diff --git a/.github/actions/ensure-active-weka-mount/action.yml b/.github/actions/ensure-active-weka-mount/action.yml
new file mode 100644
index 00000000000..8a80a8ef436
--- /dev/null
+++ b/.github/actions/ensure-active-weka-mount/action.yml
@@ -0,0 +1,17 @@
+name: "Ensure Active Weka Mount"
+description: "Make sure weka mount is active"
+
+inputs:
+ os:
+ description: 'Runner OS'
+ required: true
+
+runs:
+ using: "composite"
+ steps:
+ - name: Ensure active weka mount
+ shell: bash
+ run: |
+ sudo systemctl restart mnt-MLPerf.mount
+ sudo /etc/rc.local
+ ls -al /mnt/MLPerf/bit_error_tests
diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
index 45f3affb431..8851052019e 100644
--- a/.github/workflows/build-artifact.yaml
+++ b/.github/workflows/build-artifact.yaml
@@ -62,6 +62,7 @@ jobs:
build-artifact:
needs: build-docker-image
if: always()
+ timeout-minutes: 30
strategy:
matrix:
arch: ${{ fromJson(inputs.arch || '["grayskull", "wormhole_b0", "blackhole"]') }}
diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml
index d44cc99e715..0fb59e1add7 100644
--- a/.github/workflows/perf-models-impl.yaml
+++ b/.github/workflows/perf-models-impl.yaml
@@ -28,11 +28,7 @@ jobs:
- name: Enable Performance mode
run: |
sudo cpupower frequency-set -g performance
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml
index ce5f82ac000..defc6b3d2b1 100644
--- a/.github/workflows/t3000-demo-tests-impl.yaml
+++ b/.github/workflows/t3000-demo-tests-impl.yaml
@@ -28,11 +28,7 @@ jobs:
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
index a0edf468a68..2df18fbea23 100644
--- a/.github/workflows/t3000-frequent-tests-impl.yaml
+++ b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -27,11 +27,7 @@ jobs:
runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-functional"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 379cdf2f284..366ec614e74 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -30,11 +30,7 @@ jobs:
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml
index a84c55120d0..6634dc9cfd5 100644
--- a/.github/workflows/t3000-unit-tests-impl.yaml
+++ b/.github/workflows/t3000-unit-tests-impl.yaml
@@ -30,6 +30,7 @@ jobs:
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
+ - uses: ./.github/actions/ensure-active-weka-mount
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
diff --git a/.github/workflows/tg-frequent-tests-impl.yaml b/.github/workflows/tg-frequent-tests-impl.yaml
index 57be999d057..aaf5208327b 100644
--- a/.github/workflows/tg-frequent-tests-impl.yaml
+++ b/.github/workflows/tg-frequent-tests-impl.yaml
@@ -36,7 +36,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
- timeout-minutes: 60
+ timeout-minutes: 90
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml
index dd10b6109a9..255a934a423 100644
--- a/.github/workflows/tg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tg-model-perf-tests-impl.yaml
@@ -37,11 +37,7 @@ jobs:
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/tgg-model-perf-tests-impl.yaml b/.github/workflows/tgg-model-perf-tests-impl.yaml
index f3d44f2e2ba..df523469fbe 100644
--- a/.github/workflows/tgg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tgg-model-perf-tests-impl.yaml
@@ -37,11 +37,7 @@ jobs:
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- - name: Ensure weka mount is active
- run: |
- sudo systemctl restart mnt-MLPerf.mount
- sudo /etc/rc.local
- ls -al /mnt/MLPerf/bit_error_tests
+ - uses: ./.github/actions/ensure-active-weka-mount
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
diff --git a/.github/workflows/ttnn-post-commit-wrapper.yaml b/.github/workflows/ttnn-post-commit-wrapper.yaml
index b94be9dd0e3..cd134bb9785 100644
--- a/.github/workflows/ttnn-post-commit-wrapper.yaml
+++ b/.github/workflows/ttnn-post-commit-wrapper.yaml
@@ -25,4 +25,4 @@ jobs:
uses: ./.github/workflows/ttnn-post-commit.yaml
with:
arch: ${{ matrix.test-group.arch}}
- runner-label: ${{ matrix.test-group.runner-label}}
+ runner-label: ${{ matrix.test-group.runner-label }}
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 011376e876c..d234535b80f 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -13,37 +13,54 @@ on:
- add
- ccl.line_all_gather
- ccl.all_gather_n300
+ - eltwise.unary.abs.abs_pytorch2
- eltwise.unary.relu.relu
+ - eltwise.unary.relu.relu_pytorch2
- eltwise.unary.gelu.gelu
+ - eltwise.unary.gelu.gelu_pytorch2
+ - eltwise.unary.hardsigmoid.hardsigmoid_pytorch2
+ - eltwise.unary.leaky_relu.leaky_relu_pytorch2
- eltwise.unary.cos.cos
+ - eltwise.unary.cos.cos_pytorch2
- eltwise.unary.sin.sin
+ - eltwise.unary.sin.sin_pytorch2
+ - eltwise.unary.tril.tril_pytorch2
- eltwise.unary.clamp.clamp
- eltwise.unary.clip.clip
- eltwise.unary.cbrt.cbrt
- eltwise.unary.rsub.rsub
+ - eltwise.unary.rsub.rsub_pytorch2
+ - eltwise.unary.rsqrt.rsqrt_pytorch2
- eltwise.unary.rdiv.rdiv
- eltwise.unary.frac.frac
- eltwise.unary.ceil.ceil
+ - eltwise.unary.ceil.ceil_pytorch2
- eltwise.unary.trunc.trunc
- eltwise.unary.floor.floor
+ - eltwise.unary.floor.floor_pytorch2
- eltwise.unary.clone.clone
- eltwise.unary.elu.elu
+ - eltwise.unary.elu.elu_pytorch2
- eltwise.unary.erfc.erfc
- eltwise.unary.exp.exp
+ - eltwise.unary.exp.exp_pytorch2
- eltwise.unary.exp2.exp2
- eltwise.unary.expm1.expm1
- eltwise.unary.tanh.tanh
+ - eltwise.unary.tanh.tanh_pytorch2
- eltwise.unary.sign.sign
- eltwise.unary.rad2deg.rad2deg
- eltwise.unary.deg2rad.deg2rad
- eltwise.unary.relu6.relu6
- eltwise.unary.log.log
+ - eltwise.unary.log.log_pytorch2
- eltwise.unary.log1p.log1p
- eltwise.unary.log2.log2
- eltwise.unary.log10.log10
- eltwise.unary.bitwise.bitwise_and
- eltwise.unary.bitwise.bitwise_left_shift
- eltwise.unary.bitwise.bitwise_not
+ - eltwise.unary.bitwise.bitwise_not_pytorch2
- eltwise.unary.bitwise.bitwise_or
- eltwise.unary.bitwise.bitwise_right_shift
- eltwise.unary.bitwise.bitwise_xor
@@ -55,9 +72,10 @@ on:
- eltwise.unary.erfinv.erfinv
- eltwise.unary.i0.i0
- eltwise.unary.silu.silu
+ - eltwise.unary.silu.silu_pytorch2
- eltwise.unary.glu.glu
- - eltwise.unary.lgamma.lgamma
- eltwise.unary.sigmoid.sigmoid
+ - eltwise.unary.sigmoid.sigmoid_pytorch2
- eltwise.unary.sigmoid_accurate.sigmoid_accurate
- eltwise.unary.tril.tril
- eltwise.unary.triu.triu
@@ -74,8 +92,28 @@ on:
- eltwise.unary.sinh.sinh
- eltwise.unary.relu_min.relu_min
- eltwise.unary.relu_max.relu_max
+ - eltwise.unary.softplus.softplus
+ - eltwise.unary_backward.clamp_bw.clamp_bw
+ - eltwise.unary_backward.hardtanh_bw.hardtanh_bw
+ - eltwise.unary_backward.mul_bw.mul_bw
+ - eltwise.unary_backward.softplus_bw.softplus_bw
+ - eltwise.unary_backward.threshold_bw.threshold_bw
+ - eltwise.unary_backward.div_bw.div_bw
+ - eltwise.unary_backward.log_bw.log_bw
+ - eltwise.unary_backward.relu6_bw.relu6_bw
+ - eltwise.unary.lgamma
+ - eltwise.unary.logit
+ - eltwise.unary.mish
+ - eltwise.unary.multigammaln
+ - eltwise.unary.isfinite
+ - eltwise.unary.isinf
+ - eltwise.unary.isnan
+ - eltwise.unary.isneginf
+ - eltwise.unary.isposinf
+ - eltwise.binary.add.add_all_pytorch2
- eltwise.binary.subtract.subtract
- eltwise.binary.multiply.multiply
+ - eltwise.binary.multiply.mul_tensor_pytorch2
- eltwise.binary.div.div
- eltwise.binary.div_no_nan.div_no_nan
- eltwise.binary.logical_or.logical_or_
@@ -90,9 +128,14 @@ on:
- eltwise.binary.remainder.remainder
- eltwise.binary.squared_difference.squared_difference
- eltwise.binary.squared_difference_output.squared_difference_output
+ - eltwise.binary.remainder.remainder_scalar_pytorch2
- eltwise.binary.bcast.bcast_h_sharded
- eltwise.binary.bcast.bcast
+ - eltwise.binary.eq.eq_scalar_pytorch2
+ - eltwise.binary.gt.gt_scalar_pytorch2
+ - eltwise.binary.le.le_tensor_pytorch2
- eltwise.binary.fmod.fmod
+ - eltwise.binary.floor_divide.floor_divide_pytorch2
- eltwise.binary.logaddexp.logaddexp
- eltwise.binary.ldexp.ldexp
- eltwise.binary.hypot.hypot
@@ -100,11 +143,18 @@ on:
- eltwise.composite.binary.addalpha.addalpha
- eltwise.composite.binary.subalpha.subalpha
- eltwise.composite.binary.minimum.minimum
+ - eltwise.composite.binary.minimum.minimum_pytorch2
- eltwise.composite.binary.maximum.maximum
+ - eltwise.composite.binary.maximum.maximum_pytorch2
+ - eltwise.composite.binary.pow.pow_pytorch2
+ - eltwise.composite.binary.pow.pow_scalar_pytorch2
+ - eltwise.composite.binary.pow.pow_tensor_pytorch2
- eltwise.ternary.addcmul.addcmul
- eltwise.ternary.addcdiv.addcdiv
- eltwise.ternary.mac.mac
+ - eltwise.ternary.lerp
- eltwise.ternary.where.where
+ - eltwise.ternary.where.where_pytorch2
- matmul.full.matmul_default_block_sharded
- matmul.full.matmul_default_height_sharded
- matmul.full.matmul_default_interleaved
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ca911f87ae..521ec920bd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,11 @@
cmake_minimum_required(VERSION 3.16)
cmake_policy(VERSION 3.16)
+# Sanity check, forgetting to clone submodules is a common omission and results in a poor error message
+if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/tt_metal/third_party/umd/CMakeLists.txt")
+ message(FATAL_ERROR "Missing submodules. Run: git submodule update --init --recursive")
+endif()
+
############################################
# Project setup
############################################
diff --git a/CODEOWNERS b/CODEOWNERS
index 81e62bd088a..2f5b8e2b2ac 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -126,10 +126,10 @@ tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
/models/ @tt-rkim @uaydonat
/models/*/**
models/conv_on_device_utils*.py @mywoodstock @shwetankTT @sankarmanoj-tt
-functional_*/ @eyonland @patrickroberts @yan-zaretskiy @cfjchu @xanderchin
-models/demos @eyonland @patrickroberts @yan-zaretskiy @cfjchu @xanderchin
+functional_*/ @uaydonat @esmalTT
+models/demos @uaydonat @tt-rkim
models/demos/metal_BERT_large_11 @tt-aho @TT-BrianLiu
-models/demos/wormhole @uaydonat @eyonland
+models/demos/wormhole @uaydonat @tt-rkim
models/demos/t3000 @uaydonat
models/demos/falcon7b_common @skhorasganiTT @djordje-tt @uaydonat
models/demos/wormhole/mamba @esmalTT @uaydonat @kpaigwar
@@ -145,7 +145,7 @@ models/demos/t3000/llama3_70b @cglagovichTT @uaydonat @johanna-rock-tt @djordje-
models/demos/t3000/mixtral8x7b @yieldthought @mtairum @uaydonat
models/demos/tg/llama3_70b @cglagovichTT @uaydonat @johanna-rock-tt @djordje-tt @kpaigwar
models/demos/tg/falcon7b @skhorasganiTT @djordje-tt @uaydonat
-models/demos/grayskull @uaydonat @eyonland
+models/demos/grayskull @uaydonat @tt-rkim
models/demos/**/*resnet* @mywoodstock @shwetankTT @tt-aho
models/experimental/functional_unet @esmalTT @uaydonat @mywoodstock
models/perf/ @uaydonat @tt-rkim
diff --git a/METALIUM_GUIDE.md b/METALIUM_GUIDE.md
index 55ae85c85c3..a68f5ba129e 100644
--- a/METALIUM_GUIDE.md
+++ b/METALIUM_GUIDE.md
@@ -126,7 +126,7 @@ kernel:
namespace NAMESPACE {
void MAIN {
mm_init();
- acquire_dst(tt::DstMode::Tile);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
@@ -140,7 +140,7 @@ void MAIN {
pack_tile(0, tt::CB::c_out0);
cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
- release_dst(tt::DstMode::Tile);
+ release_dst();
}
} // namespace NAMESPACE
```
@@ -149,7 +149,7 @@ It takes two matrix tiles from `tt::CB::c_in0` and `tt::CB::c_in0` L1 and
conducts a single-tile matrix multiplication. Finally, it packs the result to
`tt::CB::c_out0`.
-Note that tile registers are acquired by `acquire_dst(..)`, but actually we can
+Note that tile registers are acquired by `acquire_dst()`, but actually we can
use `tile_regs_..()` functions for the more fine-grained tile register lock
mechanism. At the end of this section, we will explain more details.
@@ -226,10 +226,10 @@ inline __attribute__((always_inline)) void cb_wait_front(uint32_t cbid, uint32_t
}
```
-Another interesting function is `acquire_dst(tt::DstMode mode)`:
+Another interesting function is `acquire_dst()`:
* The UNPACK kernel has an empty one:
```
-inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
+inline __attribute__((always_inline)) void acquire_dst() {
;
;
@@ -237,7 +237,7 @@ inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
```
* The MATH kernel waits for DEST available:
```
-inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
+inline __attribute__((always_inline)) void acquire_dst() {
( llk_math_wait_for_dest_available() );
;
@@ -245,7 +245,7 @@ inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
```
* The UNPACK kernel waits for the end of MATH kernel:
```
-inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
+inline __attribute__((always_inline)) void acquire_dst() {
;
( llk_packer_wait_for_math_done() );
@@ -254,14 +254,14 @@ inline __attribute__((always_inline)) void acquire_dst(tt::DstMode mode) {
[Its implementation](https://github.com/tenstorrent/tt-metal/blob/6d4951a20ca4c392888f924f038ae0780a8cc656/tt_metal/include/compute_kernel_api/reg_api.h#L28-L32) matches the preprocessed code:
```
-ALWI void acquire_dst(tt::DstMode mode) {
+ALWI void acquire_dst() {
MATH(( llk_math_wait_for_dest_available() ));
PACK(( llk_packer_wait_for_math_done() ));
}
```
-Based on the implementation of `acquire_dst(..)`, if we use it, we can guess it
+Based on the implementation of `acquire_dst()`, if we use it, we can guess it
executes UNPACK, MATH, PACK in order, which will help you to follow the
execution order and instructions that actually run on each kernel.
@@ -292,7 +292,7 @@ ALWI void tile_regs_release() {
}
```
-We can replace `acquire_dst(..)` and `release_dst(..)` from the above example
+We can replace `acquire_dst()` from the above example
with `tile_regs_..()` functions like:
```
namespace NAMESPACE {
diff --git a/README.md b/README.md
index a96ce18abc0..cbad5ca6cb6 100644
--- a/README.md
+++ b/README.md
@@ -21,22 +21,24 @@
---
## LLMs
-| Model | Batch | Hardware | ttft (s) | t/s/u | Target t/s/u | Release |
-|----------------------------------------------------------------------|-------|----------------------------------------------------------|------------|-------|--------------|---------------------------------------------------------------------------|
-| [Falcon7B-decode](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | |
-| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.07 | 16.7 | 26 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Mistral-7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) |
-| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.04 | 12.3 | 41 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
-| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.20 | 21.4 | 23 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (data parallel)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.10 | 14.1 | 26 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-2-70B - (tensor parallel)](./models/demos/t3000/llama2_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [LLaMA-3.1-70B (tensor parallel)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon40B (tensor parallel)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Mixtral7Bx8 (tensor parallel)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.23 | 14.2 | 33 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-| [Falcon7B (data parallel)](./models/demos/tg/falcon7b) |1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 0.24 | 4.3 | 26 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
-> **Last Update:** September 23, 2024
+| Model | Batch | Hardware | ttft (s) | t/s/u | Target
t/s/u | t/s | Release |
+|---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------|
+| [Falcon7B-decode](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | |
+| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.07 | 16.7 | 26 | 534.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Mistral-7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) |
+| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.04 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) |
+| [LLaMA-3.1-8B](./models/demos/wormhole/llama31_8b) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.20 | 21.4 | 23 | 21.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.10 | 14.4 | 26 | 3686.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [LLaMA-2-70B - (TP=8)](./models/demos/t3000/llama2_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | 483.2 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.19 | 15.1 | 20 | 483.2 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 0.23 | 14.2 | 33 | 454.4 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [Falcon7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 0.24 | 4.4 | 26 | 4505.6 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 0.19 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) |
+> **Last Update:** October 7, 2024
> **Notes:**
+> - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices.
> - The reported LLM performance is for an input sequence length (number of rows filled in the KV cache) of 128 for all models except Mamba (which can accept any sequence length).
> - The t/s/u reported is the throughput of the first token generated after prefill, i.e. 1 / inter token latency.
@@ -45,22 +47,20 @@
|-----------------------------------------------------------------------------|-------|----------------------------------------------------------|---------|------------|-------------|
| [ResNet-50 (224x224)](./models/demos/grayskull/resnet50) | 20 | [e150](https://tenstorrent.com/hardware/grayskull) | 5,100 | 10,000 | |
| [ResNet-50 (224x224)](./models/demos/wormhole/resnet50) | 16 | [n150](https://tenstorrent.com/hardware/wormhole) | 4,100 | 7,000 | |
-| [ResNet-50 (224x224) (data parallel)](./models/demos/t3000/resnet50) | 128 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 32,250 | 56,000 | |
-| [ResNet-50 (224x224) (data parallel)](./models/demos/tg/resnet50) | 512 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 66,150 | 224,000 | |
-| [ResNet-50 (224x224) (data parallel)](./models/demos/tgg/resnet50) | 1024 | [Two Galaxies](https://tenstorrent.com/hardware/galaxy) | 128,800 | 448,000 | |
+| [ResNet-50 (224x224) (DP=8)](./models/demos/t3000/resnet50) | 128 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 32,250 | 56,000 | |
+| [ResNet-50 (224x224) (DP=32)](./models/demos/tg/resnet50) | 512 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 95,900 | 224,000 | |
+| [ResNet-50 (224x224) (DP=64)](./models/demos/tgg/resnet50) | 1024 | [Two Galaxies](https://tenstorrent.com/hardware/galaxy) | 128,800 | 448,000 | |
| [ViT](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | |
| [ViT](./models/demos/wormhole/vit) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 912 | 1,600 | |
| [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | |
## NLPs
-| Model | Batch | Hardware | sen/sec | Target sen/sec | Release |
-|-----------------------------------------------------|-------|----------------------------------------------------|-----------|----------------|-------------|
-| [BERT-Large](./models/demos/metal_BERT_large_11/) | 12 | [e150](https://tenstorrent.com/hardware/grayskull) | 370 | 410 | |
-| [BERT-Large](./models/demos/metal_BERT_large_11/) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 270 | 400 | |
-| [T5 small](.models/demos/grayskull/t5) | | [e150](https://tenstorrent.com/hardware/grayskull) | 140 | | |
-| [Bloom](.models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | |
-
-
+| Model | Batch | Hardware | sen/sec | Target sen/sec | Release |
+|-----------------------------------------------------|-------|----------------------------------------------------|---------|----------------|---------|
+| [BERT-Large](./models/demos/metal_BERT_large_11/) | 12 | [e150](https://tenstorrent.com/hardware/grayskull) | 370 | 410 | |
+| [BERT-Large](./models/demos/metal_BERT_large_11/) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 270 | 400 | |
+| [T5 small](.models/demos/grayskull/t5) | | [e150](https://tenstorrent.com/hardware/grayskull) | 140 | | |
+| [Bloom](.models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | |
## Model Updates
For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md)
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index d222640ea41..a598b632fc7 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -55,3 +55,23 @@ CPMAddPackage(
GITHUB_REPOSITORY boost-ext/reflect
GIT_TAG v1.1.1
)
+
+############################################################################################################################
+# magic_enum : https://github.com/Neargye/magic_enum
+############################################################################################################################
+
+CPMAddPackage(
+ NAME magic_enum
+ GITHUB_REPOSITORY Neargye/magic_enum
+ GIT_TAG v0.9.6
+)
+
+############################################################################################################################
+# fmt : https://github.com/fmtlib/fmt
+############################################################################################################################
+
+CPMAddPackage(
+ NAME fmt
+ GITHUB_REPOSITORY fmtlib/fmt
+ GIT_TAG 11.0.1
+)
diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/acquire_dst.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/acquire_dst.rst
index c6b725cf683..10c52fc84d1 100644
--- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/acquire_dst.rst
+++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/acquire_dst.rst
@@ -1,4 +1,4 @@
acquire_dst
===========
-.. doxygenfunction:: acquire_dst(tt::DstMode mode)
+.. doxygenfunction:: acquire_dst()
diff --git a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/release_dst.rst b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/release_dst.rst
index 481b642fac9..804b60edf61 100644
--- a/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/release_dst.rst
+++ b/docs/source/tt-metalium/tt_metal/apis/kernel_apis/compute/release_dst.rst
@@ -1,4 +1,4 @@
release_dst
===========
-.. doxygenfunction:: release_dst(tt::DstMode mode)
+.. doxygenfunction:: release_dst()
diff --git a/docs/source/ttnn/ttnn/api.rst b/docs/source/ttnn/ttnn/api.rst
index 12292183f9d..9acf162cc83 100644
--- a/docs/source/ttnn/ttnn/api.rst
+++ b/docs/source/ttnn/ttnn/api.rst
@@ -440,6 +440,17 @@ Normalization
ttnn.layer_norm
ttnn.rms_norm
+
+Moreh Operations
+================
+
+.. autosummary::
+ :toctree: api
+ :nosignatures:
+ :template: function.rst
+
+ ttnn.moreh_sum
+
Transformer
===========
diff --git a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
index c6f12613744..7a87a746005 100644
--- a/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
+++ b/docs/source/ttnn/ttnn/dependencies/tt_lib.rst
@@ -34,7 +34,7 @@ New Device Operation
struct {
void validate(const std::vector &input_tensors) const;
- std::vector compute_output_shapes(const std::vector &input_tensors) const;
+ std::vector compute_output_shapes(const std::vector &input_tensors) const;
std::vector create_output_tensors(const std::vector &input_tensors) const;
operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const;
};
@@ -48,7 +48,7 @@ New Device Operation with a member
int some_member
void validate(const std::vector &input_tensors) const;
- std::vector compute_output_shapes(const std::vector &input_tensors) const;
+ std::vector compute_output_shapes(const std::vector &input_tensors) const;
std::vector create_output_tensors(const std::vector &input_tensors) const;
operation::ProgramWithCallbacks create_program(const std::vector& input_tensors, std::vector &output_tensors) const;
};
@@ -61,7 +61,7 @@ New Device Operation with Optional Input Tensors
struct {
void validate(const std::vector &input_tensors,
const std::vector>& optional_input_tensors) const;
- std::vector compute_output_shapes(const std::vector &input_tensors) const;
+ std::vector compute_output_shapes(const std::vector &input_tensors) const;
std::vector create_output_tensors(const std::vector &input_tensors) const;
operation::ProgramWithCallbacks create_program(
const std::vector& input_tensors,
@@ -80,7 +80,7 @@ and create_output_tensors with the additional parameter for the output_tensors.
struct {
void validate_with_output_tensors(const std::vector &input_tensors, const std::vector>& output_tensors) const;
- std::vector compute_output_shapes(const std::vector &input_tensors) const;
+ std::vector compute_output_shapes(const std::vector &input_tensors) const;
std::vector> create_output_tensors(const std::vector &input_tensors, const std::vector>& output_tensors) const;
operation::ProgramWithOptionalOutputTensors create_program(const std::vector& input_tensors, std::vector> &output_tensors) const;
@@ -209,17 +209,17 @@ TT-LIB API through ``tt_lib``
Primary Operations
==================
-.. autofunction:: tt_lib.operations.primary.moreh_softmax
+.. autofunction:: ttnn.operations.moreh.softmax
-.. autofunction:: tt_lib.operations.primary.moreh_softmax_backward
+.. autofunction:: ttnn.operations.moreh.softmax_backward
-.. autofunction:: tt_lib.operations.primary.moreh_softmin
+.. autofunction:: ttnn.operations.moreh.softmin
-.. autofunction:: tt_lib.operations.primary.moreh_softmin_backward
+.. autofunction:: ttnn.operations.moreh.softmin_backward
-.. autofunction:: tt_lib.operations.primary.moreh_logsoftmax
+.. autofunction:: ttnn.operations.moreh.logsoftmax
-.. autofunction:: tt_lib.operations.primary.moreh_logsoftmax_backward
+.. autofunction:: ttnn.operations.moreh.logsoftmax_backward
.. autofunction:: ttnn.operations.moreh.mean
diff --git a/models/MODEL_HYBRID_TP_DP.md b/models/MODEL_HYBRID_TP_DP.md
new file mode 100644
index 00000000000..299cfdc369c
--- /dev/null
+++ b/models/MODEL_HYBRID_TP_DP.md
@@ -0,0 +1,44 @@
+# Hybrid Tensor and Data Parallelism Implementation
+
+This short guide explains how to add hybrid tensor and data parallelism to your model using submesh tiling across a larger mesh.
+
+## Overview of Changes
+
+The main changes involve:
+
+1. Creating multiple submeshes from the main mesh
+2. Running the model on each submesh
+3. Capturing and replaying a trace across all submeshes in parallel
+
+## Key Implementation Details
+
+### 1. Submesh Creation
+
+```python
+ # Work with submesh device as you would with a regular ttnn.MeshDevice
+ submesh_devices: List[ttnn.MeshDevice] = mesh_device.create_submeshes((2, 4), ttnn.MeshType.Ring)
+```
+
+### 2. Compile & Run the Model on Each Submesh
+
+```python
+ # Run the model on each submesh
+ for submesh_device in submesh_devices:
+ model(..., device=submesh_device)
+```
+
+### 3. Capture & Replay the Trace
+
+```python
+
+ # Capture Model Trace spanning all submeshes
+ trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
+ for submesh_device in submesh_devices:
+ model(..., device=submesh) # Run the model on each submesh
+ ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
+
+ # Execute Model Trace across all submeshes in parallel
+ ttnn.execute_trace(mesh_device, trace_id, blocking=False)
+ ttnn.release_trace(mesh_device, trace_id)
+
+```
diff --git a/models/MODEL_UPDATES.md b/models/MODEL_UPDATES.md
index 1c3e02e2651..feaa61cc031 100644
--- a/models/MODEL_UPDATES.md
+++ b/models/MODEL_UPDATES.md
@@ -4,6 +4,13 @@
>
> Please refer to the front-page [README](../README.md) for the latest verified release for each model.
+## October 7, 2024
+
+### [Llama 3.1 - 8B](demos/wormhole/llama31_8b)
+- Added support for continuous batching
+- Added paged caching support for PagedAttention
+- Added a demo which runs with TT-NN tracing (23 t/s/u decode on main)
+
## September 23, 2024
### [Llama 3/3.1 - 70B](demos/t3000/llama3_70b)
diff --git a/models/demos/t3000/falcon40b/demo/demo.py b/models/demos/t3000/falcon40b/demo/demo.py
index e04e903ed34..9e3451dde37 100644
--- a/models/demos/t3000/falcon40b/demo/demo.py
+++ b/models/demos/t3000/falcon40b/demo/demo.py
@@ -527,8 +527,7 @@ def run_falcon_demo_kv(
if not perf_mode:
print_output_prompts(generated_ids, tokenizer)
- for device in devices:
- device.disable_and_clear_program_cache()
+ mesh_device.disable_and_clear_program_cache()
generated_text = tokenizer.batch_decode(generated_ids.tolist())
diff --git a/models/demos/t3000/falcon40b/demo/expected_output_data.json b/models/demos/t3000/falcon40b/demo/expected_output_data.json
index 5821c3c7b42..380fb5eeff1 100644
--- a/models/demos/t3000/falcon40b/demo/expected_output_data.json
+++ b/models/demos/t3000/falcon40b/demo/expected_output_data.json
@@ -1 +1 @@
-["List the first 5 prime numbers \nThe first 5 prime numbers are 2, 3, 5, 7, and 11. ", "Give a brief history of the internet \nThe internet was invented in the late 1960s by a group of researchers at the University of California, Los Angeles (UCLA). The first message was sent between two computers in 1969, and the first email was sent in 1971. The internet grew rapidly in the 1990s, and by the end of the decade, it had become a global phenomenon. Today, the internet is used for everything from shopping to social media to streaming movies and TV shows. ", "Describe to me some good coding practices \nSome good coding practices include: \n\n1. Properly commenting code to make it easier to understand and maintain. \n2. Using consistent naming conventions for variables and functions. \n3. Keeping code organized and modularized. \n4. Testing code thoroughly before deploying it. \n5. Using version control to track changes and revert mistakes. \n6. Avoiding unnecessary complexity and over-engineering. \n7. Writing clean and readable code. \n8. Using appropriate data types and avoiding unnecessary conversions. \n9. Minimizing the use of global variables", "write a short poem about Paris in English\nParis is a city of love and romance,\nWhere the streets are filled with art and culture,\nThe Eiffel Tower stands tall and proud,\nAnd the Seine River flows through the heart of the city,\nParis is a city of beauty and charm,\nWhere the people are friendly and welcoming,\nThe cafes and restaurants are filled with delicious food,\nAnd the museums and galleries are filled with treasures,\nParis is a city of history and tradition,\nWhere the monuments and landmarks are breathtaking,\nThe architecture is stunning and unique,\nAnd the city is full of life and", "Who is the inventor of the telephone?\nAlexander Graham Bell is credited with inventing the telephone in 1876. ", "write a short poem about Istanbul in English\nIstanbul is a city of contrasts,\nWhere East meets West,\nWhere ancient meets modern,\nWhere old meets new,\nWhere past meets present,\nWhere history meets future,\nWhere tradition meets innovation,\nWhere culture meets commerce,\nWhere religion meets secularism,\nWhere art meets architecture,\nWhere beauty meets chaos,\nWhere diversity meets unity,\nWhere the old city meets the new city,\nWhere the past meets the future,\nWhere the East meets the West,\nWhere the old meets the new,\nWhere the ancient meets the modern,\nWhere the", "What are the tourist attractions in Paris?\nParis is home to many famous tourist attractions such as the Eiffel Tower, Notre-Dame Cathedral, the Louvre Museum, the Champs-\u00c9lys\u00e9es, the Palace of Versailles, and the Seine River. Other popular attractions include the Arc de Triomphe, Montmartre, and the Parisian parks such as Jardin des Tuileries and Parc de la Villette. ", "How many countries are in Africa? \nThere are 54 countries in Africa. ", "what is the capital of USA? \nThe capital of USA is Washington D.C. ", "what is the capital of Canada? \nThe capital of Canada is Ottawa. ", "what is the capital of UK? \nThe capital of UK is London. ", "what is the capital of Germany? \nThe capital of Germany is Berlin. ", "what is the capital of France? \nThe capital of France is Paris. ", "what is the capital of Japan? \nThe capital of Japan is Tokyo. ", "what is the capital of India? \nThe capital of India is New Delhi. ", "what is the capital of China? \nThe capital of China is Beijing. ", "what is the currency of Cuba? \nThe currency of Cuba is the Cuban peso (CUP). ", "what is the currency of Lebanon? \nThe currency of Lebanon is the Lebanese pound (LBP). ", "what is the currency of Brazil? \nThe currency of Brazil is the Brazilian Real (BRL). ", "what is the currency of Australia? \nThe currency of Australia is the Australian dollar (AUD). ", "what is the currency of Jamaica? \nThe currency of Jamaica is the Jamaican dollar (JMD). ", "what is the currency of Egypt? \nThe currency of Egypt is the Egyptian pound (EGP). ", "what is the currency of Uzbekistan? \nThe currency of Uzbekistan is the Uzbekistani som (UZS). ", "what is the currency of Argentina? \nThe currency of Argentina is the Argentine peso. ", "describe the geographic location of London in UK\nLondon is located in the southeast of England, on the River Thames. It is the capital city of the United Kingdom and the largest city in Europe. ", "describe the geographic location of Toronto in Canada\nToronto is located in the southern part of Ontario, Canada. It is situated on the northwestern shore of Lake Ontario, and is the largest city in Canada. ", "describe the geographic location of Madrid in Spain\nMadrid is located in the center of Spain, in the heart of the Iberian Peninsula. It is the capital city of Spain and the largest city in the country. Madrid is situated in a valley surrounded by mountains, which gives it a unique climate and geography. ", "describe the geographic location of Paris in France\nParis is located in the north-central part of France, on the Seine River. It is the capital city of France and the largest city in the country. ", "describe the geographic location of Rome in Italy\nRome is located in central Italy, on the Tiber River. It is the capital city of Italy and the largest city in the country. ", "describe the geographic location of Istanbul in Turkey\nIstanbul is located in the northwest corner of Turkey, on the Bosphorus Strait, which connects the Black Sea to the Sea of Marmara. It is the largest city in Turkey and the fifth largest city in the world. ", "describe the geographic location of Shanghai in China\nShanghai is located in eastern China, on the Yangtze River Delta. It is the largest city in China and one of the largest cities in the world. ", "describe the geographic location of Lagos in Nigeria\nLagos is located in the southwestern part of Nigeria, on the Gulf of Guinea. It is the largest city in Nigeria and the fifth largest city in Africa. Lagos is also the economic and cultural hub of Nigeria, with a population of over 20 million people. "]
+["List the first 5 prime numbers \nThe first 5 prime numbers are 2, 3, 5, 7, and 11. ", "Give a brief history of the internet \nThe internet was invented in the late 1960s by computer scientists at the University of California, Los Angeles (UCLA). It was originally called ARPANET and was designed to allow scientists to share information and resources across different computer networks. In the 1990s, the internet became more widely available to the public and began to transform the way people communicate and access information. Today, the internet is a ubiquitous part of modern life, with billions of people using it daily for everything from shopping to social media to streaming entertainment. ", "Describe to me some good coding practices \nSome good coding practices include: \n\n1. Properly commenting code to make it easier to understand and maintain. \n2. Using consistent naming conventions for variables and functions. \n3. Writing clean and readable code that is easy to debug. \n4. Avoiding unnecessary complexity and keeping code simple and concise. \n5. Using version control to track changes and revert mistakes. \n6. Testing code thoroughly before deploying it. \n7. Keeping up-to-date with industry standards and best practices. \n8. Collaborating with other developers to improve code", "write a short poem about Paris in English\nParis is a city of love and romance,\nWhere the streets are filled with art and culture,\nThe Eiffel Tower stands tall and proud,\nAnd the Seine River flows through the heart of the city,\nParis is a city of dreams and possibilities,\nWhere the people are friendly and welcoming,\nThe cafes and restaurants are filled with delicious food,\nAnd the museums and galleries are filled with treasures,\nParis is a city of beauty and charm,\nWhere the architecture is stunning and the parks are lush,\nThe city is alive with energy and excitement,\nAnd the people", "Who is the inventor of the telephone?\nAlexander Graham Bell is credited with inventing the telephone in 1876. ", "write a short poem about Istanbul in English\nIstanbul is a city of contrasts,\nWhere East meets West,\nWhere ancient meets modern,\nWhere old meets new,\nWhere past meets present,\nWhere history meets future,\nWhere tradition meets innovation,\nWhere culture meets commerce,\nWhere religion meets secularism,\nWhere art meets architecture,\nWhere beauty meets chaos,\nWhere diversity meets unity,\nWhere the old city meets the new city,\nWhere the past meets the future,\nWhere the East meets the West,\nWhere the old meets the new,\nWhere the ancient meets the modern,\nWhere the", "What are the tourist attractions in Paris?\nParis is home to many famous landmarks and attractions such as the Eiffel Tower, Notre-Dame Cathedral, the Louvre Museum, the Champs-\u00c9lys\u00e9es, the Palace of Versailles, and the Seine River. Other popular attractions include the Montmartre district, the Arc de Triomphe, and the Parisian parks such as Jardin des Tuileries and Parc de la Villette. ", "How many countries are in Africa? \nThere are 54 countries in Africa. ", "what is the capital of USA? \nThe capital of USA is Washington D.C. ", "what is the capital of Canada? \nThe capital of Canada is Ottawa. ", "what is the capital of UK? \nThe capital of UK is London. ", "what is the capital of Germany? \nThe capital of Germany is Berlin. ", "what is the capital of France? \nThe capital of France is Paris. ", "what is the capital of Japan? \nThe capital of Japan is Tokyo. ", "what is the capital of India? \nThe capital of India is New Delhi. ", "what is the capital of China? \nThe capital of China is Beijing. ", "what is the currency of Cuba? \nThe currency of Cuba is the Cuban peso (CUP). ", "what is the currency of Lebanon? \nThe currency of Lebanon is the Lebanese pound (LBP). ", "what is the currency of Brazil? \nThe currency of Brazil is the Brazilian Real (BRL). ", "what is the currency of Australia? \nThe currency of Australia is the Australian dollar (AUD). ", "what is the currency of Jamaica? \nThe currency of Jamaica is the Jamaican dollar. ", "what is the currency of Egypt? \nThe currency of Egypt is the Egyptian pound (EGP). ", "what is the currency of Uzbekistan? \nThe currency of Uzbekistan is the Uzbekistani som (UZS). ", "what is the currency of Argentina? \nThe currency of Argentina is the Argentine peso. ", "describe the geographic location of London in UK\nLondon is located in the southeast of England, on the River Thames. It is the capital city of the United Kingdom and the largest city in Europe. ", "describe the geographic location of Toronto in Canada\nToronto is located in the province of Ontario, Canada. It is situated on the northwestern shore of Lake Ontario, and is the largest city in Canada. Toronto is also the fourth largest city in North America, with a population of over 2.8 million people. ", "describe the geographic location of Madrid in Spain\nMadrid is located in the center of Spain, in the region of Madrid. It is the capital city of Spain and the largest city in the country. Madrid is situated on a plateau at an elevation of 2,180 feet (660 meters) above sea level. ", "describe the geographic location of Paris in France\nParis is located in the north-central part of France, on the Seine River. It is the capital city of France and the largest city in the country. ", "describe the geographic location of Rome in Italy\nRome is located in central Italy, on the Tiber River. It is the capital city of Italy and the largest city in the country. ", "describe the geographic location of Istanbul in Turkey\nIstanbul is located in Turkey, on the Bosphorus Strait, which connects the Black Sea to the Sea of Marmara. It is the largest city in Turkey and the fifth largest city in the world. ", "describe the geographic location of Shanghai in China\nShanghai is located in eastern China, on the Yangtze River Delta. It is the largest city in China and one of the largest cities in the world. ", "describe the geographic location of Lagos in Nigeria\nLagos is located in the southwestern part of Nigeria, on the Gulf of Guinea. It is the largest city in Nigeria and the second largest city in Africa. Lagos is also the economic and cultural center of Nigeria, with a population of over 20 million people. "]
diff --git a/models/demos/t3000/falcon40b/tests/test_falcon_prefill_determinism.py b/models/demos/t3000/falcon40b/tests/test_falcon_prefill_determinism.py
index ba2d857dc5d..993437e0c65 100644
--- a/models/demos/t3000/falcon40b/tests/test_falcon_prefill_determinism.py
+++ b/models/demos/t3000/falcon40b/tests/test_falcon_prefill_determinism.py
@@ -276,8 +276,7 @@ def test_falcon_prefill_end_to_end_determinism(
input_shape = [batch, seq_len]
model_config = get_model_config(model_config_str, "prefill", input_shape, num_devices)
- devices = t3k_mesh_device.get_devices()
- compute_grid_size = devices[0].compute_with_storage_grid_size()
+ compute_grid_size = t3k_mesh_device.compute_with_storage_grid_size()
if compute_grid_size.x < model_config["MAX_GRID_SIZE"][0] or compute_grid_size.y < model_config["MAX_GRID_SIZE"][1]:
pytest.skip(f"Requires grid size of at least {model_config['MAX_GRID_SIZE']} to run")
@@ -286,8 +285,7 @@ def test_falcon_prefill_end_to_end_determinism(
)
if enable_program_cache:
- for device in devices:
- device.enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
run_test_falcon_prefill_end_to_end_determinism(
t3k_mesh_device,
@@ -304,5 +302,4 @@ def test_falcon_prefill_end_to_end_determinism(
)
if enable_program_cache:
- for device in devices:
- device.disable_and_clear_program_cache()
+ t3k_mesh_device.disable_and_clear_program_cache()
diff --git a/models/demos/t3000/llama2_70b/demo/data/llama3_ground_truth.json b/models/demos/t3000/llama2_70b/demo/data/llama3_ground_truth.json
index e59003fae4c..0d6f8a194d7 100644
--- a/models/demos/t3000/llama2_70b/demo/data/llama3_ground_truth.json
+++ b/models/demos/t3000/llama2_70b/demo/data/llama3_ground_truth.json
@@ -1,34 +1,34 @@
[
"<|begin_of_text|>Tenstorrent is an AI startup whose RISC-V hardware aims to define a new spatial computing platform for the next century. In this interview with CEO and legendary chip architect Jim Keller we learn of their plans to revolutionize the industry. Tenstorrent is an AI startup whose RISC-V hardware aims to define a new spatial computing platform for the next century. In this interview with CEO and legendary chip architect Jim Keller we learn of their plans to revolutionize the industry.\nTenstorrent is an AI startup whose RISC-V hardware aims to define a new spatial computing platform for the next century. In this interview with CEO and legendary chip architect Jim Keller we learn of their plans to revolutionize the industry. Tenstorrent is an AI startup whose RISC-V hardware aims to define a new spatial computing platform for the next century. In this interview with CEO and legendary chip architect",
"<|begin_of_text|>It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way \u2013 in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree",
- "<|begin_of_text|>I like to think (and the sooner the better!) of a cybernetic meadow where mammals and computers live together in mutually programming harmony like pure water touching clear sky. I like to think (right now, please!) of a cybernetic forest filled with pines and electronics where deer stroll peacefully past wise computers that their hooves and horns have created. I like to think (it has to be!) of a cybernetic ecology where we are responsible for the programming of our computers, of a humane meadow of technical immortality.\nI like to think (it must be!) of a cybernetic forest filled with pines and electronics and peo\u00adple strolling about under them, their brains wave fronts",
- "<|begin_of_text|>We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America.\nArticle. I.\nAll legislative Powers herein granted shall be vested in a Congress of the United States, which shall consist of a Senate and House of Representatives.\nThe House of Representatives shall be composed of Members chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifications requisite for Electors of the most",
- "<|begin_of_text|>Katherine Johnson (August 26, 1918 - February 24, 2020) was an African-American mathematician whose calculations of orbital mechanics as a NASA employee were critical to the success of the first and subsequent U.S. crewed spaceflights. During her 35-year career at NASA and its predecessor, she earned a reputation for mastering complex manual calculations and helped pioneer the use of computers to perform the tasks. The space agency noted her \"historical role as one of the first African-American women to work as a NASA scientist\". Johnson's work included calculating trajectories, launch windows, and emergency return paths for Project Mercury spaceflights. In 2015, President Barack Obama awarded Johnson the Presidential Medal of Freedom. In 2016",
- "<|begin_of_text|>Knock, knock. Who's there? The police. The police who? The police who are here to arrest you for not having a sense of humor.",
+ "<|begin_of_text|>I like to think (and the sooner the better!) of a cybernetic meadow where mammals and computers live together in mutually programming harmony like pure water touching clear sky. I like to think (right now, please!) of a cybernetic forest filled with pines and electronics where deer stroll peacefully past wise computers that their hooves and horns have grown, where owls and umber hawks assess their fame with accurate iridescence and where, in a friendly way, the wrens and curlews sing to microphones. I like to think (it has to be!) of a cybernetic ecology where we are responsible for the programming of our lives. People who will not acknowledge this are living and",
+ "<|begin_of_text|>We the People of the United States, in Order to form a more perfect Union, establish Justice, insure domestic Tranquility, provide for the common defence, promote the general Welfare, and secure the Blessings of Liberty to ourselves and our Posterity, do ordain and establish this Constitution for the United States of America.\nSection 1. All legislative Powers herein granted shall be vested in a Congress of the United States, which shall consist of a Senate and House of Representatives.\nSection 2. The House of Representatives shall be composed of Members chosen every second Year by the People of the several States, and the Electors in each State shall have the Qualifications requisite for Elect",
+ "<|begin_of_text|>Katherine Johnson (August 26, 1918 - February 24, 2020) was an African-American mathematician whose calculations of orbital mechanics as a NASA employee were critical to the success of the first and subsequent U.S. crewed spaceflights. During her 35-year career at NASA and its predecessor, she earned a reputation for mastering complex manual calculations and helped pioneer the use of computers to perform the tasks. The space agency noted her \"historical role as one of the first African-American women to work as a NASA scientist\". Johnson's work included calculating trajectories, launch windows, and emergency return paths for Project Mercury spaceflights, including those for astronauts Alan Shepard and John Glenn, and the 1969 Apollo 11 flight to",
+ "<|begin_of_text|>Knock, knock. Who's there? It's the police. The police who? The police who are here to arrest you for not having a sense of humor.",
"<|begin_of_text|>Count to a hundred: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77",
"<|begin_of_text|>Not like the brazen giant of Greek fame, With conquering limbs astride from land to land; Here at our sea-washed, sunset gates shall stand A mighty woman with a torch, whose flame Is the imprisoned lightning, and her name Mother of Exiles. From her beacon-hand Glows world-wide welcome; her mild eyes command The air-bridged harbor that twin cities frame. \"Keep, ancient lands, your storied pomp!\" cries she With silent lips. \"Give me your tired, your poor, Your huddled masses yearning to breathe free, The wretched refuse of your teeming shore. Send these, the homeless, tempest-tost to me,",
- "<|begin_of_text|>Roses are red, violets are blue, and this Valentine\u2019s Day, we\u2019re celebrating the love we have for our favorite things. From the people who make our lives better to the things that make us smile, we\u2019re sharing the love this February 14th. So whether you\u2019re single or taken, we hope you\u2019ll join us in celebrating all the things we love!\nThis article is all about helping you get to know the Roses are red, violets are blue for PC better and install it on your PC. Here are the technical specifications you want to know about beforehand:\nHow To Install & Download Roses are red, violets are blue For PC Windows 10",
+ "<|begin_of_text|>Roses are red, violets are blue, and this Valentine\u2019s Day, we\u2019re celebrating the love we have for our favorite things. From the people who make our lives better to the things that make us smile, we\u2019re sharing the love this February 14th. So, whether you\u2019re single or taken, we hope you\u2019ll join us in celebrating all the things we love!\nThis article is all about helping you get to know the Roses are red, violets are blue for PC better and install it on your PC. Here are the technical specifications you want to know about beforehand:\nHow To Install & Download Roses are red, violets are blue For PC Windows ",
"<|begin_of_text|>Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist. He developed the general theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). Einstein's work is also known for its influence on the philosophy of science. Einstein is best known in popular culture for his mass\u2013energy equivalence formula E = mc2 (which has been dubbed \"the world's most famous equation\"). He received the 1921 Nobel Prize in Physics for his \"services to theoretical physics\", in particular his discovery of the law of the photoelectric effect, a pivotal step in the evolution of quantum theory.",
- "<|begin_of_text|>The journey of a thousand miles begins with a single step. \u2013 Lao Tzu\nI have been thinking about this quote a lot lately. I have been thinking about it in terms of my own life and in terms of the lives of my students. I have been thinking about it in terms of the journey of a thousand miles and in terms of the journey of a single step.\nI have been thinking about this quote in terms of the journey of a thousand miles because I have been thinking about the journey of a thousand miles in terms of the journey of a single step. I have been thinking about the journey of a thousand miles in terms of the journey of a single",
- "<|begin_of_text|>When I find myself in times of trouble, Mother Mary comes to me, speaking words of wisdom, let it be. And in my hour of darkness she is standing right in front of me, speaking words of wisdom, let it be. Let it be, let it be, let it be, let it be. Whisper words of wisdom, let it be. And when the broken hearted people living in the world agree, there will be an answer, let it be. For though they may be parted there is still a chance that they will see, there will be an answer. Let it be. Let it be, let it be, let it be, let it",
+ "<|begin_of_text|>The journey of a thousand miles begins with a single step. \u2013 Lao Tzu\nI have been thinking about this quote a lot lately. I have been thinking about it in terms of my own life and in terms of the lives of my students. I have been thinking about it in terms of the journey of a thousand miles and in terms of the journey of a single step. I have been thinking about it in terms of the journey of a thousand miles and in terms of the journey of a single step.\nI have been thinking about it in terms of the journey of a thousand miles and in terms of the journey of a single step. I have been thinking about",
+ "<|begin_of_text|>When I find myself in times of trouble, Mother Mary comes to me, speaking words of wisdom, let it be. And in my hour of darkness she is standing right in front of me, speaking words of wisdom, let it be. Let it be, let it be, let it be, let it be. Whisper words of wisdom, let it be. And when the broken hearted people living in the world agree, there will be an answer, let it be. For though they may be parted there is still a chance that they will see, there will be an answer. let it be. Let it be, let it be, ..... yeah ..... let it be",
"<|begin_of_text|>Shall I compare thee to a summer's day? Thou art more lovely and more temperate: Rough winds do shake the darling buds of May, And summer's lease hath all too short a date: Sometime too hot the eye of heaven shines, And often is his gold complexion dimm'd; And every fair from fair sometime declines, By chance or nature's changing course untrimm'd; But thy eternal summer shall not fade Nor lose possession of that fair thou owest; Nor shall Death brag thou wander'st in his shade, When in eternal lines to time thou growest: So long as men can breathe or eyes can see, So long lives this and this gives life to thee.\n",
"<|begin_of_text|>Rachel Carson (May 27, 1907 - April 14, 1964) was an American marine biologist and nature writer whose writings are credited with advancing the global environmental movement. Carson started her career as a biologist in the U.S. Bureau of Fisheries, and became a full-time nature writer in the 1950s. Her widely praised 1951 bestseller The Sea Around Us won her a U.S. National Book Award, recognition as a gifted writer, and financial security. Her next book, The Edge of the Sea, and the reissued version of her first book, Under the Sea Wind, were also bestsellers. This sea trilogy explores the whole of ocean life from the shores to the depths. Carson's writing career",
- "<|begin_of_text|>Two roads diverged in a yellow wood, and I took the one less traveled by, and that has made all the difference. -Robert Frost\nI have always been a fan of Robert Frost. I love his poetry and his ability to capture the essence of life in a few short lines. This particular poem, \u201cThe Road Not Taken,\u201d has always been one of my favorites. It speaks to the choices we make in life and how they can shape our future.\nThe poem begins with the speaker standing at a fork in the road. He has to choose which path to take. He knows that both paths are equally worn, so he cannot base his decision on that. He decides",
- "<|begin_of_text|>Save tonight and fight the break of dawn / come tomorrow, tomorrow I'll be gone\nI'm not sure if I've ever mentioned this before, but I'm a huge fan of the Swedish pop group Eagle-Eye Cherry. I've been a fan since I first heard his song \"Save Tonight\" on the radio in 1997. I was in high school at the time, and I remember thinking that it was the most beautiful song I had ever heard. I still think it's a beautiful song, and I still love it. I also love his other songs, such as \"Falling in Love Again,\" \"Are You Still Having Fun?,\" and \"Long Way Around",
+ "<|begin_of_text|>Two roads diverged in a yellow wood, and I took the one less traveled by, and that has made all the difference. -Robert Frost\nI have always been a fan of Robert Frost\u2019s poetry. I love the way he uses nature to express his thoughts and feelings. This particular poem is one of my favorites because it speaks to me on a personal level.\nI have always been a bit of a rebel. I never liked following the crowd or doing what was expected of me. I always wanted to forge my own path and do things my own way. This poem speaks to that part of me.\nThe road less traveled is often the more difficult road to take. It is",
+ "<|begin_of_text|>Save tonight and fight the break of dawn / come tomorrow, tomorrow I'll be gone\nI'm not sure if I'm going to be able to sleep tonight. I'm not sure if I want to. I'm not sure if I want to be awake, either. I'm not sure if I want to be alive. I'm not sure if I want to be dead. I'm not sure if I want to be anything. I'm not sure if I want to be nothing. I'm not sure if I want to be here. I'm not sure if I want to be there. I'm not sure if I want to be anywhere. I'm not sure if I",
"<|begin_of_text|>The first thousand digits of PI: 3.14159265358979323846 26433832795028841971 69399375105820974944 59230781640628620899 86280348253421170679 82148086513282306647 09384460955058223172 53594081284811174502 84102701938521105559 64462294895493038196 44288109756659334461 28475648233786783165 27120190914564856692 34603486104543266482 13393607260249141273 72458700660631558817 48815209209628292540",
- "<|begin_of_text|>Thirty days hath September, April, June, and November. All the rest have thirty-one, except for February, which has twenty-eight. And if it\u2019s a leap year, then it has twenty-nine. This is a mnemonic device that helps us remember how many days are in each month. But what if we wanted to know how many days are in a year? We could use a similar mnemonic device: \u201cThirty days hath September, April, June, and November. All the rest have thirty-one, except for February, which has twenty-eight. And if it\u2019s a leap year, then it has twenty-nine. So there are 365 days in a year, except for",
+ "<|begin_of_text|>Thirty days hath September, April, June, and November. All the rest have thirty-one, except for February, which has twenty-eight. And if it\u2019s a leap year, then it has twenty-nine. This is the rhyme that I learned as a child to remember how many days are in each month. I don\u2019t know if it\u2019s still taught in schools, but it\u2019s a good way to remember. I\u2019m not sure why it\u2019s important to remember, but it\u2019s a good way to remember.\nThe rhyme is a mnemonic device that helps people remember the number of days in each month. It is a simple way to remember the number of days in each month, and",
"<|begin_of_text|>If you want to live a happy life, tie it to a goal, not to people or things. - Albert Einstein\nIf you want to live a happy life, tie it to a goal, not to people or things.\nI have a lot of things to prove to myself. One is that I can live my life fearlessly. - Oprah Winfrey\nI have a lot of things to prove to myself. One is that I can live my life fearlessly.\nI have a lot of things to prove to myself. One is that I can live my life fearlessly. - Oprah Winfrey\nI have a lot of things to prove to myself. One is that",
- "<|begin_of_text|>Ada Lovelace (10 December 1815 - 27 November 1852) was an English mathematician and writer, chiefly known for her work on Charles Babbage's early mechanical general-purpose computer, the Analytical Engine. Her notes on the engine include what is recognised as the first algorithm intended to be processed by a machine. Because of this, she is often regarded as the first computer programmer.\nAda Lovelace was born Augusta Ada Byron, the only legitimate child of the poet Lord Byron. She was the child of the short-lived marriage between the Romantic poet and Anne Isabella \"Annabella\" Milbanke. Byron separated from his wife a month after Ada was born and left England forever four months later, eventually dying of",
+ "<|begin_of_text|>Ada Lovelace (10 December 1815 - 27 November 1852) was an English mathematician and writer, chiefly known for her work on Charles Babbage's early mechanical general-purpose computer, the Analytical Engine. Her notes on the engine include what is recognised as the first algorithm intended to be processed by a machine. Because of this, she is often regarded as the first computer programmer.\nAda Lovelace was born Augusta Ada Byron, the only legitimate child of the poet Lord Byron and his wife Anne Isabella Byron. She was named after Byron's half-sister, Augusta Leigh, and was called \"Ada\" by Byron himself. On 16 January 1816, at Lord Byron's own insistence, Annabella",
"<|begin_of_text|>Call me Ishmael. Some years ago\u2014never mind how long precisely\u2014having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle",
- "<|begin_of_text|>The true sign of intelligence is not knowledge but imagination. - Albert Einstein\n\"The true sign of intelligence is not knowledge but imagination.\"Albert Einstein\nThe True , The True Knowledge , The True Meaning , The True You , The True Light , The True Self , The True Church , The True God , The True Israel , The True Vine , The True Way , The True Word , The True Worship , The True Worshipers , The True , The True Believer , The True Church of God , The True God of Israel , The True Israel of God , The True Light of God , The True Meaning of Christmas , The True Meaning of Easter , The True Meaning of Life , The",
+ "<|begin_of_text|>The true sign of intelligence is not knowledge but imagination. - Albert Einstein\n\"The true sign of intelligence is not knowledge but imagination.\"Albert Einstein\nThe True , The True Knowledge , The True Meaning , The True You , The True Light , The True Self , The True Church , The True Gospel , The True Israel , The True Vine , The True Way , The True Word , The True Worship , The True Worshipers , The True , The True Believer , The True Church of God , The True God , The True Gospel of Jesus Christ , The True Israel of God , The True Light of God , The True Meaning of Christmas , The True Meaning of Easter , The True",
"<|begin_of_text|>Consider the sequence of prime numbers: 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, ",
"<|begin_of_text|>Fibonacci sequence unfurls like a mathematical nautilus: 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465, 14930352, 24157817,",
- "<|begin_of_text|>Once upon a time, in a land full of dragons, there lived a young girl named Lily. Lily was a brave and adventurous girl who loved to explore the world around her. She was always curious about the dragons that lived in the land and wanted to learn more about them.\nOne day, Lily decided to go on a journey to find out more about the dragons. She packed her bag and set off on her adventure. Along the way, she met many interesting people and creatures, but none of them could tell her anything about the dragons.\nFinally, Lily reached a village where she heard stories of a wise old man who knew everything about the dragons. She went to see him and asked him",
- "<|begin_of_text|>A duck walks into a store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have any grapes.\"\nThe next day, the duck walks into the store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have any grapes. If you come in here again and ask for grapes, I'm going to nail your feet to the floor.\"\nThe next day, the duck walks into the store and asks the clerk, \"Do you have any nails?\"\nThe clerk says, \"No, we don't have any nails.\"\nThe duck says, \"Do you have any grapes?\"",
- "<|begin_of_text|>I heard there was a secret chord\nThat David played, and it pleased the Lord\nBut you don't really care for music, do you?\nIt goes like this, the fourth, the fifth\nThe minor fall, the major lift\nYour faith was strong but you needed proof\nYou saw her bathing on the roof\nHer beauty and the moonlight overthrew you\nShe tied you to a kitchen chair\nShe broke your throne, and she cut your hair\nAnd from your lips she drew the Hallelujah\nI've seen this room and I've walked this floor\nI used to live alone before I knew you\nI've seen your flag",
+ "<|begin_of_text|>Once upon a time, in a land full of dragons, there was a young girl named Lily. Lily was a brave and adventurous girl who loved to explore the world around her. She was always curious about the dragons that lived in the land and wanted to learn more about them.\nOne day, Lily decided to go on a journey to find out more about the dragons. She packed her bag and set off on her adventure. Along the way, she met many interesting people and animals, but none of them could tell her anything about the dragons.\nFinally, Lily reached a village where she met an old man who told her stories about the dragons. He told her that the dragons were powerful creatures that",
+ "<|begin_of_text|>A duck walks into a store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have any grapes.\"\nThe next day, the duck walks into the store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have any grapes.\"\nThe next day, the duck walks into the store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have any grapes.\"\nThe next day, the duck walks into the store and asks the clerk, \"Do you have any grapes?\"\nThe clerk says, \"No, we don't have",
+ "<|begin_of_text|>I heard there was a secret chord\nThat David played, and it pleased the Lord\nBut you don't really care for music, do you?\nIt goes like this, the fourth, the fifth\nThe minor fall, the major lift\nYour faith was strong but you needed proof\nHer beauty in the moonlight overthrew you\nShe tied you to a kitchen chair\nShe broke your throne, and she cut your hair\nAnd from your lips she drew the Hallelujah\nI've seen this room and I've walked this floor\nI used to live alone before I knew you\nI've seen your flag on the marble arch\nLove is not",
"<|begin_of_text|>It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters. \"My dear Mr. Bennet,\" said his lady to him one day, \"have you heard that Netherfield Park is let at last?\" Mr. Bennet replied that he had not. \"But it is,\" returned she; \"for Mrs. Long has just been here, and she told me all",
- "<|begin_of_text|>Shakespeare, William (bapt. 26 April 1564 - 23 April 1616) was an English poet and playwright, widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist. He is often called England's national poet and the \"Bard of Avon\". His surviving works, including some collaborations, consist of 38 plays, 154 sonnets, two long narrative poems, and several other poems. His plays have been translated into every major living language and are performed more often than those of any other playwright.\nShakespeare was born and brought up in Stratford-upon-Avon. At the age of 18, he married Anne Hathaway, who bore him three children: Sus",
- "<|begin_of_text|>The quality of mercy is not strained. It droppeth as the gentle rain from heaven upon the place beneath. It is twice blest: It blesseth him that gives and him that takes. 'Tis mightiest in the mightiest; it becomes the throned monarch better than his crown. His scepter shows the force of temporal power, the attribute to awe and majesty wherein doth sit the dread and fear of kings; but mercy is above this sceptered sway. It is enthroned in the hearts of kings; it is an attribute to God himself; and earthly power doth then show likest God's when mercy seasons justice",
- "<|begin_of_text|>The Deliverator belongs to an elite order, a hallow subcategory. He's got esprit up to here. Right now he is preparing to deliver a pizza within twenty-three minutes. He's going to open the door of his car, hump a box into the kitchen of the address, and scram. Here's how long that ought to take: the door of his car has a streetlight shining on it, so it won't be a problem to see the latch. Opening it and swinging it open will take a second or two. Same thing with swinging it closed after he gets out, always assuming that it doesn't stick on his sweats. The only wild card is the car",
+ "<|begin_of_text|>Shakespeare, William (bapt. 26 April 1564 - 23 April 1616) was an English poet and playwright, widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist. He is often called England's national poet and the \"Bard of Avon\" (or simply \"The Bard\"). His surviving works consist of 38 plays, 154 sonnets, two long narrative poems, and several other poems. His plays have been translated into every major living language, and are performed more often than those of any other playwright. Shakespeare was born and raised in Stratford-upon-Avon. At the age of 18 he married Anne Hathaway, who bore him three children: Sus",
+ "<|begin_of_text|>The quality of mercy is not strained. It droppeth as the gentle rain from heaven upon the place beneath. It is twice blest. It blesseth him that gives and him that takes. 'Tis mightiest in the mightiest. It becomes the throned monarch better than his crown. His scepter shows the force of temporal power, the attribute to awe and majesty wherein doth sit the dread and fear of kings. But mercy is above this sceptered sway. It is enthroned in the hearts of kings. It is an attribute to God himself. And earthly power doth then show likest God's when mercy seasons justice",
+ "<|begin_of_text|>The Deliverator belongs to an elite order, a hallow subcategory. He's got esprit up to here. Right now he is preparing to deliver a pizza within twenty-three minutes. He's going to open the door, bend over, and pick the pizza up. He knows that when he does, he is going to see a pair of young, playful, and extraordinarily beautiful Asian-fusion women wearing expensive, revealing black underwear. He knows this because they will be photographed in this condition, and this photograph will be tacked up in the office, at CosaNostra Pizza #3569, for the rest of his life, right next to the other eleven thousand photographs of beautiful young",
"<|begin_of_text|>Counting in binary: 0000, 0001, 0010, 0011, 0100, 0101, 0110, 0111, 1000, 1001, 1010, 1011, 1100, 1101, 1110, 1111, 10000, 10001, 10010, 10011, 10100, 10101, 10110, 10111, 11000, 11001, 11010, 11011, 11100, 11101, 11110, 11111, 100000, 100001, 100010, 100011,"
]
diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_generation.py b/models/demos/t3000/llama2_70b/tests/test_llama_generation.py
index b5b2286aa81..f5af555dc39 100644
--- a/models/demos/t3000/llama2_70b/tests/test_llama_generation.py
+++ b/models/demos/t3000/llama2_70b/tests/test_llama_generation.py
@@ -152,9 +152,7 @@ def test_LlamaModel_inference(
pytest.skip(f"Requires grid size of at least {model_config['MAX_GRID_SIZE']} to run")
t3k_mesh_device.enable_async(True)
- for device_id in t3k_mesh_device.get_device_ids():
- device = t3k_mesh_device.get_device(device_id)
- device.enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
args = construct_arg(
implementation=implementation,
diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_perf.py b/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
index 131f8abf965..de8fab5b8c2 100644
--- a/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
+++ b/models/demos/t3000/llama2_70b/tests/test_llama_perf.py
@@ -315,9 +315,7 @@ def test_Llama_perf_host(
pytest.skip(f"Requires grid size of at least {model_config['MAX_GRID_SIZE']} to run")
t3k_mesh_device.enable_async(True)
- for i in t3k_mesh_device.get_device_ids():
- device = t3k_mesh_device.get_device(i)
- device.enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
disable_compilation_reports()
run_test_LlamaModel_end_to_end(
diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py b/models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py
index fbccd4176c3..3526206b852 100644
--- a/models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py
+++ b/models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py
@@ -28,6 +28,8 @@
)
from models.perf.perf_utils import prep_perf_report
+from collections import defaultdict
+
def get_decode_time(profiler, start_token, end_token):
total_time = 0
@@ -254,3 +256,219 @@ def test_Llama_perf_host(
tokenizer_path,
cache_path,
)
+
+
+def run_test_LlamaModel_end_to_end_hybrid_data_tensor_parallel(
+ mesh_device,
+ llama_version,
+ batch,
+ seq_len,
+ max_context_len,
+ model_config,
+ n_layers,
+ n_devices,
+ generation_length,
+ expected_compile_time,
+ expected_inference_time,
+ ckpt_dir,
+ tokenizer_path,
+ cache_path,
+):
+ # Prepare paths and devices
+ skip_model_load = should_skip_model_load()
+
+ logger.info(f"Running num_layer: {n_layers}")
+
+ generator = Llama.build(
+ ckpt_dir,
+ tokenizer_path,
+ max_seq_len=max_context_len,
+ max_batch_size=batch,
+ n_layers=1,
+ skip_model_load=skip_model_load,
+ )
+ hugging_face_reference_model, tokenizer = generator.model, generator.tokenizer
+ hugging_face_reference_model.eval()
+ # state_dict = hugging_face_reference_model.state_dict()
+ state_dict = load_llama_state_dict(ckpt_dir, n_layers=n_layers)
+ configuration = hugging_face_reference_model.params
+
+ # Prepare input -----------------------------------------------------------------------
+ torch.manual_seed(0)
+ total_len = min(max_context_len, generation_length + 1)
+ n_iters = 100 # Number of iterations to run in order to get a perf estimate
+ tokens = torch.randint(0, 10000, (batch, 1), dtype=torch.long)
+ # Clear global profiler state before starting measurements
+ profiler.clear()
+
+ submesh_to_metadata = defaultdict(dict)
+ submeshes = mesh_device.create_submeshes((2, 4), ttnn.MeshType.Ring)
+ for submesh in submeshes:
+ # Set up model -----------------------------------------------------------------------
+ logger.info("Moving weights to devices; might take some time...")
+ profiler.start("TT_llama_model_setup")
+ tt_model = TtLlamaModel_optimized(
+ submesh,
+ state_dict,
+ BASE_URL,
+ n_layers,
+ model_config,
+ configuration,
+ cache_path=cache_path,
+ read_cache=True,
+ )
+
+ for i in submesh.get_device_ids():
+ device = submesh.get_device(i)
+ ttnn.synchronize_device(device)
+
+ profiler.end("TT_llama_model_setup")
+
+ ##### Prepare Inputs #####
+ prev_pos = total_len - 1
+ tt_inp_emb, prev_pos, rot_mat, cache_idxs = tt_model.prepare_inputs(tokens, prev_pos)
+ tt_inp_emb = ttnn.to_device(tt_inp_emb, submesh, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+ tt_inp_emb = tt_model.tt_embd(tt_inp_emb)
+ tt_inp_emb = ttnn.interleaved_to_sharded(tt_inp_emb, tt_model.model_config["WORD_EMBEDDING_OUTPUT_MEMCFG"])
+
+ rot_mat = ttnn.to_device(rot_mat, submesh, memory_config=tt_model.model_config["ROT_MAT_MM_IN1_MEMCFG"])
+ cache_idxs = ttnn.to_device(cache_idxs, submesh, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+
+ ##### Compile Model #####
+ logger.info("Compiling model")
+ profiler.start(f"compile_time")
+ tt_logits = tt_model(tt_inp_emb, rot_mat, prev_pos, cache_idxs=cache_idxs, mode="decode")
+ tt_logits = ttnn.all_gather(tt_logits, dim=3, num_links=1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+ tt_logits_tensors = ttnn.get_device_tensors(tt_logits)
+ logits_rm = ttnn.to_layout(tt_logits_tensors[0], ttnn.ROW_MAJOR_LAYOUT)
+ logits = ttnn.to_torch(logits_rm)
+ profiler.end(f"compile_time")
+ profiler.print()
+ compile_iter_time = profiler.get("compile_time")
+ logger.info(f"decode with compile time, single iter latency: {compile_iter_time}")
+
+ submesh_to_metadata[submesh.get_mesh_id()] = {
+ "submesh": submesh,
+ "logits_rm": logits_rm,
+ "tt_model": tt_model,
+ "prev_pos": prev_pos,
+ "tt_inp_emb": tt_inp_emb,
+ "rot_mat": rot_mat,
+ "cache_idxs": cache_idxs,
+ }
+
+ ##### Capture Trace #####
+ logger.info("Capturing trace")
+ trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
+
+ for submesh in submeshes:
+ mesh_id = submesh.get_mesh_id()
+ tt_model = submesh_to_metadata[mesh_id]["tt_model"]
+ tt_inp_emb = submesh_to_metadata[mesh_id]["tt_inp_emb"]
+ rot_mat = submesh_to_metadata[mesh_id]["rot_mat"]
+ cache_idxs = submesh_to_metadata[mesh_id]["cache_idxs"]
+ prev_pos = submesh_to_metadata[mesh_id]["prev_pos"]
+
+ tt_logits = tt_model(tt_inp_emb, rot_mat, prev_pos, cache_idxs=cache_idxs, mode="decode")
+ tt_logits = ttnn.all_gather(tt_logits, dim=3, num_links=1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+ tt_logits_tensors = ttnn.get_device_tensors(tt_logits)
+ logits_rm = ttnn.to_layout(tt_logits_tensors[0], ttnn.ROW_MAJOR_LAYOUT)
+ ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
+
+ ##### Execute Trace #####
+ logger.info("Executing trace")
+ profiler.start(f"end_to_end_inference")
+ for i in range(n_iters):
+ ttnn.execute_trace(mesh_device, trace_id, blocking=False)
+ logits = ttnn.to_torch(logits_rm)
+ profiler.end(f"end_to_end_inference")
+ ttnn.release_trace(mesh_device, trace_id)
+
+ profiler.print()
+ loop_time = profiler.get("end_to_end_inference")
+ iter_time = loop_time / n_iters
+ logger.info(f"decode cached, single iter latency: {iter_time}")
+
+ comment = f"num_layers={n_layers}L_n_devices={n_devices}"
+
+ prep_perf_report(
+ model_name=f"{llama_version}_70b_{comment}",
+ batch_size=batch,
+ inference_and_compile_time=compile_iter_time,
+ inference_time=iter_time,
+ expected_compile_time=expected_compile_time,
+ expected_inference_time=expected_inference_time,
+ comments=comment,
+ )
+
+ tokens_per_s_per_user = 1 / iter_time
+ tokens_per_s_overall = tokens_per_s_per_user * batch * len(submeshes)
+
+ logger.info(f"Time per iteration: {iter_time}")
+ logger.info(f"Tokens per s per user: {tokens_per_s_per_user}")
+ logger.info(f"Tokens per s overall: {tokens_per_s_overall}")
+
+ # assert compile_time <= expected_compile_time
+ assert iter_time <= expected_inference_time
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.timeout(4500)
+@pytest.mark.model_perf_tg
+@pytest.mark.parametrize(
+ "llama_version",
+ (("llama3"),),
+)
+@pytest.mark.parametrize(
+ "generation_length, expected_compile_time, expected_inference_time, batch, seq_len, max_context_len",
+ (
+ (32, 10000, 0.0653 + 0.01, 32, 1, 4096),
+ (128, 10000, 0.0655 + 0.01, 32, 1, 4096),
+ (2048, 10000, 0.0771 + 0.01, 32, 1, 4096),
+ (8192, 10000, 0.0825 + 0.01, 16, 1, 8192),
+ (128 * 1024, 10000, 0.0918 + 0.01, 1, 1, 128 * 1024),
+ ),
+ ids=["gen32", "gen128", "gen2k", "gen8k", "gen128k"],
+)
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 20000000}], indirect=True)
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+def test_Llama_perf_hybrid_data_tensor_parallel(
+ mesh_device,
+ generation_length,
+ expected_compile_time,
+ expected_inference_time,
+ batch,
+ seq_len,
+ max_context_len,
+ llama_version,
+ use_program_cache,
+ n_layers=80,
+ n_devices=8,
+):
+ model_config, ckpt_dir, tokenizer_path, cache_path = setup_llama_env(
+ llama_version=llama_version,
+ max_batch_size=batch,
+ max_context_len=max_context_len,
+ )
+
+ check_mesh_device(mesh_device, model_config)
+ mesh_device.enable_async(True)
+
+ disable_compilation_reports()
+
+ run_test_LlamaModel_end_to_end_hybrid_data_tensor_parallel(
+ mesh_device,
+ llama_version,
+ batch,
+ seq_len,
+ max_context_len,
+ model_config,
+ n_layers,
+ n_devices,
+ generation_length,
+ expected_compile_time,
+ expected_inference_time,
+ ckpt_dir,
+ tokenizer_path,
+ cache_path,
+ )
diff --git a/models/demos/t3000/llama2_70b/tests/test_llama_stress_test.py b/models/demos/t3000/llama2_70b/tests/test_llama_stress_test.py
index 621fcd2b3a7..a5b9edc7f81 100644
--- a/models/demos/t3000/llama2_70b/tests/test_llama_stress_test.py
+++ b/models/demos/t3000/llama2_70b/tests/test_llama_stress_test.py
@@ -147,9 +147,7 @@ def test_Llama_stress_test(
if compute_grid_size.x < model_config["MAX_GRID_SIZE"][0] or compute_grid_size.y < model_config["MAX_GRID_SIZE"][1]:
pytest.skip(f"Requires grid size of at least {model_config['MAX_GRID_SIZE']} to run")
- for i in t3k_mesh_device.get_device_ids():
- device = t3k_mesh_device.get_device(i)
- device.enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
disable_compilation_reports()
run_test_LlamaModel_stress_test(
devices,
diff --git a/models/demos/tg/llama3_70b/tests/test_llama_perf.py b/models/demos/tg/llama3_70b/tests/test_llama_perf.py
index ce9a16095ce..3190abc90d3 100644
--- a/models/demos/tg/llama3_70b/tests/test_llama_perf.py
+++ b/models/demos/tg/llama3_70b/tests/test_llama_perf.py
@@ -197,9 +197,7 @@ def test_Llama_perf_host(
check_mesh_device(mesh_device, model_config)
mesh_device.enable_async(True)
-
- for device in mesh_device.get_devices():
- device.enable_program_cache()
+ mesh_device.enable_program_cache()
disable_compilation_reports()
run_test_LlamaModel_end_to_end(
diff --git a/models/demos/tg/resnet50/tests/test_resnet50_performant.py b/models/demos/tg/resnet50/tests/test_resnet50_performant.py
index c3f10fe9272..6a82bbab68c 100644
--- a/models/demos/tg/resnet50/tests/test_resnet50_performant.py
+++ b/models/demos/tg/resnet50/tests/test_resnet50_performant.py
@@ -77,3 +77,69 @@ def test_run_resnet50_trace_inference(
math_fidelity,
model_location_generator,
)
+
+
+@run_for_wormhole_b0()
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_command_queues": 2}], indirect=True)
+@pytest.mark.parametrize(
+ "device_batch_size, act_dtype, weight_dtype, math_fidelity",
+ ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
+)
+@pytest.mark.parametrize("enable_async_mode", [True], indirect=True)
+@pytest.mark.parametrize(
+ "mesh_device",
+ ((8, 4),),
+ indirect=True,
+)
+def test_run_resnet50_2cqs_inference(
+ mesh_device,
+ use_program_cache,
+ device_batch_size,
+ act_dtype,
+ weight_dtype,
+ math_fidelity,
+ enable_async_mode,
+ model_location_generator,
+):
+ run_resnet50_2cqs_inference(
+ mesh_device,
+ device_batch_size,
+ act_dtype,
+ weight_dtype,
+ math_fidelity,
+ model_location_generator,
+ )
+
+
+@run_for_wormhole_b0()
+@pytest.mark.parametrize(
+ "device_params", [{"l1_small_size": 24576, "trace_region_size": 800768, "num_command_queues": 2}], indirect=True
+)
+@pytest.mark.parametrize(
+ "device_batch_size, act_dtype, weight_dtype, math_fidelity",
+ ((16, ttnn.bfloat8_b, ttnn.bfloat8_b, ttnn.MathFidelity.LoFi),),
+)
+@pytest.mark.parametrize("enable_async_mode", [True], indirect=True)
+@pytest.mark.parametrize(
+ "mesh_device",
+ ((8, 4),),
+ indirect=True,
+)
+def test_run_resnet50_trace_2cqs_inference(
+ mesh_device,
+ use_program_cache,
+ device_batch_size,
+ act_dtype,
+ weight_dtype,
+ math_fidelity,
+ enable_async_mode,
+ model_location_generator,
+):
+ run_resnet50_trace_2cqs_inference(
+ mesh_device,
+ device_batch_size,
+ act_dtype,
+ weight_dtype,
+ math_fidelity,
+ model_location_generator,
+ )
diff --git a/models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py b/models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
index c52d9d4fc28..1de4f9a058c 100644
--- a/models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
+++ b/models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
@@ -296,8 +296,7 @@ def test_t3k_falcon_causal_lm_with_trace(
num_loops,
):
t3k_mesh_device.enable_async(enable_async)
- for device in t3k_mesh_device.get_device_ids():
- t3k_mesh_device.get_device(device).enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
torch.manual_seed(0)
batch = device_batch_size * t3k_mesh_device.get_num_devices()
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
index 9d7f3efed24..d8d09562b8a 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -639,6 +639,8 @@ def __init__(
width=self.conv1_output_width,
in_channels=self.conv1_input_channels,
out_channels=self.conv1_output_channels,
+ kernel_size=[self.conv1_kernel_size[0], self.conv1_kernel_size[1]],
+ stride=[self.conv1_stride[0], self.conv1_stride[1]],
)
def __del__(self):
diff --git a/models/demos/wormhole/mamba/demo/demo.py b/models/demos/wormhole/mamba/demo/demo.py
index c18c22314ef..d5a4017a0b8 100644
--- a/models/demos/wormhole/mamba/demo/demo.py
+++ b/models/demos/wormhole/mamba/demo/demo.py
@@ -219,6 +219,7 @@ def run_mamba_demo(
cache_dir: Optional[str] = None,
display: bool = True,
prefill_chunk_size: int = 32,
+ assert_on_performance_measurements: bool = True,
):
profiler = BenchmarkProfiler()
profiler.start("run")
@@ -345,6 +346,8 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
prefill_time_to_token_per_user = prefill_stats.mean_throughput_per_user
decode_time_to_token_per_user = decode_stats.mean_throughput_per_user
+ time_to_first_token = 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user) # t/s/u
+
measurements = {
"total_demo_time": profiler.get_duration("run"),
"compile_prefill": profiler.get_duration("compile_prefill"),
@@ -352,10 +355,10 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
"inference_prefill": prefill_stats.total_time,
"inference_decode": decode_stats.total_time,
"prefill_t/s": prefill_stats.mean_throughput,
- "prefill_time_to_token": prefill_stats.total_time,
+ "prefill_time_to_token": time_to_first_token,
"decode_t/s": decode_stats.mean_throughput,
"decode_t/s/u": decode_stats.mean_throughput_per_user,
- "prefill_decode_t/s/u": 1 / (prefill_time_to_token_per_user + decode_time_to_token_per_user), # t/s/u
+ "prefill_decode_t/s/u": time_to_first_token,
"token_verification": 1, # This is checked by the caller - but we could also do a match here
}
@@ -367,7 +370,7 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
logger.info(
f"Decode throughput: {decode_stats.mean_throughput:.1f} t/s, {decode_stats.mean_throughput_per_user:.2f} t/s/u"
)
- logger.info(f"Time to first token: {(1e3 * measurements['prefill_decode_t/s/u']):.2f} ms")
+ logger.info(f"Time to first token: {(1e3 * time_to_first_token):.2f} ms")
chunk_size_to_prefill_targets_tok_per_s = {32: 135.0, 128: 270.0} # perf is different for different chunk sizes
targets = {
@@ -390,7 +393,10 @@ def callback(token: torch.Tensor, inference_time: float) -> None:
output_sequence_length=tokenized_prompts.shape[1] + generated_sequence_length,
)
- verify_perf(measurements, targets)
+ if assert_on_performance_measurements:
+ verify_perf(measurements, targets)
+ else:
+ logger.warning(f"Skipping performance checks (this is expected for functional tests)")
return DemoResult(generated_text=token_display.sequences)
diff --git a/models/demos/wormhole/mamba/tests/test_mamba_demo.py b/models/demos/wormhole/mamba/tests/test_mamba_demo.py
index f5496420b51..1680c933fb8 100644
--- a/models/demos/wormhole/mamba/tests/test_mamba_demo.py
+++ b/models/demos/wormhole/mamba/tests/test_mamba_demo.py
@@ -44,6 +44,7 @@ def test_demo(
get_tt_cache_path,
max_gen_len,
prefill_chunk_size,
+ reset_seeds,
):
assert len(user_input) == len(expected_output)
@@ -55,6 +56,7 @@ def test_demo(
display=True,
cache_dir=get_tt_cache_path(model_version),
prefill_chunk_size=prefill_chunk_size,
+ assert_on_performance_measurements=False, # Don't check performance for functional tests
)
expected = user_input[0] + expected_output[0]
diff --git a/models/demos/wormhole/mamba/tests/test_mamba_perf.py b/models/demos/wormhole/mamba/tests/test_mamba_perf.py
index 5e69de74619..b515c52e345 100644
--- a/models/demos/wormhole/mamba/tests/test_mamba_perf.py
+++ b/models/demos/wormhole/mamba/tests/test_mamba_perf.py
@@ -36,8 +36,8 @@ def is_nearby(actual: float, expected: float, lower_margin: float = 0.03, upper_
@pytest.mark.parametrize(
"model_version, mode, batch_size, sequence_length, iterations, expected_compile_time, expected_inference_time",
(
- ("state-spaces/mamba-2.8b", ModelMode.DECODE, 32, 1, 8, 15.0, 0.110),
- ("state-spaces/mamba-2.8b", ModelMode.PREFILL, 1, 128, 8, 27.0, 0.520),
+ ("state-spaces/mamba-2.8b", ModelMode.DECODE, 32, 1, 8, 18.0, 0.110),
+ ("state-spaces/mamba-2.8b", ModelMode.PREFILL, 1, 128, 8, 30.0, 0.520),
),
)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@@ -129,7 +129,7 @@ def test_mamba_perf_e2e(
upper_margin = MARGIN
if not is_nearby(inference_time, expected_inference_time, lower_margin=lower_margin, upper_margin=upper_margin):
logger.warning(
- "Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
+ f"Inference time does not match (within some margin) the expected value (was {inference_time:2f} but expected {expected_inference_time:2f})"
)
if not is_nearby(compile_time, expected_compile_time, lower_margin=lower_margin, upper_margin=upper_margin):
@@ -142,33 +142,30 @@ def test_mamba_perf_e2e(
@pytest.mark.timeout(600)
@pytest.mark.models_device_performance_bare_metal
@pytest.mark.parametrize(
- "batch, warmup, expected_device_fw_duration_ms",
- ((32, True, 1.66),),
+ "batch, expected_layer_duration_ms",
+ ((32, 1.689),),
)
-def test_mamba_perf_device(batch, warmup, expected_device_fw_duration_ms, reset_seeds):
+def test_mamba_perf_device(batch, expected_layer_duration_ms):
subdir = "ttnn_mamba"
- margin = 0.03
- if warmup:
- inference_iterations = 2
- else:
- inference_iterations = 1
- command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[{inference_iterations}]"
+ margin = 0.01
+ command = f"pytest models/demos/wormhole/mamba/tests/test_mamba_model.py::test_device_perf[1]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
# Convert expected perf (ms) to samples/s
- expected_device_fw_duration_ns = expected_device_fw_duration_ms * 1e6 # ms to ns
- expected_total_device_fw_samples = get_samples_per_s(expected_device_fw_duration_ns * inference_iterations, batch)
-
- inference_time_key = "AVG DEVICE FW SAMPLES/S"
- expected_perf_cols = {inference_time_key: expected_total_device_fw_samples}
+ expected_layer_duration_ns = expected_layer_duration_ms * 1e6 # ms to ns
+ expected_total_layer_samples_per_s = get_samples_per_s(expected_layer_duration_ns, batch)
+ inference_time_key = "AVG DEVICE KERNEL SAMPLES/S"
+ expected_perf_cols = {inference_time_key: expected_total_layer_samples_per_s}
post_processed_results = run_device_perf(command, subdir, 1, cols, batch)
+ logger.info(
+ f"Checking device performance... Expecting {expected_total_layer_samples_per_s} samples/sec (equivalent to {expected_layer_duration_ms} ms per layer)"
+ )
expected_results = check_device_perf(post_processed_results, margin, expected_perf_cols, assert_on_fail=True)
- comment = ""
prep_device_perf_report(
model_name=f"mamba-2.8b_batch_{batch}",
batch_size=batch,
post_processed_results=post_processed_results,
expected_results=expected_results,
- comments=comment,
+ comments="",
)
diff --git a/scripts/build_scripts/build_with_profiler_opt.sh b/scripts/build_scripts/build_with_profiler_opt.sh
index 1c6135e28ad..83ab526d393 100755
--- a/scripts/build_scripts/build_with_profiler_opt.sh
+++ b/scripts/build_scripts/build_with_profiler_opt.sh
@@ -11,7 +11,7 @@ if [[ -z "$ARCH_NAME" ]]; then
exit 1
fi
-cmake -B build -G Ninja -DENABLE_TRACY=ON -DTT_METAL_BUILD_TESTS=ON -DTTNN_BUILD_TESTS=ON
+cmake -B build -G Ninja -DENABLE_TRACY=ON -DTT_METAL_BUILD_TESTS=ON -DTTNN_BUILD_TESTS=ON -DBUILD_PROGRAMMING_EXAMPLES=ON
if [[ $1 == "NO_CLEAN" ]]; then
cmake --build build
diff --git a/scripts/tools_setup_common.sh b/scripts/tools_setup_common.sh
index 2f04ab3af9a..2d5ed561b83 100644
--- a/scripts/tools_setup_common.sh
+++ b/scripts/tools_setup_common.sh
@@ -10,6 +10,9 @@ fi
PROFILER_SCRIPTS_ROOT=$TT_METAL_HOME/tt_metal/tools/profiler
PROFILER_TEST_SCRIPTS_ROOT=$TT_METAL_HOME/tests/tt_metal/tools/profiler
PROFILER_ARTIFACTS_DIR=$TT_METAL_HOME/generated/profiler
+if [[ "$TT_METAL_PROFILER_DIR" ]]; then
+ PROFILER_ARTIFACTS_DIR=$TT_METAL_PROFILER_DIR
+fi
PROFILER_OUTPUT_DIR=$PROFILER_ARTIFACTS_DIR/reports
remove_default_log_locations(){
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 24088c4ba89..1474dc932c1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -2,7 +2,7 @@
enable_testing()
include(GoogleTest)
add_library(test_common_libs INTERFACE)
-target_link_libraries(test_common_libs INTERFACE pthread gtest gtest_main)
+target_link_libraries(test_common_libs INTERFACE pthread gtest gtest_main magic_enum fmt)
if(TT_METAL_BUILD_TESTS)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tt_metal/tt_metal)
@@ -12,4 +12,3 @@ if(TTNN_BUILD_TESTS)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tt_eager) # this should go away and be replaced with link to ttnn
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ttnn/unit_tests/gtests)
endif(TTNN_BUILD_TESTS)
-
diff --git a/tests/end_to_end_tests/test_ttnn.py b/tests/end_to_end_tests/test_ttnn.py
index 4166c7d7558..5d1e2dd2401 100644
--- a/tests/end_to_end_tests/test_ttnn.py
+++ b/tests/end_to_end_tests/test_ttnn.py
@@ -7,7 +7,7 @@
import ttnn
import torch
-import ttnn.operations.binary
+import ttnn.operations.eltwise.binary
@pytest.mark.eager_host_side
diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh
index de20e4221ac..cdccd2f8302 100755
--- a/tests/scripts/run_moreh_microbenchmark.sh
+++ b/tests/scripts/run_moreh_microbenchmark.sh
@@ -33,8 +33,12 @@ run_profiling_test() {
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_l1 -k $ARCH_NAME
if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
- pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
- pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+ pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
+ pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
+ fi
+ # bypass wh_b0 for now until we can move FD cores to last col
+ if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then
+ pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_l1_write_core -k $ARCH_NAME
fi
}
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
index 3d9d8a50782..dc1e3b9b4c9 100755
--- a/tests/scripts/test_moreh_microbenchmark.py
+++ b/tests/scripts/test_moreh_microbenchmark.py
@@ -265,6 +265,28 @@ def run_dram_read_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
run_moreh_single_test("DRAM BW test multi-core", command)
+def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
+ command = (
+ "TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write "
+ + " --k "
+ + str(k)
+ + " --n "
+ + str(n)
+ + " --num-blocks "
+ + str(num_blocks)
+ + " --num-tests "
+ + str(1)
+ + " --data-type "
+ + str(df)
+ + " --num-banks "
+ + str(num_banks)
+ + " --bank-start-id "
+ + str(bank_start_id)
+ + " --bypass-check "
+ )
+ run_moreh_single_test("DRAM BW test multi-core", command)
+
+
# noc
def test_noc_local(r=9, c=12, nt=256, cb=1):
command = (
@@ -672,6 +694,51 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
assert bw_bound <= throughput
+@pytest.mark.parametrize(
+ "arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
+ [
+ ("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
+ ("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
+ ("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
+ ],
+)
+def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
+ data = []
+ cycle_list = []
+ time_list = []
+ throughput_list = []
+ for _ in range(num_tests):
+ k = int(test_vector[0])
+ n = int(test_vector[1])
+ if data_format == 0:
+ input_size = k * n * 1088 // 1024
+ elif data_format == 1:
+ input_size = k * n * 2048 // 1024
+ run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
+ cycle = profile_results_kernel_duration()
+ time = cycle / freq / 1000.0 / 1000.0
+ throughput = input_size / cycle * freq / 1000.0
+ cycle_list.append(cycle)
+ time_list.append(time)
+ throughput_list.append(throughput)
+ cycle = sum(cycle_list) / len(cycle_list)
+ time = sum(time_list) / len(time_list)
+ throughput = sum(throughput_list) / len(throughput_list)
+ logger.info("DRAM read cycle: " + str(cycle))
+ logger.info("DRAM read time: " + str(time))
+ logger.info("DRAM read throughput: " + str(throughput))
+ data.append([throughput])
+ # check within range
+ dev_freq = get_device_freq()
+ if arch == "grayskull":
+ bw_bound = 100.0
+ elif arch == "wormhole_b0":
+ bw_bound = 260.0
+ elif arch == "blackhole":
+ bw_bound = 340.0
+ assert bw_bound <= throughput
+
+
@pytest.mark.parametrize(
"arch, freq, r, c, test_vector_global, test_vector_local",
[
diff --git a/tests/scripts/tg/run_tg_model_perf_tests.sh b/tests/scripts/tg/run_tg_model_perf_tests.sh
index 9501e79e423..d86a7a96688 100755
--- a/tests/scripts/tg/run_tg_model_perf_tests.sh
+++ b/tests/scripts/tg/run_tg_model_perf_tests.sh
@@ -4,6 +4,7 @@ run_tg_llm_tests() {
echo "LOG_METAL: Running run_t3000_llama2_70b_tests"
pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$?
+ pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_tg" --timeout=600 ; fail+=$?
# Merge all the generated reports
env python models/perf/merge_perf_results.py; fail+=$?
diff --git a/tests/sweep_framework/sweeps/ccl/all_gather_n300.py b/tests/sweep_framework/sweeps/ccl/all_gather_n300.py
index 963e7f48a7c..bd56a03a62b 100644
--- a/tests/sweep_framework/sweeps/ccl/all_gather_n300.py
+++ b/tests/sweep_framework/sweeps/ccl/all_gather_n300.py
@@ -105,9 +105,6 @@ def run(
logger.info(f"Input shape: {input_shape}")
logger.info(f"dim: {dim}")
- # for device in devices:
- # device.disable_and_clear_program_cache()
-
input_tensor = torch.rand(input_shape).bfloat16()
input_tensors = torch.chunk(input_tensor, num_devices, dim)
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/add/add_all_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/add/add_all_pytorch2.py
new file mode 100644
index 00000000000..408f0c117fa
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/add/add_all_pytorch2.py
@@ -0,0 +1,593 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"self": [0, 1], "other": [0, 1]},
+ {"self": [0], "other": [0]},
+ {"self": [1, 1, 1024], "other": [1, 1, 1024]},
+ {"self": [1, 1, 16, 32], "other": [1, 1, 16, 32]},
+ {"self": [1, 1, 3072], "other": [1, 1, 3072]},
+ {"self": [1, 1, 4096], "other": [1, 1, 4096]},
+ {"self": [1, 1, 512], "other": [1, 1, 512]},
+ {"self": [1, 1, 7, 64], "other": [1, 1, 7, 64]},
+ {"self": [1, 1, 768], "other": [1, 1, 768]},
+ {"self": [1, 1, 768], "other": [1, 768]},
+ {"self": [1, 10, 1024], "other": [1, 10, 1024]},
+ {"self": [1, 10, 512], "other": [1, 10, 512]},
+ {"self": [1, 10, 768], "other": [1, 10, 768]},
+ {"self": [1, 1008, 7, 7], "other": [1, 1008, 7, 7]},
+ {"self": [1, 1024, 10, 10], "other": [1, 1024, 10, 10]},
+ {"self": [1, 1024, 14, 14], "other": [1, 1024, 14, 14]},
+ {"self": [1, 1024, 16, 16], "other": [1, 1024, 16, 16]},
+ {"self": [1, 1024, 160], "other": [1, 1024, 160]},
+ {"self": [1, 1024, 256], "other": [256]},
+ {"self": [1, 1024, 45, 80], "other": [1, 1024, 1, 1]},
+ {"self": [1, 1024, 45, 80], "other": [1, 1024, 45, 80]},
+ {"self": [1, 1024, 50, 68], "other": [1, 1024, 1, 1]},
+ {"self": [1, 1024, 50, 68], "other": [1, 1024, 50, 68]},
+ {"self": [1, 1024, 640], "other": [1, 1024, 640]},
+ {"self": [1, 1024, 7, 7], "other": [1, 1024, 7, 7]},
+ {"self": [1, 1024], "other": [1, 1024]},
+ {"self": [1, 104, 28, 28], "other": [1, 104, 28, 28]},
+ {"self": [1, 1056, 48, 48], "other": [1, 1056, 48, 48]},
+ {"self": [1, 112, 14, 14], "other": [1, 112, 14, 14]},
+ {"self": [1, 112, 15, 15], "other": [1, 112, 15, 15]},
+ {"self": [1, 112, 20, 20], "other": [1, 112, 20, 20]},
+ {"self": [1, 112, 24, 24], "other": [1, 112, 24, 24]},
+ {"self": [1, 12, 1, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 12, 1, 10], "other": [1, 12, 1, 10]},
+ {"self": [1, 12, 1, 1], "other": [1, 1, 1, 1]},
+ {"self": [1, 12, 1, 1], "other": [1, 12, 1, 1]},
+ {"self": [1, 12, 1, 24], "other": [1, 1, 1, 24]},
+ {"self": [1, 12, 1, 2], "other": [1, 1, 1, 2]},
+ {"self": [1, 12, 1, 2], "other": [1, 12, 1, 2]},
+ {"self": [1, 12, 1, 46], "other": [1, 1, 1, 46]},
+ {"self": [1, 12, 10, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 12, 10, 10], "other": [1, 12, 10, 10]},
+ {"self": [1, 12, 12, 12], "other": [1, 1, 1, 12]},
+ {"self": [1, 12, 128], "other": [1, 12, 128]},
+ {"self": [1, 12, 14, 14], "other": [1, 1, 1, 14]},
+ {"self": [1, 12, 197, 197], "other": [1, 12, 197, 197]},
+ {"self": [1, 12, 201, 201], "other": [1, 1, 1, 201]},
+ {"self": [1, 12, 24, 24], "other": [1, 1, 24, 24]},
+ {"self": [1, 12, 25, 25], "other": [1, 1, 1, 25]},
+ {"self": [1, 12, 3072], "other": [1, 12, 3072]},
+ {"self": [1, 12, 45, 45], "other": [1, 1, 45, 45]},
+ {"self": [1, 12, 7, 7], "other": [1, 1, 1, 7]},
+ {"self": [1, 12, 768], "other": [1, 12, 768]},
+ {"self": [1, 12, 9, 9], "other": [1, 1, 1, 9]},
+ {"self": [1, 120, 17, 17], "other": [1, 120, 17, 17]},
+ {"self": [1, 120, 28, 28], "other": [1, 120, 28, 28]},
+ {"self": [1, 1200, 320], "other": [1, 1200, 320]},
+ {"self": [1, 1232, 14, 14], "other": [1, 1232, 14, 14]},
+ {"self": [1, 128, 100, 136], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 128, 128], "other": [1, 128, 128, 128]},
+ {"self": [1, 128, 1536], "other": [1, 128, 1536]},
+ {"self": [1, 128, 180, 320], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 200, 272], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 28, 28], "other": [1, 128, 28, 28]},
+ {"self": [1, 128, 56, 56], "other": [1, 128, 56, 56]},
+ {"self": [1, 128, 75, 75], "other": [1, 128, 75, 75]},
+ {"self": [1, 128, 90, 160], "other": [1, 128, 1, 1]},
+ {"self": [1, 1280, 16, 16], "other": [1, 1280, 1, 1]},
+ {"self": [1, 1280, 16, 16], "other": [1, 1280, 16, 16]},
+ {"self": [1, 1280, 8, 8], "other": [1, 1280, 1, 1]},
+ {"self": [1, 1280, 8, 8], "other": [1, 1280, 8, 8]},
+ {"self": [1, 1344, 14, 14], "other": [1, 1344, 14, 14]},
+ {"self": [1, 136, 19, 19], "other": [1, 136, 19, 19]},
+ {"self": [1, 1370, 1280], "other": [1, 1370, 1280]},
+ {"self": [1, 1392, 14, 14], "other": [1, 1392, 14, 14]},
+ {"self": [1, 14, 128], "other": [1, 14, 128]},
+ {"self": [1, 14, 14, 384], "other": [1, 14, 14, 384]},
+ {"self": [1, 14, 14, 512], "other": [1, 14, 14, 512]},
+ {"self": [1, 14, 3072], "other": [1, 14, 3072]},
+ {"self": [1, 14, 768], "other": [1, 14, 768]},
+ {"self": [1, 144, 28, 28], "other": [1, 144, 28, 28]},
+ {"self": [1, 144, 7, 7], "other": [1, 144, 7, 7]},
+ {"self": [1, 1445, 192], "other": [1, 1445, 192]},
+ {"self": [1, 15, 1024], "other": [1, 15, 1024]},
+ {"self": [1, 15, 512], "other": [1, 15, 512]},
+ {"self": [1, 1500, 768], "other": [1, 1500, 768]},
+ {"self": [1, 1500, 768], "other": [1500, 768]},
+ {"self": [1, 1512, 7, 7], "other": [1, 1512, 7, 7]},
+ {"self": [1, 16, 1, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 16, 1, 10], "other": [1, 16, 1, 10]},
+ {"self": [1, 16, 1, 1], "other": [1, 1, 1, 1]},
+ {"self": [1, 16, 1, 1], "other": [1, 16, 1, 1]},
+ {"self": [1, 16, 1, 2], "other": [1, 1, 1, 2]},
+ {"self": [1, 16, 1, 2], "other": [1, 16, 1, 2]},
+ {"self": [1, 16, 1, 60], "other": [1, 1, 1, 60]},
+ {"self": [1, 16, 1, 6], "other": [1, 1, 1, 6]},
+ {"self": [1, 16, 10, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 16, 10, 10], "other": [1, 16, 10, 10]},
+ {"self": [1, 16, 112, 112], "other": [1, 16, 112, 112]},
+ {"self": [1, 16, 16, 384], "other": [1, 16, 16, 384]},
+ {"self": [1, 16, 16, 512], "other": [1, 16, 16, 512]},
+ {"self": [1, 16, 160, 160], "other": [1, 16, 160, 160]},
+ {"self": [1, 16, 19, 19], "other": [1, 1, 19, 19]},
+ {"self": [1, 16, 197, 197], "other": [1, 16, 197, 197]},
+ {"self": [1, 16, 256, 256], "other": [1, 1, 1, 256]},
+ {"self": [1, 16, 5, 5], "other": [1, 1, 1, 5]},
+ {"self": [1, 16, 59, 59], "other": [1, 1, 59, 59]},
+ {"self": [1, 16, 6, 49, 49], "other": [1, 16, 1, 49, 49]},
+ {"self": [1, 16, 6, 64, 64], "other": [1, 16, 1, 64, 64]},
+ {"self": [1, 16, 768], "other": [1, 16, 768]},
+ {"self": [1, 16, 8, 49, 49], "other": [1, 16, 1, 49, 49]},
+ {"self": [1, 16, 8, 64, 64], "other": [1, 16, 1, 64, 64]},
+ {"self": [1, 16, 9, 9], "other": [1, 1, 1, 9]},
+ {"self": [1, 160, 14, 14], "other": [1, 160, 14, 14]},
+ {"self": [1, 160, 24, 24], "other": [1, 160, 24, 24]},
+ {"self": [1, 160, 7, 7], "other": [1, 160, 7, 7]},
+ {"self": [1, 16384, 256], "other": [256]},
+ {"self": [1, 16384, 32], "other": [1, 16384, 32]},
+ {"self": [1, 168, 28, 28], "other": [1, 168, 28, 28]},
+ {"self": [1, 18, 56, 56], "other": [1, 18, 56, 56]},
+ {"self": [1, 19, 1024], "other": [1, 19, 1024]},
+ {"self": [1, 192, 28, 28], "other": [1, 192, 28, 28]},
+ {"self": [1, 192, 32, 42], "other": [1, 192, 32, 42]},
+ {"self": [1, 192, 7, 7], "other": [1, 192, 7, 7]},
+ {"self": [1, 192, 8, 8], "other": [1, 192, 8, 8]},
+ {"self": [1, 1920, 7, 7], "other": [1, 1920, 7, 7]},
+ {"self": [1, 19200, 64], "other": [1, 19200, 64]},
+ {"self": [1, 193, 768], "other": [1, 193, 768]},
+ {"self": [1, 196, 768], "other": [1, 196, 768]},
+ {"self": [1, 197, 1024], "other": [1, 197, 1024]},
+ {"self": [1, 197, 768], "other": [1, 197, 768]},
+ {"self": [1, 201, 768], "other": [1, 201, 768]},
+ {"self": [1, 2016, 7, 7], "other": [1, 2016, 7, 7]},
+ {"self": [1, 2048, 23, 40], "other": [1, 2048, 1, 1]},
+ {"self": [1, 2048, 23, 40], "other": [1, 2048, 23, 40]},
+ {"self": [1, 2048, 25, 34], "other": [1, 2048, 1, 1]},
+ {"self": [1, 2048, 25, 34], "other": [1, 2048, 25, 34]},
+ {"self": [1, 2048, 7, 7], "other": [1, 2048, 7, 7]},
+ {"self": [1, 2048, 768], "other": [1, 2048, 768]},
+ {"self": [1, 2048, 768], "other": [2048, 768]},
+ {"self": [1, 208, 14, 14], "other": [1, 208, 14, 14]},
+ {"self": [1, 208, 9, 9], "other": [1, 208, 9, 9]},
+ {"self": [1, 216, 28, 28], "other": [1, 216, 28, 28]},
+ {"self": [1, 224, 56, 56], "other": [1, 224, 56, 56]},
+ {"self": [1, 232, 10, 10], "other": [1, 232, 10, 10]},
+ {"self": [1, 232, 56, 56], "other": [1, 232, 56, 56]},
+ {"self": [1, 24, 28, 28], "other": [1, 24, 28, 28]},
+ {"self": [1, 24, 49, 49], "other": [1, 24, 49, 49]},
+ {"self": [1, 24, 56, 56], "other": [1, 24, 56, 56]},
+ {"self": [1, 24, 60, 60], "other": [1, 24, 60, 60]},
+ {"self": [1, 24, 64, 64], "other": [1, 24, 64, 64]},
+ {"self": [1, 24, 65, 65], "other": [1, 24, 65, 65]},
+ {"self": [1, 24, 768], "other": [1, 24, 768]},
+ {"self": [1, 24, 80, 80], "other": [1, 24, 80, 80]},
+ {"self": [1, 240, 28, 28], "other": [1, 240, 28, 28]},
+ {"self": [1, 25, 768], "other": [1, 25, 768]},
+ {"self": [1, 2520, 7, 7], "other": [1, 2520, 7, 7]},
+ {"self": [1, 256, 100, 136], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 100, 136], "other": [1, 256, 100, 136]},
+ {"self": [1, 256, 1024], "other": [1, 256, 1024]},
+ {"self": [1, 256, 128, 128], "other": [1, 256, 128, 128]},
+ {"self": [1, 256, 1280], "other": [1, 256, 1280]},
+ {"self": [1, 256, 14, 14], "other": [1, 256, 14, 14]},
+ {"self": [1, 256, 180, 320], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 180, 320], "other": [1, 256, 180, 320]},
+ {"self": [1, 256, 200, 272], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 200, 272], "other": [1, 256, 200, 272]},
+ {"self": [1, 256, 256], "other": [1, 256, 256]},
+ {"self": [1, 256, 256], "other": [256]},
+ {"self": [1, 256, 28, 28], "other": [1, 256, 28, 28]},
+ {"self": [1, 256, 38, 38], "other": [1, 256, 38, 38]},
+ {"self": [1, 256, 384], "other": [1, 256, 384]},
+ {"self": [1, 256, 45, 80], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 50, 68], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 50, 68], "other": [1, 256, 50, 68]},
+ {"self": [1, 256, 512], "other": [1, 256, 512]},
+ {"self": [1, 256, 56, 56], "other": [1, 256, 56, 56]},
+ {"self": [1, 256, 64, 64], "other": [1, 256, 64, 64]},
+ {"self": [1, 256, 75, 75], "other": [1, 256, 75, 75]},
+ {"self": [1, 256, 90, 160], "other": [1, 256, 1, 1]},
+ {"self": [1, 272, 12, 12], "other": [1, 272, 12, 12]},
+ {"self": [1, 28, 28, 192], "other": [1, 28, 28, 192]},
+ {"self": [1, 28, 28, 256], "other": [1, 28, 28, 256]},
+ {"self": [1, 288, 14, 14], "other": [1, 288, 14, 14]},
+ {"self": [1, 2904, 24, 24], "other": [1, 2904, 24, 24]},
+ {"self": [1, 3, 16, 16, 2], "other": [1, 3, 16, 16, 2]},
+ {"self": [1, 3, 300, 300], "other": [1, 3, 300, 300]},
+ {"self": [1, 3, 32, 32, 2], "other": [1, 3, 32, 32, 2]},
+ {"self": [1, 3, 320, 320], "other": [1, 3, 320, 320]},
+ {"self": [1, 3, 64, 64, 2], "other": [1, 3, 64, 64, 2]},
+ {"self": [1, 3, 800, 1066], "other": [1, 3, 800, 1066]},
+ {"self": [1, 300, 512], "other": [1, 300, 512]},
+ {"self": [1, 3024, 7, 7], "other": [1, 3024, 7, 7]},
+ {"self": [1, 32, 1536], "other": [1, 32, 1536]},
+ {"self": [1, 32, 24576], "other": [1, 32, 24576]},
+ {"self": [1, 32, 28, 28], "other": [1, 32, 28, 28]},
+ {"self": [1, 32, 32, 192], "other": [1, 32, 32, 192]},
+ {"self": [1, 32, 32, 256], "other": [1, 32, 32, 256]},
+ {"self": [1, 32, 49, 49], "other": [1, 32, 49, 49]},
+ {"self": [1, 32, 56, 56], "other": [1, 32, 56, 56]},
+ {"self": [1, 32, 64, 64], "other": [1, 32, 64, 64]},
+ {"self": [1, 32, 75, 75], "other": [1, 32, 75, 75]},
+ {"self": [1, 32, 95, 95], "other": [1, 32, 95, 95]},
+ {"self": [1, 320, 14, 14], "other": [1, 320, 14, 14]},
+ {"self": [1, 320, 64, 64], "other": [1, 320, 1, 1]},
+ {"self": [1, 320, 64, 64], "other": [1, 320, 64, 64]},
+ {"self": [1, 336, 14, 14], "other": [1, 336, 14, 14]},
+ {"self": [1, 336, 56, 56], "other": [1, 336, 56, 56]},
+ {"self": [1, 36, 28, 28], "other": [1, 36, 28, 28]},
+ {"self": [1, 3712, 7, 7], "other": [1, 3712, 7, 7]},
+ {"self": [1, 4, 12, 49, 49], "other": [1, 4, 1, 49, 49]},
+ {"self": [1, 4, 12, 64, 64], "other": [1, 4, 1, 64, 64]},
+ {"self": [1, 4, 16, 49, 49], "other": [1, 4, 1, 49, 49]},
+ {"self": [1, 4, 16, 64, 64], "other": [1, 4, 1, 64, 64]},
+ {"self": [1, 4, 768], "other": [1, 4, 768]},
+ {"self": [1, 4, 768], "other": [4, 768]},
+ {"self": [1, 40, 14, 14], "other": [1, 40, 14, 14]},
+ {"self": [1, 40, 28, 28], "other": [1, 40, 28, 28]},
+ {"self": [1, 40, 30, 30], "other": [1, 40, 30, 30]},
+ {"self": [1, 40, 40, 40], "other": [1, 40, 40, 40]},
+ {"self": [1, 400, 7, 7], "other": [1, 400, 7, 7]},
+ {"self": [1, 408, 14, 14], "other": [1, 408, 14, 14]},
+ {"self": [1, 4096, 256], "other": [256]},
+ {"self": [1, 4096, 320], "other": [1, 4096, 320]},
+ {"self": [1, 4096, 64], "other": [1, 4096, 64]},
+ {"self": [1, 432, 14, 14], "other": [1, 432, 14, 14]},
+ {"self": [1, 440, 7, 7], "other": [1, 440, 7, 7]},
+ {"self": [1, 448, 28, 28], "other": [1, 448, 28, 28]},
+ {"self": [1, 45, 3072], "other": [1, 45, 3072]},
+ {"self": [1, 45, 768], "other": [1, 45, 768]},
+ {"self": [1, 48, 14, 14], "other": [1, 48, 14, 14]},
+ {"self": [1, 48, 33, 33], "other": [1, 48, 33, 33]},
+ {"self": [1, 48, 38, 38], "other": [1, 48, 38, 38]},
+ {"self": [1, 48, 56, 56], "other": [1, 48, 56, 56]},
+ {"self": [1, 4800, 128], "other": [1, 4800, 128]},
+ {"self": [1, 5, 1024], "other": [1, 5, 1024]},
+ {"self": [1, 5, 16, 32], "other": [1, 5, 16, 32]},
+ {"self": [1, 5, 4096], "other": [1, 5, 4096]},
+ {"self": [1, 50, 1024], "other": [1, 50, 1024]},
+ {"self": [1, 50, 768], "other": [1, 50, 768]},
+ {"self": [1, 512, 100, 136], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 100, 136], "other": [1, 512, 100, 136]},
+ {"self": [1, 512, 14, 14], "other": [1, 512, 14, 14]},
+ {"self": [1, 512, 23, 40], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 25, 34], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 28, 28], "other": [1, 512, 28, 28]},
+ {"self": [1, 512, 32, 32], "other": [1, 512, 32, 32]},
+ {"self": [1, 512, 45, 80], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 50, 68], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 7, 7], "other": [1, 512, 7, 7]},
+ {"self": [1, 512, 90, 160], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 90, 160], "other": [1, 512, 90, 160]},
+ {"self": [1, 528, 96, 96], "other": [1, 528, 96, 96]},
+ {"self": [1, 56, 48, 48], "other": [1, 56, 48, 48]},
+ {"self": [1, 56, 56, 128], "other": [1, 56, 56, 128]},
+ {"self": [1, 56, 56, 96], "other": [1, 56, 56, 96]},
+ {"self": [1, 576, 14, 14], "other": [1, 576, 14, 14]},
+ {"self": [1, 59, 1024], "other": [1, 59, 1024]},
+ {"self": [1, 6, 1, 15], "other": [1, 1, 1, 15]},
+ {"self": [1, 6, 1, 15], "other": [1, 6, 1, 15]},
+ {"self": [1, 6, 1, 17], "other": [1, 1, 1, 17]},
+ {"self": [1, 6, 1, 17], "other": [1, 6, 1, 17]},
+ {"self": [1, 6, 1, 1], "other": [1, 1, 1, 1]},
+ {"self": [1, 6, 1, 1], "other": [1, 6, 1, 1]},
+ {"self": [1, 6, 1, 2], "other": [1, 1, 1, 2]},
+ {"self": [1, 6, 1, 2], "other": [1, 6, 1, 2]},
+ {"self": [1, 6, 15, 15], "other": [1, 1, 1, 15]},
+ {"self": [1, 6, 15, 15], "other": [1, 6, 15, 15]},
+ {"self": [1, 64, 120, 160], "other": [1, 64, 120, 160]},
+ {"self": [1, 64, 1280], "other": [1, 64, 1280]},
+ {"self": [1, 64, 14, 14], "other": [1, 64, 14, 14]},
+ {"self": [1, 64, 180, 320], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 200, 272], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 240, 320], "other": [1, 64, 240, 320]},
+ {"self": [1, 64, 256, 256], "other": [1, 64, 256, 256]},
+ {"self": [1, 64, 28, 28], "other": [1, 64, 28, 28]},
+ {"self": [1, 64, 3, 49, 49], "other": [1, 64, 1, 49, 49]},
+ {"self": [1, 64, 3, 64, 64], "other": [1, 64, 1, 64, 64]},
+ {"self": [1, 64, 30, 40], "other": [1, 64, 30, 40]},
+ {"self": [1, 64, 360, 640], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 4, 49, 49], "other": [1, 64, 1, 49, 49]},
+ {"self": [1, 64, 4, 64, 64], "other": [1, 64, 1, 64, 64]},
+ {"self": [1, 64, 400, 544], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 480, 640], "other": [1, 64, 480, 640]},
+ {"self": [1, 64, 56, 56], "other": [1, 64, 56, 56]},
+ {"self": [1, 64, 60, 80], "other": [1, 64, 60, 80]},
+ {"self": [1, 64, 6144], "other": [1, 64, 6144]},
+ {"self": [1, 64, 64, 128], "other": [1, 64, 64, 128]},
+ {"self": [1, 64, 64, 96], "other": [1, 64, 64, 96]},
+ {"self": [1, 64, 9, 9], "other": [1, 1, 1, 9]},
+ {"self": [1, 640, 32, 32], "other": [1, 640, 1, 1]},
+ {"self": [1, 640, 32, 32], "other": [1, 640, 32, 32]},
+ {"self": [1, 672, 28, 28], "other": [1, 672, 28, 28]},
+ {"self": [1, 672, 7, 7], "other": [1, 672, 7, 7]},
+ {"self": [1, 696, 28, 28], "other": [1, 696, 28, 28]},
+ {"self": [1, 7, 3072], "other": [1, 7, 3072]},
+ {"self": [1, 7, 4544], "other": [1, 7, 4544]},
+ {"self": [1, 7, 7, 1024], "other": [1, 7, 7, 1024]},
+ {"self": [1, 7, 7, 768], "other": [1, 7, 7, 768]},
+ {"self": [1, 7, 768], "other": [1, 7, 768]},
+ {"self": [1, 71, 7, 64], "other": [1, 71, 7, 64]},
+ {"self": [1, 71, 7, 7], "other": [7, 7]},
+ {"self": [1, 72, 14, 14], "other": [1, 72, 14, 14]},
+ {"self": [1, 72, 56, 56], "other": [1, 72, 56, 56]},
+ {"self": [1, 720, 14, 14], "other": [1, 720, 14, 14]},
+ {"self": [1, 728, 19, 19], "other": [1, 728, 19, 19]},
+ {"self": [1, 728, 38, 38], "other": [1, 728, 38, 38]},
+ {"self": [1, 7392, 12, 12], "other": [1, 7392, 12, 12]},
+ {"self": [1, 768, 384], "other": [384]},
+ {"self": [1, 784, 7, 7], "other": [1, 784, 7, 7]},
+ {"self": [1, 8, 1, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 8, 1, 10], "other": [1, 8, 1, 10]},
+ {"self": [1, 8, 1, 1], "other": [1, 1, 1, 1]},
+ {"self": [1, 8, 1, 1], "other": [1, 8, 1, 1]},
+ {"self": [1, 8, 1, 2], "other": [1, 1, 1, 2]},
+ {"self": [1, 8, 1, 2], "other": [1, 8, 1, 2]},
+ {"self": [1, 8, 10, 10], "other": [1, 1, 1, 10]},
+ {"self": [1, 8, 10, 10], "other": [1, 8, 10, 10]},
+ {"self": [1, 8, 256, 2048], "other": [1, 1, 1, 2048]},
+ {"self": [1, 8, 768], "other": [1, 8, 768]},
+ {"self": [1, 8, 8, 1024], "other": [1, 8, 8, 1024]},
+ {"self": [1, 8, 8, 768], "other": [1, 8, 8, 768]},
+ {"self": [1, 80, 10, 10], "other": [1, 80, 10, 10]},
+ {"self": [1, 80, 14, 14], "other": [1, 80, 14, 14]},
+ {"self": [1, 80, 15, 15], "other": [1, 80, 15, 15]},
+ {"self": [1, 80, 20, 20], "other": [1, 80, 20, 20]},
+ {"self": [1, 80, 56, 56], "other": [1, 80, 56, 56]},
+ {"self": [1, 88, 17, 17], "other": [1, 88, 17, 17]},
+ {"self": [1, 888, 7, 7], "other": [1, 888, 7, 7]},
+ {"self": [1, 896, 14, 14], "other": [1, 896, 14, 14]},
+ {"self": [1, 9, 1024], "other": [1, 9, 1024]},
+ {"self": [1, 9, 128], "other": [1, 9, 128]},
+ {"self": [1, 9, 16384], "other": [1, 9, 16384]},
+ {"self": [1, 9, 2048], "other": [1, 9, 2048]},
+ {"self": [1, 9, 3072], "other": [1, 9, 3072]},
+ {"self": [1, 9, 4096], "other": [1, 9, 4096]},
+ {"self": [1, 9, 768], "other": [1, 9, 768]},
+ {"self": [1, 9, 8192], "other": [1, 9, 8192]},
+ {"self": [1, 912, 7, 7], "other": [1, 912, 7, 7]},
+ {"self": [1, 96, 14, 14], "other": [1, 96, 14, 14]},
+ {"self": [1, 96, 19, 19], "other": [1, 96, 19, 19]},
+ {"self": [1, 96, 56, 56], "other": [1, 96, 56, 56]},
+ {"self": [1, 96, 7, 7], "other": [1, 96, 7, 7]},
+ {"self": [1, 96, 80], "other": [1, 96, 80]},
+ {"self": [10, 10], "other": [10, 10]},
+ {"self": [100, 1, 256], "other": [100, 1, 256]},
+ {"self": [12, 24, 24], "other": [12, 24, 24]},
+ {"self": [13600, 1, 4], "other": [1, 9, 4]},
+ {"self": [15, 15], "other": [15, 15]},
+ {"self": [16, 6, 49, 49], "other": [1, 6, 49, 49]},
+ {"self": [16, 6, 64, 64], "other": [1, 6, 64, 64]},
+ {"self": [16, 8, 49, 49], "other": [1, 8, 49, 49]},
+ {"self": [16, 8, 64, 64], "other": [1, 8, 64, 64]},
+ {"self": [2, 7, 512], "other": [1, 7, 512]},
+ {"self": [2, 7, 512], "other": [2, 7, 512]},
+ {"self": [2, 8, 7, 7], "other": [2, 1, 7, 7]},
+ {"self": [2048, 262], "other": [262]},
+ {"self": [221, 1, 4], "other": [1, 9, 4]},
+ {"self": [25, 4], "other": [25, 1]},
+ {"self": [3234, 1], "other": [3234, 1]},
+ {"self": [3234, 2], "other": [3234, 2]},
+ {"self": [3234], "other": [3234]},
+ {"self": [3400, 1, 4], "other": [1, 9, 4]},
+ {"self": [4, 12, 49, 49], "other": [1, 12, 49, 49]},
+ {"self": [4, 12, 64, 64], "other": [1, 12, 64, 64]},
+ {"self": [4, 16, 49, 49], "other": [1, 16, 49, 49]},
+ {"self": [4, 16, 64, 64], "other": [1, 16, 64, 64]},
+ {"self": [59, 1024], "other": [59, 1024]},
+ {"self": [63, 1, 4], "other": [1, 9, 4]},
+ {"self": [64, 3, 49, 49], "other": [1, 3, 49, 49]},
+ {"self": [64, 3, 64, 64], "other": [1, 3, 64, 64]},
+ {"self": [64, 4, 49, 49], "other": [1, 4, 49, 49]},
+ {"self": [64, 4, 64, 64], "other": [1, 4, 64, 64]},
+ {"self": [850, 1, 4], "other": [1, 9, 4]},
+ {"self": [8732, 1], "other": [8732, 1]},
+ {"self": [8732, 2], "other": [8732, 2]},
+ {"self": [8732], "other": [8732]},
+ {"self": [], "other": []},
+ {"self": [920, 1, 256], "other": [256]},
+ {"self": [920, 1, 256], "other": [920, 1, 256]},
+ {"self": [1, 1, 1, 42], "other": -6.0},
+ {"self": [1, 1, 1, 42], "other": 0.5},
+ {"self": [1, 1, 1, 42], "other": 1.0},
+ {"self": [1, 1, 1, 42], "other": 1.0},
+ {"self": [1, 1, 1, 42], "other": 2.0},
+ {"self": [1, 1, 1024], "other": 1.0},
+ {"self": [1, 1, 1], "other": 1e-06},
+ {"self": [1, 1, 224, 224], "other": -0.030000000000000027},
+ {"self": [1, 1, 224, 224], "other": -0.08799999999999997},
+ {"self": [1, 1, 224, 224], "other": -0.18799999999999994},
+ {"self": [1, 1, 3072], "other": 1.0},
+ {"self": [1, 1, 32, 1], "other": -6.0},
+ {"self": [1, 1, 32, 1], "other": 0.5},
+ {"self": [1, 1, 32, 1], "other": 1.0},
+ {"self": [1, 1, 32, 1], "other": 1.0},
+ {"self": [1, 1, 32, 1], "other": 2.0},
+ {"self": [1, 1, 4096], "other": 1.0},
+ {"self": [1, 1, 40], "other": 1e-06},
+ {"self": [1, 10, 1], "other": 1e-06},
+ {"self": [1, 1024, 1, 1], "other": 0.0},
+ {"self": [1, 1024, 1, 1], "other": 1e-05},
+ {"self": [1, 10], "other": 0.0},
+ {"self": [1, 10], "other": 1.0},
+ {"self": [1, 12, 3072], "other": 1.0},
+ {"self": [1, 128, 1, 1], "other": 0.0},
+ {"self": [1, 128, 1, 1], "other": 1e-05},
+ {"self": [1, 14, 3072], "other": 1.0},
+ {"self": [1, 15, 1024], "other": 1.0},
+ {"self": [1, 15, 1], "other": 1e-06},
+ {"self": [1, 19], "other": 2.0},
+ {"self": [1, 1], "other": 0.0},
+ {"self": [1, 1], "other": 16.0},
+ {"self": [1, 1], "other": 2.0},
+ {"self": [1, 2048, 1, 1], "other": 0.0},
+ {"self": [1, 2048, 1, 1], "other": 1e-05},
+ {"self": [1, 23, 1], "other": 1e-06},
+ {"self": [1, 256, 1, 1], "other": 0.0},
+ {"self": [1, 256, 1, 1], "other": 1e-05},
+ {"self": [1, 32, 6144], "other": 1.0},
+ {"self": [1, 32, 6144], "other": 1.0},
+ {"self": [1, 45, 3072], "other": 1.0},
+ {"self": [1, 5, 4096], "other": 1.0},
+ {"self": [1, 512, 1, 1], "other": 0.0},
+ {"self": [1, 512, 1, 1], "other": 1e-05},
+ {"self": [1, 59], "other": 2.0},
+ {"self": [1, 64, 1, 1], "other": 0.0},
+ {"self": [1, 64, 1, 1], "other": 1e-05},
+ {"self": [1, 7, 3072], "other": 1.0},
+ {"self": [1, 9, 128], "other": 1.0},
+ {"self": [1, 9, 16384], "other": 1.0},
+ {"self": [1, 9, 3072], "other": 1.0},
+ {"self": [1, 9, 4096], "other": 1.0},
+ {"self": [1, 9, 8192], "other": 1.0},
+ {"self": [10, 10], "other": 0.0},
+ {"self": [10, 10], "other": 8.0},
+ {"self": [100], "other": 0.0},
+ {"self": [1066], "other": 0.5},
+ {"self": [10], "other": 0.5},
+ {"self": [120], "other": 0.5},
+ {"self": [128], "other": 0.5},
+ {"self": [12], "other": 0.0},
+ {"self": [136], "other": 0.0},
+ {"self": [14], "other": 0.0},
+ {"self": [15, 15], "other": 0.0},
+ {"self": [15, 15], "other": 8.0},
+ {"self": [160], "other": 0.5},
+ {"self": [16], "other": 0.0},
+ {"self": [17, 17], "other": 0.0},
+ {"self": [17, 17], "other": 16.0},
+ {"self": [19], "other": 0.5},
+ {"self": [1], "other": 0.5},
+ {"self": [2, 2], "other": 0.0},
+ {"self": [2, 2], "other": 16.0},
+ {"self": [20], "other": 0.5},
+ {"self": [23], "other": 0.0},
+ {"self": [24, 24], "other": 160.0},
+ {"self": [240], "other": 0.5},
+ {"self": [28], "other": 0.0},
+ {"self": [2], "other": 0.5},
+ {"self": [300], "other": 0.5},
+ {"self": [30], "other": 0.5},
+ {"self": [320], "other": 0.5},
+ {"self": [32], "other": 0.0},
+ {"self": [38], "other": 0.5},
+ {"self": [3], "other": 0.5},
+ {"self": [40], "other": 0.0},
+ {"self": [40], "other": 0.5},
+ {"self": [480], "other": 0.5},
+ {"self": [50], "other": 0.0},
+ {"self": [56], "other": 0.0},
+ {"self": [5], "other": 0.5},
+ {"self": [60], "other": 0.5},
+ {"self": [640], "other": 0.5},
+ {"self": [64], "other": 0.0},
+ {"self": [68], "other": 0.0},
+ {"self": [7], "other": 0.0},
+ {"self": [800], "other": 0.5},
+ {"self": [80], "other": 0.5},
+ {"self": [], "other": 1},
+ ],
+ # {"self": [s0 + 1, s0 + 1], "other": 16},
+ # {"self": [s0 + 1, s0 + 1], "other": 0},
+ # {"self": [1, 16, 1, "s0 + 1"], "other": [1, 1, 1, "s0 + 1"]},
+ # {"self": [1, 16, 1, "s0 + 1"], "other": [1, 16, 1, "s0 + 1"]},
+ # {"self": [1, 8, 1, "s0 + 1"], "other": [1, 1, 1, "s0 + 1"]},
+ # {"self": [1, 8, 1, "s0 + 1"], "other": [1, 8, 1, "s0 + 1"]},
+ # {"self": [1, 6, 1, "s0 + 1"], "other": [1, 1, 1, "s0 + 1"]},
+ # {"self": [1, 6, 1, "s0 + 1"], "other": [1, 6, 1, "s0 + 1"]},
+ # {"self": [1, 12, 1, "s0 + 1"], "other": [1, 1, 1, "s0 + 1"]},
+ # {"self": [1, 12, 1, "s0 + 1"], "other": [1, 12, 1, "s0 + 1"]},
+ # {"self": [1, 32, "s0", "s1"], "other": [1, 32, "s0", "s1"]},
+ # {"self": [1, 12, 1, "s10 + 1"], "other": [1, 1, 1, "s10 + 1"]},
+ # {"self": [1, 64, "s1", "s2"], "other": [1, 64, "s1", "s2"]},
+ # {"self": [1, 128, "s1", "s2"], "other": [1, 128, "s1", "s2"]},
+ # {"self": [1, 16, 1, "s10 + 1"], "other": [1, 1, 1, "s10 + 1"]},
+ # {"self": [1, 256, "s1", "s2"], "other": [1, 256, "s1", "s2"]},
+ # {"self": [1, "s0", 768], "other": [1, "s0", 768]}
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_b_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["self"])
+
+ if isinstance(input_shape["other"], list):
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape["other"])
+ else:
+ torch_input_tensor_b = torch.tensor(input_shape["other"], dtype=torch.float32)
+ # torch_input_tensor_b = input_shape["other"]
+
+ golden_function = ttnn.get_golden_function(ttnn.add)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ # if isinstance(input_shape["other"], list):
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+ # else:
+ # input_tensor_b = input_shape["other"]
+
+ start_time = start_measuring_time()
+ result = ttnn.add(input_tensor_a, input_tensor_b)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, pcc=0.9999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/div/div.py b/tests/sweep_framework/sweeps/eltwise/binary/div/div.py
index c45a8f4375f..0a8605b5de0 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/div/div.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/div/div.py
@@ -26,10 +26,11 @@
# Developers can create their own generator functions and pass them to the parameters as inputs.
parameters = {
"nightly": {
- "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
- + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
- + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 8)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 8)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 8),
"accurate_mode": [True, False],
+ "round_mode": ["None", "floor", "trunc"],
"round_mode": [None],
"input_a_dtype": [ttnn.bfloat16],
"input_b_dtype": [ttnn.bfloat16],
@@ -40,11 +41,11 @@
"output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
},
"xfail": {
- "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
- + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
- + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 4)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 4)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 4),
"accurate_mode": [True, False],
- "round_mode": [None],
+ "round_mode": ["None", "floor", "trunc"],
"input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
"input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
"input_a_layout": [ttnn.TILE_LAYOUT],
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/eq/eq_scalar_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/eq/eq_scalar_pytorch2.py
new file mode 100644
index 00000000000..6a8a955225f
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/eq/eq_scalar_pytorch2.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 256],
+ [1, 16],
+ [1, 7],
+ [1, 7],
+ [16, 49, 49],
+ [16, 64, 64],
+ [1],
+ [1],
+ [4, 49, 49],
+ [4, 64, 64],
+ [64, 49, 49],
+ [64, 64, 64],
+ ],
+ "scalar": [1, 0, 1, 50256, 0, 0, 1, 50256, 0, 0, 0, 0],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ scalar,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.eq)
+ torch_output_tensor = golden_function(torch_input_tensor_a, scalar)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.eq(input_tensor_a, scalar, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/floor_divide/floor_divide_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/floor_divide/floor_divide_pytorch2.py
new file mode 100644
index 00000000000..efb9b66b64b
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/floor_divide/floor_divide_pytorch2.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [128],
+ ],
+ "scalar": [
+ 2,
+ ],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+def mesh_device_fixture():
+ device = ttnn.open_device(device_id=0)
+ assert ttnn.device.is_wormhole_b0(device), "This op is available for Wormhole_B0 only"
+ yield (device, "Wormhole_B0")
+ ttnn.close_device(device)
+ del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ scalar,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor_b = torch.tensor(scalar, dtype=torch.float32)
+
+ golden_function = ttnn.get_golden_function(ttnn.floor_div)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.floor_div(input_tensor_a, input_tensor_b, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/gt/gt_scalar_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/gt/gt_scalar_pytorch2.py
new file mode 100644
index 00000000000..0910775d42a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/gt/gt_scalar_pytorch2.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [10, 10],
+ [15, 15],
+ [],
+ ],
+ "scalar": [0, 0],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ scalar,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.gt)
+ torch_output_tensor = golden_function(torch_input_tensor_a, scalar)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.gt(input_tensor_a, scalar, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/le/le_tensor_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/le/le_tensor_pytorch2.py
new file mode 100644
index 00000000000..d119ced4892
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/le/le_tensor_pytorch2.py
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape_a": [[1, 1, 1]],
+ "input_shape_b": [[1, 1, 1]],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_b_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape_a,
+ input_shape_b,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape_a)
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape_b)
+
+ golden_function = ttnn.get_golden_function(ttnn.le)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+ start_time = start_measuring_time()
+ result = ttnn.le(input_tensor_a, input_tensor_b)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/multiply/mul_tensor_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/multiply/mul_tensor_pytorch2.py
new file mode 100644
index 00000000000..37c1414c713
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/multiply/mul_tensor_pytorch2.py
@@ -0,0 +1,472 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"self": [0], "other": 0.5},
+ {"self": [1, 1, 1, 10], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 12], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 14], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 15], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 17], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 1], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 201], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 2048], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 256], "other": -3.3895313892515355e38},
+ {"self": [1, 1, 1, 25], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 2], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 42], "other": -0.75},
+ {"self": [1, 1, 1, 42], "other": 1.25},
+ {"self": [1, 1, 1, 42], "other": 1.9761904761904763},
+ {"self": [1, 1, 1, 5], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 6], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1, 7], "other": -3.3895313892515355e38},
+ {"self": [1, 1, 1, 8], "other": -3.3895313892515355e38},
+ {"self": [1, 1, 1, 9], "other": -3.4028234663852886e38},
+ {"self": [1, 1, 1024], "other": 0.03125},
+ {"self": [1, 1, 1024], "other": 0.044715},
+ {"self": [1, 1, 1024], "other": 0.125},
+ {"self": [1, 1, 1024], "other": 0.5},
+ {"self": [1, 1, 1024], "other": 0.7978845608028654},
+ {"self": [1, 1, 224, 224], "other": 0.448},
+ {"self": [1, 1, 224, 224], "other": 0.45},
+ {"self": [1, 1, 224, 224], "other": 0.458},
+ {"self": [1, 1, 256], "other": 1.0},
+ {"self": [1, 1, 3072], "other": 0.044715},
+ {"self": [1, 1, 3072], "other": 0.5},
+ {"self": [1, 1, 3072], "other": 0.7978845608028654},
+ {"self": [1, 1, 32, 1], "other": -0.75},
+ {"self": [1, 1, 32, 1], "other": 1.25},
+ {"self": [1, 1, 32, 1], "other": 1.5625},
+ {"self": [1, 1, 4096], "other": 0.044715},
+ {"self": [1, 1, 4096], "other": 0.5},
+ {"self": [1, 1, 4096], "other": 0.7978845608028654},
+ {"self": [1, 1, 480, 640], "other": 10.0},
+ {"self": [1, 1, 512], "other": 0.04419417382415922},
+ {"self": [1, 1, 768], "other": 0.03608439182435161},
+ {"self": [1, 1, 768], "other": 0.125},
+ {"self": [1, 12, 3072], "other": 0.044715},
+ {"self": [1, 12, 3072], "other": 0.5},
+ {"self": [1, 12, 3072], "other": 0.7978845608028654},
+ {"self": [1, 12, 64, 64], "other": 16.0},
+ {"self": [1, 14, 3072], "other": 0.044715},
+ {"self": [1, 14, 3072], "other": 0.5},
+ {"self": [1, 14, 3072], "other": 0.7978845608028654},
+ {"self": [1, 15, 1024], "other": 0.044715},
+ {"self": [1, 15, 1024], "other": 0.5},
+ {"self": [1, 15, 1024], "other": 0.7978845608028654},
+ {"self": [1, 16, 64, 64], "other": 16.0},
+ {"self": [1, 160], "other": 1.0},
+ {"self": [1, 19, 1024], "other": 0.125},
+ {"self": [1, 19, 1024], "other": 32.0},
+ {"self": [1, 1], "other": 0.0},
+ {"self": [1, 1], "other": 16.0},
+ {"self": [1, 1], "other": 50258.0},
+ {"self": [1, 1], "other": 50259.0},
+ {"self": [1, 1], "other": 50359.0},
+ {"self": [1, 1], "other": 50363.0},
+ {"self": [1, 23, 40], "other": 6.283185307179586},
+ {"self": [1, 24, 49, 32], "other": 0.1767766952966369},
+ {"self": [1, 24, 64, 64], "other": 16.0},
+ {"self": [1, 24, 768], "other": 0.125},
+ {"self": [1, 3, 16, 16, 2], "other": 2.0},
+ {"self": [1, 3, 32, 32, 2], "other": 2.0},
+ {"self": [1, 3, 64, 64, 2], "other": 2.0},
+ {"self": [1, 3, 64, 64], "other": 16.0},
+ {"self": [1, 32, 49, 32], "other": 0.1767766952966369},
+ {"self": [1, 32, 6144], "other": 0.044715},
+ {"self": [1, 32, 6144], "other": 0.5},
+ {"self": [1, 32, 6144], "other": 0.79788456},
+ {"self": [1, 32, 64, 64], "other": 16.0},
+ {"self": [1, 4, 64, 64], "other": 16.0},
+ {"self": [1, 45, 3072], "other": 0.044715},
+ {"self": [1, 45, 3072], "other": 0.5},
+ {"self": [1, 45, 3072], "other": 0.7978845608028654},
+ {"self": [1, 5, 4096], "other": 0.044715},
+ {"self": [1, 5, 4096], "other": 0.5},
+ {"self": [1, 5, 4096], "other": 0.7978845608028654},
+ {"self": [1, 50, 3072], "other": 1.702},
+ {"self": [1, 50, 768], "other": 0.125},
+ {"self": [1, 59, 1024], "other": 0.125},
+ {"self": [1, 6, 64, 64], "other": 16.0},
+ {"self": [1, 7, 3072], "other": 0.044715},
+ {"self": [1, 7, 3072], "other": 0.5},
+ {"self": [1, 7, 3072], "other": 0.7978845608028654},
+ {"self": [1, 8, 64, 64], "other": 16.0},
+ {"self": [1, 9, 128], "other": 0.044715},
+ {"self": [1, 9, 128], "other": 0.5},
+ {"self": [1, 9, 128], "other": 0.7978845608028654},
+ {"self": [1, 9, 16384], "other": 0.044715},
+ {"self": [1, 9, 16384], "other": 0.5},
+ {"self": [1, 9, 16384], "other": 0.7978845608028654},
+ {"self": [1, 9, 3072], "other": 0.044715},
+ {"self": [1, 9, 3072], "other": 0.5},
+ {"self": [1, 9, 3072], "other": 0.7978845608028654},
+ {"self": [1, 9, 4096], "other": 0.044715},
+ {"self": [1, 9, 4096], "other": 0.5},
+ {"self": [1, 9, 4096], "other": 0.7978845608028654},
+ {"self": [1, 9, 8192], "other": 0.044715},
+ {"self": [1, 9, 8192], "other": 0.5},
+ {"self": [1, 9, 8192], "other": 0.7978845608028654},
+ {"self": [10, 10], "other": 16.0},
+ {"self": [10, 10], "other": 8.0},
+ {"self": [100], "other": 0.5},
+ {"self": [1066], "other": 0.600375234521576},
+ {"self": [120], "other": 0.5},
+ {"self": [128], "other": 0.125},
+ {"self": [128], "other": 0.25},
+ {"self": [128], "other": 0.5},
+ {"self": [128], "other": 1.0},
+ {"self": [128], "other": 2.0},
+ {"self": [12], "other": 32.0},
+ {"self": [136], "other": 0.5},
+ {"self": [14], "other": 0.5},
+ {"self": [15, 15], "other": 16.0},
+ {"self": [15, 15], "other": 8.0},
+ {"self": [16, 6, 49, 32], "other": 0.1767766952966369},
+ {"self": [16, 8, 49, 32], "other": 0.1767766952966369},
+ {"self": [160], "other": -9.210340371976184},
+ {"self": [160], "other": 0.5},
+ {"self": [16], "other": 0.5},
+ {"self": [16], "other": 32.0},
+ {"self": [17, 17], "other": 16.0},
+ {"self": [2, 2], "other": 16.0},
+ {"self": [2, 7, 2048], "other": 1.702},
+ {"self": [2, 7, 512], "other": 0.125},
+ {"self": [23], "other": 31.304347826086957},
+ {"self": [240], "other": 0.5},
+ {"self": [28], "other": 0.25},
+ {"self": [28], "other": 0.5},
+ {"self": [300], "other": 1.6},
+ {"self": [300], "other": 2.1333333333333333},
+ {"self": [30], "other": 0.5},
+ {"self": [320], "other": 0.5},
+ {"self": [320], "other": 1.0},
+ {"self": [320], "other": 1.5},
+ {"self": [320], "other": 2.0},
+ {"self": [3234, 2], "other": 0.5},
+ {"self": [3234], "other": 0.5},
+ {"self": [32], "other": 0.5},
+ {"self": [4, 12, 49, 32], "other": 0.1767766952966369},
+ {"self": [4, 16, 49, 32], "other": 0.1767766952966369},
+ {"self": [40], "other": 0.5},
+ {"self": [40], "other": 32.0},
+ {"self": [480], "other": 0.5},
+ {"self": [50], "other": 0.5},
+ {"self": [56], "other": 0.125},
+ {"self": [56], "other": 0.25},
+ {"self": [56], "other": 0.5},
+ {"self": [60], "other": 0.5},
+ {"self": [64, 3, 49, 32], "other": 0.1767766952966369},
+ {"self": [64, 4, 49, 32], "other": 0.1767766952966369},
+ {"self": [640], "other": 0.5},
+ {"self": [64], "other": 0.5},
+ {"self": [68], "other": 0.5},
+ {"self": [7], "other": 0.42857142857142855},
+ {"self": [800], "other": 0.6},
+ {"self": [80], "other": 0.5},
+ {"self": [8732, 2], "other": 0.5},
+ {"self": [8732], "other": 0.5},
+ # vec other
+ {"self": [0, 1], "other": [0, 1]},
+ {"self": [0], "other": []},
+ {"self": [1, 1, 1, 17], "other": [1, 1, 1, 17]},
+ {"self": [1, 1, 1, 1], "other": [1, 1, 1, 1]},
+ {"self": [1, 1, 1, 2], "other": [1, 1, 1, 2]},
+ {"self": [1, 1, 1, 42], "other": [1, 1, 1, 42]},
+ {"self": [1, 1, 1024], "other": [1, 1, 1024]},
+ {"self": [1, 1, 1024], "other": [1, 1, 1]},
+ {"self": [1, 1, 16, 32], "other": [1, 1, 1, 32]},
+ {"self": [1, 1, 3072], "other": [1, 1, 3072]},
+ {"self": [1, 1, 32, 1], "other": [1, 1, 32, 1]},
+ {"self": [1, 1, 4096], "other": [1, 1, 4096]},
+ {"self": [1, 1, 512], "other": [1, 1, 1]},
+ {"self": [1, 1, 7, 64], "other": [1, 1, 7, 64]},
+ {"self": [1, 1, 768], "other": [1, 1, 1]},
+ {"self": [1, 10, 1024], "other": [1, 10, 1]},
+ {"self": [1, 10, 512], "other": [1, 10, 1]},
+ {"self": [1, 10, 768], "other": [1, 10, 1]},
+ {"self": [1, 1024, 1, 1], "other": [1, 1024, 1, 1]},
+ {"self": [1, 1024, 2560], "other": [1, 1024, 2560]},
+ {"self": [1, 1024, 45, 80], "other": [1, 1024, 1, 1]},
+ {"self": [1, 1024, 50, 68], "other": [1, 1024, 1, 1]},
+ {"self": [1, 1024, 7, 7], "other": [1, 1024, 1, 1]},
+ {"self": [1, 104, 1, 1], "other": [1, 104, 28, 28]},
+ {"self": [1, 1056, 1, 1], "other": [1, 1056, 48, 48]},
+ {"self": [1, 10], "other": [1, 10]},
+ {"self": [1, 12, 3072], "other": [1, 12, 3072]},
+ {"self": [1, 120, 1, 1], "other": [1, 120, 14, 14]},
+ {"self": [1, 120, 1, 1], "other": [1, 120, 28, 28]},
+ {"self": [1, 120, 1, 1], "other": [1, 120, 40, 40]},
+ {"self": [1, 120, 28, 28], "other": [1, 120, 1, 1]},
+ {"self": [1, 120, 28, 28], "other": [1, 120, 28, 28]},
+ {"self": [1, 1232, 1, 1], "other": [1, 1232, 14, 14]},
+ {"self": [1, 128, 1, 1], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 100, 136], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 180, 320], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 200, 272], "other": [1, 128, 1, 1]},
+ {"self": [1, 128, 90, 160], "other": [1, 128, 1, 1]},
+ {"self": [1, 1392, 1, 1], "other": [1, 1392, 14, 14]},
+ {"self": [1, 14, 3072], "other": [1, 14, 3072]},
+ {"self": [1, 144, 1, 1], "other": [1, 144, 14, 14]},
+ {"self": [1, 144, 1, 1], "other": [1, 144, 28, 28]},
+ {"self": [1, 15, 1024], "other": [1, 15, 1024]},
+ {"self": [1, 15, 512], "other": [1, 15, 1]},
+ {"self": [1, 1512, 1, 1], "other": [1, 1512, 7, 7]},
+ {"self": [1, 16, 1, 1], "other": [1, 16, 56, 56]},
+ {"self": [1, 184, 14, 14], "other": [1, 184, 14, 14]},
+ {"self": [1, 192, 32, 42], "other": [1, 1, 1, 42]},
+ {"self": [1, 192, 32, 42], "other": [1, 1, 32, 1]},
+ {"self": [1, 1], "other": [1, 160]},
+ {"self": [1, 200, 14, 14], "other": [1, 200, 14, 14]},
+ {"self": [1, 2016, 1, 1], "other": [1, 2016, 7, 7]},
+ {"self": [1, 2048, 1, 1], "other": [1, 2048, 1, 1]},
+ {"self": [1, 2048, 23, 40], "other": [1, 2048, 1, 1]},
+ {"self": [1, 2048, 25, 34], "other": [1, 2048, 1, 1]},
+ {"self": [1, 208, 1, 1], "other": [1, 208, 14, 14]},
+ {"self": [1, 216, 1, 1], "other": [1, 216, 28, 28]},
+ {"self": [1, 224, 1, 1], "other": [1, 224, 56, 56]},
+ {"self": [1, 232, 1, 1], "other": [1, 232, 56, 56]},
+ {"self": [1, 24, 64, 64], "other": [24, 1, 1]},
+ {"self": [1, 240, 1, 1], "other": [1, 240, 14, 14]},
+ {"self": [1, 240, 28, 28], "other": [1, 240, 28, 28]},
+ {"self": [1, 256, 1, 1], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 100, 136], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 128, 128], "other": [128, 1]},
+ {"self": [1, 256, 128, 128], "other": [128]},
+ {"self": [1, 256, 180, 320], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 200, 272], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 45, 80], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 50, 68], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 5120], "other": [1, 256, 5120]},
+ {"self": [1, 256, 56, 56], "other": [1, 256, 1, 1]},
+ {"self": [1, 256, 90, 160], "other": [1, 256, 1, 1]},
+ {"self": [1, 288, 1, 1], "other": [1, 288, 7, 7]},
+ {"self": [1, 2904, 1, 1], "other": [1, 2904, 24, 24]},
+ {"self": [1, 3, 16, 16, 2], "other": [1, 3, 16, 16, 2]},
+ {"self": [1, 3, 16, 16, 2], "other": []},
+ {"self": [1, 3, 300, 300], "other": [300, 1]},
+ {"self": [1, 3, 300, 300], "other": [300]},
+ {"self": [1, 3, 32, 32, 2], "other": [1, 3, 32, 32, 2]},
+ {"self": [1, 3, 32, 32, 2], "other": []},
+ {"self": [1, 3, 320, 320], "other": [320, 1]},
+ {"self": [1, 3, 320, 320], "other": [320]},
+ {"self": [1, 3, 64, 64, 2], "other": [1, 3, 64, 64, 2]},
+ {"self": [1, 3, 64, 64, 2], "other": []},
+ {"self": [1, 3, 800, 1066], "other": [1066]},
+ {"self": [1, 3, 800, 1066], "other": [800, 1]},
+ {"self": [1, 3024, 1, 1], "other": [1, 3024, 7, 7]},
+ {"self": [1, 32, 6144], "other": [1, 32, 6144]},
+ {"self": [1, 32, 64, 64], "other": [32, 1, 1]},
+ {"self": [1, 320, 1, 1], "other": [1, 320, 14, 14]},
+ {"self": [1, 32], "other": [1, 32]},
+ {"self": [1, 336, 1, 1], "other": [1, 336, 14, 14]},
+ {"self": [1, 3712, 1, 1], "other": [1, 3712, 7, 7]},
+ {"self": [1, 4096, 1280], "other": [1, 4096, 1280]},
+ {"self": [1, 440, 1, 1], "other": [1, 440, 7, 7]},
+ {"self": [1, 448, 1, 1], "other": [1, 448, 28, 28]},
+ {"self": [1, 45, 3072], "other": [1, 45, 3072]},
+ {"self": [1, 48, 1, 1], "other": [1, 48, 56, 56]},
+ {"self": [1, 480, 1, 1], "other": [1, 480, 10, 10]},
+ {"self": [1, 480, 1, 1], "other": [1, 480, 14, 14]},
+ {"self": [1, 480, 1, 1], "other": [1, 480, 20, 20]},
+ {"self": [1, 480, 14, 14], "other": [1, 480, 1, 1]},
+ {"self": [1, 480, 14, 14], "other": [1, 480, 14, 14]},
+ {"self": [1, 5, 16, 32], "other": [1, 5, 1, 32]},
+ {"self": [1, 5, 4096], "other": [1, 5, 4096]},
+ {"self": [1, 50, 3072], "other": [1, 50, 3072]},
+ {"self": [1, 512, 1, 1], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 1, 1], "other": [1, 512, 38, 38]},
+ {"self": [1, 512, 100, 136], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 23, 40], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 25, 34], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 28, 28], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 45, 80], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 50, 68], "other": [1, 512, 1, 1]},
+ {"self": [1, 512, 90, 160], "other": [1, 512, 1, 1]},
+ {"self": [1, 528, 1, 1], "other": [1, 528, 96, 96]},
+ {"self": [1, 576, 1, 1], "other": [1, 576, 14, 14]},
+ {"self": [1, 576, 1, 1], "other": [1, 576, 7, 7]},
+ {"self": [1, 59], "other": [1, 59]},
+ {"self": [1, 60], "other": [1, 60]},
+ {"self": [1, 64, 1, 1], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 1, 1], "other": [1, 64, 56, 56]},
+ {"self": [1, 64, 120, 160], "other": [1, 1, 120, 160]},
+ {"self": [1, 64, 120, 160], "other": [120, 1]},
+ {"self": [1, 64, 120, 160], "other": [160]},
+ {"self": [1, 64, 180, 320], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 200, 272], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 240, 320], "other": [240, 1]},
+ {"self": [1, 64, 240, 320], "other": [320]},
+ {"self": [1, 64, 30, 40], "other": [1, 1, 30, 40]},
+ {"self": [1, 64, 30, 40], "other": [30, 1]},
+ {"self": [1, 64, 30, 40], "other": [40]},
+ {"self": [1, 64, 360, 640], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 400, 544], "other": [1, 64, 1, 1]},
+ {"self": [1, 64, 480, 640], "other": [480, 1]},
+ {"self": [1, 64, 480, 640], "other": [640]},
+ {"self": [1, 64, 5120], "other": [1, 64, 5120]},
+ {"self": [1, 64, 60, 80], "other": [1, 1, 60, 80]},
+ {"self": [1, 64, 60, 80], "other": [60, 1]},
+ {"self": [1, 64, 60, 80], "other": [80]},
+ {"self": [1, 672, 1, 1], "other": [1, 672, 10, 10]},
+ {"self": [1, 672, 1, 1], "other": [1, 672, 14, 14]},
+ {"self": [1, 672, 1, 1], "other": [1, 672, 20, 20]},
+ {"self": [1, 672, 1, 1], "other": [1, 672, 7, 7]},
+ {"self": [1, 672, 14, 14], "other": [1, 672, 1, 1]},
+ {"self": [1, 672, 14, 14], "other": [1, 672, 14, 14]},
+ {"self": [1, 672, 7, 7], "other": [1, 672, 1, 1]},
+ {"self": [1, 696, 1, 1], "other": [1, 696, 28, 28]},
+ {"self": [1, 7, 3072], "other": [1, 7, 3072]},
+ {"self": [1, 71, 7, 64], "other": [1, 1, 7, 64]},
+ {"self": [1, 72, 1, 1], "other": [1, 72, 28, 28]},
+ {"self": [1, 72, 1, 1], "other": [1, 72, 40, 40]},
+ {"self": [1, 72, 1, 1], "other": [1, 72, 56, 56]},
+ {"self": [1, 72, 28, 28], "other": [1, 72, 1, 1]},
+ {"self": [1, 72, 56, 56], "other": [1, 72, 56, 56]},
+ {"self": [1, 7392, 1, 1], "other": [1, 7392, 12, 12]},
+ {"self": [1, 768, 14, 14], "other": [1, 768, 1, 1]},
+ {"self": [1, 784, 1, 1], "other": [1, 784, 7, 7]},
+ {"self": [1, 888, 1, 1], "other": [1, 888, 7, 7]},
+ {"self": [1, 896, 1, 1], "other": [1, 896, 14, 14]},
+ {"self": [1, 9, 128], "other": [1, 9, 128]},
+ {"self": [1, 9, 16384], "other": [1, 9, 16384]},
+ {"self": [1, 9, 3072], "other": [1, 9, 3072]},
+ {"self": [1, 9, 4096], "other": [1, 9, 4096]},
+ {"self": [1, 9, 8192], "other": [1, 9, 8192]},
+ {"self": [1, 96, 1, 1], "other": [1, 96, 14, 14]},
+ {"self": [1, 960, 1, 1], "other": [1, 960, 7, 7]},
+ {"self": [1, 960, 7, 7], "other": [1, 960, 1, 1]},
+ {"self": [1, 960, 7, 7], "other": [1, 960, 7, 7]},
+ {"self": [100], "other": []},
+ {"self": [1024], "other": [1, 1, 1024]},
+ {"self": [1024], "other": [1, 10, 1024]},
+ {"self": [1024], "other": [1, 197, 1024]},
+ {"self": [12], "other": []},
+ {"self": [136], "other": []},
+ {"self": [13], "other": []},
+ {"self": [16, 1], "other": [1, 1, 32]},
+ {"self": [16, 6, 64, 64], "other": [6, 1, 1]},
+ {"self": [16, 8, 64, 64], "other": [8, 1, 1]},
+ {"self": [17], "other": []},
+ {"self": [1], "other": [1]},
+ {"self": [2, 1], "other": []},
+ {"self": [2, 7, 2048], "other": [2, 7, 2048]},
+ {"self": [25], "other": []},
+ {"self": [300], "other": []},
+ {"self": [3234, 1], "other": [3234, 1]},
+ {"self": [3234, 2], "other": [2]},
+ {"self": [34], "other": []},
+ {"self": [4, 12, 64, 64], "other": [12, 1, 1]},
+ {"self": [4, 16, 64, 64], "other": [16, 1, 1]},
+ {"self": [50], "other": []},
+ {"self": [512], "other": [1, 1, 512]},
+ {"self": [512], "other": [1, 10, 512]},
+ {"self": [512], "other": [1, 15, 512]},
+ {"self": [64, 3, 64, 64], "other": [3, 1, 1]},
+ {"self": [64, 4, 64, 64], "other": [4, 1, 1]},
+ {"self": [68], "other": []},
+ {"self": [768], "other": [1, 1, 768]},
+ {"self": [768], "other": [1, 10, 768]},
+ {"self": [768], "other": [1, 197, 768]},
+ {"self": [7], "other": []},
+ {"self": [8732, 1], "other": [8732, 1]},
+ {"self": [8732, 2], "other": [2]},
+ {"self": [9], "other": []},
+ {"self": [], "other": [0, 1]},
+ {"self": [], "other": [1, 1, 768]},
+ {"self": [], "other": [1, 24, 768]},
+ {"self": [], "other": [3234, 1]},
+ {"self": [], "other": [8732, 1]},
+ ],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_b_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["self"])
+
+ if isinstance(input_shape["other"], list):
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape["other"])
+ else:
+ torch_input_tensor_b = torch.tensor(input_shape["other"], dtype=torch.float32)
+ # torch_input_tensor_b = input_shape["other"]
+
+ golden_function = ttnn.get_golden_function(ttnn.mul)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ # if isinstance(input_shape["other"], list):
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+ # else:
+ # input_tensor_b = input_shape["other"]
+
+ start_time = start_measuring_time()
+ result = ttnn.mul(input_tensor_a, input_tensor_b)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, pcc=0.99), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/remainder/remainder_scalar_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/binary/remainder/remainder_scalar_pytorch2.py
new file mode 100644
index 00000000000..5ac35667ba3
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/binary/remainder/remainder_scalar_pytorch2.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1],
+ ],
+ "scalar": [7],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ scalar,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.remainder)
+ torch_output_tensor = golden_function(torch_input_tensor_a, scalar)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.remainder(input_tensor_a, scalar, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/maximum/maximum_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/maximum/maximum_pytorch2.py
new file mode 100644
index 00000000000..55401a3e5ff
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/maximum/maximum_pytorch2.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"shape1": [1, 16, 1, 60], "shape2": []},
+ # {"shape1": [1,16,s10+1], "shape2": []},
+ {"shape1": [1, 16, 19, 19], "shape2": []},
+ {"shape1": [1, 16, 59, 59], "shape2": []},
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["shape1"])
+
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape["shape2"])
+
+ golden_function = ttnn.get_golden_function(ttnn.maximum)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.maximum(input_tensor_a, input_tensor_b, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/minimum/minimum_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/minimum/minimum_pytorch2.py
new file mode 100644
index 00000000000..22b185e425f
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/minimum/minimum_pytorch2.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1],
+ [10, 10],
+ [15, 15],
+ [17, 17],
+ # [s0+1, s0+1],
+ [2, 2],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.minimum)
+ torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.minimum(input_tensor_a, input_tensor_b, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_pytorch2.py
new file mode 100644
index 00000000000..3f25722d19f
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_pytorch2.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"shape": [1, 1, 1024], "exponent": 2},
+ {"shape": [1, 1, 1024], "exponent": 3.0},
+ {"shape": [1, 1, 3072], "exponent": 3.0},
+ {"shape": [1, 1, 4096], "exponent": 3.0},
+ {"shape": [1, 1, 512], "exponent": 2},
+ {"shape": [1, 1, 768], "exponent": 2},
+ {"shape": [1, 10, 1024], "exponent": 2},
+ {"shape": [1, 10, 512], "exponent": 2},
+ {"shape": [1, 10, 768], "exponent": 2},
+ {"shape": [1, 12, 3072], "exponent": 3.0},
+ {"shape": [1, 14, 3072], "exponent": 3.0},
+ {"shape": [1, 15, 1024], "exponent": 3.0},
+ {"shape": [1, 15, 512], "exponent": 2},
+ {"shape": [1, 3, 16, 16, 2], "exponent": 2},
+ {"shape": [1, 3, 32, 32, 2], "exponent": 2},
+ {"shape": [1, 3, 64, 64, 2], "exponent": 2},
+ {"shape": [1, 45, 3072], "exponent": 3.0},
+ {"shape": [1, 5, 4096], "exponent": 3.0},
+ {"shape": [1, 7, 3072], "exponent": 3.0},
+ {"shape": [1, 9, 128], "exponent": 3.0},
+ {"shape": [1, 9, 16384], "exponent": 3.0},
+ {"shape": [1, 9, 3072], "exponent": 3.0},
+ {"shape": [1, 9, 4096], "exponent": 3.0},
+ {"shape": [1, 9, 8192], "exponent": 3.0},
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["shape"])
+
+ value = input_shape["exponent"]
+ torch_output_tensor = torch.pow(torch_input_tensor_a, value)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.pow(input_tensor_a, exponent=value, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py
new file mode 100644
index 00000000000..8dfb1bd1dc8
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"value": 10000, "shape": [128]},
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["shape"])
+
+ value = input_shape["value"]
+ golden_function = ttnn.get_golden_function(ttnn.pow)
+ torch_output_tensor = golden_function(torch_input_tensor_a, value)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.pow(value, exponent=input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_tensor_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_tensor_pytorch2.py
new file mode 100644
index 00000000000..911d33e7df5
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_tensor_pytorch2.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ {"shape1": [], "shape2": [16]},
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape["shape1"])
+
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape["shape2"])
+
+ torch_output_tensor = torch.pow(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.pow(input_tensor_a, input_tensor_b, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/lerp.py b/tests/sweep_framework/sweeps/eltwise/ternary/lerp.py
new file mode 100644
index 00000000000..c69560f3474
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/lerp.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 360
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 8)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 8)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_c_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_c_layout": [ttnn.TILE_LAYOUT],
+ "input_c_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ input_b_dtype,
+ input_b_layout,
+ input_b_memory_config,
+ input_c_dtype,
+ input_c_layout,
+ input_c_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape)
+
+ torch_input_tensor_c = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_c_dtype
+ )(input_shape)
+
+ torch_output_tensor = torch.lerp(torch_input_tensor_a, torch_input_tensor_b, torch_input_tensor_c)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ input_tensor_c = ttnn.from_torch(
+ torch_input_tensor_c,
+ dtype=input_c_dtype,
+ layout=input_c_layout,
+ device=device,
+ memory_config=input_c_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.lerp(input_tensor_a, input_tensor_b, input_tensor_c, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(f"pcc {pcc}")
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/where/where_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/ternary/where/where_pytorch2.py
new file mode 100644
index 00000000000..720b4e1d33d
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/where/where_pytorch2.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_bin
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "where": {
+ "input_shape": [
+ {"shape1": [1, 1, 1, 46], "shape2": [1, 12, 1, 46], "shape3": []},
+ {"shape1": [1, 1, 1, 6], "shape2": [1, 16, 1, 6], "shape3": []},
+ # {"shape1": [1, 1, 1, "s10 + 1"], "shape2": [1, 12, 1, "s10 + 1"], "shape3": []},
+ # {"shape1": [1, 1, 1, "s10 + 1"], "shape2": [1, 16, 1, "s10 + 1"], "shape3": []},
+ {"shape1": [1, 1, 256], "shape2": [1, 1, 256], "shape3": []},
+ {"shape1": [1, 1, 45, 45], "shape2": [1, 12, 45, 45], "shape3": []},
+ {"shape1": [1, 1, 5, 5], "shape2": [1, 16, 5, 5], "shape3": []},
+ {"shape1": [1, 1, 7, 7], "shape2": [1, 12, 7, 7], "shape3": []},
+ {"shape1": [1, 1], "shape2": [1, 1], "shape3": [1, 1]},
+ # {"shape1": [1, "s0", 256], "shape2": [1, "s0", 256], "shape3": []},
+ {"shape1": [10, 10], "shape2": [10, 10], "shape3": [10, 10]},
+ {"shape1": [15, 15], "shape2": [15, 15], "shape3": [15, 15]},
+ {"shape1": [17, 17], "shape2": [17, 17], "shape3": [17, 17]},
+ {"shape1": [2, 2], "shape2": [2, 2], "shape3": [2, 2]},
+ # {"shape1": ["s0 + 1", "s0 + 1"], "shape2": ["s0 + 1", "s0 + 1"], "shape3": []},
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_c_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_b_layout": [ttnn.TILE_LAYOUT],
+ "input_c_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_b_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_c_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_b_dtype,
+ input_c_dtype,
+ input_a_layout,
+ input_b_layout,
+ input_c_layout,
+ input_a_memory_config,
+ input_b_memory_config,
+ input_c_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(gen_bin, input_a_dtype)(input_shape["shape1"])
+ torch_input_tensor_b = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_b_dtype
+ )(input_shape["shape2"])
+ torch_input_tensor_c = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_c_dtype
+ )(input_shape["shape3"])
+
+ golden_function = ttnn.get_golden_function(ttnn.where)
+ torch_output_tensor = golden_function(torch_input_tensor_a > 0, torch_input_tensor_b, torch_input_tensor_c)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ input_tensor_b = ttnn.from_torch(
+ torch_input_tensor_b,
+ dtype=input_b_dtype,
+ layout=input_b_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ input_tensor_c = ttnn.from_torch(
+ torch_input_tensor_c,
+ dtype=input_c_dtype,
+ layout=input_c_layout,
+ device=device,
+ memory_config=input_b_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.where(input_tensor_a, input_tensor_b, input_tensor_c, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/lgamma/lgamma.py b/tests/sweep_framework/sweeps/eltwise/unary/abs/abs_pytorch2.py
similarity index 85%
rename from tests/sweep_framework/sweeps/eltwise/unary/lgamma/lgamma.py
rename to tests/sweep_framework/sweeps/eltwise/unary/abs/abs_pytorch2.py
index 51597d92f37..0e262cc9980 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/lgamma/lgamma.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/abs/abs_pytorch2.py
@@ -25,9 +25,10 @@
# Developers can create their own generator functions and pass them to the parameters as inputs.
parameters = {
"nightly": {
- "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
- + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
- + gen_shapes([32, 32], [256, 256], [32, 32], 32),
+ "input_shape": [
+ [10, 10],
+ [15, 15],
+ ],
"input_a_dtype": [ttnn.bfloat16],
"input_a_layout": [ttnn.TILE_LAYOUT],
"input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
@@ -53,9 +54,11 @@ def run(
torch.manual_seed(data_seed)
torch_input_tensor_a = gen_func_with_cast_tt(
- partial(torch_random, low=0.1, high=1000, dtype=torch.float32), input_a_dtype
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
)(input_shape)
- torch_output_tensor = torch.lgamma(torch_input_tensor_a)
+
+ golden_function = ttnn.get_golden_function(ttnn.abs)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
input_tensor_a = ttnn.from_torch(
torch_input_tensor_a,
@@ -66,7 +69,7 @@ def run(
)
start_time = start_measuring_time()
- result = ttnn.lgamma(input_tensor_a, memory_config=output_memory_config)
+ result = ttnn.abs(input_tensor_a, memory_config=output_memory_config)
output_tensor = ttnn.to_torch(result)
e2e_perf = stop_measuring_time(start_time)
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/bitwise/bitwise_not_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/bitwise/bitwise_not_pytorch2.py
new file mode 100644
index 00000000000..91f5e28af58
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/bitwise/bitwise_not_pytorch2.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [[1, 23, 40]],
+ "input_a_dtype": [ttnn.int32],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+def mesh_device_fixture():
+ device = ttnn.open_device(device_id=0)
+ assert ttnn.device.is_wormhole_b0(device), "This op is available for Wormhole_B0 only"
+ yield (device, "Wormhole_B0")
+ ttnn.close_device(device)
+ del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-2147483647, high=2147483648, dtype=torch.int64), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor_a = torch.full(size=input_shape, fill_value=-2147483647).to(torch.int32)
+
+ torch_output_tensor = torch.bitwise_not(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.bitwise_not(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_pytorch2.py
new file mode 100644
index 00000000000..03fa811c4a1
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_pytorch2.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [[1066], [120], [128], [160], [240], [300], [30], [320], [40], [480], [60], [640], [800], [80]],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+def mesh_device_fixture():
+ device = ttnn.open_device(device_id=0)
+ assert ttnn.device.is_wormhole_b0(device), "This op is available for Wormhole_B0 only"
+ yield (device, "Wormhole_B0")
+ ttnn.close_device(device)
+ del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.ceil)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.ceil(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/clamp/clamp.py b/tests/sweep_framework/sweeps/eltwise/unary/clamp/clamp.py
index 7b63fcd562d..0261f6b6758 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/clamp/clamp.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/clamp/clamp.py
@@ -27,7 +27,7 @@
parameters = {
"nightly": {
"input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 32),
- "mode": [None, "min", "max"],
+ "mode": ["both", "min", "max"],
"input_a_dtype": [ttnn.bfloat16],
"input_a_layout": [ttnn.TILE_LAYOUT],
"input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
@@ -60,9 +60,9 @@ def run(
low, high = gen_low_high_scalars()
if mode == "min":
- low, high = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item(), None
+ high = None
elif mode == "max":
- low, high = None, torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+ low = None
torch_output_tensor = torch.clamp(torch_input_tensor_a, low, high)
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/cos/cos_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/cos/cos_pytorch2.py
new file mode 100644
index 00000000000..9cb49e3f023
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/cos/cos_pytorch2.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [[1, 160], [1, 23, 40, 64]],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=0, high=6.283185307179586, dtype=torch.float16), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.cos)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.cos(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_pytorch2.py
new file mode 100644
index 00000000000..b69687ef84a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_pytorch2.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 128, 28, 28],
+ ],
+ "alpha": [1.0],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ alpha,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.elu)
+ torch_output_tensor = golden_function(torch_input_tensor_a, alpha=alpha)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.elu(input_tensor_a, alpha=alpha, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/exp/exp_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/exp/exp_pytorch2.py
new file mode 100644
index 00000000000..90f50db4656
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/exp/exp_pytorch2.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [0, 1],
+ [12, 1, 1],
+ [16, 1, 1],
+ [160],
+ [24, 1, 1],
+ [3, 1, 1],
+ [32, 1, 1],
+ [3234, 1],
+ [4, 1, 1],
+ [6, 1, 1],
+ [8, 1, 1],
+ [8732, 1],
+ [],
+ ],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-10, high=10, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.exp)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.exp(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/floor/floor_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/floor/floor_pytorch2.py
new file mode 100644
index 00000000000..e3328020f49
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/floor/floor_pytorch2.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [[1, 1, 1, 42], [1, 1, 32, 1]],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+def mesh_device_fixture():
+ device = ttnn.open_device(device_id=0)
+ assert ttnn.device.is_wormhole_b0(device), "This op is available for Wormhole_B0 only"
+ yield (device, "Wormhole_B0")
+ ttnn.close_device(device)
+ del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.floor)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.floor(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/gelu/gelu_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/gelu/gelu_pytorch2.py
new file mode 100644
index 00000000000..595c6613e0d
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/gelu/gelu_pytorch2.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 3072],
+ [1, 10, 3072],
+ [1, 10, 768],
+ [1, 1024, 2560],
+ [1, 1024, 512],
+ [1, 1024, 640],
+ [1, 1200, 1280],
+ [1, 1370, 5120],
+ [1, 14, 14, 1536],
+ [1, 14, 14, 2048],
+ [1, 1445, 768],
+ [1, 1500, 3072],
+ [1, 1536],
+ [1, 16, 16, 1536],
+ [1, 16, 16, 2048],
+ [1, 16, 3072],
+ [1, 16384, 128],
+ [1, 19, 4096],
+ [1, 19200, 256],
+ [1, 196, 3072],
+ [1, 197, 3072],
+ [1, 197, 4096],
+ [1, 201, 3072],
+ [1, 2048, 768],
+ [1, 24, 3072],
+ [1, 25, 3072],
+ [1, 256, 1024],
+ [1, 256, 1280],
+ [1, 256, 256],
+ [1, 256, 4096],
+ [1, 256, 5120],
+ [1, 28, 28, 1024],
+ [1, 28, 28, 768],
+ [1, 300, 2048],
+ [1, 32, 32, 1024],
+ [1, 32, 32, 768],
+ [1, 4, 3072],
+ [1, 4096, 1280],
+ [1, 4096, 256],
+ [1, 4800, 512],
+ [1, 50, 3072],
+ [1, 50, 4096],
+ [1, 56, 56, 384],
+ [1, 56, 56, 512],
+ [1, 64, 5120],
+ [1, 64, 64, 384],
+ [1, 64, 64, 512],
+ [1, 7, 18176],
+ [1, 7, 7, 3072],
+ [1, 7, 7, 4096],
+ [1, 768, 1500],
+ [1, 768, 3000],
+ [1, 768, 384],
+ [1, 8, 8, 3072],
+ [1, 8, 8, 4096],
+ ],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float16), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.gelu)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.gelu(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid_pytorch2.py
new file mode 100644
index 00000000000..cd9266e2bc7
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid_pytorch2.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1024, 1, 1],
+ [1, 120, 1, 1],
+ [1, 144, 1, 1],
+ [1, 16, 1, 1],
+ [1, 240, 1, 1],
+ [1, 256, 1, 1],
+ [1, 288, 1, 1],
+ [1, 480, 1, 1],
+ [1, 512, 1, 1],
+ [1, 576, 1, 1],
+ [1, 672, 1, 1],
+ [1, 72, 1, 1],
+ [1, 768, 1, 1],
+ [1, 96, 1, 1],
+ [1, 960, 1, 1],
+ ],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float16), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.hardsigmoid)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.hardsigmoid(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/isfinite.py b/tests/sweep_framework/sweeps/eltwise/unary/isfinite.py
new file mode 100644
index 00000000000..712e704f0a1
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/isfinite.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_rand_inf
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 64),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_rand_inf(input_shape, low=-100, high=100)
+ torch_output_tensor = torch.isfinite(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.isfinite(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(pcc)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/isinf.py b/tests/sweep_framework/sweeps/eltwise/unary/isinf.py
new file mode 100644
index 00000000000..34b680170d4
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/isinf.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import os
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_rand_inf
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 64),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_rand_inf(input_shape, low=-100, high=100)
+ torch_output_tensor = torch.isinf(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.isinf(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(pcc)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/isnan.py b/tests/sweep_framework/sweeps/eltwise/unary/isnan.py
new file mode 100644
index 00000000000..765a0c88729
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/isnan.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import os
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_rand_inf
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 64),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_rand_inf(input_shape, low=-100, high=100)
+ torch_output_tensor = torch.isnan(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.isnan(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(pcc)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/isneginf.py b/tests/sweep_framework/sweeps/eltwise/unary/isneginf.py
new file mode 100644
index 00000000000..5d9eca50a24
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/isneginf.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import os
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_rand_inf
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 64),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_rand_inf(input_shape, low=-100, high=100)
+ torch_output_tensor = torch.isneginf(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.isneginf(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(pcc)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/isposinf.py b/tests/sweep_framework/sweeps/eltwise/unary/isposinf.py
new file mode 100644
index 00000000000..f4d825a4812
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/isposinf.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import os
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, gen_rand_inf
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random, is_wormhole_b0
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 64),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_rand_inf(input_shape, low=-100, high=100)
+ torch_output_tensor = torch.isposinf(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.isposinf(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(pcc)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_pytorch2.py
new file mode 100644
index 00000000000..5f0078fb17b
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_pytorch2.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1024, 16, 16],
+ [1, 128, 128, 128],
+ [1, 128, 1536],
+ [1, 128, 32, 32],
+ [1, 128, 64, 64],
+ [1, 256, 16, 16],
+ [1, 256, 32, 32],
+ [1, 256, 384],
+ [1, 256, 64, 64],
+ [1, 32, 24576],
+ [1, 32, 24576],
+ [1, 32, 256, 256],
+ [1, 32, 512, 512],
+ [1, 512, 16, 16],
+ [1, 512, 32, 32],
+ [1, 512, 96],
+ [1, 64, 128, 128],
+ [1, 64, 256, 256],
+ [1, 64, 6144],
+ ],
+ "negative_slope": [0.1],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ negative_slope,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float16), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.leaky_relu)
+ torch_output_tensor = golden_function(torch_input_tensor_a, negative_slope=negative_slope)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.leaky_relu(input_tensor_a, negative_slope=negative_slope, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/lgamma.py b/tests/sweep_framework/sweeps/eltwise/unary/lgamma.py
new file mode 100644
index 00000000000..d25d2473319
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/lgamma.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 4)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 4)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 4),
+ "input_a_dtype": [ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=0.0001, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_output_tensor = torch.lgamma(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.lgamma(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(f"pcc {pcc}")
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/log/log_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/log/log_pytorch2.py
new file mode 100644
index 00000000000..e53c4b11c7f
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/log/log_pytorch2.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [[1, 1], [10, 10], [15, 15], [17, 17], [2, 2]],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=1, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.log)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.log(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/logit.py b/tests/sweep_framework/sweeps/eltwise/unary/logit.py
new file mode 100644
index 00000000000..b4cdfb87a40
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/logit.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+ "eps": [0, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1],
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 1)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 1)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 1),
+ "eps": [0, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 10e-1],
+ "input_a_dtype": [ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ eps,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_output_tensor = torch.logit(torch_input_tensor_a, eps)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.logit(input_tensor_a, eps=eps, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.99)
+ # print(f"eps {eps} pcc {pcc}")
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/mish.py b/tests/sweep_framework/sweeps/eltwise/unary/mish.py
new file mode 100644
index 00000000000..a9708746c56
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/mish.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# def mesh_device_fixture():
+# device = ttnn.open_device(device_id=0)
+# assert ttnn.device.is_grayskull(device), "This op is not supported on Grayskull"
+# device_name = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "default")).lower()
+# yield (device, device_name)
+# ttnn.close_device(device)
+# del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ mish = torch.nn.Mish()
+ torch_output_tensor = mish(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.mish(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(f"pcc {pcc}")
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/multigammaln.py b/tests/sweep_framework/sweeps/eltwise/unary/multigammaln.py
new file mode 100644
index 00000000000..89744611949
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/multigammaln.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 16)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 16)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 16),
+ "input_a_dtype": [ttnn.bfloat16],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 4)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 4)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 4),
+ "input_a_dtype": [ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=1.6, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_output_tensor = torch.special.multigammaln(torch_input_tensor_a, 4)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.multigammaln(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ # print(f"pcc {pcc}")
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv.py b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv.py
index 10bf13e50aa..138fe8929f7 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv.py
@@ -28,7 +28,7 @@
"input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 64),
"exclude_range": [[-1, 1]],
"input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
- "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
"input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
"output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
},
@@ -43,6 +43,15 @@
}
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Row Major layout is not supported"
+ return False, None
+
+
# This is the run instructions for the test, defined by the developer.
# The run function must take the above-defined parameters as inputs.
# The runner will call this run function with each test vector, and the returned results from this function will be stored.
@@ -66,7 +75,8 @@ def run(
factor = torch.tensor(1, dtype=torch.bfloat16).uniform_(0.1, 10.0).item()
- torch_output_tensor = torch.div(factor, torch_input_tensor_a)
+ golden_function = ttnn.get_golden_function(ttnn.rdiv)
+ torch_output_tensor = golden_function(torch_input_tensor_a, factor)
input_tensor_a = ttnn.from_torch(
torch_input_tensor_a,
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/relu/relu_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/relu/relu_pytorch2.py
new file mode 100644
index 00000000000..42c81c6886c
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/relu/relu_pytorch2.py
@@ -0,0 +1,504 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 2048],
+ [1, 1, 256],
+ [1, 1, 3072],
+ [1, 1, 4096],
+ [1, 1, 768],
+ [1, 10, 2048],
+ [1, 10, 3072],
+ [1, 10, 4096],
+ [1, 100, 14, 14],
+ [1, 100, 192],
+ [1, 1008, 14, 14],
+ [1, 1008, 7, 7],
+ [1, 1024, 10, 10],
+ [1, 1024, 14, 14],
+ [1, 1024, 19, 19],
+ [1, 1024, 28, 28],
+ [1, 1024, 45, 80],
+ [1, 1024, 50, 68],
+ [1, 1024, 7, 7],
+ [1, 104, 28, 28],
+ [1, 104, 56, 56],
+ [1, 1056, 14, 14],
+ [1, 1056, 48, 48],
+ [1, 1056, 7, 7],
+ [1, 1056, 96, 96],
+ [1, 1088, 14, 14],
+ [1, 1088, 7, 7],
+ [1, 110, 1, 1],
+ [1, 1104, 14, 14],
+ [1, 1104, 7, 7],
+ [1, 112, 1, 1],
+ [1, 112, 14, 14],
+ [1, 1120, 14, 14],
+ [1, 1120, 7, 7],
+ [1, 1152, 14, 14],
+ [1, 1152, 7, 7],
+ [1, 1184, 14, 14],
+ [1, 1184, 7, 7],
+ [1, 12, 1, 1],
+ [1, 120, 1, 1],
+ [1, 120, 28, 28],
+ [1, 120, 40, 40],
+ [1, 120, 56, 56],
+ [1, 1200, 14, 14],
+ [1, 1200, 7, 7],
+ [1, 1216, 14, 14],
+ [1, 1216, 7, 7],
+ [1, 1232, 14, 14],
+ [1, 1232, 28, 28],
+ [1, 1248, 14, 14],
+ [1, 1248, 7, 7],
+ [1, 128, 10, 10],
+ [1, 128, 100, 136],
+ [1, 128, 112, 112],
+ [1, 128, 14, 14],
+ [1, 128, 150, 150],
+ [1, 128, 17, 17],
+ [1, 128, 180, 320],
+ [1, 128, 200, 272],
+ [1, 128, 28, 28],
+ [1, 128, 3, 3],
+ [1, 128, 5, 5],
+ [1, 128, 56, 56],
+ [1, 128, 64, 64],
+ [1, 128, 7, 7],
+ [1, 128, 75, 75],
+ [1, 128, 90, 160],
+ [1, 1280, 1, 1],
+ [1, 1280, 14, 14],
+ [1, 1280, 7, 7],
+ [1, 128],
+ [1, 1296, 14, 14],
+ [1, 1296, 7, 7],
+ [1, 12],
+ [1, 1312, 14, 14],
+ [1, 1312, 7, 7],
+ [1, 132, 1, 1],
+ [1, 1344, 14, 14],
+ [1, 1344, 28, 28],
+ [1, 1344, 7, 7],
+ [1, 1376, 14, 14],
+ [1, 1376, 7, 7],
+ [1, 1392, 14, 14],
+ [1, 1392, 28, 28],
+ [1, 1392, 7, 7],
+ [1, 1408, 14, 14],
+ [1, 1408, 7, 7],
+ [1, 144, 1, 1],
+ [1, 144, 14, 14],
+ [1, 144, 28, 28],
+ [1, 144, 56, 56],
+ [1, 144, 7, 7],
+ [1, 1440, 14, 14],
+ [1, 1440, 7, 7],
+ [1, 1472, 14, 14],
+ [1, 1472, 7, 7],
+ [1, 1488, 14, 14],
+ [1, 1488, 7, 7],
+ [1, 15, 15, 512],
+ [1, 1504, 14, 14],
+ [1, 1504, 7, 7],
+ [1, 1512, 14, 14],
+ [1, 1512, 7, 7],
+ [1, 1536, 10, 10],
+ [1, 1536, 14, 14],
+ [1, 1536, 7, 7],
+ [1, 1568, 14, 14],
+ [1, 1568, 7, 7],
+ [1, 1584, 14, 14],
+ [1, 1584, 7, 7],
+ [1, 16, 1, 1],
+ [1, 16, 112, 112],
+ [1, 16, 14, 14],
+ [1, 16, 160, 160],
+ [1, 16, 224, 224],
+ [1, 16, 28, 28],
+ [1, 16, 56, 56],
+ [1, 160, 14, 14],
+ [1, 160, 28, 28],
+ [1, 160, 56, 56],
+ [1, 160, 7, 7],
+ [1, 1600, 14, 14],
+ [1, 1600, 7, 7],
+ [1, 1632, 14, 14],
+ [1, 1632, 7, 7],
+ [1, 1664, 14, 14],
+ [1, 1664, 7, 7],
+ [1, 168, 1, 1],
+ [1, 168, 28, 28],
+ [1, 168, 56, 56],
+ [1, 1680, 14, 14],
+ [1, 1680, 7, 7],
+ [1, 1696, 14, 14],
+ [1, 1696, 7, 7],
+ [1, 1728, 14, 14],
+ [1, 1728, 7, 7],
+ [1, 174, 1, 1],
+ [1, 1760, 14, 14],
+ [1, 1760, 7, 7],
+ [1, 1776, 14, 14],
+ [1, 1776, 7, 7],
+ [1, 1792, 14, 14],
+ [1, 1792, 7, 7],
+ [1, 18, 1, 1],
+ [1, 18, 14, 14],
+ [1, 18, 28, 28],
+ [1, 18, 56, 56],
+ [1, 1824, 14, 14],
+ [1, 1824, 7, 7],
+ [1, 1856, 7, 7],
+ [1, 1872, 14, 14],
+ [1, 1872, 7, 7],
+ [1, 1888, 7, 7],
+ [1, 192, 14, 14],
+ [1, 192, 17, 17],
+ [1, 192, 28, 28],
+ [1, 192, 35, 35],
+ [1, 192, 56, 56],
+ [1, 192, 7, 7],
+ [1, 192, 8, 8],
+ [1, 1920, 14, 14],
+ [1, 1920, 7, 7],
+ [1, 196, 1, 1],
+ [1, 1968, 14, 14],
+ [1, 1968, 7, 7],
+ [1, 20, 1, 1],
+ [1, 2016, 14, 14],
+ [1, 2016, 7, 7],
+ [1, 2048, 10, 10],
+ [1, 2048, 14, 14],
+ [1, 2048, 23, 40],
+ [1, 2048, 25, 34],
+ [1, 2048, 7, 7],
+ [1, 2064, 14, 14],
+ [1, 2064, 7, 7],
+ [1, 208, 14, 14],
+ [1, 208, 28, 28],
+ [1, 2112, 14, 14],
+ [1, 2112, 7, 7],
+ [1, 216, 28, 28],
+ [1, 216, 56, 56],
+ [1, 2160, 7, 7],
+ [1, 2208, 7, 7],
+ [1, 222, 1, 1],
+ [1, 224, 1, 1],
+ [1, 224, 112, 112],
+ [1, 224, 14, 14],
+ [1, 224, 17, 17],
+ [1, 224, 28, 28],
+ [1, 224, 35, 35],
+ [1, 224, 56, 56],
+ [1, 224, 7, 7],
+ [1, 232, 112, 112],
+ [1, 232, 56, 56],
+ [1, 24, 1, 1],
+ [1, 24, 112, 112],
+ [1, 24, 14, 14],
+ [1, 240, 1, 1],
+ [1, 240, 14, 14],
+ [1, 240, 28, 28],
+ [1, 240, 56, 56],
+ [1, 2520, 14, 14],
+ [1, 2520, 7, 7],
+ [1, 256, 1, 1],
+ [1, 256, 100, 136],
+ [1, 256, 112, 112],
+ [1, 256, 128, 128],
+ [1, 256, 13, 17],
+ [1, 256, 14, 14],
+ [1, 256, 17, 17],
+ [1, 256, 180, 320],
+ [1, 256, 19, 19],
+ [1, 256, 200, 272],
+ [1, 256, 25, 34],
+ [1, 256, 28, 28],
+ [1, 256, 3, 3],
+ [1, 256, 32, 32],
+ [1, 256, 38, 38],
+ [1, 256, 45, 80],
+ [1, 256, 5, 5],
+ [1, 256, 50, 68],
+ [1, 256, 56, 56],
+ [1, 256, 7, 7],
+ [1, 256, 7, 9],
+ [1, 256, 75, 75],
+ [1, 256, 8, 8],
+ [1, 256, 90, 160],
+ [1, 26, 1, 1],
+ [1, 264, 1, 1],
+ [1, 288, 14, 14],
+ [1, 288, 28, 28],
+ [1, 288, 56, 56],
+ [1, 2904, 24, 24],
+ [1, 2904, 48, 48],
+ [1, 30, 1, 1],
+ [1, 3024, 14, 14],
+ [1, 3024, 7, 7],
+ [1, 308, 1, 1],
+ [1, 32, 1, 1],
+ [1, 32, 112, 112],
+ [1, 32, 120, 160],
+ [1, 32, 14, 14],
+ [1, 32, 147, 147],
+ [1, 32, 149, 149],
+ [1, 32, 150, 150],
+ [1, 32, 192, 192],
+ [1, 32, 256, 256],
+ [1, 32, 26, 26],
+ [1, 32, 28, 28],
+ [1, 32, 30, 40],
+ [1, 32, 56, 56],
+ [1, 32, 60, 80],
+ [1, 32, 7, 7],
+ [1, 320, 14, 14],
+ [1, 320, 17, 17],
+ [1, 320, 28, 28],
+ [1, 320, 7, 7],
+ [1, 320, 8, 8],
+ [1, 336, 112, 112],
+ [1, 336, 14, 14],
+ [1, 336, 28, 28],
+ [1, 336, 56, 56],
+ [1, 348, 1, 1],
+ [1, 352, 14, 14],
+ [1, 352, 28, 28],
+ [1, 36, 1, 1],
+ [1, 36, 14, 14],
+ [1, 36, 28, 28],
+ [1, 36, 56, 56],
+ [1, 3712, 14, 14],
+ [1, 3712, 7, 7],
+ [1, 384, 14, 14],
+ [1, 384, 17, 17],
+ [1, 384, 28, 28],
+ [1, 384, 56, 56],
+ [1, 384, 7, 7],
+ [1, 384, 8, 8],
+ [1, 4, 14, 14],
+ [1, 40, 1, 1],
+ [1, 400, 14, 14],
+ [1, 400, 7, 7],
+ [1, 408, 14, 14],
+ [1, 408, 28, 28],
+ [1, 4096],
+ [1, 416, 14, 14],
+ [1, 416, 28, 28],
+ [1, 432, 14, 14],
+ [1, 432, 28, 28],
+ [1, 440, 14, 14],
+ [1, 440, 7, 7],
+ [1, 448, 14, 14],
+ [1, 448, 28, 28],
+ [1, 448, 56, 56],
+ [1, 448, 8, 8],
+ [1, 48, 112, 112],
+ [1, 48, 14, 14],
+ [1, 48, 56, 56],
+ [1, 48, 7, 7],
+ [1, 480, 14, 14],
+ [1, 480, 28, 28],
+ [1, 480, 7, 7],
+ [1, 512, 10, 10],
+ [1, 512, 100, 136],
+ [1, 512, 14, 14],
+ [1, 512, 16, 16],
+ [1, 512, 19, 19],
+ [1, 512, 23, 40],
+ [1, 512, 25, 34],
+ [1, 512, 28, 28],
+ [1, 512, 38, 38],
+ [1, 512, 45, 80],
+ [1, 512, 50, 68],
+ [1, 512, 56, 56],
+ [1, 512, 7, 7],
+ [1, 512, 8, 8],
+ [1, 512, 90, 160],
+ [1, 52, 1, 1],
+ [1, 528, 14, 14],
+ [1, 528, 192, 192],
+ [1, 528, 28, 28],
+ [1, 528, 96, 96],
+ [1, 54, 1, 1],
+ [1, 544, 14, 14],
+ [1, 544, 7, 7],
+ [1, 56, 1, 1],
+ [1, 576, 14, 14],
+ [1, 576, 28, 28],
+ [1, 576, 7, 7],
+ [1, 58, 1, 1],
+ [1, 60, 28, 28],
+ [1, 608, 14, 14],
+ [1, 608, 7, 7],
+ [1, 624, 14, 14],
+ [1, 624, 28, 28],
+ [1, 64, 1, 1],
+ [1, 64, 112, 112],
+ [1, 64, 120, 160],
+ [1, 64, 128, 128],
+ [1, 64, 14, 14],
+ [1, 64, 147, 147],
+ [1, 64, 150, 150],
+ [1, 64, 160, 160],
+ [1, 64, 180, 320],
+ [1, 64, 200, 272],
+ [1, 64, 224, 224],
+ [1, 64, 24, 24],
+ [1, 64, 28, 28],
+ [1, 64, 30, 40],
+ [1, 64, 300, 300],
+ [1, 64, 35, 35],
+ [1, 64, 360, 640],
+ [1, 64, 400, 544],
+ [1, 64, 480, 640],
+ [1, 64, 56, 56],
+ [1, 64, 60, 80],
+ [1, 64, 73, 73],
+ [1, 64, 80, 80],
+ [1, 640, 14, 14],
+ [1, 640, 7, 7],
+ [1, 64],
+ [1, 672, 14, 14],
+ [1, 672, 28, 28],
+ [1, 672, 56, 56],
+ [1, 672, 7, 7],
+ [1, 696, 28, 28],
+ [1, 696, 56, 56],
+ [1, 704, 14, 14],
+ [1, 704, 7, 7],
+ [1, 72, 1, 1],
+ [1, 72, 112, 112],
+ [1, 72, 14, 14],
+ [1, 72, 28, 28],
+ [1, 72, 40, 40],
+ [1, 72, 56, 56],
+ [1, 72, 80, 80],
+ [1, 720, 14, 14],
+ [1, 720, 28, 28],
+ [1, 726, 1, 1],
+ [1, 728, 19, 19],
+ [1, 728, 38, 38],
+ [1, 736, 14, 14],
+ [1, 736, 7, 7],
+ [1, 7392, 12, 12],
+ [1, 7392, 24, 24],
+ [1, 768, 14, 14],
+ [1, 768, 28, 28],
+ [1, 768, 7, 7],
+ [1, 784, 14, 14],
+ [1, 784, 7, 7],
+ [1, 8, 1, 1],
+ [1, 8, 112, 112],
+ [1, 80, 1, 1],
+ [1, 80, 112, 112],
+ [1, 80, 56, 56],
+ [1, 800, 14, 14],
+ [1, 800, 7, 7],
+ [1, 816, 14, 14],
+ [1, 832, 14, 14],
+ [1, 832, 7, 7],
+ [1, 84, 1, 1],
+ [1, 864, 14, 14],
+ [1, 864, 7, 7],
+ [1, 88, 28, 28],
+ [1, 888, 14, 14],
+ [1, 888, 7, 7],
+ [1, 896, 14, 14],
+ [1, 896, 28, 28],
+ [1, 896, 7, 7],
+ [1, 912, 14, 14],
+ [1, 912, 7, 7],
+ [1, 92, 14, 14],
+ [1, 928, 14, 14],
+ [1, 928, 7, 7],
+ [1, 96, 112, 112],
+ [1, 96, 14, 14],
+ [1, 96, 28, 28],
+ [1, 96, 35, 35],
+ [1, 96, 56, 56],
+ [1, 96, 71, 71],
+ [1, 96, 73, 73],
+ [1, 960, 14, 14],
+ [1, 960, 7, 7],
+ [1, 992, 14, 14],
+ [1, 992, 7, 7],
+ # [1, "s0", 256],
+ # [1, "s0", 768],
+ [100, 1, 2048],
+ [59, 4096],
+ [6, 1, 100, 256],
+ [920, 1, 2048],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.relu)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.relu(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rsqrt/rsqrt_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/rsqrt/rsqrt_pytorch2.py
new file mode 100644
index 00000000000..ee0d03fba83
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rsqrt/rsqrt_pytorch2.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 1],
+ [1, 10, 1],
+ [1, 1024, 1, 1],
+ [1, 128, 1, 1],
+ [1, 15, 1],
+ [1, 2048, 1, 1],
+ [1, 256, 1, 1],
+ [1, 512, 1, 1],
+ [1, 64, 1, 1],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.rsqrt)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.rsqrt(input_tensor_a, fast_and_approximate_mode=True, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub.py b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub.py
index ab5b225d7ad..de02011284e 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub.py
@@ -28,13 +28,22 @@
"nightly": {
"input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 64),
"input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
- "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
"input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
"output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
},
}
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Row Major layout is not supported"
+ return False, None
+
+
# This is the run instructions for the test, defined by the developer.
# The run function must take the above-defined parameters as inputs.
# The runner will call this run function with each test vector, and the returned results from this function will be stored.
@@ -57,7 +66,8 @@ def run(
factor = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
- torch_output_tensor = torch.sub(factor, torch_input_tensor_a)
+ golden_function = ttnn.get_golden_function(ttnn.rsub)
+ torch_output_tensor = golden_function(torch_input_tensor_a, factor)
input_tensor_a = ttnn.from_torch(
torch_input_tensor_a,
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_pytorch2.py
new file mode 100644
index 00000000000..0d5fd280dd8
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_pytorch2.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 1, 10],
+ [1, 1, 1, 12],
+ [1, 1, 1, 14],
+ [1, 1, 1, 15],
+ [1, 1, 1, 17],
+ [1, 1, 1, 1],
+ [1, 1, 1, 201],
+ [1, 1, 1, 2048],
+ [1, 1, 1, 24],
+ [1, 1, 1, 256],
+ [1, 1, 1, 25],
+ [1, 1, 1, 2],
+ [1, 1, 1, 42],
+ [1, 1, 1, 42],
+ [1, 1, 1, 46],
+ [1, 1, 1, 5],
+ [1, 1, 1, 60],
+ [1, 1, 1, 6],
+ [1, 1, 1, 7],
+ [1, 1, 1, 8],
+ [1, 1, 1, 9],
+ # [1, 1, 1, s0 + 1],
+ # [1, 1, 1, s0],
+ # [1, 1, 1, s10 + 1],
+ [1, 1, 19, 19],
+ [1, 1, 24, 24],
+ [1, 1, 32, 1],
+ [1, 1, 32, 1],
+ [1, 1, 32, 32],
+ [1, 1, 45, 45],
+ [1, 1, 59, 59],
+ [1, 192],
+ [1066],
+ [120, 1],
+ [128, 1],
+ [128],
+ [160],
+ [2, 1, 7, 7],
+ [240, 1],
+ [30, 1],
+ [300, 1],
+ [300],
+ [320, 1],
+ [320],
+ [40],
+ [480, 1],
+ [60, 1],
+ [640],
+ [800, 1],
+ [80],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ factor = 1.0
+
+ golden_function = ttnn.get_golden_function(ttnn.rsub)
+ torch_output_tensor = golden_function(torch_input_tensor_a, factor)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.rsub(input_tensor_a, value=factor, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/sigmoid/sigmoid_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/sigmoid/sigmoid_pytorch2.py
new file mode 100644
index 00000000000..45b085082a0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/sigmoid/sigmoid_pytorch2.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 256, 256],
+ [1, 1, 480, 640],
+ [1, 100, 4],
+ [1, 104, 1, 1],
+ [1, 1056, 1, 1],
+ [1, 12, 64, 64],
+ [1, 120, 1, 1],
+ [1, 120, 14, 14],
+ [1, 1232, 1, 1],
+ [1, 1392, 1, 1],
+ [1, 144, 1, 1],
+ [1, 1512, 1, 1],
+ [1, 16, 64, 64],
+ [1, 184, 7, 7],
+ [1, 2, 120, 160],
+ [1, 2, 30, 40],
+ [1, 2, 60, 80],
+ [1, 200, 7, 7],
+ [1, 2016, 1, 1],
+ [1, 208, 1, 1],
+ [1, 216, 1, 1],
+ [1, 224, 1, 1],
+ [1, 232, 1, 1],
+ [1, 24, 64, 64],
+ [1, 240, 14, 14],
+ [1, 2904, 1, 1],
+ [1, 3, 16, 16, 85],
+ [1, 3, 32, 32, 85],
+ [1, 3, 64, 64, 85],
+ [1, 3, 64, 64],
+ [1, 3024, 1, 1],
+ [1, 32, 64, 64],
+ [1, 320, 1, 1],
+ [1, 336, 1, 1],
+ [1, 3712, 1, 1],
+ [1, 4, 64, 64],
+ [1, 440, 1, 1],
+ [1, 448, 1, 1],
+ [1, 48, 1, 1],
+ [1, 480, 7, 7],
+ [1, 50, 3072],
+ [1, 528, 1, 1],
+ [1, 576, 1, 1],
+ [1, 6, 64, 64],
+ [1, 64, 1, 1],
+ [1, 672, 7, 7],
+ [1, 696, 1, 1],
+ [1, 72, 1, 1],
+ [1, 72, 28, 28],
+ [1, 7392, 1, 1],
+ [1, 784, 1, 1],
+ [1, 8, 64, 64],
+ [1, 888, 1, 1],
+ [1, 896, 1, 1],
+ [1, 960, 3, 3],
+ [2, 7, 2048],
+ [6, 1, 100, 4],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.sigmoid)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.sigmoid(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/silu/silu_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/silu/silu_pytorch2.py
new file mode 100644
index 00000000000..3a6fe10f458
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/silu/silu_pytorch2.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ # [1, 128, ((s1 - 1)//2) + 1, ((s2 - 1)//2) + 1],
+ # [1, 128, s1, s2],
+ [1, 1280, 16, 16],
+ [1, 1280, 32, 32],
+ [1, 1280, 8, 8],
+ [1, 1280],
+ [1, 1920, 16, 16],
+ [1, 1920, 32, 32],
+ # [1, 256, ((s1 - 1)//2) + 1, ((s2 - 1)//2) + 1],
+ # [1, 256, s0, s1],
+ # [1, 256, s1, s2],
+ [1, 2560, 16, 16],
+ [1, 2560, 8, 8],
+ [1, 32, 256, 256],
+ # [1, 32, s0, s1],
+ [1, 320, 32, 32],
+ [1, 320, 64, 64],
+ # [1, 512, ((s1 - 1)//2) + 1, ((s2 - 1)//2) + 1],
+ # [1, 512, s1, s2],
+ # [1, 64, ((s1 - 1)//2) + 1, ((s2 - 1)//2) + 1],
+ # [1, 64, s0, s1],
+ # [1, 64, s1, s2],
+ [1, 640, 16, 16],
+ [1, 640, 32, 32],
+ [1, 640, 64, 64],
+ [1, 960, 32, 32],
+ [1, 960, 64, 64],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float16), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.silu)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.silu(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_pytorch2.py
new file mode 100644
index 00000000000..f09fc207b12
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_pytorch2.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 160],
+ [1, 23, 40, 64],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=0, high=6.283185307179586, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.sin)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.sin(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/softplus/softplus.py b/tests/sweep_framework/sweeps/eltwise/unary/softplus/softplus.py
new file mode 100644
index 00000000000..e0b0780bb0f
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/softplus/softplus.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+low = 0
+high = 100
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 32),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+def mesh_device_fixture():
+ device = ttnn.open_device(device_id=0)
+ assert ttnn.device.is_wormhole_b0(device), "This op is available for Wormhole_B0 only"
+ yield (device, "Wormhole_B0")
+ ttnn.close_device(device)
+ del device
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ beta = torch.tensor(1, dtype=torch.bfloat16).uniform_(low, high).item()
+ threshold = torch.tensor(1, dtype=torch.bfloat16).uniform_(low, high).item()
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_output_tensor = torch.nn.functional.softplus(torch_input_tensor_a, beta=beta, threshold=threshold)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.softplus(input_tensor_a, beta=beta, threshold=threshold, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/tanh/tanh_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/tanh/tanh_pytorch2.py
new file mode 100644
index 00000000000..8d5c2746dc0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/tanh/tanh_pytorch2.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [1, 1, 1024],
+ [1, 1, 24576],
+ [1, 1, 3072],
+ [1, 1, 4096],
+ [1, 12, 3072],
+ [1, 14, 3072],
+ [1, 15, 1024],
+ [1, 256, 96],
+ [1, 32, 6144],
+ [1, 45, 3072],
+ [1, 5, 4096],
+ [1, 7, 3072],
+ [1, 768],
+ [1, 9, 128],
+ [1, 9, 16384],
+ [1, 9, 3072],
+ [1, 9, 4096],
+ [1, 9, 8192],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.tanh)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.tanh(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/tril/tril_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/unary/tril/tril_pytorch2.py
new file mode 100644
index 00000000000..82a90241b30
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/tril/tril_pytorch2.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": [
+ [7, 7],
+ ],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_a_dtype,
+ input_a_layout,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=0, high=6.283185307179586, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ golden_function = ttnn.get_golden_function(ttnn.tril)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.tril(input_tensor_a, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/clamp_bw/clamp_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/clamp_bw/clamp_bw.py
new file mode 100644
index 00000000000..9d9dcfda635
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/clamp_bw/clamp_bw.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes, gen_low_high_scalars
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 8)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 8)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 8),
+ "mode": ["both", "min", "max"],
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ mode,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ low, high = gen_low_high_scalars()
+
+ if mode == "min":
+ high = None
+ elif mode == "max":
+ low = None
+
+ intermediate_result = torch.clamp(torch_input_tensor_a, low, high)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.clamp_bw(grad_tensor, input_tensor_a, min=low, max=high, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/div_bw/div_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/div_bw/div_bw.py
new file mode 100644
index 00000000000..6e45912b22e
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/div_bw/div_bw.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 8)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 8)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 8),
+ "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# TO-DO: Create an issue on this, since these constrictions are not mentioned in the documentation
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ # In the documentation stands that this op supports ROW_MAJOR
+ # TO-DO: create an issue on this matter
+ if test_vector["grad_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_a_dtype"] == ttnn.bfloat8_b and test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if test_vector["grad_dtype"] == ttnn.bfloat8_b and test_vector["grad_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "bfloat8_b is only supported on tiled layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ scalar = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+
+ itermediate_result = torch.div(torch_input_tensor_a, scalar)
+ itermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.div_bw(grad_tensor, input_tensor_a, scalar, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/hardtanh_bw/hardtanh_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardtanh_bw/hardtanh_bw.py
new file mode 100644
index 00000000000..0b52a8f1970
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardtanh_bw/hardtanh_bw.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ intermediate_result = torch.nn.functional.hardtanh(torch_input_tensor_a)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.hardtanh_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/log_bw/log_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/log_bw/log_bw.py
new file mode 100644
index 00000000000..2f42574a4e0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/log_bw/log_bw.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(partial(torch_random, low=-10, high=10, dtype=torch.float32), grad_dtype)(
+ input_shape
+ )
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-10, high=10, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ intermediate_result = torch.log(torch_input_tensor_a)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.log_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/mul_bw/mul_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/mul_bw/mul_bw.py
new file mode 100644
index 00000000000..b8d8daf2462
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/mul_bw/mul_bw.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes, tensor_to_dtype
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 4)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 4)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 4),
+ "grad_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "grad_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ # In the documentation stands that this op supports ROW_MAJOR
+ # TO-DO: create an issue on this matter
+ if test_vector["grad_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+ return True, "Inputs to eltwise binary must be tilized"
+ if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is not supported on row major layout"
+ if test_vector["grad_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["grad_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is not supported on row major layout"
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ scalar = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+
+ itermediate_result = torch.mul(torch_input_tensor_a, scalar)
+ itermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.mul_bw(grad_tensor, input_tensor_a, scalar, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/relu6_bw/relu6_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/relu6_bw/relu6_bw.py
new file mode 100644
index 00000000000..ff86458ab9b
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/relu6_bw/relu6_bw.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ intermediate_result = torch.nn.functional.relu6(torch_input_tensor_a)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.relu6_bw(grad_tensor, input_tensor_a, memory_config=output_memory_config)[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/softplus_bw/softplus_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/softplus_bw/softplus_bw.py
new file mode 100644
index 00000000000..cda40ebe077
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/softplus_bw/softplus_bw.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "xfail": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ beta = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+ threshold = torch.tensor(1, dtype=torch.bfloat16).uniform_(100, 100).item()
+ while beta == 0.0 and threshold > 0.0:
+ beta = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+ threshold = torch.tensor(1, dtype=torch.bfloat16).uniform_(100, 100).item()
+
+ intermediate_result = torch.nn.functional.softplus(torch_input_tensor_a, beta=beta, threshold=threshold)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.softplus_bw(
+ grad_tensor, input_tensor_a, beta=beta, threshold=threshold, memory_config=output_memory_config
+ )[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/threshold_bw/threshold_bw.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/threshold_bw/threshold_bw.py
new file mode 100644
index 00000000000..3a24af1d54c
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/threshold_bw/threshold_bw.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import random
+import ttnn
+from tests.sweep_framework.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 30
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
+ + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
+ + gen_shapes([32, 32], [256, 256], [32, 32], 16),
+ "grad_dtype": [ttnn.bfloat16],
+ "input_a_dtype": [ttnn.bfloat16],
+ "grad_layout": [ttnn.TILE_LAYOUT],
+ "input_a_layout": [ttnn.TILE_LAYOUT],
+ "grad_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ grad_dtype,
+ input_a_dtype,
+ grad_layout,
+ input_a_layout,
+ grad_memory_config,
+ input_a_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), grad_dtype
+ )(input_shape)
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_input_tensor_a.requires_grad = True
+ torch_input_tensor_a.retain_grad()
+
+ threshold = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+ value = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+
+ intermediate_result = torch.threshold(torch_input_tensor_a, threshold, value)
+ intermediate_result.backward(gradient=torch_grad_tensor)
+ torch_output_tensor = torch_input_tensor_a.grad
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=grad_dtype,
+ layout=grad_layout,
+ device=device,
+ memory_config=grad_memory_config,
+ )
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a.detach().clone(),
+ dtype=input_a_dtype,
+ layout=input_a_layout,
+ device=device,
+ memory_config=input_a_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.threshold_bw(
+ grad_tensor, input_tensor_a, threshold, value, memory_config=output_memory_config
+ )[0]
+ output_tensor = ttnn.to_torch(output_tensor)
+ e2e_perf = stop_measuring_time(start_time)
+
+ return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp b/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
index 6ec7b2de5b0..d65c4bdf818 100644
--- a/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
+++ b/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
@@ -20,7 +20,7 @@ void MAIN {
uint32_t block_index = 0;
cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
uint32_t tile_index = 0;
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
// Pop tile after tile, copy to DST and pack
cb_wait_front(tt::CB::c_in0, 1);
@@ -36,7 +36,7 @@ void MAIN {
cb_pop_front(tt::CB::c_in0, 1);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, per_core_block_dim);
diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp
index 9de793a85ba..0a7fe9ecd98 100644
--- a/tests/tt_eager/ops/test_bcast_op.cpp
+++ b/tests/tt_eager/ops/test_bcast_op.cpp
@@ -6,7 +6,7 @@
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/bcast/bcast.hpp"
#include "common/constants.hpp"
-#include "third_party/magic_enum/magic_enum.hpp"
+#include
#include
#include
diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp
index 942ea4bfb9d..40543974d05 100644
--- a/tests/tt_eager/ops/test_sfpu.cpp
+++ b/tests/tt_eager/ops/test_sfpu.cpp
@@ -8,7 +8,7 @@
#include
#include
-#include "third_party/magic_enum/magic_enum.hpp"
+#include
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
diff --git a/tests/tt_eager/ops_device_perf/run_op_profiling.py b/tests/tt_eager/ops_device_perf/run_op_profiling.py
index b4e12f50c02..a8241c42d7a 100644
--- a/tests/tt_eager/ops_device_perf/run_op_profiling.py
+++ b/tests/tt_eager/ops_device_perf/run_op_profiling.py
@@ -7,14 +7,13 @@
from loguru import logger
import numpy as np
import pytest
+from pathlib import Path
-from tt_metal.tools.profiler.process_model_log import run_device_profiler, post_process_ops_log
+from tt_metal.tools.profiler.process_model_log import run_device_profiler, post_process_ops_log, get_profiler_folder
from models.utility_functions import is_wormhole_b0, is_blackhole
-from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG
-
-profiler_log_path = PROFILER_LOGS_DIR / PROFILER_DEVICE_SIDE_LOG
+from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG, generate_logs_folder
from tt_metal.tools.profiler.process_device_log import import_log_run_stats
import tt_metal.tools.profiler.device_post_proc_config as device_post_proc_config
@@ -75,15 +74,16 @@ def run_op_test():
op_name = "tt::operations::primary::Matmul"
duration_cols = ["DEVICE KERNEL DURATION [ns]"]
+ profiler_out_dir = "op_profiler_results"
run_device_profiler(
- "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_2d.py", "op_profiler_results"
+ "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_matmul_1d_2d.py", profiler_out_dir
)
- results = post_process_ops_log("op_profiler_results", duration_cols, False, op_name)
+ results = post_process_ops_log(profiler_out_dir, duration_cols, False, op_name)
kernel_durations_ns = results[duration_cols[0]]
setup = device_post_proc_config.default_setup()
- setup.deviceInputLog = profiler_log_path
+ setup.deviceInputLog = generate_logs_folder(get_profiler_folder(profiler_out_dir)) / PROFILER_DEVICE_SIDE_LOG
deviceData = import_log_run_stats(setup)
freq = deviceData["deviceInfo"]["freq"]
freq_to_cycle_ratio = freq / 1000.0
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
index 86c46e8b8f6..3a238ff96b7 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_copy.py
@@ -66,65 +66,3 @@ def test_run_copy_op(
device,
test_args,
)
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- [
- [[1, 1, 1, 30]], # Single core
- [[1, 1, 300, 380]], # multi core
- [[1, 3, 320, 380]], # multi core
- [[1, 1, 32, 32]], # Single core
- [[1, 1, 320, 384]], # Multi core
- [[1, 3, 320, 384]], # Multi core
- ],
-)
-@pytest.mark.parametrize(
- "input_mem_config",
- mem_configs,
-)
-@pytest.mark.parametrize(
- "dst_mem_config",
- mem_configs,
-)
-@pytest.mark.parametrize(
- "output_type",
- [
- ttnn.bfloat16,
- ],
-)
-@pytest.mark.parametrize(
- "input_type",
- [
- torch.float32,
- torch.float16,
- torch.bfloat16,
- ],
-)
-class TestClone:
- def test_run_clone_op(
- self,
- input_type,
- output_type,
- input_shapes,
- input_mem_config,
- dst_mem_config,
- device,
- function_level_defaults,
- ):
- datagen_func = [
- generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), input_type)
- ]
- test_args = generation_funcs.gen_default_dtype_layout_device(input_shapes)[0]
- test_args["input_mem_config"] = [input_mem_config]
- test_args["dtype"] = [output_type]
- test_args.update({"output_mem_config": dst_mem_config})
- comparison_func = partial(comparison_funcs.comp_allclose, rtol=1e-1, atol=1e-1)
- run_single_pytorch_test(
- "clone",
- input_shapes,
- datagen_func,
- comparison_func,
- device,
- test_args,
- )
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
index d77f952d7fd..7362670f625 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytorch_ops.py
@@ -1843,36 +1843,6 @@ def log_bw(x, y, *args, **kwargs):
return in_data.grad
-def gt_bw(x, *args, **kwargs):
- grad_data = x
-
- pyt_y = torch.zeros_like(grad_data)
-
- golden_tensor = pyt_y
-
- return golden_tensor
-
-
-def lt_bw(x, *args, **kwargs):
- grad_data = x
-
- pyt_y = torch.zeros_like(grad_data)
-
- golden_tensor = pyt_y
-
- return golden_tensor
-
-
-def ne_bw(x, *args, **kwargs):
- grad_data = x
-
- pyt_y = torch.zeros_like(grad_data)
-
- golden_tensor = pyt_y
-
- return golden_tensor
-
-
def rsub_bw(x, y, z, *args, **kwargs):
grad_data = x
in_data = y
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_logsoftmax.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_logsoftmax.py
deleted file mode 100644
index 61d73aa759d..00000000000
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_logsoftmax.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-import ttnn
-import pytest
-from models.utility_functions import comp_allclose_and_pcc
-from loguru import logger
-import torch.nn.functional as F
-from models.utility_functions import is_wormhole_b0
-
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
- get_compute_kernel_options,
- compute_kernel_options,
- compute_kernel_ids,
-)
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((50, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.log_softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(
- dev_x, dim, compute_kernel_config=compute_kernel_config
- )
-
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
- assert list(tt_dev.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_dev.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.1
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.log_softmax(x, dim)
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(
- dev_x, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.1
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.log_softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(
- dev_x, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.1
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.log_softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(
- dev_x, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.1
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 2), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 5, 32), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.log_softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.5
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.log_softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(
- dev_y, dev_dy, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.5
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.log_softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).pad_to_tile(float("200")).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.1
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_logsoftmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.log_softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.5
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_logsoftmax_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- # cpu calculation
- tt_cpu = F.log_softmax(x, dim)
-
- # npu calculation
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- if optional_output_tensor:
- dev_y = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(dev_x, dim, dev_y)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax(dev_x, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_logsoftmax_backward_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- # cpu calculation
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.log_softmax(x, dim)
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- y.backward(dy)
-
- # npu calculation
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- if optional_output_tensor:
- dev_dx = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim, dev_dx)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_logsoftmax_backward(dev_y, dev_dy, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmax.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmax.py
deleted file mode 100644
index 498891c1432..00000000000
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmax.py
+++ /dev/null
@@ -1,479 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-import ttnn
-import pytest
-from models.utility_functions import comp_allclose_and_pcc
-from loguru import logger
-from models.utility_functions import is_wormhole_b0
-
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
- get_compute_kernel_options,
- compute_kernel_options,
- compute_kernel_ids,
-)
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = torch.softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = torch.softmax(x, dim)
-
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(
- dev_x, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = torch.softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("7")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = torch.softmax(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim, compute_kernel_config=compute_kernel_config)
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
-
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(
- dev_y, dev_dy, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).pad_to_tile(float("20")).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((15, 32, 32), 0), # single tile c
- ((15, 32 * 7, 32 * 5), 0), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT)
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim_strategy",
- (
- ((32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.SMALL_W),
- ((32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.SMALL_H),
- ((32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_W),
- ((32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H),
- ((1, 1, 32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_C),
- ((1, 1, 32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_C),
- ),
-)
-def test_softmax_callback(shape_dim_strategy, device):
- device.enable_program_cache()
-
- shape, dim, strategy = shape_dim_strategy
- torch.manual_seed(0)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = torch.softmax(x, dim)
- for i in range(2):
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim, None, strategy)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim_strategy",
- (
- ((32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.SMALL_W),
- ((32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.SMALL_H),
- ((32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_W),
- ((32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H),
- ((1, 1, 32, 32), 1, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_C),
- ((1, 1, 32, 32), 0, ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_C),
- ),
-)
-def test_softmax_backward_callback(shape_dim_strategy, device):
- device.enable_program_cache()
- shape, dim, strategy = shape_dim_strategy
- torch.manual_seed(0)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- for i in range(2):
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(dev_y, dev_dy, dim, None, strategy)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_softmax_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- # cpu calculation
- tt_cpu = torch.softmax(x, dim)
-
- # npu calculation
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- if optional_output_tensor:
- dev_y = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim, dev_y)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax(dev_x, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_softmax_backward_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- # cpu calculation
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = torch.softmax(x, dim)
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- y.backward(dy)
-
- # npu calculation
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- if optional_output_tensor:
- dev_dx = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(dev_y, dev_dy, dim, dev_dx)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_softmax_backward(dev_y, dev_dy, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmin.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmin.py
deleted file mode 100644
index 19f77781990..00000000000
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_softmin.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-import ttnn
-import pytest
-from models.utility_functions import comp_allclose_and_pcc
-from loguru import logger
-import torch.nn.functional as F
-from models.utility_functions import is_wormhole_b0
-
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
- get_compute_kernel_options,
- compute_kernel_options,
- compute_kernel_ids,
-)
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.softmin(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(dev_x, dim, compute_kernel_config=compute_kernel_config)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.softmin(x, dim)
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(
- dev_x, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.softmin(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(dev_x, dim, compute_kernel_config=compute_kernel_config)
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("7")).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_cpu = F.softmin(x, dim)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(dev_x, dim, compute_kernel_config=compute_kernel_config)
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.softmin(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.softmin(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- strategy = (
- ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_W
- if dim == 3
- else ttnn.experimental.operations.primary.MorehSoftmaxBackwardOpParallelizationStrategy.LARGE_H
- )
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(
- dev_y, dev_dy, dim, None, strategy, compute_kernel_config=compute_kernel_config
- )
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.softmin(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).pad_to_tile(float("20")).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
-)
-@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_softmin_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- compute_kernel_config = get_compute_kernel_options(compute_kernel_options)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.softmin(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).pad_to_tile(float("10")).to(ttnn.TILE_LAYOUT).to(device)
-
- y.backward(dy)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(
- dev_y, dev_dy, dim, compute_kernel_config=compute_kernel_config
- )
- tt_npu = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.debug(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_softmin_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
-
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
-
- # cpu calculation
- tt_cpu = F.softmin(x, dim)
-
- # npu calculation
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- if optional_output_tensor:
- dev_y = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(dev_x, dim, dev_y)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin(dev_x, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
-
-
-@pytest.mark.parametrize(
- "shape_dim",
- (((32, 32), 1),), # single tile
-)
-@pytest.mark.parametrize(
- "optional_output_tensor",
- (True, False),
-)
-def test_softmin_backward_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
- shape, dim = shape_dim
- torch.manual_seed(0)
-
- # cpu calculation
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
-
- y = F.softmin(x, dim)
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- y.backward(dy)
-
- # npu calculation
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
-
- if optional_output_tensor:
- dev_dx = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(dev_y, dev_dy, dim, dev_dx)
- else:
- tt_npu = ttnn.experimental.operations.primary.moreh_softmin_backward(dev_y, dev_dy, dim)
-
- assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
- tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
-
- rtol = atol = 0.05
- passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
- logger.info(out)
- assert passing
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_unary_ops_ttnn.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_unary_ops_ttnn.py
index bda1f32c355..7cf8ea27cfd 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_unary_ops_ttnn.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_unary_ops_ttnn.py
@@ -463,34 +463,13 @@ def test_unary_gelu_ttnn(input_shapes, fast_and_approx, device):
(torch.Size([1, 3, 320, 384])),
),
)
-@pytest.mark.parametrize("negative_slope", [1.0, 5.0, 10.0])
+@pytest.mark.parametrize("negative_slope", [1.0, 5.0, 10.0, 0.1])
def test_unary_leaky_relu_ttnn(input_shapes, negative_slope, device):
in_data, input_tensor = data_gen_with_range(input_shapes, -10, 10, device)
_, output_tensor = data_gen_with_range(input_shapes, -1, 1, device)
cq_id = 0
- ttnn.leaky_relu(input_tensor, slope=negative_slope, output_tensor=output_tensor, queue_id=cq_id)
- golden_tensor = torch.nn.functional.leaky_relu(in_data, negative_slope)
-
- comp_pass = compare_pcc([output_tensor], [golden_tensor])
- assert comp_pass
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("negative_slope", [1.0, 5.0, 10.0])
-def test_unary_leaky_relu_ttnn(input_shapes, negative_slope, device):
- in_data, input_tensor = data_gen_with_range(input_shapes, -10, 10, device)
- _, output_tensor = data_gen_with_range(input_shapes, -1, 1, device)
-
- cq_id = 0
- ttnn.leaky_relu(input_tensor, slope=negative_slope, output_tensor=output_tensor, queue_id=cq_id)
+ ttnn.leaky_relu(input_tensor, negative_slope=negative_slope, output_tensor=output_tensor, queue_id=cq_id)
golden_tensor = torch.nn.functional.leaky_relu(in_data, negative_slope)
comp_pass = compare_pcc([output_tensor], [golden_tensor])
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
index 731a9d21fae..79ef28eca0f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
@@ -58,7 +58,7 @@ void MAIN {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
// Reconfigure input
@@ -99,7 +99,7 @@ void MAIN {
pack_tile(i, mm_bias_intermediate_cb_id);
}
cb_push_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
// Redundant wait since we know data was just pushed
cb_wait_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
@@ -109,7 +109,7 @@ void MAIN {
unpack_reconfig_data_format(mm_bias_intermediate_cb_id, bias_cb_id);
// reconfigure packer df for out
pack_reconfig_data_format(out_cb_id);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
for (uint32_t i = 0, j = 0; j < out_subblock_h; j++) {
uint32_t bcast_tile_idx = in1_index_subblock_offset;
for (uint32_t k = 0; k < out_subblock_w; k++, i++) {
@@ -158,7 +158,7 @@ void MAIN {
cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
}
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
}
in0_index_subblock_offset += in0_subblock_num_tiles;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
new file mode 100644
index 00000000000..48c659c54ce
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp
@@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#include "dataflow_api.h"
+
+#include "debug/dprint.h"
+
+template
+FORCE_INLINE
+void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) {
+ uint32_t src_addr_;
+ uint32_t src_noc_xy;
+
+ src_addr_ = src_addr + bank_base_address;
+ src_addr_ += bank_to_dram_offset[bank_id];
+ src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id];
+
+ WAYPOINT("NRTW");
+ DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size);
+ while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF));
+ WAYPOINT("NRTD");
+
+ if constexpr(use_vc) {
+ uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc);
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field);
+ }
+
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr);
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size); // len_bytes
+ NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
+ noc_reads_num_issued[noc_index] += 1;
+}
+
+void kernel_main() {
+ constexpr uint32_t input_addr = get_compile_time_arg_val(0);
+ constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1);
+ constexpr uint32_t num_blocks = get_compile_time_arg_val(2);
+ constexpr uint32_t num_pages = get_compile_time_arg_val(3);
+ constexpr uint32_t block_num_tiles = get_compile_time_arg_val(4);
+ constexpr uint32_t page_size = get_compile_time_arg_val(5);
+
+ constexpr uint32_t block_size_bytes = page_size * num_pages;
+
+ const uint32_t bank_id = get_arg_val(0);
+ const uint32_t vc = get_arg_val(1);
+
+ constexpr uint32_t cb_id = 0;
+
+ uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, bank_id, vc);
+ uint32_t src_read_addr = 0;
+
+#ifdef ARCH_GRAYSKULL
+ for (uint32_t block = 0; block < num_blocks; ++block) {
+ // Operand 1
+ cb_reserve_back(cb_id, block_num_tiles);
+ auto l1_write_addr = get_write_ptr(cb_id);
+
+ for (uint32_t h = 0; h < num_pages; ++h) {
+ noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr);
+ src_read_addr += page_size;
+ l1_write_addr += page_size;
+ }
+
+ noc_async_read_barrier();
+ cb_push_back(cb_id, block_num_tiles);
+ }
+#else
+ constexpr uint32_t total_num_blocks_in_buffer = 3;
+ constexpr uint32_t total_num_trid = 4;
+ uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer;
+ uint32_t curr_block_trid = 1;
+ uint32_t block_trid_to_wait = 1;
+
+ cb_reserve_back(cb_id, block_num_tiles);
+ uint32_t l1_write_addr_offset = 0;
+ uint32_t l1_write_addr_start = get_write_ptr(cb_id);
+ uint32_t l1_write_addr = l1_write_addr_start;
+ for (uint32_t block = 0; block < num_blocks; ++block) {
+ noc_async_read_tile_dram_sharded_set_trid(curr_block_trid);
+
+ for (uint32_t h = 0; h < num_pages; ++h) {
+ noc_async_read_tile_dram_sharded_with_state_with_trid(
+ src_base_addr, src_read_addr, l1_write_addr, curr_block_trid);
+ src_read_addr += page_size;
+ l1_write_addr += page_size;
+ }
+
+ if (num_free_blocks_in_buffer == 2) {
+ noc_async_read_barrier_with_trid(block_trid_to_wait);
+ cb_push_back(cb_id, block_num_tiles);
+ // wait for next block trid
+ block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1);
+ // reserve for next block
+ cb_reserve_back(cb_id, block_num_tiles * 2);
+ } else {
+ num_free_blocks_in_buffer -= 1;
+ }
+
+ if (curr_block_trid == total_num_blocks_in_buffer) {
+ l1_write_addr_offset = 0;
+ curr_block_trid = 1;
+ } else {
+ l1_write_addr_offset += block_size_bytes;
+ curr_block_trid += 1;
+ }
+ l1_write_addr = l1_write_addr_start + l1_write_addr_offset;
+ }
+ // last block to wait
+ noc_async_read_barrier_with_trid(block_trid_to_wait);
+ cb_push_back(cb_id, block_num_tiles);
+#endif
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp
new file mode 100644
index 00000000000..3184c98f187
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#include "dataflow_api.h"
+
+#include "debug/dprint.h"
+
+
+void kernel_main() {
+ constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
+ constexpr uint32_t num_pages = get_compile_time_arg_val(1);
+ constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
+ constexpr uint32_t page_size = get_compile_time_arg_val(3);
+ constexpr uint32_t noc = get_compile_time_arg_val(4);
+
+ const uint32_t vc = get_arg_val(0);
+ const uint32_t noc_x = get_arg_val(1);
+ const uint32_t noc_y = get_arg_val(2);
+
+ constexpr uint32_t cb_id = 0;
+
+ uint32_t l1_write_addr = get_write_ptr(cb_id);
+ const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);
+
+ noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);
+
+ for (uint32_t block = 0; block < num_blocks; ++block) {
+
+ auto remote_l1_write_addr = l1_noc_write_addr;
+
+ cb_wait_front(cb_id, block_num_tiles);
+ auto l1_read_addr = get_read_ptr(cb_id);
+
+ for (uint32_t h = 0; h < num_pages; ++h) {
+ noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
+ l1_read_addr += page_size;
+ remote_l1_write_addr += page_size;
+ }
+
+ noc_async_write_barrier(noc);
+
+ cb_pop_front(cb_id, block_num_tiles);
+
+ }
+
+
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
new file mode 100644
index 00000000000..cf618979b32
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -0,0 +1,959 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "common/bfloat8.hpp"
+#include "common/bfloat16.hpp"
+#include "common/tt_backend_api_types.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include
+
+using namespace tt;
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+////////////////////////////////////////////////////////////////////////////////
+// A tensix core that's next to a DRAM bank reads from the bank, and writes to
+// the neighbour receiver tensix core. It creates a bfloat16/bfloat8_b format
+// DRAM buffer of a given input size, and write it to the DRAM banks in the round
+// robin style.
+//
+// Disclaimer:
+// - This benchmark is designed to support an input size larger than 4GB. But
+// current tt-metal does not seem to support buffer allocation larger than 4GB
+// yet.
+// - Also, detail::ReadFromBuffer API used in DRAM write test may take a long time if
+// the input size is large.
+//
+// Usage example:
+// ./test_dram_offchip
+// --k
+// --n
+// --num-blocks
+// --k
+// --k
+// --num-tests
+// --data-type
+// --num-banks
+// --bank-start-id
+// --bypass-check (set to bypass checking performance criteria fulfillment)
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+template
+std::vector slice_vec(std::vector const &v, int m, int n) {
+ auto first = v.cbegin() + m;
+ auto last = v.cbegin() + n + 1;
+
+ std::vector vec(first, last);
+ return vec;
+}
+
+void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t tile_size, uint32_t& page_size, uint32_t& num_pages) {
+ uint64_t total_size = static_cast(num_tiles) * tile_size;
+
+ page_size = (8192 / tile_size) * tile_size;
+ while (total_size % page_size != 0 && page_size >= tile_size) {
+ page_size -= tile_size;
+ }
+ num_pages = total_size / page_size;
+}
+
+std::tuple create_program(
+ tt_metal::Device *device,
+ const CoreRangeSet &all_dram_reader_cores,
+ const CoreRangeSet &all_l1_receiver_cores,
+ const uint32_t &single_tile_size,
+ const tt::DataFormat &tile_format,
+ uint32_t num_tiles_cb,
+ uint32_t num_tiles_per_core,
+ uint32_t k,
+ uint32_t n,
+ uint32_t num_blocks,
+ uint32_t num_banks,
+ std::vectorall_dram_reader_cores_ordered,
+ std::vectorall_l1_writer_cores_ordered,
+ uint32_t bank_start_id,
+ const uint32_t &input_buffer_addr) {
+ tt_metal::Program program = tt_metal::Program();
+
+ uint32_t start_tile_id = 0;
+ uint32_t kt = k / 32;
+ uint32_t nt = n / 32;
+ uint32_t block_h = kt / num_blocks;
+ uint32_t block_w = nt / num_banks;
+ uint32_t block_num_tiles = block_h * block_w;
+
+ // DRAM reader CB
+ uint32_t reader_cb_index = 0;
+ uint32_t reader_cb_size = block_h * block_w * single_tile_size * 3;
+ uint32_t page_size, num_pages;
+ get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, page_size, num_pages);
+
+ uint32_t reader_cb_addr = device->get_base_allocator_addr(HalMemType::L1);
+ tt_metal::CircularBufferConfig reader_cb_config =
+ tt_metal::CircularBufferConfig(reader_cb_size, {{reader_cb_index, tile_format}})
+ .set_page_size(reader_cb_index, single_tile_size);
+ auto reader_cb = tt_metal::CreateCircularBuffer(program, all_dram_reader_cores, reader_cb_config);
+
+ std::vector reader_compile_time_args = {
+ (std::uint32_t) input_buffer_addr,
+ (std::uint32_t) start_tile_id,
+ (std::uint32_t) num_blocks,
+ (std::uint32_t) num_pages,
+ (std::uint32_t) block_num_tiles,
+ (std::uint32_t) page_size,
+ (std::uint32_t) tt_metal::NOC::RISCV_0_default
+ };
+
+ auto reader_kernel = tt_metal::CreateKernel(
+ program,
+ "tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp",
+ all_dram_reader_cores,
+ tt_metal::DataMovementConfig{
+ .processor = tt_metal::DataMovementProcessor::RISCV_0,
+ .noc = tt_metal::NOC::RISCV_0_default,
+ .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+ .compile_args = reader_compile_time_args});
+
+ std::vector writer_compile_time_args = {
+ (std::uint32_t) num_blocks,
+ (std::uint32_t) num_pages,
+ (std::uint32_t) block_num_tiles,
+ (std::uint32_t) page_size,
+ (std::uint32_t) tt_metal::NOC::RISCV_0_default
+ };
+
+ auto writer_kernel = tt_metal::CreateKernel(
+ program,
+ "tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp",
+ all_dram_reader_cores,
+ tt_metal::DataMovementConfig{
+ .processor = tt_metal::DataMovementProcessor::RISCV_1,
+ .noc = tt_metal::NOC::RISCV_1_default,
+ .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
+ .compile_args = writer_compile_time_args});
+
+ std::vector bank_ids;
+ for (int i=0; i < all_dram_reader_cores_ordered.size(); i++) {
+ auto core = all_dram_reader_cores_ordered[i];
+ uint32_t bank_id = i + bank_start_id;
+ uint32_t vc = bank_id & 0x1;
+
+ bank_ids.push_back(bank_id);
+
+ for (int j=0; j reader_rt_args = {
+ (std::uint32_t) bank_id,
+ (std::uint32_t) vc
+ };
+
+ log_info("core: {}, vc: {}", core, vc);
+
+ tt_metal::SetRuntimeArgs(program, reader_kernel, core, reader_rt_args);
+
+ auto writer_core = all_l1_writer_cores_ordered[i];
+ auto writer_core_phy = device->worker_core_from_logical_core(writer_core);
+
+ std::vector writer_rt_args = {
+ (std::uint32_t) (vc + 2) & 0x3,
+ (std::uint32_t) writer_core_phy.x,
+ (std::uint32_t) writer_core_phy.y
+ };
+
+ tt_metal::SetRuntimeArgs(program, writer_kernel, core, writer_rt_args);
+ }
+ return {std::move(program), reader_kernel, reader_cb_addr};
+}
+
+
+bool validation(
+ tt_metal::Device *device,
+ tt_metal::Buffer &input_buffer,
+ std::vector &input_vec,
+ const uint32_t &num_cores,
+ std::vector &all_cores,
+ const uint32_t &num_tiles_per_core,
+ const uint32_t &cb_addr,
+ const uint32_t &single_tile_size,
+ uint32_t num_tiles_cb,
+ uint32_t df,
+ uint32_t num_banks,
+ uint32_t num_blocks,
+ uint32_t block_h,
+ uint32_t block_w,
+ uint32_t num_datum_per_slice) {
+
+ uint32_t core_id = 0;
+ for (auto core: all_cores) {
+ std::vector result_vec;
+ tt_metal::detail::ReadFromDeviceL1(
+ device, core, cb_addr, num_tiles_cb * single_tile_size, result_vec);
+
+ uint32_t num_datum_per_block = block_h * block_w * num_datum_per_slice;
+ uint32_t tensor_slice_stride = core_id * num_datum_per_slice;
+ uint32_t last_block_offset = (num_blocks - 1) * num_datum_per_block * num_banks;
+ uint32_t start_index = tensor_slice_stride + last_block_offset;
+ uint32_t num_slices = block_h * block_w;
+
+ if (df == 0) {
+ auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result_vec, true, true);
+ auto input_bfp8 = unpack_bfp8_tiles_into_float_vec(input_vec, true, true);
+
+ for (uint32_t i=0; i < num_slices; ++i) {
+ uint32_t input_step = start_index + i * num_datum_per_slice * num_banks;
+ std::vector input_slice(input_bfp8.begin() + input_step, input_bfp8.begin() + input_step + num_datum_per_slice);
+ uint32_t result_step = i * num_datum_per_slice;
+ std::vector result_slice(result_bfp8.begin() + result_step, result_bfp8.begin() + result_step + num_datum_per_slice);
+
+ if (input_slice != result_slice) {
+ return false;
+ }
+ }
+
+ } else {
+ auto result_bf16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);
+ auto input_bf16 = unpack_uint32_vec_into_bfloat16_vec(input_vec);
+
+ for (uint32_t i=0; i < num_slices; ++i) {
+ uint32_t input_step = start_index + i * num_datum_per_slice * num_banks;
+ std::vector input_slice(input_bf16.begin() + input_step, input_bf16.begin() + input_step + num_datum_per_slice);
+ uint32_t result_step = i * num_datum_per_slice;
+ std::vector result_slice(result_bf16.begin() + result_step, result_bf16.begin() + result_step + num_datum_per_slice);
+
+ if (input_slice != result_slice) {
+ return false;
+ }
+ }
+ }
+ core_id ++;
+ }
+ return true;
+}
+
+uint32_t get_dram_bandwidth(tt::ARCH arch) {
+ constexpr uint32_t GS_DRAM_BANDWIDTH_GB_PER_SEC = 100;
+ constexpr uint32_t WH_DRAM_BANDWIDTH_GB_PER_SEC = 384;
+
+ uint32_t dram_bandwidth_gb_per_sec = 0;
+ if (arch == tt::ARCH::WORMHOLE || arch == tt::ARCH::WORMHOLE_B0) {
+ dram_bandwidth_gb_per_sec = WH_DRAM_BANDWIDTH_GB_PER_SEC;
+ } else if (arch == tt::ARCH::GRAYSKULL) {
+ dram_bandwidth_gb_per_sec = GS_DRAM_BANDWIDTH_GB_PER_SEC;
+ }
+ return dram_bandwidth_gb_per_sec;
+}
+
+
+void get_dram_reader_core_coords_blackhole(
+ tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
+ uint32_t full_grid_size_x = soc_d.grid_size.x;
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get dram banks and coords
+ uint32_t num_banks = device->num_dram_channels();
+ uint32_t max_bank_id = num_banks - 1;
+ std::vector dram_coord_phy;
+ for (int i = 0; i < num_banks; ++i) {
+ dram_coord_phy.push_back(device->dram_core_from_dram_channel(i));
+ }
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get x coords of the workers
+ std::vector all_worker_cores_x_physical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0));
+ all_worker_cores_x_physical.push_back(core_phy.x);
+ }
+
+ // get the harvested rows, we treat dram and eth cores as harvested as well
+ std::vector harvested_cols;
+ for (int i = 0; i < full_grid_size_x; ++i) {
+ auto x = i;
+
+ if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) ==
+ all_worker_cores_x_physical.end()) {
+ harvested_cols.push_back(x);
+ }
+ }
+
+ // get the ajacent cores of DRAM banks
+ std::vector adj_core_physical;
+ for (int i = 0; i < num_banks; ++i) {
+ auto dram_core = dram_coord_phy[i];
+ uint32_t adj_core_x = dram_core.x + 1;
+ uint32_t adj_core_y = dram_core.y;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // move worker if they are in the harvested cols
+ for (auto& coord : adj_core_physical) {
+ auto x = coord.x;
+
+ // if row is harvested, move core down by 1
+ while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and x < (full_grid_size_x - 1)) {
+ x += 1;
+ }
+
+ coord.x = x;
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical_realloc;
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < num_banks; ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical_realloc;
+}
+
+
+void get_l1_writer_core_coords_blackhole(
+ tt_metal::Device* device, std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
+ uint32_t full_grid_size_x = soc_d.grid_size.x;
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get x coords of the workers
+ std::vector all_worker_cores_x_physical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0));
+ all_worker_cores_x_physical.push_back(core_phy.x);
+ }
+
+ // get the harvested rows, we treat dram and eth cores as harvested as well
+ std::vector harvested_cols;
+ for (int i = 0; i < full_grid_size_x; ++i) {
+ auto x = i;
+
+ if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) ==
+ all_worker_cores_x_physical.end()) {
+ harvested_cols.push_back(x);
+ }
+ }
+
+ // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers
+ std::vector adj_core_physical;
+ for (int i = 0; i < all_dram_reader_cores.size(); ++i) {
+ auto dram_reader_core = all_dram_reader_cores[i];
+ auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core);
+ uint32_t adj_core_x = dram_reader_core_phy.x + 1;
+ uint32_t adj_core_y = dram_reader_core_phy.y;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // move worker if they are in the harvested rows
+ for (auto& coord : adj_core_physical) {
+ auto x = coord.x;
+
+ // if row is harvested, move core down by 1
+ while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and x < (full_grid_size_x - 1)) {
+ x += 1;
+ }
+
+ coord.x = x;
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical_realloc;
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < adj_core_logical_realloc.size(); ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical_realloc;
+}
+
+void get_dram_reader_core_coords_grayskull(
+ tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
+ uint32_t full_grid_size_y = soc_d.grid_size.y;
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get dram banks and coords
+ uint32_t num_banks = device->num_dram_channels();
+ uint32_t max_bank_id = num_banks - 1;
+ std::vector dram_coord_phy;
+ for (int i = 0; i < num_banks; ++i) {
+ dram_coord_phy.push_back(device->dram_core_from_dram_channel(i));
+ }
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get y coords of the workers
+ std::vector all_worker_cores_y_physical;
+ for (int i = 0; i < num_cores_y; ++i) {
+ auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i));
+ all_worker_cores_y_physical.push_back(core_phy.y);
+ }
+
+ // get the harvested rows, we treat dram and eth cores as harvested as well
+ std::vector harvested_rows;
+ for (int i = 0; i < full_grid_size_y; ++i) {
+ auto y = i;
+
+ if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) ==
+ all_worker_cores_y_physical.end()) {
+ harvested_rows.push_back(y);
+ }
+ }
+
+ // get the ajacent cores of DRAM banks
+ std::vector adj_core_physical;
+ for (int i = 0; i < num_banks; ++i) {
+ auto dram_core = dram_coord_phy[i];
+ uint32_t adj_core_x = dram_core.x;
+ uint32_t adj_core_y = dram_core.y + 1;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // move worker if they are in the harvested rows
+ for (auto& coord : adj_core_physical) {
+ auto y = coord.y;
+
+ // if row is harvested, move core down by 1
+ while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and y < (full_grid_size_y - 1)) {
+ y += 1;
+ }
+
+ coord.y = y;
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical_realloc;
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < num_banks; ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical_realloc;
+}
+
+void get_l1_writer_core_coords_grayskull(
+ tt_metal::Device* device, std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id());
+ uint32_t full_grid_size_y = soc_d.grid_size.y;
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get y coords of the workers
+ std::vector all_worker_cores_y_physical;
+ for (int i = 0; i < num_cores_y; ++i) {
+ auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i));
+ all_worker_cores_y_physical.push_back(core_phy.y);
+ }
+
+ // get the harvested rows, we treat dram and eth cores as harvested as well
+ std::vector harvested_rows;
+ for (int i = 0; i < full_grid_size_y; ++i) {
+ auto y = i;
+
+ if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) ==
+ all_worker_cores_y_physical.end()) {
+ harvested_rows.push_back(y);
+ }
+ }
+
+ // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers
+ std::vector adj_core_physical;
+ for (int i = 0; i < all_dram_reader_cores.size(); ++i) {
+ auto dram_reader_core = all_dram_reader_cores[i];
+ auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core);
+ uint32_t adj_core_x = dram_reader_core_phy.x;
+ uint32_t adj_core_y = dram_reader_core_phy.y + 1;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // move worker if they are in the harvested rows
+ for (auto& coord : adj_core_physical) {
+ auto y = coord.y;
+
+ // if row is harvested, move core down by 1
+ while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and y < (full_grid_size_y - 1)) {
+ y += 1;
+ }
+
+ coord.y = y;
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical_realloc;
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < adj_core_logical_realloc.size(); ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical_realloc;
+}
+
+void get_dram_reader_core_coords_wormhole_b0(
+ tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get dram banks and coords
+ uint32_t num_banks = device->num_dram_channels();
+ uint32_t max_bank_id = num_banks - 1;
+ std::vector dram_coord_phy; dram_coord_phy.reserve(num_banks);
+ for (int i = 0; i < num_banks; ++i) {
+ dram_coord_phy.push_back(device->dram_core_from_dram_channel(i));
+ }
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical; all_worker_cores_logical.reserve(num_cores_x * num_cores_y);
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get the ajacent cores of DRAM banks
+ std::vector adj_core_physical; adj_core_physical.reserve(num_banks);
+ for (int i = 0; i < num_banks; ++i) {
+ auto dram_core = dram_coord_phy[i];
+ uint32_t adj_core_x = dram_core.x + 1;
+ uint32_t adj_core_y = dram_core.y;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical; adj_core_logical.reserve(num_banks);
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < num_banks; ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical;
+}
+
+
+void get_l1_writer_core_coords_wormhole_b0(
+ tt_metal::Device* device, std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) {
+
+ // get all the logical coord
+ auto compute_with_storage_grid_size = device->compute_with_storage_grid_size();
+ uint32_t num_cores_x = compute_with_storage_grid_size.x;
+ uint32_t num_cores_y = compute_with_storage_grid_size.y;
+
+ // get worker logical coords
+ std::vector all_worker_cores_logical;
+ for (int i = 0; i < num_cores_x; ++i) {
+ for (int j = 0; j < num_cores_y; ++j) {
+ all_worker_cores_logical.push_back(CoreCoord(i, j));
+ }
+ }
+
+ // get the ajacent cores of DRAM readers, for wormhole the l1 writers are on the left or right DRAM readers
+ std::vector adj_core_physical;
+ for (int i = 0; i < all_dram_reader_cores.size(); ++i) {
+ auto dram_reader_core = all_dram_reader_cores[i];
+ auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core);
+ uint32_t adj_core_x = dram_reader_core_phy.x + 1;
+ uint32_t adj_core_y = dram_reader_core_phy.y;
+ adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y));
+ }
+
+ // find the logical coord from physical coord
+ std::vector adj_core_logical_realloc;
+ for (int i = 0; i < adj_core_physical.size(); ++i) {
+ for (int j = 0; j < all_worker_cores_logical.size(); ++j) {
+ auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]);
+ if (adj_core_physical[i] == core) {
+ adj_core_logical_realloc.push_back(all_worker_cores_logical[j]);
+ }
+ }
+ }
+
+ // create sets
+ std::set all_cores_set;
+ for (int i = 0; i < adj_core_logical_realloc.size(); ++i) {
+ all_cores_set.insert(CoreRange(adj_core_logical_realloc[i]));
+ }
+ all_cores = CoreRangeSet(all_cores_set);
+ all_cores_ordered = adj_core_logical_realloc;
+}
+
+int main(int argc, char **argv) {
+ if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) {
+ log_error("Test not supported w/ slow dispatch, exiting");
+ }
+
+ bool pass = true;
+ bool use_device_profiler = false;
+ bool bypass_check = false;
+ uint32_t df = 0;
+ std::vector dram_bandwidth;
+ uint32_t num_tests = 1;
+ uint32_t num_blocks = 8;
+ uint64_t k = 8192, n = 128;
+ uint32_t dram_bandwidth_spec = 0;
+ uint32_t num_banks = 1;
+ uint32_t bank_start_id = 1;
+
+ log_info("start DRAM benchmark");
+
+ try {
+ ////////////////////////////////////////////////////////////////////////////
+ // Initial Runtime Args Parse
+ ////////////////////////////////////////////////////////////////////////////
+ std::vector input_args(argv, argv + argc);
+ try {
+ std::tie(k, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192);
+
+ std::tie(n, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12*128);
+
+ std::tie(num_blocks, input_args) =
+ test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8);
+
+ std::tie(num_tests, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1);
+
+ std::tie(use_device_profiler, input_args) =
+ test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler");
+
+ std::tie(bypass_check, input_args) =
+ test_args::has_command_option_and_remaining_args(input_args, "--bypass-check");
+
+ std::tie(df, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 0);
+
+ std::tie(num_banks, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12);
+
+ std::tie(bank_start_id, input_args) =
+ test_args::get_command_option_uint32_and_remaining_args(input_args, "--bank-start-id", 0);
+
+ test_args::validate_remaining_args(input_args);
+ } catch (const std::exception &e) {
+ log_error(tt::LogTest, "Command line arguments found exception", e.what());
+ TT_ASSERT(false);
+ }
+
+ if (use_device_profiler) {
+#if !defined(TRACY_ENABLE)
+ log_error(
+ LogTest,
+ "Metal library and test code should be build with "
+ "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh");
+#endif
+ auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER");
+ TT_FATAL(
+ device_profiler,
+ "Before running the program, do one of the following in a shell: "
+ "either export the environment variable by executing export TT_METAL_DEVICE_PROFILER=1, "
+ "or run the program with TT_METAL_DEVICE_PROFILER=1 prefixed to the command");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Parameters Setup
+ ////////////////////////////////////////////////////////////////////////////
+ uint32_t input_size = 0;
+ tt::DataFormat tile_format = tt::DataFormat::Bfp8_b;
+ if (df == 0) {
+ input_size = k * n * 1088 / 1024;
+ tile_format = tt::DataFormat::Bfp8_b;
+ } else if (df == 1) {
+ input_size = k * n * 2;
+ tile_format = tt::DataFormat::Float16_b;
+ } else {
+ TT_THROW("Input data format {} is invalid. Please change.", df);
+ }
+ uint32_t kt = k / 32;
+ uint32_t nt = n / 32;
+ uint32_t block_h = kt / num_blocks;
+ uint32_t block_w = nt / num_banks;
+ uint32_t num_datum_per_slice = 32 * 32;
+
+ uint32_t single_tile_size = tt_metal::detail::TileSize(tile_format);
+ if (input_size % single_tile_size != 0) {
+ auto align_to_single_tile = [=](uint64_t value) -> uint64_t {
+ return ((value + (single_tile_size - 1)) / single_tile_size) * single_tile_size;
+ };
+
+ auto input_size_aligned = align_to_single_tile(input_size);
+ log_info(LogTest, "input size {} is aligned to {} bytes", input_size, input_size_aligned);
+ input_size = input_size_aligned;
+ }
+ ////////////////////////////////////////////////////////////////////////////
+ // Device Setup
+ ////////////////////////////////////////////////////////////////////////////
+ int device_id = 0;
+ tt_metal::Device *device = tt_metal::CreateDevice(device_id);
+ dram_bandwidth_spec = get_dram_bandwidth(device->arch());
+
+ TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0");
+
+ int clock_freq_mhz = get_tt_npu_clock(device);
+
+ uint32_t num_tiles = static_cast((input_size + single_tile_size - 1) / single_tile_size);
+ uint32_t num_cores = num_banks; // number of DRAM banks
+
+ CoreRangeSet all_dram_reader_cores = CoreRangeSet{{}};
+ std::vector all_dram_reader_cores_ordered;
+ CoreRangeSet all_l1_receiver_cores = CoreRangeSet{{}};
+ std::vector all_l1_writer_cores_ordered;
+ if (device->arch() == tt::ARCH::BLACKHOLE) {
+ get_dram_reader_core_coords_blackhole(device, all_dram_reader_cores, all_dram_reader_cores_ordered);
+ get_l1_writer_core_coords_blackhole(device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered);
+ } else if (device->arch() == tt::ARCH::WORMHOLE_B0) {
+ get_dram_reader_core_coords_wormhole_b0(device, all_dram_reader_cores, all_dram_reader_cores_ordered);
+ get_l1_writer_core_coords_wormhole_b0(device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered);
+ } else {
+ get_dram_reader_core_coords_grayskull(device, all_dram_reader_cores, all_dram_reader_cores_ordered);
+ get_l1_writer_core_coords_grayskull(device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered);
+ }
+
+ uint32_t num_tiles_per_core = num_tiles / num_cores;
+ uint32_t num_tiles_cb = num_tiles_per_core / num_blocks;
+
+ log_info("all_dram_reader_cores");
+ for (auto core: all_dram_reader_cores_ordered) {
+ auto phys_core = device->worker_core_from_logical_core(core);
+ log_info("logical core: {}, physical core: {}", core, phys_core);
+ }
+ log_info("all_l1_writer_cores");
+ for (auto core: all_l1_writer_cores_ordered) {
+ auto phys_core = device->worker_core_from_logical_core(core);
+ log_info("logical core: {}, physical core: {}", core, phys_core);
+ }
+
+ log_info(
+ LogTest,
+ "Measuring DRAM bandwidth for input_size = {} bytes ({:.3f} MB, "
+ "{} tiles), using {} cores",
+ input_size,
+ static_cast(input_size) / 1024 / 1024,
+ num_tiles,
+ num_cores);
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Input Setup
+ ////////////////////////////////////////////////////////////////////////////
+ std::vector input_vec;
+ if (tile_format == tt::DataFormat::Bfp8_b) {
+ // input_vec = create_constant_vector_of_bfp8(
+ // input_size, 100, true);
+ input_vec = create_random_vector_of_bfp8(
+ input_size, true, 100, 1234);
+ } else {
+ // input_vec = create_constant_vector_of_bfloat16(
+ // input_size * total_banks / num_banks, 100);
+ input_vec = create_random_vector_of_bfloat16(
+ input_size, 100, 1234);
+ }
+
+ tt_metal::Buffer input_buffer(
+ device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Application Setup
+ ////////////////////////////////////////////////////////////////////////////
+ auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Copy Input To DRAM or L1
+ ////////////////////////////////////////////////////////////////////////////
+ tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Execution Application
+ ////////////////////////////////////////////////////////////////////////////
+ tt_metal::detail::CompileProgram(device, program);
+
+ log_info(LogTest, "Num tests {}", num_tests);
+ for (uint32_t i = 0; i < num_tests; ++i) {
+ auto t_begin = std::chrono::steady_clock::now();
+ EnqueueProgram(device->command_queue(), program, false);
+ Finish(device->command_queue());
+ tt_metal::DumpDeviceProfileResults(device, program);
+ auto t_end = std::chrono::steady_clock::now();
+ auto elapsed_us = duration_cast(t_end - t_begin).count();
+ dram_bandwidth.push_back((input_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));
+ log_info(
+ LogTest,
+ "Time elapsed for DRAM accesses: {:.3f}ms ({:.3f}GB/s)",
+ elapsed_us / 1000.0,
+ dram_bandwidth[i]);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Validation & Teardown
+ ////////////////////////////////////////////////////////////////////////////
+
+ pass = validation(
+ device,
+ input_buffer,
+ input_vec,
+ num_cores,
+ all_l1_writer_cores_ordered,
+ num_tiles_per_core,
+ output_cb_addr,
+ single_tile_size,
+ num_tiles_cb,
+ df,
+ num_banks,
+ num_blocks,
+ block_h,
+ block_w,
+ num_datum_per_slice);
+
+ pass &= tt_metal::CloseDevice(device);
+ } catch (const std::exception &e) {
+ pass = false;
+ // Capture the exception error message
+ log_error(LogTest, "{}", e.what());
+ // Capture system call errors that may have returned from driver/kernel
+ log_error(LogTest, "System error message: {}", std::strerror(errno));
+ }
+
+ // Determine if it passes performance goal
+ auto avg_dram_bandwidth = calculate_average(dram_bandwidth);
+ if (pass && bypass_check == false) {
+ // goal is 90% of peak DRAM bandwidth performance
+ double target_bandwidth = static_cast(dram_bandwidth_spec) * 0.9;
+ if (avg_dram_bandwidth < target_bandwidth) {
+ pass = false;
+ log_error(
+ LogTest,
+ "The DRAM bandwidth does not meet the criteria. "
+ "Current: {:.3f}GB/s, goal: {:.3f}GB/s",
+ avg_dram_bandwidth,
+ target_bandwidth);
+ }
+ }
+
+ if (pass) {
+ log_info(LogTest, "Test Passed");
+ } else {
+ log_error(LogTest, "Test Failed");
+ }
+
+ return 0;
+}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index c855cac5c49..94875c6114f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -37,6 +37,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
6_dram_offchip/test_dram_offchip.cpp
7_kernel_launch/test_kernel_launch.cpp
8_dram_adjacent_core_read/test_dram_read.cpp
+ 9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
)
foreach (TEST_SRC ${PERF_MICROBENCH_TESTS_SRCS})
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
index 4b8564ea2b7..879d11a1b4c 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/common/util.hpp
@@ -25,7 +25,7 @@ inline uint64_t get_t0_to_any_riscfw_end_cycle(tt::tt_metal::Device *device, con
uint64_t min_cycle = -1;
uint64_t max_cycle = 0;
dprint_buf_msg_t *dprint_msg =
- hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::DPRINT);
+ hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::DPRINT);
// This works for tensix only, will need to be updated for eth
vector print_buffer_addrs = {
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp
index 563a77d51b0..731ac407e0f 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp
@@ -8,28 +8,41 @@ void kernel_main() {
#else
uint32_t page_size = get_arg_val(0);
#endif
+
cb_reserve_back(0, PAGE_COUNT);
uint32_t cb_addr = get_write_ptr(0);
for (int i = 0; i < ITERATIONS; i++) {
uint32_t read_ptr = cb_addr;
+ uint32_t write_ptr = cb_addr;
for (int j = 0; j < PAGE_COUNT; j++) {
+
#if DRAM_BANKED
uint64_t noc_addr = get_dram_noc_addr(j, page_size, 0);
#else
uint64_t noc_addr = NOC_XY_ADDR(NOC_X(NOC_ADDR_X), NOC_Y(NOC_ADDR_Y), NOC_MEM_ADDR);
#endif
-#if READ_ONE_PACKET
+
+#if ISSUE_MCAST
+ uint64_t dst_noc_multicast_addr =
+ get_noc_multicast_addr(NOC_ADDR_X, NOC_ADDR_Y, MCAST_NOC_END_ADDR_X, MCAST_NOC_END_ADDR_Y, NOC_MEM_ADDR);
+ noc_async_write_multicast(write_ptr, dst_noc_multicast_addr, page_size, NUM_MCAST_DESTS);
+#elif READ_ONE_PACKET
noc_async_read_one_packet(noc_addr, read_ptr, page_size);
#else
noc_async_read(noc_addr, read_ptr, page_size);
#endif
+
#if LATENCY
noc_async_read_barrier();
+ noc_async_write_barrier();
#endif
+
read_ptr += page_size;
+ write_ptr += page_size;
}
}
#if !LATENCY
noc_async_read_barrier();
+ noc_async_write_barrier();
#endif
}
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh
new file mode 100755
index 00000000000..90a4972019b
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+if [ "$ARCH_NAME" = "grayskull" ]; then
+ echo "Configured core range for grayskull"
+ max_x=11
+ max_y=8
+elif [ "$ARCH_NAME" = "wormhole_b0" ]; then
+ echo "Configured core range for wormhole_b0"
+ max_x=7
+ max_y=6
+elif [ "$ARCH_NAME" = "blackhole" ]; then
+ echo "Configured core range for blackhole"
+ max_x=12
+ max_y=9
+else
+ echo "Unknown arch: $ARCH_NAME"
+ exit 1
+fi
+
+function get_half_way_away_core_x() {
+ half_way_away_core_x=$(( ($1 + (($max_x + 1) / 2)) % ($max_x + 1) ))
+ echo $half_way_away_core_x
+}
+
+function get_half_way_away_core_y() {
+ half_way_away_core_y=$(( ($1 + (($max_y + 1) / 2)) % ($max_y + 1) ))
+ echo $half_way_away_core_y
+}
+
+function read_from_half_way_away_core() {
+ half_way_away_core_x=$(get_half_way_away_core_x $1)
+ half_way_away_core_y=$(get_half_way_away_core_y $2)
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 2 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 2 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y
+}
+
+function mcast_write_to_half_way_away_core() {
+ half_way_away_core_x=$(get_half_way_away_core_x $1)
+ half_way_away_core_y=$(get_half_way_away_core_y $2)
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y -tx $half_way_away_core_x -ty $half_way_away_core_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y -tx $half_way_away_core_x -ty $half_way_away_core_y
+}
+
+function mcast_write_to_adjacent_core() {
+ adj_core_y=$(($2 + 1))
+ if [ $adj_core_y -gt $max_y ]; then
+ adj_core_y=$(($2 - 1))
+ fi
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $adj_core_y -tx $1 -ty $adj_core_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $adj_core_y -tx $1 -ty $adj_core_y
+}
+
+function mcast_write_from_core_after_curr_core_to_half_way_away_core() {
+ half_way_away_core_x=$(get_half_way_away_core_x $1)
+ half_way_away_core_y=$(get_half_way_away_core_y $2)
+ mcast_start_y=$(($2 + 1))
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $mcast_start_y -tx $half_way_away_core_x -ty $half_way_away_core_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $mcast_start_y -tx $half_way_away_core_x -ty $half_way_away_core_y
+}
+
+for ((x=0; x<=max_x; x++)); do
+ for ((y=0; y<=max_y; y++)); do
+ read_from_half_way_away_core $x $y
+ mcast_write_to_half_way_away_core $x $y
+ mcast_write_to_adjacent_core $x $y
+ mcast_write_from_core_after_curr_core_to_half_way_away_core $x $y
+
+ if [ $y -eq 0 ]; then
+ mcast_start_y=$(($y + 1))
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy $mcast_start_y -tx $max_x -ty $max_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy $mcast_start_y -tx $max_x -ty $max_y
+ fi
+
+ if [ $y -eq $max_y ]; then
+ mcast_end_y=$(($y - 1))
+ echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy 0 -tx $max_x -ty $mcast_end_y"
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy 0 -tx $max_x -ty $mcast_end_y
+ fi
+ done
+done
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
index e442ae3f91a..e8999324dc3 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp
@@ -3,9 +3,13 @@
// SPDX-License-Identifier: Apache-2.0
#include
+#include
#include
#include
+#include
+#include "core_coord.h"
+#include "logger.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "tt_metal/llrt/rtoptions.hpp"
@@ -22,7 +26,7 @@ constexpr uint32_t DEFAULT_BATCH_SIZE_K = 512;
//////////////////////////////////////////////////////////////////////////////////////////
// Test dispatch program performance
//
-// Test read bw and latency from host/dram/l1
+// Test read/write bw and latency from host/dram/l1
//////////////////////////////////////////////////////////////////////////////////////////
using namespace tt;
@@ -30,6 +34,7 @@ uint32_t iterations_g = DEFAULT_ITERATIONS;
uint32_t warmup_iterations_g = DEFAULT_WARMUP_ITERATIONS;
CoreRange worker_g = {{0, 0}, {0, 0}};
CoreCoord src_worker_g = {0, 0};
+CoreRange mcast_src_workers_g = {{0, 0}, {0, 0}};
uint32_t page_size_g;
uint32_t page_count_g;
uint32_t source_mem_g;
@@ -53,15 +58,17 @@ void init(int argc, char **argv) {
log_info(LogTest, " -i: iterations (default {})", DEFAULT_ITERATIONS);
log_info(LogTest, " -bs: batch size in K of data to xfer in one iteration (default {}K)", DEFAULT_BATCH_SIZE_K);
log_info(LogTest, " -p: page size (default {})", DEFAULT_PAGE_SIZE);
- log_info(LogTest, " -m: source mem, 0:PCIe, 1:DRAM, 2:L1, 3:ALL_DRAMs, 4:HOST_READ, 5:HOST_WRITE (default 0:PCIe)");
+ log_info(LogTest, " -m: source mem, 0:PCIe, 1:DRAM, 2:L1, 3:ALL_DRAMs, 4:HOST_READ, 5:HOST_WRITE, 6:MULTICAST_WRITE (default 0:PCIe)");
log_info(LogTest, " -l: measure latency (default is bandwidth)");
- log_info(LogTest, " -rx: X of core to issue read (default {})", 1);
- log_info(LogTest, " -ry: Y of core to issue read (default {})", 0);
+ log_info(LogTest, " -rx: X of core to issue read or write (default {})", 1);
+ log_info(LogTest, " -ry: Y of core to issue read or write (default {})", 0);
+ log_info(LogTest, " -sx: when reading from L1, X of core to read from. when issuing a multicast write, X of start core to write to. (default {})", 0);
+ log_info(LogTest, " -sy: when reading from L1, Y of core to read from. when issuing a multicast write, Y of start core to write to. (default {})", 0);
+ log_info(LogTest, " -tx: when issuing a multicast write, X of end core to write to (default {})", 0);
+ log_info(LogTest, " -ty: when issuing a multicast write, Y of end core to write to (default {})", 0);
log_info(LogTest, " -c: when reading from dram, DRAM channel (default 0)");
- log_info(LogTest, " -sx: when reading from L1, X of core to read from (default {})", 0);
- log_info(LogTest, " -sy: when reading from L1, Y of core to read (default {})", 0);
log_info(LogTest, " -f: time just the finish call (use w/ lazy mode) (default disabled)");
- log_info(LogTest, " -o: use read_one_packet API. restrices page size to 8K max (default {})", 0);
+ log_info(LogTest, " -o: use read_one_packet API. restricts page size to 8K max (default {})", 0);
log_info(LogTest, " -z: enable dispatch lazy mode (default disabled)");
log_info(LogTest, " -hr: hammer write_reg while executing (for PCIe test)");
log_info(LogTest, " -hp: hammer hugepage PCIe memory while executing (for PCIe test)");
@@ -80,9 +87,11 @@ void init(int argc, char **argv) {
hammer_pcie_type_g = test_args::get_command_option_uint32(input_args, "-hpt", 0);
time_just_finish_g = test_args::has_command_option(input_args, "-f");
source_mem_g = test_args::get_command_option_uint32(input_args, "-m", 0);
- dram_channel_g = test_args::get_command_option_uint32(input_args, "-c", 0);
uint32_t src_core_x = test_args::get_command_option_uint32(input_args, "-sx", 0);
uint32_t src_core_y = test_args::get_command_option_uint32(input_args, "-sy", 0);
+ uint32_t mcast_end_core_x = test_args::get_command_option_uint32(input_args, "-tx", 0);
+ uint32_t mcast_end_core_y = test_args::get_command_option_uint32(input_args, "-ty", 0);
+ dram_channel_g = test_args::get_command_option_uint32(input_args, "-c", 0);
uint32_t size_bytes = test_args::get_command_option_uint32(input_args, "-bs", DEFAULT_BATCH_SIZE_K) * 1024;
latency_g = test_args::has_command_option(input_args, "-l");
page_size_g = test_args::get_command_option_uint32(input_args, "-p", DEFAULT_PAGE_SIZE);
@@ -96,6 +105,25 @@ void init(int argc, char **argv) {
worker_g = CoreRange({core_x, core_y}, {core_x, core_y});
src_worker_g = {src_core_x, src_core_y};
+
+ if (source_mem_g == 6)
+ {
+ if (mcast_end_core_x < src_core_x || mcast_end_core_y < src_core_y)
+ {
+ log_info(LogTest, "X of end core must be >= X of start core, Y of end core must be >= Y of start core");
+ exit(-1);
+ }
+
+ mcast_src_workers_g = CoreRange({src_core_x, src_core_y}, {mcast_end_core_x, mcast_end_core_y});
+
+ if (mcast_src_workers_g.intersects(worker_g)) {
+ log_info(
+ LogTest,
+ "Multicast destination rectangle and core that issues the multicast cannot overlap - Multicast "
+ "destination rectangle: {} Master core: {}", mcast_src_workers_g.str(), worker_g.start_coord.str());
+ exit(-1);
+ }
+ }
}
#define CACHE_LINE_SIZE 64
@@ -136,6 +164,10 @@ int main(int argc, char **argv) {
uint32_t noc_addr_x, noc_addr_y;
uint64_t noc_mem_addr = 0;
uint32_t dram_banked = 0;
+ uint32_t issue_mcast = 0;
+ uint32_t num_mcast_dests = mcast_src_workers_g.size();
+ uint32_t mcast_noc_addr_end_x = 0;
+ uint32_t mcast_noc_addr_end_y = 0;
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
@@ -202,6 +234,18 @@ int main(int argc, char **argv) {
noc_addr_y = w.y;
}
break;
+ case 6:
+ {
+ src_mem = "FROM_L1_TO_MCAST";
+ issue_mcast = 1;
+ CoreCoord start = device->physical_core_from_logical_core(mcast_src_workers_g.start_coord, CoreType::WORKER);
+ CoreCoord end = device->physical_core_from_logical_core(mcast_src_workers_g.end_coord, CoreType::WORKER);
+ noc_addr_x = start.x;
+ noc_addr_y = start.y;
+ mcast_noc_addr_end_x = end.x;
+ mcast_noc_addr_end_y = end.y;
+ }
+ break;
}
std::map defines = {
@@ -212,7 +256,11 @@ int main(int argc, char **argv) {
{"NOC_ADDR_Y", std::to_string(noc_addr_y)},
{"NOC_MEM_ADDR", std::to_string(noc_mem_addr)},
{"READ_ONE_PACKET", std::to_string(read_one_packet_g)},
- {"DRAM_BANKED", std::to_string(dram_banked)}
+ {"DRAM_BANKED", std::to_string(dram_banked)},
+ {"ISSUE_MCAST", std::to_string(issue_mcast)},
+ {"NUM_MCAST_DESTS", std::to_string(num_mcast_dests)},
+ {"MCAST_NOC_END_ADDR_X", std::to_string(mcast_noc_addr_end_x)},
+ {"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)}
};
if (!page_size_as_runtime_arg_g) {
defines.insert(pair("PAGE_SIZE", std::to_string(page_size_g)));
@@ -243,11 +291,23 @@ int main(int argc, char **argv) {
log_info(LogTest, "Reading: {} - core ({}, {})", src_mem, w.x, w.y);
} else if (source_mem_g == 5) {
log_info(LogTest, "Writing: {} - core ({}, {})", src_mem, w.x, w.y);
+ } else if (source_mem_g == 6) {
+ log_info(LogTest, "Writing: {} - core grid [({}, {}) - ({}, {})]", src_mem, noc_addr_x, noc_addr_y, mcast_noc_addr_end_x, mcast_noc_addr_end_y);
} else {
log_info(LogTest, "Reading: {} - core ({}, {})", src_mem, noc_addr_x, noc_addr_y);
}
- if (source_mem_g != 4) {
- log_info(LogTest, "Using API: {}", read_one_packet_g ? "noc_async_read_one_packet" : "noc_async_read");
+ if (source_mem_g < 4 || source_mem_g == 6) {
+ std::string api;
+ if (issue_mcast) {
+ api = "noc_async_write_multicast";
+ }
+ else if (read_one_packet_g) {
+ api = "noc_async_read_one_packet";
+ }
+ else {
+ api = "noc_async_read";
+ }
+ log_info(LogTest, "Using API: {}", api);
log_info(LogTest, "Lazy: {}", lazy_g);
log_info(LogTest, "Page size ({}): {}", page_size_as_runtime_arg_g ? "runtime arg" : "compile time define", page_size_g);
log_info(LogTest, "Size per iteration: {}", page_count_g * page_size_g);
@@ -259,7 +319,7 @@ int main(int argc, char **argv) {
vectorblank(page_size_g / sizeof(uint32_t));
std::chrono::duration elapsed_seconds;
- if (source_mem_g < 4) {
+ if (source_mem_g < 4 || source_mem_g == 6) {
// Cache stuff
for (int i = 0; i < warmup_iterations_g; i++) {
EnqueueProgram(cq, program, false);
@@ -313,7 +373,7 @@ int main(int argc, char **argv) {
Finish(cq);
auto end = std::chrono::system_clock::now();
elapsed_seconds = (end-start);
- } else {
+ } else if (source_mem_g == 4 || source_mem_g == 5) {
vector vec;
vec.resize(page_size_g / sizeof(uint32_t));
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
index 47657c7059d..6722056c2bf 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
@@ -58,7 +58,7 @@ void MAIN {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
// Reconfigure input
@@ -98,7 +98,7 @@ void MAIN {
pack_tile(i, mm_bias_intermediate_cb_id);
}
cb_push_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
// Redundant wait since we know data was just pushed
cb_wait_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
@@ -108,7 +108,7 @@ void MAIN {
unpack_reconfig_data_format(mm_bias_intermediate_cb_id, bias_cb_id);
// reconfigure packer df for out
pack_reconfig_data_format(out_cb_id);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
for (uint32_t i = 0, j = 0; j < out_subblock_h; j++) {
uint32_t bcast_tile_idx = in1_index_subblock_offset;
for (uint32_t k = 0; k < out_subblock_w; k++, i++) {
@@ -150,7 +150,7 @@ void MAIN {
cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
}
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
}
in0_index_subblock_offset += in0_subblock_num_tiles;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
index 56bb8f17a4f..60d3136267d 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
@@ -16,14 +16,14 @@ void MAIN {
for (uint32_t mt = 0; mt < sub_Mt; ++mt) {
for (uint32_t nt = 0; nt < sub_Nt; ++nt) {
- acquire_dst(tt::DstMode::Full);
+ acquire_dst();
for (uint32_t kt = 0; kt < Kt; ++kt) {
matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, mt * Kt + kt, nt * Kt + kt, 0, false);
}
cb_reserve_back(tt::CB::c_out0, onetile);
pack_tile(0, tt::CB::c_out0);
cb_push_back(tt::CB::c_out0, onetile);
- release_dst(tt::DstMode::Full);
+ release_dst();
}
}
}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
index 747765489ac..1220f3e935d 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
@@ -24,7 +24,7 @@ void MAIN {
cb_reserve_back(tt::CB::c_out0, onetile);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, onetile);
@@ -33,7 +33,7 @@ void MAIN {
cb_pop_front(tt::CB::c_in0, onetile);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, onetile);
cb_pop_front(tt::CB::c_in1, onetile);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
index 230ee8b9c36..499afa82fad 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
@@ -26,7 +26,7 @@ void MAIN {
#endif
cb_reserve_back(tt::CB::c_out0, onetile);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, onetile);
@@ -37,7 +37,7 @@ void MAIN {
#ifndef BCAST_SCALAR
cb_pop_front(tt::CB::c_in1, onetile);
#endif
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, onetile);
} } }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
index 0de0e2f82c0..ec6f71c0023 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
@@ -23,14 +23,14 @@ void MAIN {
cb_reserve_back(tt::CB::c_out0, onetile);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, onetile);
BCAST_OP(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
pack_tile(0, tt::CB::c_out0);
cb_pop_front(tt::CB::c_in0, onetile);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, onetile);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
index 6e42eb29d49..d62a8e06e98 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
@@ -31,7 +31,7 @@ void MAIN {
for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C
for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) // output tile index of C
{
- acquire_dst(tt::DstMode::Full);
+ acquire_dst();
for (uint32_t kt = 0; kt < Kt; kt++) {
cb_wait_front(tt::CB::c_in0, onetile);
cb_wait_front(tt::CB::c_in1, onetile);
@@ -46,7 +46,7 @@ void MAIN {
pack_tile(0, tt::CB::c_out0);
cb_push_back(tt::CB::c_out0, onetile);
- release_dst(tt::DstMode::Full);
+ release_dst();
}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
index ec293c8c7bb..2ab808f2f32 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
@@ -41,7 +41,7 @@ void MAIN {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
copy_tile_to_dst_init_short();
@@ -91,7 +91,7 @@ void MAIN {
cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
}
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
}
in0_index_subblock_offset += in0_subblock_num_tiles;
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
index 47657c7059d..6722056c2bf 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
@@ -58,7 +58,7 @@ void MAIN {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
// Reconfigure input
@@ -98,7 +98,7 @@ void MAIN {
pack_tile(i, mm_bias_intermediate_cb_id);
}
cb_push_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
// Redundant wait since we know data was just pushed
cb_wait_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
@@ -108,7 +108,7 @@ void MAIN {
unpack_reconfig_data_format(mm_bias_intermediate_cb_id, bias_cb_id);
// reconfigure packer df for out
pack_reconfig_data_format(out_cb_id);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
for (uint32_t i = 0, j = 0; j < out_subblock_h; j++) {
uint32_t bcast_tile_idx = in1_index_subblock_offset;
for (uint32_t k = 0; k < out_subblock_w; k++, i++) {
@@ -150,7 +150,7 @@ void MAIN {
cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
}
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
}
in0_index_subblock_offset += in0_subblock_num_tiles;
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
index d2b042f7238..632fc69018f 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
@@ -46,7 +46,7 @@ void MAIN {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
// Reconfigure input
@@ -98,7 +98,7 @@ void MAIN {
cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
}
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
}
in0_index_subblock_offset += in0_subblock_num_tiles;
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
index b12c847b1df..3c972131d6b 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
@@ -71,10 +71,10 @@ inline void tilize_in(
for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) {
for (uint32_t w = 0; w < out_subblock_w; w++) {
uint32_t tile_index = block_offset + within_block_index + w;
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
copy_tile(interm_cb_id, tile_index, 0);
pack_tile(0, reblock_cb_id);
- release_dst(tt::DstMode::Half);
+ release_dst();
}
block_offset += out_subblock_num_tiles;
}
@@ -165,7 +165,7 @@ void MAIN {
for (uint32_t in0_subblock_i = 0; in0_subblock_i < in0_num_subblocks; ++in0_subblock_i) {
int in1_index_subblock_offset = 0;
for (uint32_t in1_subblock_i = 0; in1_subblock_i < in1_num_subblocks; ++in1_subblock_i) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
if (enable_reload) {
// Reconfigure input
copy_tile_to_dst_init_short_with_dt(in1_cb_id, matmul_partials_cb);
@@ -201,7 +201,7 @@ void MAIN {
if (last_out) {
// first move the current result from dst to interim CB
pack_matmul_subblock(out_for_bias_cb_id, out_subblock_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
// reconfig unpacker df for src B
// unpack_reconfig_data_format(out_for_bias_cb_id, bias_cb_id);
// bcast add data from bias_cb_id
@@ -210,7 +210,7 @@ void MAIN {
add_bcast_rows_init_short();
// reconfig packer df for out
// pack_reconfig_data_format(out_cb_id);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
uint32_t i = 0;
for (uint32_t h = 0; h < out_subblock_h; ++ h) {
uint32_t bcast_tile_i = bias_block_offset + in1_index_subblock_offset;
@@ -244,7 +244,7 @@ void MAIN {
: out_cb_id)
: matmul_partials_cb;
pack_matmul_subblock(curr_matmul_out_cb, out_subblock_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
in1_index_subblock_offset += out_subblock_w;
} // for in1_num_subblocks
#ifndef FUSE_BIAS
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
index cf60e0652a1..267be6ebc2e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
@@ -19,7 +19,7 @@ void MAIN {
cb_wait_front(tt::CB::c_in1, onetile);
cb_reserve_back(tt::CB::c_out0, onetile);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, onetile);
#ifndef BCAST_SPECIFIC
@@ -30,7 +30,7 @@ void MAIN {
pack_tile(0, tt::CB::c_out0);
cb_pop_front(tt::CB::c_in0, onetile);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, onetile);
cb_pop_front(tt::CB::c_in1, onetile);
}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
index 5cd4b60b14f..c72464280c5 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
@@ -30,7 +30,7 @@ void MAIN {
for(uint32_t wt = 0; wt < Wt; ++wt) {
for(uint32_t ht = 0; ht < Ht; ++ht) {
cb_reserve_back(tt::CB::c_out0, onetile);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(tt::CB::c_in0, onetile);
#ifndef ROWWISE
@@ -48,7 +48,7 @@ void MAIN {
pack_tile(0, tt::CB::c_out0);
cb_pop_front(tt::CB::c_in0, onetile);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(tt::CB::c_out0, onetile);
}
}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
index a55ce9ba155..5f43fc0b346 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
@@ -21,7 +21,7 @@ void MAIN {
for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
// Pop tile after tile, copy to DST and pack
cb_wait_front(tt::CB::c_in0, 1);
@@ -34,7 +34,7 @@ void MAIN {
cb_pop_front(tt::CB::c_in0, 1);
- release_dst(tt::DstMode::Half);
+ release_dst();
}
cb_push_back(tt::CB::c_out0, per_core_block_dim);
}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
index 1e7c029d9a3..41e494d29b8 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
@@ -15,7 +15,7 @@ void MAIN {
unary_op_init_common(tt::CB::c_in0);
for(uint32_t b=0;b 0) {
copy_tile_to_dst_init_short();
cb_wait_front(partials_cb, out_block_num_tiles);
@@ -68,7 +68,7 @@ void MAIN {
cb_push_back(partials_cb, out_block_num_tiles);
}
}
- release_dst(tt::DstMode::Half);
+ release_dst();
}
}
} // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp
index 6190640c8bd..8c50ca5a30e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp
@@ -23,7 +23,7 @@ void MAIN {
// we are looking at block
// out = in0[r x k]*in1[k x c]
mm_init();
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
uint32_t out_tile_index = 0;
uint32_t in0_index_r_offset = 0;
@@ -50,6 +50,6 @@ void MAIN {
pack_tile(tile_index, out_cb);
}
cb_push_back(out_cb, out_num_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
}
} // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp
index 8792a0af75e..cb8eb194d98 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp
@@ -21,14 +21,14 @@ void MAIN {
const bool transpose = false;
mm_init();
cb_reserve_back(out_cb, num_out_tiles);
- acquire_dst(tt::DstMode::Half);
+ acquire_dst();
cb_wait_front(in0_cb, num_in0_tiles);
cb_wait_front(in1_cb, num_in1_tiles);
matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose);
pack_tile(0, out_cb);
cb_pop_front(in0_cb, num_in0_tiles);
cb_pop_front(in1_cb, num_in1_tiles);
- release_dst(tt::DstMode::Half);
+ release_dst();
cb_push_back(out_cb, num_out_tiles);
}
} // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
index 6a49900ac5c..dc56a879feb 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
@@ -41,11 +41,11 @@ void MAIN {
unpack_tilizeA_B_block(tt::CB::c_in0, tt::CB::c_in1, per_core_block_tile_cnt, b);
for(uint i=0; idevices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+ .single_tile_size = 2 * 1024,
+ .num_tiles = 8,
+ .reader_ublock = 8,
+ .writer_ublock = 8,
+ .compute_ublock = 8,
+ .src0_cb_index = 0,
+ .ouput_cb_index = 16,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ }
}
TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W8C1) {
- unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
- .single_tile_size = 2 * 1024,
- .num_tiles = 8,
- .reader_ublock = 8,
- .writer_ublock = 8,
- .compute_ublock = 1,
- .src0_cb_index = 0,
- .ouput_cb_index = 16
- };
- unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+ .single_tile_size = 2 * 1024,
+ .num_tiles = 8,
+ .reader_ublock = 8,
+ .writer_ublock = 8,
+ .compute_ublock = 1,
+ .src0_cb_index = 0,
+ .ouput_cb_index = 16,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ }
}
TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W1C1) {
- unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
- .single_tile_size = 2 * 1024,
- .num_tiles = 8,
- .reader_ublock = 8,
- .writer_ublock = 1,
- .compute_ublock = 1,
- .src0_cb_index = 0,
- .ouput_cb_index = 16
- };
- unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+ .single_tile_size = 2 * 1024,
+ .num_tiles = 8,
+ .reader_ublock = 8,
+ .writer_ublock = 1,
+ .compute_ublock = 1,
+ .src0_cb_index = 0,
+ .ouput_cb_index = 16,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ }
}
TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR1W1C1) {
- unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
- .single_tile_size = 2 * 1024,
- .num_tiles = 1,
- .reader_ublock = 1,
- .writer_ublock = 1,
- .compute_ublock = 1,
- .src0_cb_index = 0,
- .ouput_cb_index = 16
- };
- unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+ .single_tile_size = 2 * 1024,
+ .num_tiles = 1,
+ .reader_ublock = 1,
+ .writer_ublock = 1,
+ .compute_ublock = 1,
+ .src0_cb_index = 0,
+ .ouput_cb_index = 16,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+ }
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
index ecaf175f9a9..9d48d09ccaa 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
@@ -17,6 +17,7 @@ struct ReconfigConfig {
bool explicit_reconfig = false;
bool split_src_reconfig = false;
bool l1_acc = false;
+ bool dst_full_sync_en = false;
};
/// @brief Does Dramx3 --> Reader --> CB --> Add with acc --> CB --> Writer --> Dram
@@ -142,7 +143,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
program,
"tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp",
core,
- tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = defines});
+ tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en,
+ .compile_args = compute_kernel_args, .defines = defines});
SetRuntimeArgs(
program,
@@ -275,14 +277,18 @@ TEST_F(DeviceFixture, TileCopyReconfigExplicitSplit) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::reconfig::ReconfigConfig test_config = {
- .num_tiles = 1,
- .ublock_size_tiles = 1,
- .explicit_reconfig = true,
- .split_src_reconfig = true
- };
- for (unsigned int id = 0; id < num_devices_; id++) {
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::reconfig::ReconfigConfig test_config = {
+ .num_tiles = 1,
+ .ublock_size_tiles = 1,
+ .explicit_reconfig = true,
+ .split_src_reconfig = true,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ for (unsigned int id = 0; id < num_devices_; id++) {
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ }
}
}
@@ -291,14 +297,18 @@ TEST_F(DeviceFixture, TileCopyReconfigExplicitJoined) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::reconfig::ReconfigConfig test_config = {
- .num_tiles = 1,
- .ublock_size_tiles = 1,
- .explicit_reconfig = true,
- .split_src_reconfig = false
- };
- for (unsigned int id = 0; id < num_devices_; id++) {
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::reconfig::ReconfigConfig test_config = {
+ .num_tiles = 1,
+ .ublock_size_tiles = 1,
+ .explicit_reconfig = true,
+ .split_src_reconfig = false,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ for (unsigned int id = 0; id < num_devices_; id++) {
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ }
}
}
@@ -307,14 +317,18 @@ TEST_F(DeviceFixture, TileCopyReconfigImplicitSplit) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::reconfig::ReconfigConfig test_config = {
- .num_tiles = 1,
- .ublock_size_tiles = 1,
- .explicit_reconfig = false,
- .split_src_reconfig = true
- };
- for (unsigned int id = 0; id < num_devices_; id++) {
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::reconfig::ReconfigConfig test_config = {
+ .num_tiles = 1,
+ .ublock_size_tiles = 1,
+ .explicit_reconfig = false,
+ .split_src_reconfig = true,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ for (unsigned int id = 0; id < num_devices_; id++) {
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ }
}
}
@@ -323,14 +337,18 @@ TEST_F(DeviceFixture, TileCopyReconfigImplicitJoined) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::reconfig::ReconfigConfig test_config = {
- .num_tiles = 1,
- .ublock_size_tiles = 1,
- .explicit_reconfig = false,
- .split_src_reconfig = false
- };
- for (unsigned int id = 0; id < num_devices_; id++) {
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::reconfig::ReconfigConfig test_config = {
+ .num_tiles = 1,
+ .ublock_size_tiles = 1,
+ .explicit_reconfig = false,
+ .split_src_reconfig = false,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ for (unsigned int id = 0; id < num_devices_; id++) {
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ }
}
}
@@ -339,16 +357,20 @@ TEST_F(DeviceFixture, TileCopyReconfigL1Acc) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::reconfig::ReconfigConfig test_config = {
- .num_tiles = 1,
- .ublock_size_tiles = 1,
- };
- for (unsigned int id = 0; id < num_devices_; id++) {
- test_config.l1_acc = false;
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
- log_info(LogTest, "Passed without L1 accumulation");
- test_config.l1_acc = true;
- ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
- log_info(LogTest, "Passed with L1 accumulation");
+
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::reconfig::ReconfigConfig test_config = {
+ .num_tiles = 1,
+ .ublock_size_tiles = 1,
+ .dst_full_sync_en = dst_full_sync_en
+ };
+ for (unsigned int id = 0; id < num_devices_; id++) {
+ test_config.l1_acc = false;
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ log_info(LogTest, "Passed without L1 accumulation");
+ test_config.l1_acc = true;
+ ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+ log_info(LogTest, "Passed with L1 accumulation");
+ }
}
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
index 961d5fd111c..c12dfb809be 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
@@ -52,6 +52,7 @@ struct ReduceConfig {
std::vector result_shape;
bool math_only_reduce = false;
bool fp32_dest_acc_en = false;
+ bool dst_full_sync_en = false;
MathFidelity math_fidelity = MathFidelity::HiFi4;
};
@@ -315,6 +316,7 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
core,
tt_metal::ComputeConfig{.math_fidelity = test_config.math_fidelity,
.fp32_dest_acc_en = test_config.fp32_dest_acc_en,
+ .dst_full_sync_en = test_config.dst_full_sync_en,
.compile_args = compute_kernel_args,
.defines = reduce_defines});
@@ -382,22 +384,25 @@ TEST_F(DeviceFixture, ComputeReduceH) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::H,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_h,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid),
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::H,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_h,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid),
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -411,23 +416,26 @@ TEST_F(DeviceFixture, ComputeReduceW) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::W,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_w,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid),
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::W,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_w,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid),
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -441,24 +449,27 @@ TEST_F(DeviceFixture, ComputeReduceHW) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- // Currently fp32 dest unsupported with reduce scalar
- if (fp32_dest_acc_en) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::HW,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_hw,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ // Currently fp32 dest unsupported with reduce scalar
+ if (fp32_dest_acc_en) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::HW,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_hw,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -476,23 +487,26 @@ TEST_F(DeviceFixture, ComputeReduceHMathOnly) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::H,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_h,
- .result_shape = result_shape,
- .math_only_reduce = true,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::H,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_h,
+ .result_shape = result_shape,
+ .math_only_reduce = true,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -506,24 +520,27 @@ TEST_F(DeviceFixture, ComputeReduceWMathOnly) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::W,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_w,
- .result_shape = result_shape,
- .math_only_reduce = true,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::W,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_w,
+ .result_shape = result_shape,
+ .math_only_reduce = true,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -537,25 +554,28 @@ TEST_F(DeviceFixture, ComputeReduceHWMathOnly) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- // Currently fp32 dest unsupported with reduce scalar
- if (fp32_dest_acc_en) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .shape = shape,
- .reduce_dim = ReduceDim::HW,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_hw,
- .result_shape = result_shape,
- .math_only_reduce = true,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ // Currently fp32 dest unsupported with reduce scalar
+ if (fp32_dest_acc_en) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .shape = shape,
+ .reduce_dim = ReduceDim::HW,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_hw,
+ .result_shape = result_shape,
+ .math_only_reduce = true,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -573,23 +593,26 @@ TEST_F(DeviceFixture, ComputeReduceHShortInit) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .short_init = true,
- .shape = shape,
- .reduce_dim = ReduceDim::H,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_h,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .short_init = true,
+ .shape = shape,
+ .reduce_dim = ReduceDim::H,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_h,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -603,24 +626,27 @@ TEST_F(DeviceFixture, ComputeReduceWShortInit) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .short_init = true,
- .shape = shape,
- .reduce_dim = ReduceDim::W,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_w,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .short_init = true,
+ .shape = shape,
+ .reduce_dim = ReduceDim::W,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_w,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
@@ -634,25 +660,28 @@ TEST_F(DeviceFixture, ComputeReduceHWShortInit) {
if (math_fid == 1) continue;
for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
for (bool fp32_dest_acc_en : {true, false}) {
- // Currently fp32 dest unsupported with reduce scalar
- if (fp32_dest_acc_en) continue;
- log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}", math_fid, reduce_type, fp32_dest_acc_en);
- ReduceConfig test_config = {
- .short_init = true,
- .shape = shape,
- .reduce_dim = ReduceDim::HW,
- .reduce_type = ReduceType(reduce_type),
- .data_gen_rand_max = 10.0f,
- .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
- .data_gen_offset = -10.0f,
- .atol = 1e-2f,
- .rtol = 0.08f,
- .golden_function = unit_tests::compute::gold_reduce_hw,
- .result_shape = result_shape,
- .fp32_dest_acc_en = fp32_dest_acc_en,
- .math_fidelity = MathFidelity(math_fid)
- };
- run_single_core_reduce_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ // Currently fp32 dest unsupported with reduce scalar
+ if (fp32_dest_acc_en) continue;
+ log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+ ReduceConfig test_config = {
+ .short_init = true,
+ .shape = shape,
+ .reduce_dim = ReduceDim::HW,
+ .reduce_type = ReduceType(reduce_type),
+ .data_gen_rand_max = 10.0f,
+ .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+ .data_gen_offset = -10.0f,
+ .atol = 1e-2f,
+ .rtol = 0.08f,
+ .golden_function = unit_tests::compute::gold_reduce_hw,
+ .result_shape = result_shape,
+ .fp32_dest_acc_en = fp32_dest_acc_en,
+ .dst_full_sync_en = dst_full_sync_en,
+ .math_fidelity = MathFidelity(math_fid)
+ };
+ run_single_core_reduce_program(this->devices_.at(0), test_config);
+ }
}
}
}
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
index 7abc33e2ef1..c96a44be6d2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
@@ -43,6 +43,7 @@ using GoldenFunc = std::variant<
struct TestConfig {
bool short_init = false;
+ bool dst_full_sync_en = false;
uint32_t input_single_tile_size;
uint32_t output_single_tile_size;
uint32_t num_tiles_r;
@@ -169,7 +170,8 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
program,
compute_kernel,
core,
- tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = defines}
+ tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en,
+ .compile_args = compute_kernel_args, .defines = defines}
);
std::vector src0_vec = create_arange_vector_of_bfloat16(input_dram_buffer_size, false);
@@ -276,15 +278,18 @@ Following tests are for Unpack Tilize
TEST_F(DeviceFixture, ComputeUnpackTilize) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
- .golden_function = unit_tests::compute::gold_standard_tilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
+ .golden_function = unit_tests::compute::gold_standard_tilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
@@ -293,33 +298,40 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) {
if (arch == tt::ARCH::GRAYSKULL) {
GTEST_SKIP();
}
- unit_tests::compute::tilize::TestConfig test_config = {
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = 2,
- .num_tiles_c = 8,
- .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A_B,
- .golden_function = unit_tests::compute::gold_standard_tilize_w_elwadd
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-}
-TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) {
- vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
- for(auto num_tile : num_tiles) {
+ for (bool dst_full_sync_en : {true, false}) {
unit_tests::compute::tilize::TestConfig test_config = {
- .short_init = true,
+ .dst_full_sync_en = dst_full_sync_en,
.input_single_tile_size = 2 * 1024,
.output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
- .golden_function = unit_tests::compute::gold_standard_tilize
+ .num_tiles_r = 2,
+ .num_tiles_c = 8,
+ .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A_B,
+ .golden_function = unit_tests::compute::gold_standard_tilize_w_elwadd
};
unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
}
}
+TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) {
+ vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
+ for(auto num_tile : num_tiles) {
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .short_init = true,
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
+ .golden_function = unit_tests::compute::gold_standard_tilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
+ }
+}
+
/**************************************
Following tests are for Unpack Untilize
***************************************/
@@ -327,31 +339,37 @@ Following tests are for Unpack Untilize
TEST_F(DeviceFixture, ComputeUnpackUntilize) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .short_init = true,
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .short_init = true,
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
@@ -361,47 +379,55 @@ Following tests are for pack untilize
TEST_F(DeviceFixture, ComputePackUntilize) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
TEST_F(DeviceFixture, ComputePackUntilizeShortInit) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .short_init = true,
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .short_init = true,
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
-
}
TEST_F(DeviceFixture, ComputePackUntilizeDst) {
vector > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
for(auto num_tile : num_tiles) {
- unit_tests::compute::tilize::TestConfig test_config = {
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * 1024,
- .num_tiles_r = num_tile[0],
- .num_tiles_c = num_tile[1],
- .untilize_type = unit_tests::compute::tilize::UntilizeType::DST,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * 1024,
+ .num_tiles_r = num_tile[0],
+ .num_tiles_c = num_tile[1],
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::DST,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
@@ -412,19 +438,22 @@ TEST_F(DeviceFixture, ComputePackUntilizeDstTinyTile) {
vector > test_config_values = {{1, 1, 1, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}};
uint32_t face_c_dim = 16;
for(auto test_config_value : test_config_values) {
- uint32_t num_faces_per_tile = test_config_value[2];
- uint32_t face_r_dim = test_config_value[3];
- unit_tests::compute::tilize::TestConfig test_config = {
- .short_init = true,
- .input_single_tile_size = 2 * 1024,
- .output_single_tile_size = 2 * num_faces_per_tile * face_r_dim * face_c_dim,
- .num_tiles_r = test_config_value[0],
- .num_tiles_c = test_config_value[1],
- .num_faces_per_tile = num_faces_per_tile,
- .face_r_dim = face_r_dim,
- .untilize_type = unit_tests::compute::tilize::UntilizeType::DST,
- .golden_function = unit_tests::compute::gold_standard_untilize
- };
- unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ for (bool dst_full_sync_en : {true, false}) {
+ uint32_t num_faces_per_tile = test_config_value[2];
+ uint32_t face_r_dim = test_config_value[3];
+ unit_tests::compute::tilize::TestConfig test_config = {
+ .short_init = true,
+ .dst_full_sync_en = dst_full_sync_en,
+ .input_single_tile_size = 2 * 1024,
+ .output_single_tile_size = 2 * num_faces_per_tile * face_r_dim * face_c_dim,
+ .num_tiles_r = test_config_value[0],
+ .num_tiles_c = test_config_value[1],
+ .num_faces_per_tile = num_faces_per_tile,
+ .face_r_dim = face_r_dim,
+ .untilize_type = unit_tests::compute::tilize::UntilizeType::DST,
+ .golden_function = unit_tests::compute::gold_standard_untilize
+ };
+ unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
+ }
}
}
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
index 3662cf6f131..b7714aa7d88 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
@@ -30,7 +30,7 @@ set(UNIT_TESTS_COMMON_SRC
${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp
)
add_library(unit_tests_common_o OBJECT ${UNIT_TESTS_COMMON_SRC})
-target_link_libraries(unit_tests_common_o PUBLIC compiler_flags metal_header_directories gtest gtest_main)
+target_link_libraries(unit_tests_common_o PUBLIC compiler_flags metal_header_directories gtest gtest_main magic_enum fmt)
target_include_directories(unit_tests_common_o PUBLIC
${UMD_HOME}
${PROJECT_SOURCE_DIR}
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
index 5b2ec8fcc43..1ef579e6ed5 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
@@ -27,6 +27,7 @@ struct MatmulTileConfig {
bool with_bias = false;
bool test_init_short = false;
bool with_dt = true;
+ bool dst_full_sync_en = false;
string reader_kernel;
string compute_kernel;
vector compute_kernel_args;
@@ -215,6 +216,7 @@ bool matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
cfg.compute_kernel,
core,
tt_metal::ComputeConfig{.math_fidelity = cfg.math_fidelity,
+ .dst_full_sync_en = cfg.dst_full_sync_en,
.compile_args = cfg.compute_kernel_args,
.defines = compute_defines}
);
@@ -292,206 +294,221 @@ bool matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
} // namespace unit_tests_common::matmul::test_matmul_X_tile
TEST_F(CommonFixture, MatmulSingleTile){
- for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
- if (i == 1) continue;
- unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
- .M = 1, .K = 1, .N = 1,
- .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
- .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
- .compute_kernel_args = {
- 1, // block_tile_dim
- 1, // dst_tile_rows
- 1, // dst_tile_cols
- 1, // block_cnt
- 1, // in0_block_tile_cnt
- 1, // in1_block_tile_cnt
- 1 // out_block_tile_cnt
- },
- .math_fidelity = MathFidelity(i)
- };
- SHAPE shape = {1, 1, 32, 32};
- tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
- tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
- auto activations_tile_layout = convert_to_tile_layout(tensor.get_values());
- auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
-
- auto identity = create_identity_matrix(32, 32, 32); //bfloat16 32x32 identity
- auto weights_tile_layout = convert_to_tile_layout(identity);
- auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-
- for(unsigned int id = 0; id < devices_.size(); id++){
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations, weights, tensor));
+ for (bool dst_full_sync_en : {true, false}) {
+ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+ if (i == 1) continue;
+ unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+ .M = 1, .K = 1, .N = 1,
+ .dst_full_sync_en = dst_full_sync_en,
+ .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
+ .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
+ .compute_kernel_args = {
+ 1, // block_tile_dim
+ 1, // dst_tile_rows
+ 1, // dst_tile_cols
+ 1, // block_cnt
+ 1, // in0_block_tile_cnt
+ 1, // in1_block_tile_cnt
+ 1 // out_block_tile_cnt
+ },
+ .math_fidelity = MathFidelity(i)
+ };
+ SHAPE shape = {1, 1, 32, 32};
+ tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+ tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ auto activations_tile_layout = convert_to_tile_layout(tensor.get_values());
+ auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+
+ auto identity = create_identity_matrix(32, 32, 32); //bfloat16 32x32 identity
+ auto weights_tile_layout = convert_to_tile_layout(identity);
+ auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+ for(unsigned int id = 0; id < devices_.size(); id++){
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations, weights, tensor));
+ }
}
}
}
TEST_F(CommonFixture, MatmulMultiTile){
- for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
- if (i == 1) continue;
- uint32_t M = 4;
- uint32_t N = 4;
- uint32_t K = 4;
- unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
- .M = M, .K = K, .N = N,
- .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
- .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp",
- .compute_kernel_args = {
- 1, // block_tile_dim, within block, how many tiles are on the K dim
- M, // dst_tile_rows
- N, // dst_tile_cols
- K, // block_cnt, across blocks, how many tiles are on the K dim
- M, // in0_block_tile_cnt, M * block_tile_dim
- N, // in1_block_tile_cnt, N * block_tile_dim
- (M * N), // out_block_tile_cnt
- matmul_config.with_bias // whether or not to use bias
- },
- .math_fidelity = MathFidelity(i)
- };
- tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
- SHAPE shape = {1, 1, M * 32, K * 32};
- tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
- auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
- auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
- auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
- auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
-
- auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
- auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
- auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
- auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-
- for(unsigned int id = 0; id < devices_.size(); id++){
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
- log_info(LogTest, "Multi tile with no bias passed");
- matmul_config.with_bias = true;
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
- log_info(LogTest, "Multi tile with bias passed");
+ for (bool dst_full_sync_en : {true, false}) {
+ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+ if (i == 1) continue;
+ uint32_t M = 4;
+ uint32_t N = 4;
+ uint32_t K = 4;
+ unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+ .M = M, .K = K, .N = N,
+ .dst_full_sync_en = dst_full_sync_en,
+ .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+ .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp",
+ .compute_kernel_args = {
+ 1, // block_tile_dim, within block, how many tiles are on the K dim
+ M, // dst_tile_rows
+ N, // dst_tile_cols
+ K, // block_cnt, across blocks, how many tiles are on the K dim
+ M, // in0_block_tile_cnt, M * block_tile_dim
+ N, // in1_block_tile_cnt, N * block_tile_dim
+ (M * N), // out_block_tile_cnt
+ matmul_config.with_bias // whether or not to use bias
+ },
+ .math_fidelity = MathFidelity(i)
+ };
+ tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+ SHAPE shape = {1, 1, M * 32, K * 32};
+ tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+ auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+ auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+ auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+ auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+ auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+ auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+ auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+ for(unsigned int id = 0; id < devices_.size(); id++){
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ log_info(LogTest, "Multi tile with no bias passed");
+ matmul_config.with_bias = true;
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ log_info(LogTest, "Multi tile with bias passed");
+ }
}
}
}
TEST_F(CommonFixture, MatmulBlock){
- for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
- if (i == 1) continue;
- uint32_t M = 4;
- uint32_t N = 4;
- uint32_t K = 4;
- unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
- .M = M, .K = K, .N = N,
- .test_init_short = false,
- .with_dt = false,
- .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
- .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
- .compute_kernel_args = {
- 1, // block_tile_dim, within block, how many tiles are on the K dim
- M, // dst_tile_rows
- N, // dst_tile_cols
- K, // block_cnt, across blocks, how many tiles are on the K dim
- M, // in0_block_tile_cnt, M * block_tile_dim
- N, // in1_block_tile_cnt, N * block_tile_dim
- (M * N), // out_block_tile_cnt
- },
- .math_fidelity = MathFidelity(i)
- };
- tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
- SHAPE shape = {1, 1, M * 32, K * 32};
- tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
- auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
- auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
- auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
- auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
-
- auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
- auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
- auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
- auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-
- for(unsigned int id = 0; id < devices_.size(); id++){
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ for (bool dst_full_sync_en : {true, false}) {
+ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+ if (i == 1) continue;
+ uint32_t M = 4;
+ uint32_t N = 4;
+ uint32_t K = 4;
+ unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+ .M = M, .K = K, .N = N,
+ .test_init_short = false,
+ .with_dt = false,
+ .dst_full_sync_en = dst_full_sync_en,
+ .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+ .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+ .compute_kernel_args = {
+ 1, // block_tile_dim, within block, how many tiles are on the K dim
+ M, // dst_tile_rows
+ N, // dst_tile_cols
+ K, // block_cnt, across blocks, how many tiles are on the K dim
+ M, // in0_block_tile_cnt, M * block_tile_dim
+ N, // in1_block_tile_cnt, N * block_tile_dim
+ (M * N), // out_block_tile_cnt
+ },
+ .math_fidelity = MathFidelity(i)
+ };
+ tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+ SHAPE shape = {1, 1, M * 32, K * 32};
+ tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+ auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+ auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+ auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+ auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+ auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+ auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+ auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+ for(unsigned int id = 0; id < devices_.size(); id++){
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ }
}
}
}
TEST_F(CommonFixture, MatmulBlockInitShort){
- for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
- if (i == 1) continue;
- uint32_t M = 4;
- uint32_t N = 4;
- uint32_t K = 4;
- unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
- .M = M, .K = K, .N = N,
- .test_init_short = true,
- .with_dt = false,
- .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
- .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
- .compute_kernel_args = {
- 1, // block_tile_dim, within block, how many tiles are on the K dim
- M, // dst_tile_rows
- N, // dst_tile_cols
- K, // block_cnt, across blocks, how many tiles are on the K dim
- M, // in0_block_tile_cnt, M * block_tile_dim
- N, // in1_block_tile_cnt, N * block_tile_dim
- (M * N), // out_block_tile_cnt
- },
- .math_fidelity = MathFidelity(i)
- };
- tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
- SHAPE shape = {1, 1, M * 32, K * 32};
- tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
- auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
- auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
- auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
- auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
-
- auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
- auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
- auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
- auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-
- for(unsigned int id = 0; id < devices_.size(); id++){
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ for (bool dst_full_sync_en : {true, false}) {
+ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+ if (i == 1) continue;
+ uint32_t M = 4;
+ uint32_t N = 4;
+ uint32_t K = 4;
+ unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+ .M = M, .K = K, .N = N,
+ .test_init_short = true,
+ .with_dt = false,
+ .dst_full_sync_en = dst_full_sync_en,
+ .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+ .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+ .compute_kernel_args = {
+ 1, // block_tile_dim, within block, how many tiles are on the K dim
+ M, // dst_tile_rows
+ N, // dst_tile_cols
+ K, // block_cnt, across blocks, how many tiles are on the K dim
+ M, // in0_block_tile_cnt, M * block_tile_dim
+ N, // in1_block_tile_cnt, N * block_tile_dim
+ (M * N), // out_block_tile_cnt
+ },
+ .math_fidelity = MathFidelity(i)
+ };
+ tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+ SHAPE shape = {1, 1, M * 32, K * 32};
+ tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+ auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+ auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+ auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+ auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+ auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+ auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+ auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+ for(unsigned int id = 0; id < devices_.size(); id++){
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ }
}
}
}
TEST_F(CommonFixture, MatmulBlockInitShortWithDt){
- for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
- if (i == 1) continue;
- uint32_t M = 4;
- uint32_t N = 4;
- uint32_t K = 4;
- unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
- .M = M, .K = K, .N = N,
- .test_init_short = true,
- .with_dt = true,
- .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
- .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
- .compute_kernel_args = {
- 1, // block_tile_dim, within block, how many tiles are on the K dim
- M, // dst_tile_rows
- N, // dst_tile_cols
- K, // block_cnt, across blocks, how many tiles are on the K dim
- M, // in0_block_tile_cnt, M * block_tile_dim
- N, // in1_block_tile_cnt, N * block_tile_dim
- (M * N), // out_block_tile_cnt
- },
- .math_fidelity = MathFidelity(i)
- };
- tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
- SHAPE shape = {1, 1, M * 32, K * 32};
- tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
- auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
- auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
- auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
- auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
-
- auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
- auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
- auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
- auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-
- for(unsigned int id = 0; id < devices_.size(); id++){
- ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ for (bool dst_full_sync_en : {true, false}) {
+ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+ if (i == 1) continue;
+ uint32_t M = 4;
+ uint32_t N = 4;
+ uint32_t K = 4;
+ unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+ .M = M, .K = K, .N = N,
+ .test_init_short = true,
+ .with_dt = true,
+ .dst_full_sync_en = dst_full_sync_en,
+ .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+ .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+ .compute_kernel_args = {
+ 1, // block_tile_dim, within block, how many tiles are on the K dim
+ M, // dst_tile_rows
+ N, // dst_tile_cols
+ K, // block_cnt, across blocks, how many tiles are on the K dim
+ M, // in0_block_tile_cnt, M * block_tile_dim
+ N, // in1_block_tile_cnt, N * block_tile_dim
+ (M * N), // out_block_tile_cnt
+ },
+ .math_fidelity = MathFidelity(i)
+ };
+ tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+ SHAPE shape = {1, 1, M * 32, K * 32};
+ tt::deprecated::Tensor tensor = tt::deprecated::initialize_tensor(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+ auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+ auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+ auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+ auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+ auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+ auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+ auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+ auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+ for(unsigned int id = 0; id < devices_.size(); id++){
+ ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+ }
}
}
}
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
index 6eccf28268a..2734b791417 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
@@ -147,7 +147,7 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord
read_vec = tt::llrt::read_hex_vec_from_core (
device->id(),
phys_core,
- device->get_dev_addr(phys_core, HalMemAddrType::WATCHER) + offsetof(watcher_msg_t, debug_insert_delays),
+ device->get_dev_addr(phys_core, HalL1MemAddrType::WATCHER) + offsetof(watcher_msg_t, debug_insert_delays),
sizeof(debug_insert_delays_msg_t));
log_info(tt::LogTest, "Read back debug_insert_delays: 0x{:x}", read_vec[0]);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index eaab46ef4f1..3194e16e35c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -588,7 +588,7 @@ bool test_increment_runtime_args_sanity(Device* device, const DummyProgramConfig
break;
case tt::RISCV::ERISC: {
HalProgrammableCoreType eth_core_type = idle_eth ? HalProgrammableCoreType::IDLE_ETH : HalProgrammableCoreType::ACTIVE_ETH;
- unique_args_addr = hal.get_dev_addr(eth_core_type, HalMemAddrType::UNRESERVED);
+ unique_args_addr = hal.get_dev_addr(eth_core_type, HalL1MemAddrType::UNRESERVED);
common_args_addr = unique_args_addr + 1 * 256 * sizeof(uint32_t);
compile_args[2] = unique_args_addr;
compile_args[3] = common_args_addr;
diff --git a/tests/ttnn/profiling/ops_for_profiling.py b/tests/ttnn/profiling/ops_for_profiling.py
index 78cc71fc9f9..499df79b9cb 100644
--- a/tests/ttnn/profiling/ops_for_profiling.py
+++ b/tests/ttnn/profiling/ops_for_profiling.py
@@ -339,6 +339,10 @@ def primary_moreh_mean_backward(x, y):
ttnn.operations.moreh.mean_backward(x, dim=[0], keepdim=True, input_grad=y)
+def primary_moreh_sum(x):
+ ttnn.operations.moreh.sum(x, dim=[0])
+
+
def celu_bw(x, y):
ttnn.celu_bw(x, y, alpha=1)
diff --git a/tests/ttnn/python_api_testing/sweep_tests/op_map.py b/tests/ttnn/python_api_testing/sweep_tests/op_map.py
index 824cb5a5799..49f08546578 100644
--- a/tests/ttnn/python_api_testing/sweep_tests/op_map.py
+++ b/tests/ttnn/python_api_testing/sweep_tests/op_map.py
@@ -810,18 +810,6 @@
"tt_op": ttnn_ops.relu_bw,
"pytorch_op": pytorch_ops.relu_bw,
},
- "gt-bw": {
- "tt_op": ttnn_ops.gt_bw,
- "pytorch_op": pytorch_ops.gt_bw,
- },
- "lt-bw": {
- "tt_op": ttnn_ops.gt_bw,
- "pytorch_op": pytorch_ops.gt_bw,
- },
- "ne-bw": {
- "tt_op": ttnn_ops.ne_bw,
- "pytorch_op": pytorch_ops.ne_bw,
- },
"log10-bw": {
"tt_op": ttnn_ops.log10_bw,
"pytorch_op": pytorch_ops.log10_bw,
diff --git a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
index 7606da60477..5388de12fc4 100644
--- a/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
+++ b/tests/ttnn/python_api_testing/sweep_tests/ttnn_ops.py
@@ -3791,66 +3791,6 @@ def relu_bw(
return ttnn_tensor_to_torch(t2)
-def gt_bw(
- x, # grad_tensor
- y, # input_tensor
- *args,
- scalar,
- device,
- dtype,
- layout,
- input_mem_config,
- output_mem_config,
- **kwargs,
-):
- t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
- t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
-
- t2 = ttnn.gt_bw(t0, t1, alpha=scalar, memory_config=output_mem_config)[0]
-
- return ttnn_tensor_to_torch(t2)
-
-
-def lt_bw(
- x, # grad_tensor
- y, # input_tensor
- *args,
- scalar,
- device,
- dtype,
- layout,
- input_mem_config,
- output_mem_config,
- **kwargs,
-):
- t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
- t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
-
- t2 = ttnn.lt_bw(t0, t1, alpha=scalar, memory_config=output_mem_config)[0]
-
- return ttnn_tensor_to_torch(t2)
-
-
-def ne_bw(
- x, # grad_tensor
- y, # input_tensor
- *args,
- scalar,
- device,
- dtype,
- layout,
- input_mem_config,
- output_mem_config,
- **kwargs,
-):
- t0 = setup_ttnn_tensor(x, device, layout[0], input_mem_config[0], dtype[0])
- t1 = setup_ttnn_tensor(y, device, layout[1], input_mem_config[1], dtype[1])
-
- t2 = ttnn.ne_bw(t0, t1, alpha=scalar, memory_config=output_mem_config)[0]
-
- return ttnn_tensor_to_torch(t2)
-
-
def log10_bw(
x,
y,
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
index 88bb90b22c6..654e9bd5d54 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -32,12 +32,11 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
host_data[i] = 1;
}
- ttnn::Shape shape(input_shape.as_vector());
- auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, shape, dtype, Layout::TILE, mem_cfg);
+ auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg);
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
- Tensor input_tensor = Tensor(input_storage, shape, dtype, Layout::TILE);
+ Tensor input_tensor = Tensor(input_storage, input_shape, dtype, Layout::TILE);
tt::log_debug("input_data: \n {}", input_tensor.write_to_string());
ttnn::write_buffer(io_cq, input_tensor, {host_data});
diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
index 3b6daa1f0cc..5734bdc8924 100644
--- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
@@ -54,8 +54,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) {
auto workload_event = std::make_shared();
// Running sum-reduce with preallocated output
// Preallocate Input and Output Tensors on Device
- auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
- auto output_buffer = ttnn::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
+ auto input_buffer = ttnn::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape.padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
+ auto output_buffer = ttnn::allocate_buffer_on_device(output_buf_size_datums * datum_size_bytes, device, np_out.get_padded_shape(), DataType::BFLOAT16, Layout::TILE, mem_cfg);
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
auto output_storage = tt::tt_metal::DeviceStorage{output_buffer};
Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE);
@@ -105,7 +105,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
std::vector inputs = {4, 9, 16, 25, 36, 64};
uint32_t io_cq = 1;
uint32_t workload_dispatch_cq = 0;
- ttnn::Shape shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}));
+ ttnn::SimpleShape shape{1, 1, 1024, 1024};
auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]);
auto readback_data = std::shared_ptr(new bfloat16[buf_size_datums]);
@@ -158,7 +158,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) {
uint32_t buf_size_datums = 1024 * 1024;
uint32_t datum_size_bytes = 2;
- ttnn::Shape shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}));
+ ttnn::SimpleShape shape{1, 1, 1024, 1024};
// Inside the loop, initialize a buffer with limited lifetime.
// This will asynchronously allocate the buffer, wait for the allocation to complete (address to be assigned to the buffer), destroy the buffer (which will asynchronously
// deallocate the buffer) in a loop
diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
index 18e635aa4f1..df3476bd545 100644
--- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -27,7 +27,14 @@ std::vector run_operation(
const operation::OptionalTensors& optional_output_tensors = {}) {
static_assert(operation::detail::is_device_operation(), "ttnn::run_operation can only dispatch Device Operations!");
// Create output tensor vector by examining the number of output shapes created by the device operation
- std::vector outputs(operation::DeviceOperation(devop).compute_output_shapes(input_tensors).size());
+ auto output_shapes = operation::DeviceOperation(devop).compute_output_shapes(input_tensors);
+ size_t output_shapes_size = 0;
+ if (std::holds_alternative>(output_shapes)) {
+ output_shapes_size = std::get>(output_shapes).size();
+ } else {
+ output_shapes_size = std::get>(output_shapes).size();
+ }
+ std::vector outputs(output_shapes_size);
// Populate the workers of the output tensors, based on the input tensors. This is needed for the async engine.
for (int i = 0; i < outputs.size(); i++) {
outputs[i] = Tensor(operation::get_workers_for_op_output(std::move(input_tensors), std::move(optional_input_tensors)));
@@ -117,7 +124,7 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
.memory_layout = TensorMemoryLayout::INTERLEAVED,
.buffer_type = BufferType::DRAM,
.shard_spec = std::nullopt};
- ttnn::Shape shape = ttnn::Shape(LegacyShape({1, 1, 32, 16384}));
+ ttnn::SimpleShape shape{1, 1, 32, 16384};
const uint32_t buf_size_datums = 32 * 16384;
const uint32_t datum_size_bytes = 2;
auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]);
@@ -210,7 +217,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
.memory_layout = TensorMemoryLayout::INTERLEAVED,
.buffer_type = BufferType::DRAM,
.shard_spec = std::nullopt};
- ttnn::Shape shape = ttnn::Shape(LegacyShape({1, 2, 256, static_cast(256 * ring_devices.size())}));
+ ttnn::SimpleShape shape{1, 2, 256, static_cast(256 * ring_devices.size())};
const uint32_t buf_size_datums = 2 * 256 * 256 * ring_devices.size();
const uint32_t datum_size_bytes = 2;
// Output of reduce scatter is input_numel / num_devices_used_in_scatter_op
diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
index e06824058b6..52f4320fba0 100644
--- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
@@ -44,7 +44,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) {
.buffer_type = BufferType::DRAM,
.shard_spec = std::nullopt};
- ttnn::Shape shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 3, 2048, 2048}));
+ ttnn::SimpleShape shape{1, 3, 2048, 2048};
uint32_t buf_size_datums = 2048 * 2048 * 3;
uint32_t datum_size_bytes = 2;
auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]);
@@ -94,7 +94,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
.buffer_type = BufferType::DRAM,
.shard_spec = std::nullopt};
- ttnn::Shape shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 3, 2048, 2048}));
+ ttnn::SimpleShape shape{1, 3, 2048, 2048};
uint32_t buf_size_datums = 2048 * 2048 * 3;
uint32_t datum_size_bytes = 2;
auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]);
@@ -145,7 +145,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) {
.buffer_type = BufferType::DRAM,
.shard_spec = std::nullopt};
- ttnn::Shape shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 3, 2048, 2048}));
+ ttnn::SimpleShape shape{1, 3, 2048, 2048};
uint32_t buf_size_datums = 2048 * 2048 * 3;
uint32_t datum_size_bytes = 2;
auto host_data = std::shared_ptr(new bfloat16[buf_size_datums]);
diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
index d6590d9a395..20e3350dc38 100644
--- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
@@ -39,7 +39,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
uint32_t tensor_buf_size = 1024 * 1024;
uint32_t datum_size_bytes = 2;
- ttnn::Shape tensor_shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}));
+ ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024};
auto t0_host_data = std::shared_ptr(new bfloat16[tensor_buf_size]);
auto t0_readback_data = std::shared_ptr(new bfloat16[tensor_buf_size]);
auto t1_host_data = std::shared_ptr(new bfloat16[tensor_buf_size]);
@@ -117,7 +117,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiAppThreadSync) {
std::shared_ptr write_event = std::make_shared();
std::shared_ptr read_event = std::make_shared();
- ttnn::Shape tensor_shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}));
+ ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024};
auto host_data = std::shared_ptr(new bfloat16[tensor_buf_size]);
auto allocated_buffer = ttnn::allocate_buffer_on_device(tensor_buf_size * datum_size_bytes, device, tensor_shape, DataType::BFLOAT16, Layout::TILE, mem_cfg);
auto allocated_storage = tt::tt_metal::DeviceStorage{allocated_buffer};
diff --git a/tests/ttnn/unit_tests/gtests/test_repeat_interleave.cpp b/tests/ttnn/unit_tests/gtests/test_repeat_interleave.cpp
index 884ee2475e3..1dee81c29e0 100644
--- a/tests/ttnn/unit_tests/gtests/test_repeat_interleave.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_repeat_interleave.cpp
@@ -30,7 +30,7 @@ void run_repeat_interleave_test(tt::tt_metal::Device* device, const uint32_t rep
const uint32_t input_buf_size_datums = 32 * 32;
const uint32_t output_buf_size_datums = input_buf_size_datums * repeats;
const uint32_t datum_size_bytes = 2;
- ttnn::Shape input_shape = ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 32, 32}));
+ ttnn::SimpleShape input_shape{1, 1, 32, 32};
auto host_data = std::shared_ptr(new uint16_t[input_buf_size_datums]);
auto readback_data = std::shared_ptr(new uint16_t[output_buf_size_datums]);
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_lt.py b/tests/ttnn/unit_tests/operations/backward/test_backward_lt.py
deleted file mode 100644
index 170a41446d4..00000000000
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_lt.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-import pytest
-import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-def test_bw_lt(input_shapes, device):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- other_data, other_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -100, 100, device)
-
- tt_output_tensor_on_device = ttnn.lt_bw(grad_tensor, input_tensor, other_tensor)
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, other_data)
-
- status = compare_pcc(tt_output_tensor_on_device, golden_tensor)
- assert status
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("are_required_outputs", [[True, True], [True, False], [False, True]])
-def test_bw_lt_with_opt_output(input_shapes, device, are_required_outputs):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- other_data, other_tensor = data_gen_with_range(input_shapes, -90, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -70, 90, device)
- input_grad = None
- other_grad = None
-
- if are_required_outputs[0]:
- _, input_grad = data_gen_with_range(input_shapes, -1, 1, device)
- if are_required_outputs[1]:
- _, other_grad = data_gen_with_range(input_shapes, -1, 1, device)
-
- cq_id = 0
-
- pages_before = ttnn._ttnn.reports.get_buffer_pages()
- ttnn.lt_bw(
- grad_tensor,
- input_tensor,
- other_tensor,
- are_required_outputs=are_required_outputs,
- input_grad=input_grad,
- other_grad=other_grad,
- queue_id=cq_id,
- )
- assert len(pages_before) == len(ttnn._ttnn.reports.get_buffer_pages())
- tt_output_tensor_on_device = [input_grad, other_grad]
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, other_data)
-
- status = True
- for i in range(len(are_required_outputs)):
- if are_required_outputs[i]:
- status = status & compare_pcc([tt_output_tensor_on_device[i]], [golden_tensor[i]])
- assert status
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("scalar", [1.0, 0.5, 0.035])
-def test_bw_lt_scalar(input_shapes, scalar, device):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -100, 100, device)
-
- tt_output_tensor_on_device = ttnn.lt_bw(grad_tensor, input_tensor, scalar)
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, scalar)
-
- status = compare_pcc(tt_output_tensor_on_device, golden_tensor)
- assert status
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("scalar", [1.0, 0.5, 0.035])
-def test_bw_lt_with_scalar_opt_output(input_shapes, device, scalar):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -70, 90, device)
- input_grad = None
- _, input_grad = data_gen_with_range(input_shapes, -1, 1, device)
-
- cq_id = 0
-
- pages_before = ttnn._ttnn.reports.get_buffer_pages()
- ttnn.lt_bw(
- grad_tensor,
- input_tensor,
- scalar,
- input_grad=input_grad,
- queue_id=cq_id,
- )
- assert len(pages_before) == len(ttnn._ttnn.reports.get_buffer_pages())
- tt_output_tensor_on_device = [input_grad]
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, scalar)
-
- status = compare_pcc(tt_output_tensor_on_device, golden_tensor)
- assert status
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("are_required_outputs", [[True, True], [True, False], [False, True]])
-def test_bw_lt_with_opt_output_opt_qid(input_shapes, device, are_required_outputs):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- other_data, other_tensor = data_gen_with_range(input_shapes, -90, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -70, 90, device)
- input_grad = None
- other_grad = None
-
- if are_required_outputs[0]:
- _, input_grad = data_gen_with_range(input_shapes, -1, 1, device)
- if are_required_outputs[1]:
- _, other_grad = data_gen_with_range(input_shapes, -1, 1, device)
-
- pages_before = ttnn._ttnn.reports.get_buffer_pages()
- ttnn.lt_bw(
- grad_tensor,
- input_tensor,
- other_tensor,
- are_required_outputs=are_required_outputs,
- input_grad=input_grad,
- other_grad=other_grad,
- )
- assert len(pages_before) == len(ttnn._ttnn.reports.get_buffer_pages())
- tt_output_tensor_on_device = [input_grad, other_grad]
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, other_data)
-
- status = True
- for i in range(len(are_required_outputs)):
- if are_required_outputs[i]:
- status = status & compare_pcc([tt_output_tensor_on_device[i]], [golden_tensor[i]])
- assert status
-
-
-@pytest.mark.parametrize(
- "input_shapes",
- (
- (torch.Size([1, 1, 32, 32])),
- (torch.Size([1, 1, 320, 384])),
- (torch.Size([1, 3, 320, 384])),
- ),
-)
-@pytest.mark.parametrize("scalar", [1.0, 0.5, 0.035])
-def test_bw_lt_with_scalar_opt_output_opt_qid(input_shapes, device, scalar):
- in_data, input_tensor = data_gen_with_range(input_shapes, -100, 100, device, True)
- grad_data, grad_tensor = data_gen_with_range(input_shapes, -70, 90, device)
- input_grad = None
- _, input_grad = data_gen_with_range(input_shapes, -1, 1, device)
-
- pages_before = ttnn._ttnn.reports.get_buffer_pages()
- ttnn.lt_bw(
- grad_tensor,
- input_tensor,
- scalar,
- input_grad=input_grad,
- )
- assert len(pages_before) == len(ttnn._ttnn.reports.get_buffer_pages())
- tt_output_tensor_on_device = [input_grad]
-
- golden_function = ttnn.get_golden_function(ttnn.lt_bw)
- golden_tensor = golden_function(grad_data, in_data, scalar)
-
- status = compare_pcc(tt_output_tensor_on_device, golden_tensor)
- assert status
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/backward_complex_utility_funcs.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/backward_complex_utility_funcs.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/backward_complex_utility_funcs.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/backward_complex_utility_funcs.py
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_abs.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_abs.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_abs.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_abs.py
index 3305f9ef35e..4da79b2f824 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_abs.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_abs.py
@@ -11,11 +11,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_angle.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_angle.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_angle.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_angle.py
index 86c8b1156c5..429bb518d69 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_angle.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_angle.py
@@ -11,11 +11,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_add.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_add.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_add.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_add.py
index 164d78baf73..bcf1fd9861c 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_add.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_add.py
@@ -15,7 +15,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_div.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_div.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_div.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_div.py
index 3aa19df1c67..9572487d078 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_div.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_div.py
@@ -19,7 +19,7 @@
is_wormhole_b0,
is_blackhole,
)
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_mul.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_mul.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_mul.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_mul.py
index f5e88bc9c4f..588bbf32f00 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_mul.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_mul.py
@@ -16,7 +16,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_sub.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_sub.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_sub.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_sub.py
index ee43e94e8c0..60fb342ae1c 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_complex_sub.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_complex_sub.py
@@ -10,7 +10,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_conj.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_conj.py
similarity index 90%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_conj.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_conj.py
index 25ce48408fb..3845ae09789 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_conj.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_conj.py
@@ -11,11 +11,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_imag.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_imag.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_imag.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_imag.py
index e006f5d82d9..10b4ef3a74b 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_imag.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_imag.py
@@ -15,7 +15,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_polar.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_polar.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_polar.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_polar.py
index 9c1e910ccd5..36ae664a3cd 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_polar.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_polar.py
@@ -16,7 +16,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_real.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_real.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_real.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_real.py
index 36e3e680e98..89a94d02360 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_real.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_real.py
@@ -15,7 +15,7 @@
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_recip.py b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_recip.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_recip.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_recip.py
index ccfa6c64215..3a5e8cba83d 100644
--- a/tests/ttnn/unit_tests/operations/backward/complex_ops/test_backward_recip.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/complex_ops/test_backward_recip.py
@@ -11,11 +11,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0, skip_for_wormhole_b0, is_blackhole
-from tests.ttnn.unit_tests.operations.backward.complex_ops.backward_complex_utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.complex_ops.backward_complex_utility_funcs import (
Complex,
convert_to_torch_tensor,
random_complex_tensor,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_abs.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_abs.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_abs.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_abs.py
index 0e4663b7f52..630242e0dc2 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_abs.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_abs.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_acos.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acos.py
similarity index 87%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_acos.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acos.py
index e83d92d594d..d4e4a190747 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_acos.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acos.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_acosh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acosh.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_acosh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acosh.py
index 08e9da6b616..137918090ef 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_acosh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_acosh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_add.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_add.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_add.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_add.py
index 19029f0ae6f..835ac41dadd 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_add.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_add.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_addalpha.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addalpha.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_addalpha.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addalpha.py
index 7ae316a5297..048d087cf6e 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_addalpha.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addalpha.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_addcdiv.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcdiv.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_addcdiv.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcdiv.py
index 70007b0bcfe..2df72410854 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_addcdiv.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcdiv.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_addcmul.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcmul.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_addcmul.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcmul.py
index 8252447408d..cf97d53a7d4 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_addcmul.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_addcmul.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_asin.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asin.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_asin.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asin.py
index e23e5eac232..39e48022133 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_asin.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asin.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_asinh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asinh.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_asinh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asinh.py
index eb231a9a450..023071ad3b2 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_asinh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_asinh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_assign.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_assign.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_assign.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_assign.py
index 48d9115fa36..3115c447a1b 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_assign.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_assign.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_atan.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_atan.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan.py
index 686bf6af4d9..91aae353528 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_atan.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_atan2.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan2.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_atan2.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan2.py
index bff00df6cb4..528c129380d 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_atan2.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atan2.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_atanh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atanh.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_atanh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atanh.py
index c69a6f6fb59..ef877ccbf02 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_atanh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_atanh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_bias_gelu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_bias_gelu.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_bias_gelu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_bias_gelu.py
index 9ce2c7b55f5..af2b406e292 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_bias_gelu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_bias_gelu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_ceil.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ceil.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_ceil.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ceil.py
index b9b540ec2e0..04c07035493 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_ceil.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ceil.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_celu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_celu.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_celu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_celu.py
index b42841ad356..7be5986f39f 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_celu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_celu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_clamp.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_clamp.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_clamp.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_clamp.py
index e213e1103d9..7720d27bf2f 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_clamp.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_clamp.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_concat.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_concat.py
similarity index 98%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_concat.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_concat.py
index e239e2d1aff..e589a9d3b86 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_concat.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_concat.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_cos.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cos.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_cos.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cos.py
index 0a4fd1e29ba..9ce9c1a0788 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_cos.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cos.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
from math import pi
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_cosh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cosh.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_cosh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cosh.py
index b61c6fe0332..702289300a1 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_cosh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_cosh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_deg2rad.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_deg2rad.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_deg2rad.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_deg2rad.py
index 694679c3edc..63955715069 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_deg2rad.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_deg2rad.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_digamma.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_digamma.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_digamma.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_digamma.py
index 10a5fc3bf94..b776a6312dc 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_digamma.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_digamma.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_div.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div.py
similarity index 98%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_div.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div.py
index c8f1bd584cc..66543b493ac 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_div.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div.py
@@ -5,7 +5,11 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, data_gen_with_val, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
+ data_gen_with_range,
+ data_gen_with_val,
+ compare_pcc,
+)
from models.utility_functions import (
is_wormhole_b0,
is_blackhole,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_div_no_nan.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div_no_nan.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_div_no_nan.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div_no_nan.py
index 70d3ba39cc4..60bc50101f6 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_div_no_nan.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_div_no_nan.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_elu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_elu.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_elu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_elu.py
index 8114d3022c1..f93abf03e8b 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_elu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_elu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_embedding.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_embedding.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_embedding.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_embedding.py
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_erf.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erf.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_erf.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erf.py
index 0278ad60e29..336965087e9 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_erf.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erf.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_erfc.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfc.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_erfc.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfc.py
index 19a0cfc206c..6d91131c415 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_erfc.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfc.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_erfinv.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfinv.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_erfinv.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfinv.py
index 4c3197ce763..d235f11019e 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_erfinv.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_erfinv.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_exp.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_exp.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp.py
index 9de5f8cbc1e..ab779433edf 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_exp.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_exp2.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp2.py
similarity index 87%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_exp2.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp2.py
index 825e8232150..52511cdeaff 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_exp2.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_exp2.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_expm1.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_expm1.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_expm1.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_expm1.py
index 5ea5b9672b2..df8fbdf8153 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_expm1.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_expm1.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_fill.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_fill.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill.py
index 16a9711867b..228e139f05e 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_fill.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill.py
@@ -6,7 +6,7 @@
import pytest
import ttnn
from models.utility_functions import is_wormhole_b0, is_blackhole
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_all_close,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_fill_zero.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill_zero.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_fill_zero.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill_zero.py
index cabdb82b516..6c946d303a3 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_fill_zero.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fill_zero.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_floor.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_floor.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_floor.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_floor.py
index 0229e5dca28..afe14c79ec4 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_floor.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_floor.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_fmod.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fmod.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_fmod.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fmod.py
index e9f0cc64293..d5ecc666e50 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_fmod.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_fmod.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
from models.utility_functions import skip_for_grayskull
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_frac.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_frac.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_frac.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_frac.py
index 2e256084db6..1f148d4fab4 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_frac.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_frac.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_gelu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_gelu.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_gelu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_gelu.py
index 236073e3cb9..1b514166b83 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_gelu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_gelu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_hardshrink.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardshrink.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_hardshrink.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardshrink.py
index 5700e708382..96812608e00 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_hardshrink.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardshrink.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_hardsigmoid.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardsigmoid.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_hardsigmoid.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardsigmoid.py
index d7ec46359bd..535c0844211 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_hardsigmoid.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardsigmoid.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_hardswish.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardswish.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_hardswish.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardswish.py
index 14b51e51175..8241bb4b379 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_hardswish.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardswish.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_hardtanh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardtanh.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_hardtanh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardtanh.py
index d8e20cf43c3..75b0f22adab 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_hardtanh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hardtanh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_hypot.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hypot.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_hypot.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hypot.py
index 5e28b34e894..05c199fb804 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_hypot.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_hypot.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_i0.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_i0.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_i0.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_i0.py
index fa9a40f2e8f..20aa2720a18 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_i0.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_i0.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_ldexp.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ldexp.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_ldexp.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ldexp.py
index 36893e981ef..67ede15225b 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_ldexp.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_ldexp.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_leaky_relu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_leaky_relu.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_leaky_relu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_leaky_relu.py
index 5c33b4f1664..41f87a4e4a6 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_leaky_relu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_leaky_relu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_lerp.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lerp.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_lerp.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lerp.py
index 9bc3a6584cf..676a5885b45 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_lerp.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lerp.py
@@ -6,7 +6,7 @@
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_lgamma.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lgamma.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_lgamma.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lgamma.py
index d85b05be42d..2d066175651 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_lgamma.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_lgamma.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
compare_pcc,
data_gen_with_range,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_log.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_log.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log.py
index 9b460ec1e32..85db970d27b 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_log.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_val,
compare_pcc,
data_gen_with_range,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_log10.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log10.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_log10.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log10.py
index 20ca394a6bf..0c8e4aea027 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_log10.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log10.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_log1p.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log1p.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_log1p.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log1p.py
index f1bf22ff16a..c0e04042a46 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_log1p.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log1p.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_log2.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log2.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_log2.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log2.py
index 5d8256b1527..820fa92cee3 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_log2.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log2.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
compare_pcc,
data_gen_with_range,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_log_sigmoid.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log_sigmoid.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_log_sigmoid.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log_sigmoid.py
index 82ba28fd9b4..bd3def80766 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_log_sigmoid.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_log_sigmoid.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
data_gen_with_val,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp.py
index 4215c7de991..db0c1d6453f 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp2.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp2.py
similarity index 92%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp2.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp2.py
index 89009a1ba6c..6a2f56a2745 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_logaddexp2.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logaddexp2.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_logit.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logit.py
similarity index 91%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_logit.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logit.py
index ac7cc3811ab..576c0d7be79 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_logit.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logit.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
compare_pcc,
data_gen_with_range,
data_gen_with_val,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_logiteps.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logiteps.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_logiteps.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logiteps.py
index 5546090dcf7..bbe3733444b 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_logiteps.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_logiteps.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
data_gen_with_val,
compare_pcc,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_max.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_max.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_max.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_max.py
index bede78fff64..9025bf37420 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_max.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_max.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_min.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_min.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_min.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_min.py
index 26a3712b881..dcf48a8a79c 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_min.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_min.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_mul.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mul.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_mul.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mul.py
index 06becdfca1e..71d142b4692 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_mul.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mul.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_mvlgamma.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mvlgamma.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_mvlgamma.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mvlgamma.py
index a3877f5403a..1b4bace286e 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_mvlgamma.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_mvlgamma.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_neg.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_neg.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_neg.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_neg.py
index 573b8fc8822..a103128a4db 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_neg.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_neg.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_polygamma.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_polygamma.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_polygamma.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_polygamma.py
index d2a1670bbb3..b2053185bea 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_polygamma.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_polygamma.py
@@ -5,7 +5,11 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range, data_gen_with_val
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
+ compare_pcc,
+ data_gen_with_range,
+ data_gen_with_val,
+)
from models.utility_functions import (
is_wormhole_b0,
is_blackhole,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_pow.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_pow.py
similarity index 98%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_pow.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_pow.py
index 393184e7bb9..aa5d72dc4c0 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_pow.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_pow.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_pcc,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_prod.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_prod.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_prod.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_prod.py
index bd0db0d4eda..1dada61fe41 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_prod.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_prod.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_pt_tt,
data_gen_pt_tt_prod,
compare_results,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_rad2deg.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rad2deg.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_rad2deg.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rad2deg.py
index 54f72d9e1fe..8eaed4c7c89 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_rad2deg.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rad2deg.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_rdiv.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_rdiv.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
index 1b7afe5b9ed..e00b9da6eed 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_rdiv.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_pt_tt, compare_results
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_pt_tt, compare_results
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_reciprocal.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_reciprocal.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_reciprocal.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_reciprocal.py
index 009dab096ca..a226f2f6137 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_reciprocal.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_reciprocal.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
compare_pcc,
data_gen_with_range,
data_gen_with_val,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_relu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_relu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu.py
index 25aaa9c840f..76bcad4413d 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_relu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_relu6.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu6.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_relu6.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu6.py
index a34cdedbcb7..ec96169e8aa 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_relu6.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_relu6.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_remainder.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_remainder.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_remainder.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_remainder.py
index e20164644f1..1bb21aaa033 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_remainder.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_remainder.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
from models.utility_functions import skip_for_grayskull
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_repeat.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_repeat.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_repeat.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_repeat.py
index 718ec611a5c..5aead86970f 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_repeat.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_repeat.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_pt_tt, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_pt_tt, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_round.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_round.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_round.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_round.py
index 345f027743e..cbf5af1c47c 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_round.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_round.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_rpow.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rpow.py
similarity index 92%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_rpow.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rpow.py
index f82cc3d4ffd..fcaa965b3af 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_rpow.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rpow.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
compare_pcc,
data_gen_with_range,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_rsqrt.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsqrt.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_rsqrt.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsqrt.py
index 7411f22ff94..6430d29cdd1 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_rsqrt.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsqrt.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_rsub.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsub.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_rsub.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsub.py
index ec8647a90ab..452846d4a8e 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_rsub.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rsub.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_selu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_selu.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_selu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_selu.py
index f46107c1257..60a32f18abe 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_selu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_selu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sigmoid.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sigmoid.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sigmoid.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sigmoid.py
index c8ccd418f1e..6c0d38576ac 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sigmoid.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sigmoid.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sign.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sign.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sign.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sign.py
index d2a36a493de..abbd7a1335d 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sign.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sign.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_silu.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_silu.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_silu.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_silu.py
index 21b41919f66..1274ede323c 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_silu.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_silu.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sin.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sin.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sin.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sin.py
index f3b22857581..79d4b4ad05c 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sin.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sin.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
from math import pi
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sinh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sinh.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sinh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sinh.py
index c1c233bfaa9..26b511683bf 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sinh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sinh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
from models.utility_functions import (
is_wormhole_b0,
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_softplus.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softplus.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_softplus.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softplus.py
index f1fcc514cf9..7d191595c64 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_softplus.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softplus.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_softshrink.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softshrink.py
similarity index 95%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_softshrink.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softshrink.py
index bf3651a8fac..275be0d5d39 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_softshrink.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softshrink.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_results,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_softsign.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softsign.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_softsign.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softsign.py
index b83ab110341..a4c5d1837a3 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_softsign.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_softsign.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sqrt.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sqrt.py
similarity index 93%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sqrt.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sqrt.py
index d5ab971374c..e435b33c390 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sqrt.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sqrt.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_square.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_square.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_square.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_square.py
index 4cf8da1b250..86bfae70d69 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_square.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_square.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_squared_difference.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_squared_difference.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_squared_difference.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_squared_difference.py
index 965c769ab38..11404a49303 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_squared_difference.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_squared_difference.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_sub.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sub.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_sub.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sub.py
index a9f83a26c5e..135103a03f9 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_sub.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_sub.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_subalpha.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_subalpha.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_subalpha.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_subalpha.py
index d0c6ac83de0..2b6870dd002 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_subalpha.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_subalpha.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_tan.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tan.py
similarity index 92%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_tan.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tan.py
index d460fcd3aab..a5ab6581cf7 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_tan.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tan.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
compare_results,
)
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_tanh.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanh.py
similarity index 94%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_tanh.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanh.py
index b9206bf6a3b..e4e74470648 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_tanh.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanh.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_tanhshrink.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanhshrink.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_tanhshrink.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanhshrink.py
index 4d9183f3a01..cf74e5bd8bc 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_tanhshrink.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_tanhshrink.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_threshold.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_threshold.py
similarity index 90%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_threshold.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_threshold.py
index 34e62c8c5fd..05fdabe68e5 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_threshold.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_threshold.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_trunc.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_trunc.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_trunc.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_trunc.py
index ffc71489351..5ef3b2b91ac 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_trunc.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_trunc.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import compare_pcc, data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import compare_pcc, data_gen_with_range
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_where.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_where.py
similarity index 96%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_where.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_where.py
index a8da1533eea..8a28bccb2fc 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_where.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_where.py
@@ -6,7 +6,7 @@
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/test_backward_xlogy.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_xlogy.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/backward/test_backward_xlogy.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_xlogy.py
index 64a4af879ad..ae7990a164c 100644
--- a/tests/ttnn/unit_tests/operations/backward/test_backward_xlogy.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_xlogy.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/backward/utility_funcs.py b/tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/backward/utility_funcs.py
rename to tests/ttnn/unit_tests/operations/eltwise/backward/utility_funcs.py
diff --git a/tests/ttnn/unit_tests/operations/complex/test_complex_conj.py b/tests/ttnn/unit_tests/operations/eltwise/complex/test_complex_conj.py
similarity index 89%
rename from tests/ttnn/unit_tests/operations/complex/test_complex_conj.py
rename to tests/ttnn/unit_tests/operations/eltwise/complex/test_complex_conj.py
index 383dcb896b5..2ffa699cedb 100644
--- a/tests/ttnn/unit_tests/operations/complex/test_complex_conj.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/complex/test_complex_conj.py
@@ -7,11 +7,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0, skip_for_grayskull
-from tests.ttnn.unit_tests.operations.complex.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.complex.utility_funcs import (
convert_complex_to_torch_tensor,
random_complex_tensor,
)
diff --git a/tests/ttnn/unit_tests/operations/complex/utility_funcs.py b/tests/ttnn/unit_tests/operations/eltwise/complex/utility_funcs.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/complex/utility_funcs.py
rename to tests/ttnn/unit_tests/operations/eltwise/complex/utility_funcs.py
diff --git a/tests/ttnn/unit_tests/operations/test_activation.py b/tests/ttnn/unit_tests/operations/eltwise/test_activation.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/test_activation.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_activation.py
index 56eb3293c33..2f3f4b41865 100644
--- a/tests/ttnn/unit_tests/operations/test_activation.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_activation.py
@@ -186,6 +186,38 @@ def torch_prelu(x, *args, weight, **kwargs):
return result
+def run_activation_test_elu(device, h, w, scalar, ttnn_function, pcc=0.99):
+ torch.manual_seed(0)
+
+ torch_input_tensor_a = torch.rand((h, w), dtype=torch.bfloat16)
+ golden_function = ttnn.get_golden_function(ttnn_function)
+ torch_output_tensor = golden_function(torch_input_tensor_a, alpha=scalar)
+
+ input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+
+ output_tensor = ttnn_function(input_tensor_a, alpha=scalar)
+ output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT)
+ output_tensor = ttnn.from_device(output_tensor)
+ output_tensor = ttnn.to_torch(output_tensor)
+ assert_with_pcc(torch_output_tensor, output_tensor, pcc)
+
+
+def run_activation_test_leaky_relu(device, h, w, scalar, ttnn_function, pcc=0.99):
+ torch.manual_seed(0)
+
+ torch_input_tensor_a = torch.rand((h, w), dtype=torch.bfloat16)
+ golden_function = ttnn.get_golden_function(ttnn_function)
+ torch_output_tensor = golden_function(torch_input_tensor_a, negative_slope=scalar)
+
+ input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+
+ output_tensor = ttnn_function(input_tensor_a, negative_slope=scalar)
+ output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT)
+ output_tensor = ttnn.from_device(output_tensor)
+ output_tensor = ttnn.to_torch(output_tensor)
+ assert_with_pcc(torch_output_tensor, output_tensor, pcc)
+
+
def run_activation_test_scalarB(device, h, w, scalar, ttnn_function, pcc=0.99):
torch.manual_seed(0)
@@ -222,7 +254,7 @@ def run_activation_test_scalarB_key(device, h, w, value, ttnn_function, pcc=0.99
@pytest.mark.parametrize("h", [64])
@pytest.mark.parametrize("w", [128])
def test_scalarB_elu(device, h, w, scalar):
- run_activation_test_scalarB(device, h, w, scalar, ttnn.elu)
+ run_activation_test_elu(device, h, w, scalar, ttnn.elu)
@pytest.mark.parametrize("alpha", [1, 2.5, 5.0])
@@ -268,11 +300,11 @@ def test_scalarB_heaviside(device, h, w, value):
run_activation_test_scalarB_key(device, h, w, value, ttnn.heaviside)
-@pytest.mark.parametrize("scalar", [-0.5, 0, 0.5])
+@pytest.mark.parametrize("scalar", [-0.5, 0, 0.1, 0.01, 0.5])
@pytest.mark.parametrize("h", [64])
@pytest.mark.parametrize("w", [128])
def test_scalarB_leaky_relu(device, h, w, scalar):
- run_activation_test_scalarB(device, h, w, scalar, ttnn.leaky_relu)
+ run_activation_test_leaky_relu(device, h, w, scalar, ttnn.leaky_relu)
@pytest.mark.parametrize("weight", [-0.5, 1.0, 0.5])
diff --git a/tests/ttnn/unit_tests/operations/test_add.py b/tests/ttnn/unit_tests/operations/eltwise/test_add.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_add.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_add.py
diff --git a/tests/ttnn/unit_tests/operations/test_backward.py b/tests/ttnn/unit_tests/operations/eltwise/test_backward.py
similarity index 98%
rename from tests/ttnn/unit_tests/operations/test_backward.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_backward.py
index afad991bd33..5530a9ababb 100644
--- a/tests/ttnn/unit_tests/operations/test_backward.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_backward.py
@@ -9,7 +9,7 @@
import ttnn
from models.utility_functions import is_wormhole_b0, is_blackhole
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_val,
compare_all_close,
)
diff --git a/tests/ttnn/unit_tests/operations/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
similarity index 99%
rename from tests/ttnn/unit_tests/operations/test_binary_composite.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index f5460e85cf9..89e74e1f85b 100644
--- a/tests/ttnn/unit_tests/operations/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -6,7 +6,7 @@
import pytest
import random
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
data_gen_with_range,
data_gen_with_range_int,
compare_pcc,
diff --git a/tests/ttnn/unit_tests/operations/test_binary_scalar.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_scalar.py
similarity index 90%
rename from tests/ttnn/unit_tests/operations/test_binary_scalar.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_binary_scalar.py
index a421f155fca..a7adfeaa031 100644
--- a/tests/ttnn/unit_tests/operations/test_binary_scalar.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_scalar.py
@@ -6,7 +6,7 @@
import pytest
import ttnn
import random
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_complex.py b/tests/ttnn/unit_tests/operations/eltwise/test_complex.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_complex.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_complex.py
diff --git a/tests/ttnn/unit_tests/operations/test_complex_tensor.py b/tests/ttnn/unit_tests/operations/eltwise/test_complex_tensor.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/test_complex_tensor.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_complex_tensor.py
index b12ec3b0b4d..98e080977f6 100644
--- a/tests/ttnn/unit_tests/operations/test_complex_tensor.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_complex_tensor.py
@@ -7,11 +7,11 @@
import pytest
import ttnn
from loguru import logger
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal, comp_allclose
from models.utility_functions import is_wormhole_b0
-from tests.ttnn.unit_tests.operations.complex.utility_funcs import (
+from tests.ttnn.unit_tests.operations.eltwise.complex.utility_funcs import (
convert_complex_to_torch_tensor,
random_complex_tensor,
)
diff --git a/tests/ttnn/unit_tests/operations/test_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_composite.py
similarity index 99%
rename from tests/ttnn/unit_tests/operations/test_composite.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_composite.py
index 89479b9e858..5f43cd2ee17 100644
--- a/tests/ttnn/unit_tests/operations/test_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_composite.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
from models.utility_functions import skip_for_grayskull, is_wormhole_b0, is_blackhole
diff --git a/tests/ttnn/unit_tests/operations/test_elt_binary.py b/tests/ttnn/unit_tests/operations/eltwise/test_elt_binary.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_elt_binary.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_elt_binary.py
diff --git a/tests/ttnn/unit_tests/operations/test_eltwise_logical_and_.py b/tests/ttnn/unit_tests/operations/eltwise/test_eltwise_logical_and_.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_eltwise_logical_and_.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_eltwise_logical_and_.py
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_eltwise_softplus_inf.py b/tests/ttnn/unit_tests/operations/eltwise/test_eltwise_softplus_inf.py
new file mode 100644
index 00000000000..bff785ebee2
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_eltwise_softplus_inf.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from loguru import logger
+import random
+import pytest
+import torch
+import ttnn
+import traceback
+
+from tests.ttnn.python_api_testing.sweep_tests import ttnn_ops
+from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc
+from models.utility_functions import skip_for_grayskull
+
+
+def run_eltwise_softplus_tests(
+ input_shape,
+ dtype,
+ dlayout,
+ in_mem_config,
+ output_mem_config,
+ beta,
+ threshold,
+ data_seed,
+ device,
+):
+ torch.manual_seed(data_seed)
+ x = torch.Tensor(size=input_shape[0]).uniform_(-100, 100)
+
+ try:
+ # get ref result
+ ref_value = torch.nn.functional.softplus(x, beta=beta, threshold=threshold)
+
+ x = ttnn_ops.setup_ttnn_tensor(x, device, dlayout[0], in_mem_config[0], dtype[0])
+ tt_result = ttnn.softplus(x, beta=beta, threshold=threshold, memory_config=output_mem_config)
+
+ tt_result = ttnn_ops.ttnn_tensor_to_torch(tt_result, output_mem_config)
+
+ except Exception as e:
+ logger.warning(f"Test execution crashed: {e}")
+ print(traceback.format_exc())
+ raise e
+
+ assert len(tt_result.shape) == len(ref_value.shape)
+ assert tt_result.shape == ref_value.shape
+
+ # compare tt and golden outputs
+ success, pcc_value = comp_pcc(ref_value, tt_result)
+ logger.debug(pcc_value)
+ logger.debug(success)
+
+ assert success
+
+
+test_sweep_args = [
+ (
+ [(6, 6, 192, 224)],
+ [ttnn.bfloat16],
+ [ttnn.TILE_LAYOUT],
+ [ttnn.DRAM_MEMORY_CONFIG],
+ ttnn.L1_MEMORY_CONFIG,
+ 0.0,
+ 28.125,
+ 19042500,
+ ),
+]
+
+
+@skip_for_grayskull("Softplus is not available in Grayskull")
+@pytest.mark.parametrize(
+ "input_shape, dtype, dlayout, in_mem_config, out_mem_config, beta, threshold, data_seed",
+ (test_sweep_args),
+)
+def test_eltwise_softplus(
+ input_shape, dtype, dlayout, in_mem_config, out_mem_config, beta, threshold, data_seed, device
+):
+ run_eltwise_softplus_tests(
+ input_shape, dtype, dlayout, in_mem_config, out_mem_config, beta, threshold, data_seed, device
+ )
diff --git a/tests/ttnn/unit_tests/operations/test_eltwise_typecast.py b/tests/ttnn/unit_tests/operations/eltwise/test_eltwise_typecast.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_eltwise_typecast.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_eltwise_typecast.py
diff --git a/tests/ttnn/unit_tests/operations/test_inplace.py b/tests/ttnn/unit_tests/operations/eltwise/test_inplace.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_inplace.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_inplace.py
diff --git a/tests/ttnn/unit_tests/operations/test_math.py b/tests/ttnn/unit_tests/operations/eltwise/test_math.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_math.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_math.py
diff --git a/tests/ttnn/unit_tests/operations/test_math_binary.py b/tests/ttnn/unit_tests/operations/eltwise/test_math_binary.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_math_binary.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_math_binary.py
diff --git a/tests/ttnn/unit_tests/operations/test_mul.py b/tests/ttnn/unit_tests/operations/eltwise/test_mul.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_mul.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_mul.py
diff --git a/tests/ttnn/unit_tests/operations/test_pow.py b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py
similarity index 88%
rename from tests/ttnn/unit_tests/operations/test_pow.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_pow.py
index d0eed9fc9d0..51296079a5e 100644
--- a/tests/ttnn/unit_tests/operations/test_pow.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py
@@ -5,7 +5,7 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_relational.py b/tests/ttnn/unit_tests/operations/eltwise/test_relational.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_relational.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_relational.py
diff --git a/tests/ttnn/unit_tests/operations/test_sub.py b/tests/ttnn/unit_tests/operations/eltwise/test_sub.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_sub.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_sub.py
diff --git a/tests/ttnn/unit_tests/operations/test_ternary.py b/tests/ttnn/unit_tests/operations/eltwise/test_ternary.py
similarity index 100%
rename from tests/ttnn/unit_tests/operations/test_ternary.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_ternary.py
diff --git a/tests/ttnn/unit_tests/operations/test_ternary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_ternary_composite.py
similarity index 97%
rename from tests/ttnn/unit_tests/operations/test_ternary_composite.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_ternary_composite.py
index 9d75a2eabfb..2b9207dbbeb 100644
--- a/tests/ttnn/unit_tests/operations/test_ternary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_ternary_composite.py
@@ -5,7 +5,11 @@
import torch
import pytest
import ttnn
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, data_gen_with_val, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import (
+ data_gen_with_range,
+ data_gen_with_val,
+ compare_pcc,
+)
@pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/test_unary.py b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py
similarity index 99%
rename from tests/ttnn/unit_tests/operations/test_unary.py
rename to tests/ttnn/unit_tests/operations/eltwise/test_unary.py
index b4a3fca3c4c..e70e76a3b2a 100644
--- a/tests/ttnn/unit_tests/operations/test_unary.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_unary.py
@@ -9,7 +9,7 @@
import ttnn
from tests.ttnn.utils_for_testing import assert_with_pcc, assert_equal
-from tests.ttnn.unit_tests.operations.backward.utility_funcs import data_gen_with_range, compare_pcc
+from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc
from models.utility_functions import torch_random, skip_for_grayskull, is_wormhole_b0, is_blackhole
diff --git a/tests/ttnn/unit_tests/operations/test_clone.py b/tests/ttnn/unit_tests/operations/test_clone.py
new file mode 100644
index 00000000000..6684cd3f8b2
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/test_clone.py
@@ -0,0 +1,256 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+import ttnn
+from models.utility_functions import comp_allclose_and_pcc
+from loguru import logger
+
+from tests.ttnn.unit_tests.operations.test_utils import (
+ to_cpu,
+ to_npu,
+)
+
+
+def get_lib_dtype(lib, dtype):
+ """
+ Maps string-based data types to their corresponding library-specific dtypes.
+
+ Parameters:
+ lib: library module (e.g., torch, ttnn)
+ The library for which the dtype mapping is required.
+ dtype: str
+ The string representation of the data type (e.g., 'bfloat16', 'float32', 'int32').
+
+ Returns:
+ Corresponding library-specific dtype or None if not found.
+ """
+ dtype_map = {
+ "bfloat16": lib.bfloat16,
+ "float32": lib.float32,
+ "int32": lib.int32,
+ }
+ return dtype_map.get(dtype, None)
+
+
+def run_clone(
+ shape,
+ input_memory_config,
+ output_memory_config,
+ input_dtype,
+ output_dtype,
+ tilized,
+ device,
+):
+ """
+ Function to test the clone operation on NPU. Generates random input data, clones it on NPU,
+ and compares the output with the CPU clone for correctness.
+
+ Parameters:
+ shape: tuple
+ Shape of the input tensor.
+ input_memory_config: MemoryConfig
+ Memory configuration for the input tensor on NPU.
+ output_memory_config: MemoryConfig
+ Memory configuration for the output tensor on NPU.
+ input_dtype: str
+ Data type of the input tensor ('int32' or other).
+ output_dtype: str or None
+ Data type of the output tensor (must be None or match input_dtype when not tilized).
+ tilized: bool
+ Whether to use TILE_LAYOUT or ROW_MAJOR_LAYOUT for NPU tensor.
+ device: ttnn.device
+ Device where the operation is performed (e.g., NPU device).
+
+ Raises:
+ pytest.skip: When certain conditions on dtype mismatch or layout are not met.
+ """
+ if input_dtype == "int32":
+ cpu_input = torch.randint(low=-10, high=11, size=shape, dtype=get_lib_dtype(torch, input_dtype))
+ else:
+ cpu_input = 2 * torch.rand(size=shape, dtype=get_lib_dtype(torch, input_dtype)) - 1
+
+ if input_dtype == "int32":
+ if output_dtype and output_dtype != "int32":
+ pytest.skip("For int32 input, output_dtype must be None or int32.")
+ if output_dtype == "int32" and input_dtype != "int32":
+ pytest.skip("For int32 output, input_dtype must also be int32.")
+ if output_dtype != input_dtype and output_dtype and not tilized:
+ pytest.skip("When not tilized, dtype conversion is not supported.")
+
+ npu_input = to_npu(
+ cpu_input,
+ device,
+ npu_dtype=get_lib_dtype(ttnn, input_dtype),
+ npu_layout=ttnn.TILE_LAYOUT if tilized else ttnn.ROW_MAJOR_LAYOUT,
+ ).to(device, input_memory_config)
+
+ npu_output = ttnn.clone(
+ npu_input,
+ dtype=get_lib_dtype(ttnn, output_dtype),
+ memory_config=output_memory_config,
+ )
+
+ cpu_output = to_cpu(npu_output, shape)
+
+ passing, out = comp_allclose_and_pcc(torch.ops.aten.clone(cpu_input), cpu_output, rtol=0.01, atol=0.01)
+ logger.info(out)
+ assert passing
+
+
+memory_config_list = [
+ ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
+ ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1),
+]
+
+
+@pytest.mark.parametrize(
+ "shape",
+ [
+ [10], # 1d
+ [10, 10], # 2d
+ [10, 10, 10], # 3d
+ [10, 10, 10, 10], # 4d
+ [1, 1, 1, 30], # Single core
+ [1, 1, 300, 380], # Multi core
+ [1, 3, 320, 380], # Multi core
+ [1, 1, 32, 32], # Single core
+ [1, 1, 320, 384], # Multi core
+ [1, 3, 320, 384], # Multi core
+ [38, 2, 99, 181], # Odd last Dim
+ [5, 33, 319, 381], # Odd last Dim
+ ],
+)
+@pytest.mark.parametrize(
+ "tilized",
+ [True, False],
+)
+def test_clone_shape(
+ shape,
+ tilized,
+ device,
+):
+ """
+ Test case to verify the clone operation on different tensor shapes and layouts (tilized or not).
+ """
+ torch.manual_seed(2024)
+ run_clone(
+ shape,
+ memory_config_list[0],
+ memory_config_list[0],
+ "bfloat16",
+ None,
+ tilized,
+ device,
+ )
+
+
+@pytest.mark.parametrize(
+ "input_memory_config",
+ memory_config_list,
+)
+@pytest.mark.parametrize(
+ "output_memory_config",
+ [*memory_config_list, None],
+)
+@pytest.mark.parametrize(
+ "tilized",
+ [True, False],
+)
+def test_clone_memory_config(
+ input_memory_config,
+ output_memory_config,
+ tilized,
+ device,
+):
+ """
+ Test case to verify the clone operation with different memory configurations (input/output)
+ and layout configurations (tilized or not).
+ """
+ torch.manual_seed(2024)
+ run_clone(
+ [1, 3, 320, 384],
+ input_memory_config,
+ output_memory_config,
+ "bfloat16",
+ None,
+ tilized,
+ device,
+ )
+
+
+@pytest.mark.parametrize(
+ "input_dtype",
+ [
+ "bfloat16",
+ "float32",
+ "int32",
+ ],
+)
+@pytest.mark.parametrize(
+ "output_dtype",
+ [
+ "bfloat16",
+ "float32",
+ "int32",
+ None,
+ ],
+)
+@pytest.mark.parametrize(
+ "tilized",
+ [True, False],
+)
+def test_clone_dtype_conversion(
+ input_dtype,
+ output_dtype,
+ tilized,
+ device,
+):
+ """
+ Test case to verify the clone operation with various input/output dtype combinations.
+ """
+ torch.manual_seed(2024)
+ run_clone(
+ [1, 3, 320, 384],
+ memory_config_list[0],
+ memory_config_list[0],
+ input_dtype,
+ output_dtype,
+ tilized,
+ device,
+ )
+
+
+@pytest.mark.parametrize(
+ "tilized",
+ [True, False],
+)
+def test_clone_callback(
+ tilized,
+ device,
+ use_program_cache,
+):
+ """
+ Test case to verify the clone operation with various input/output dtype combinations.
+ """
+ torch.manual_seed(2024)
+ num_program_cache_entries_list = []
+ for i in range(2):
+ run_clone(
+ [1, 3, 320, 384],
+ memory_config_list[0],
+ memory_config_list[0],
+ "bfloat16",
+ None,
+ tilized,
+ device,
+ )
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
+ num_program_cache_entries_list.append(device.num_program_cache_entries())
+ logger.info(f"num_program_cache_entries_list={num_program_cache_entries_list}")
+ assert num_program_cache_entries_list[0] > 0
+ assert num_program_cache_entries_list[0] == num_program_cache_entries_list[1]
diff --git a/tests/ttnn/unit_tests/operations/test_conv1d.py b/tests/ttnn/unit_tests/operations/test_conv1d.py
index ed9d7cb2ac6..3e7a1496c63 100644
--- a/tests/ttnn/unit_tests/operations/test_conv1d.py
+++ b/tests/ttnn/unit_tests/operations/test_conv1d.py
@@ -45,6 +45,7 @@ def run_conv(
deallocate_activation=True,
debug=False,
groups=1,
+ auto_shard=False,
):
# has_bias = False
has_bias = False
@@ -78,13 +79,17 @@ def run_conv(
tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)
+ shard_layout = (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED
+ )
+ if auto_shard:
+ shard_layout = None
+
conv_config = ttnn.Conv1dConfig(
dtype=output_dtype,
weights_dtype=weights_dtype,
math_fidelity=math_fidelity,
- shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
- if use_1d_systolic_array
- else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+ shard_layout=shard_layout,
input_channels_alignment=(16 if use_shallow_conv_variant else 32),
deallocate_activation=deallocate_activation,
fp32_dest_acc_enabled=fp32_accum,
@@ -214,6 +219,7 @@ def test_conv1d_mamba(
padded_input_channels=None,
output_layout=output_layout,
groups=groups,
+ auto_shard=True,
)
diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py
index 1b9015471d9..f0aad38592b 100644
--- a/tests/ttnn/unit_tests/operations/test_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_matmul.py
@@ -1346,3 +1346,30 @@ def core_range_for_num_cores(num_cores):
matmul_output = matmul_output + bias_tensor
assert_with_pcc(matmul_output, tt_mm_out, pcc=0.993)
+
+
+@pytest.mark.parametrize("M", [32, 128])
+@pytest.mark.parametrize("K", [32, 128])
+@pytest.mark.parametrize("N", [32, 128])
+def test_alternating_dst_sync_mode_matmul(device, M, K, N):
+ torch.manual_seed(0)
+ torch_input_tensor_a = torch.randn([1, 1, M, K], dtype=torch.bfloat16)
+ torch_input_tensor_b = torch.randn([1, 1, K, N], dtype=torch.bfloat16)
+ torch_output_tensor = torch.matmul(torch_input_tensor_a, torch_input_tensor_b)
+
+ input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+ input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+ # Half sync mode
+ output1 = ttnn.matmul(input_tensor_a, input_tensor_b, core_grid=ttnn.CoreGrid(y=4, x=4))
+ # Full sync mode
+ output2 = ttnn.matmul(input_tensor_a, input_tensor_b)
+ # Half sync mode
+ output3 = ttnn.matmul(input_tensor_a, input_tensor_b, core_grid=ttnn.CoreGrid(y=4, x=4))
+
+ pcc = 0.99
+ output_tensor = ttnn.to_torch(output1)
+ assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
+ output_tensor = ttnn.to_torch(output2)
+ assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
+ output_tensor = ttnn.to_torch(output3)
+ assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_adam.py b/tests/ttnn/unit_tests/operations/test_moreh_adam.py
index a065103d50c..a1a49aa1b1e 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_adam.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_adam.py
@@ -8,7 +8,9 @@
import ttnn
import pytest
-from models.utility_functions import is_wormhole_b0, comp_allclose_and_pcc, comp_pcc, is_wormhole_b0
+from models.utility_functions import (
+ comp_allclose_and_pcc,
+)
from loguru import logger
from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
get_compute_kernel_options,
@@ -141,3 +143,22 @@ def forward(self, x):
logger.debug(f"Out passing (max_exp_avg_sq)={passing}")
logger.debug(f"Output pcc={out}")
assert passing
+
+
+@pytest.mark.parametrize(
+ "params",
+ (
+ # shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en
+ ([32, 32], 0.0, (0.9, 0.999), 1e-06, 0.0, True, True),
+ ([2, 2, 2, 2, 2, 2, 64, 64], 0.0, (0.9, 0.999), 1e-06, 0.0, False, False),
+ ),
+)
+def test_moreh_adam_enable_cache(params, device, use_program_cache):
+ for i in range(4):
+ shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en = params
+ if i % 2 == 1:
+ amsgrad = not amsgrad
+
+ test_moreh_adam(shape, lr, betas, eps, weight_decay, amsgrad, fp32_dest_acc_en, device)
+
+ assert device.num_program_cache_entries() == 2
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_cumsum.py b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
similarity index 54%
rename from tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_cumsum.py
rename to tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
index 206bbcf207e..34048c0ec00 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_moreh_cumsum.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_cumsum.py
@@ -4,12 +4,17 @@
import pytest
import torch
+import ttnn
+
from loguru import logger
-import ttnn
from models.utility_functions import comp_allclose_and_pcc
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import TILE_HEIGHT, TILE_WIDTH
+from tests.ttnn.unit_tests.operations.test_utils import TILE_HEIGHT, TILE_WIDTH
+
+
+def create_tt_tensor(tensor: torch.Tensor, dtype, device, layout):
+ return ttnn.from_torch(tensor, dtype=dtype, layout=layout, device=device)
def get_tensors(input_shape, output_shape, device):
@@ -21,8 +26,8 @@ def get_tensors(input_shape, output_shape, device):
torch_input = torch.randint(-2, 3, input_shape, dtype=cpu_dtype, requires_grad=True)
torch_output = torch.randint(-2, 3, output_shape, dtype=cpu_dtype)
- tt_input = ttnn.Tensor(torch_input, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
- tt_output = ttnn.Tensor(torch_output, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
+ tt_input = create_tt_tensor(torch_input, npu_dtype, device, npu_layout)
+ tt_output = create_tt_tensor(torch_output, npu_dtype, device, npu_layout)
return tt_input, tt_output, torch_input
@@ -74,7 +79,7 @@ def test_moreh_cumsum_dim(input_shape, dim, device):
cpu_layout = ttnn.ROW_MAJOR_LAYOUT
tt_output_cpu = (
- ttnn.experimental.operations.primary.moreh_cumsum(tt_input, tt_output, dim=dim)
+ ttnn.operations.moreh.cumsum(tt_input, dim, output=tt_output)
.cpu()
.to(cpu_layout)
.unpad_from_tile(output_shape)
@@ -114,7 +119,7 @@ def test_moreh_cumsum_dim(input_shape, dim, device):
),
ids=["0", "1"],
)
-def test_moreh_cumsumsum_backward(input_shape, dim, device):
+def test_moreh_cumsum_backward(input_shape, dim, device):
output_shape = input_shape.copy()
(_, _, torch_input) = get_tensors(input_shape, output_shape, device)
@@ -125,7 +130,7 @@ def test_moreh_cumsumsum_backward(input_shape, dim, device):
cpu_layout = ttnn.ROW_MAJOR_LAYOUT
tt_input_grad_cpu = (
- ttnn.experimental.operations.primary.moreh_cumsum_backward(tt_output_grad, tt_input_grad, dim=dim)
+ ttnn.operations.moreh.cumsum_backward(tt_output_grad, dim, input_grad=tt_input_grad)
.cpu()
.to(cpu_layout)
.unpad_from_tile(input_shape)
@@ -140,3 +145,100 @@ def test_moreh_cumsumsum_backward(input_shape, dim, device):
logger.debug(f"Output pcc={output_pcc}")
assert passing
+
+
+@pytest.mark.parametrize(
+ "input_shape",
+ (
+ ([1, 1, TILE_HEIGHT - 1, TILE_WIDTH - 1]),
+ ([4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 30 - 1]),
+ ),
+ ids=[
+ "1, 1, TILE_HEIGHT-1,TILE_WIDTH - 1",
+ "4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 30 - 1",
+ ],
+)
+@pytest.mark.parametrize(
+ "dim",
+ (
+ 0,
+ 1,
+ ),
+ ids=["0", "1"],
+)
+def test_moreh_cumsum_callback(input_shape, dim, device, use_program_cache):
+ output_shape = input_shape.copy()
+
+ (tt_input, tt_output, torch_input) = get_tensors(input_shape, output_shape, device)
+
+ torch_output = torch.cumsum(torch_input, dim)
+
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+
+ # test for equivalance
+ rtol = atol = 0.1
+
+ for i in range(2):
+ tt_output_cpu = (
+ ttnn.operations.moreh.cumsum(tt_input, dim).cpu().to(cpu_layout).unpad_from_tile(output_shape).to_torch()
+ )
+
+ passing, output_pcc = comp_allclose_and_pcc(torch_output, tt_output_cpu, pcc=0.999, rtol=rtol, atol=atol)
+
+ logger.debug(f"Out passing={passing}")
+ logger.debug(f"Output pcc={output_pcc}")
+
+ assert passing
+ assert device.num_program_cache_entries() == 1
+
+
+@pytest.mark.parametrize(
+ "input_shape",
+ (
+ ([1, 1, TILE_HEIGHT - 1, TILE_WIDTH - 1]),
+ ([4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 30 - 1]),
+ ),
+ ids=[
+ "1, 1, TILE_HEIGHT-1,TILE_WIDTH - 1",
+ "4, 4, TILE_HEIGHT * 12 - 1, TILE_WIDTH * 30 - 1",
+ ],
+)
+@pytest.mark.parametrize(
+ "dim",
+ (
+ 0,
+ 1,
+ ),
+ ids=["0", "1"],
+)
+def test_moreh_cumsum_backward_callback(input_shape, dim, device, use_program_cache):
+ output_shape = input_shape.copy()
+
+ (_, _, torch_input) = get_tensors(input_shape, output_shape, device)
+ (tt_output_grad, tt_input_grad, torch_output_grad) = get_backward_tensors(output_shape, input_shape, device)
+
+ torch_output = torch.cumsum(torch_input, dim)
+ torch_output.backward(torch_output_grad)
+
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+ # test for equivalance
+ rtol = atol = 0.1
+
+ for i in range(2):
+ tt_input_grad_cpu = (
+ ttnn.operations.moreh.cumsum_backward(tt_output_grad, dim)
+ .cpu()
+ .to(cpu_layout)
+ .unpad_from_tile(input_shape)
+ .to_torch()
+ )
+
+ passing, output_pcc = comp_allclose_and_pcc(
+ torch_input.grad, tt_input_grad_cpu, pcc=0.999, rtol=rtol, atol=atol
+ )
+
+ logger.debug(f"Out passing={passing}")
+ logger.debug(f"Output pcc={output_pcc}")
+
+ assert passing
+ assert device.num_program_cache_entries() == 1
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
index 9bfc65aaf3f..2fa86da612c 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_dot_backward.py
@@ -117,8 +117,8 @@ def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
torch_out.backward(torch_output_grad)
# tt matmul backward
- ttnn.experimental.operations.primary.moreh_matmul_backward(
- tt_output_grad, tt_input, tt_other, (require_input_grad, require_other_grad), tt_input_grad, tt_other_grad
+ ttnn.operations.moreh.dot_backward(
+ tt_output_grad, tt_input, tt_other, input_grad=tt_input_grad, other_grad=tt_other_grad
)
# test for equivalance
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
index 7814eed71b3..e67bdaba854 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_getitem.py
@@ -359,7 +359,7 @@ def test_getitem_tilized_one_index(shape_index_dim, dtype, index_size, row_major
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
@@ -452,7 +452,7 @@ def test_getitem_tilized_two_indices(shape_index_dims, dtype, index_size, row_ma
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
@@ -541,7 +541,7 @@ def test_getitem_tilized_three_indices(shape_index_dims, dtype, index_size, row_
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
@@ -625,7 +625,7 @@ def test_getitem_tilized_four_indices(shape_index_dims, dtype, index_size, row_m
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
@@ -706,7 +706,7 @@ def test_getitem_tilized_five_indices(shape_index_dims, dtype, index_size, row_m
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
@@ -751,7 +751,7 @@ def run_moreh_geitem_tilized_one_index(shape_index_dim, dtype, index_size, row_m
else:
dev_idx = (
ttnn.Tensor(idx, ttnn.int32)
- .reshape([1, 1, 1, 1, index_size])
+ .reshape([1, index_size])
.pad_to_tile(float("nan"))
.to(ttnn.TILE_LAYOUT)
.to(device)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_linear.py b/tests/ttnn/unit_tests/operations/test_moreh_linear.py
index 12109abba8e..2cdb0c3be32 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_linear.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_linear.py
@@ -5,8 +5,8 @@
import pytest
import torch
import ttnn
-from models.utility_functions import comp_allclose_and_pcc
-from tests.ttnn.unit_tests.operations.test_moreh_matmul import get_tensors
+from models.utility_functions import comp_allclose_and_pcc, skip_for_grayskull
+from tests.ttnn.unit_tests.operations.test_moreh_matmul import get_tensors, get_bias_tensors
from loguru import logger
from tests.ttnn.unit_tests.operations.test_utils import (
get_compute_kernel_options,
@@ -15,24 +15,6 @@
)
-# TODO: add this feature in get_tensors method
-def get_bias_tensors(bias_shape, require_bias_grad, device):
- npu_dtype = ttnn.bfloat16
- cpu_dtype = torch.bfloat16
- npu_layout = ttnn.TILE_LAYOUT
- cpu_layout = ttnn.ROW_MAJOR_LAYOUT
-
- bias = torch.randint(-10, 10, bias_shape, dtype=cpu_dtype)
- tt_bias = ttnn.Tensor(bias, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
-
- tt_bias_grad = None
- if require_bias_grad:
- bias_grad = torch.full(bias_shape, float("nan"), dtype=cpu_dtype)
- tt_bias_grad = ttnn.Tensor(bias_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
-
- return tt_bias, bias, tt_bias_grad
-
-
def moreh_linear(shapes, has_bias, has_output, compute_kernel_config, device):
torch.manual_seed(3072)
input_shape, weight_shape, bias_shape, output_shape = shapes
@@ -267,3 +249,93 @@ def test_moreh_linear_backward_enable_cache(shapes, device, use_program_cache):
num_program_cache_entries_list.append(device.num_program_cache_entries())
assert passing
assert len(set(num_program_cache_entries_list)) == 1
+
+
+@skip_for_grayskull("GS does not support fp32")
+@pytest.mark.parametrize(
+ "shapes",
+ (
+ # input, weight, bias(1d or scalar), output
+ # GPT2-Small cases
+ ([8, 512, 768], [2304, 768], [1, 2304], [8, 512, 2304]),
+ ([8, 512, 768], [768, 768], [1, 768], [8, 512, 768]),
+ ([8, 512, 768], [3072, 768], [1, 3072], [8, 512, 3072]),
+ ),
+)
+def test_moreh_bias_backward_fp32(shapes, device):
+ torch.manual_seed(3072)
+ compute_kernel_fp32_config = get_compute_kernel_options(True)
+ compute_kernel_config = get_compute_kernel_options(False)
+ requires_input_grad, requires_weight_grad, requires_bias_grad = (True, False, True)
+ input_shape, weight_shape, bias_shape, output_shape = shapes
+ (
+ tt_input,
+ tt_weight,
+ _,
+ tt_output_grad,
+ tt_input_grad,
+ _,
+ torch_input,
+ torch_weight,
+ torch_output_grad,
+ ) = get_tensors(
+ input_shape, weight_shape, output_shape, requires_input_grad, requires_weight_grad, False, device, False
+ )
+ tt_bias, torch_bias, tt_bias_grad = get_bias_tensors(bias_shape, requires_bias_grad, device, False)
+ (_, _, _, _, tt_input_grad_fp32, _, _, _, _) = get_tensors(
+ input_shape, weight_shape, output_shape, requires_input_grad, requires_weight_grad, False, device, False
+ )
+ (_, _, tt_bias_grad_fp32) = get_bias_tensors(bias_shape, requires_bias_grad, device, False)
+ ## tt linear backward (fp32 mode)
+ tt_input_grad_fp32, _, tt_bias_grad_fp32 = ttnn.operations.moreh.linear_backward(
+ tt_output_grad,
+ tt_input,
+ tt_weight,
+ are_required_outputs=(requires_input_grad, requires_weight_grad, requires_bias_grad),
+ bias=tt_bias,
+ input_grad=tt_input_grad_fp32,
+ weight_grad=None,
+ bias_grad=tt_bias_grad_fp32,
+ compute_kernel_config=compute_kernel_fp32_config,
+ )
+ ## tt linear backward (bf16 mode)
+ tt_input_grad, _, tt_bias_grad = ttnn.operations.moreh.linear_backward(
+ tt_output_grad,
+ tt_input,
+ tt_weight,
+ are_required_outputs=(requires_input_grad, requires_weight_grad, requires_bias_grad),
+ bias=tt_bias,
+ input_grad=tt_input_grad,
+ weight_grad=None,
+ bias_grad=tt_bias_grad,
+ compute_kernel_config=compute_kernel_config,
+ )
+ torch_input_fp32 = torch_input.float()
+ torch_weight_fp32 = torch_weight.float()
+ torch_bias_fp32 = torch_bias.float()
+ ## reference
+ torch_output = torch.nn.functional.linear(
+ torch_input_fp32.requires_grad_(requires_input_grad),
+ torch_weight_fp32.requires_grad_(requires_weight_grad),
+ torch_bias_fp32.requires_grad_(requires_bias_grad),
+ )
+ torch_output.backward(torch_output_grad.float())
+ ## test for equivalance
+ rtol = atol = 0.1
+ tt_bias_grad_fp32_cpu = tt_bias_grad_fp32.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(bias_shape).to_torch()
+ tt_bias_grad_cpu = tt_bias_grad.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(bias_shape).to_torch()
+ passing, output_pcc = comp_allclose_and_pcc(
+ torch_bias_fp32.grad, tt_bias_grad_fp32_cpu, pcc=0.98, rtol=rtol, atol=atol
+ )
+ logger.debug(f"Out passing={passing}")
+ logger.debug(f"Output pcc={output_pcc}")
+ assert passing
+ diff_fp32 = torch.abs(torch_bias_fp32.grad - tt_bias_grad_fp32_cpu)
+ logger.debug(f"std={torch.std(diff_fp32)}")
+ logger.debug(f"mean={diff_fp32.mean()}")
+ logger.debug(f"topk(5) {torch.topk(diff_fp32.reshape(-1), 5)}")
+ diff = torch.abs(torch_bias_fp32.grad - tt_bias_grad_cpu)
+ logger.debug(f"std={torch.std(diff)}")
+ logger.debug(f"mean={diff.mean()}")
+ logger.debug(f"topk(5) {torch.topk(diff.reshape(-1), 5)}")
+ assert diff_fp32.mean() < diff.mean()
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
index 10ed7d6adbb..0a09b77ba22 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_logsoftmax.py
@@ -11,24 +11,25 @@
import torch.nn.functional as F
from models.utility_functions import is_wormhole_b0
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
+from tests.ttnn.unit_tests.operations.test_utils import (
get_compute_kernel_options,
compute_kernel_options,
compute_kernel_ids,
+ to_npu,
)
@pytest.mark.parametrize(
"shape_dim",
(
- ((50, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
+ [[50, 32], 1], # single tile
+ [[3, 32, 32 * 5], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 2], # multiple tiles per core
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -60,8 +61,8 @@ def test_logsoftmax_for_dim_hw(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
(
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -99,10 +100,10 @@ def test_logsoftmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options
@pytest.mark.parametrize(
"shape_dim",
(
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -133,12 +134,12 @@ def test_logsoftmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_opti
@pytest.mark.parametrize(
"shape_dim",
(
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
+ [[1, 15, 32, 32], 1], # single tile c
+ [[1, 15, 32 * 7, 32 * 5], 1], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -169,14 +170,14 @@ def test_logsoftmax_for_dim_nc(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
(
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 2), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 5, 32), 2), # multiple tiles per core
+ [[32, 32], 1], # single tile
+ [[3, 32, 32 * 2], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 5, 32], 2], # multiple tiles per core
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -210,8 +211,8 @@ def test_logsoftmax_backward_for_dim_hw(shape_dim, compute_kernel_options, devic
@pytest.mark.parametrize(
"shape_dim",
(
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -252,10 +253,10 @@ def test_logsoftmax_backward_large_algorithm_for_dim_hw(shape_dim, compute_kerne
@pytest.mark.parametrize(
"shape_dim",
(
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -290,12 +291,12 @@ def test_logsoftmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_ke
@pytest.mark.parametrize(
"shape_dim",
(
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
+ [[1, 15, 32, 32], 1], # single tile c
+ [[1, 15, 32 * 7, 32 * 5], 1], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
),
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
@@ -328,7 +329,9 @@ def test_logsoftmax_backward_for_dim_nc(shape_dim, compute_kernel_options, devic
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
@@ -365,7 +368,9 @@ def test_logsoftmax_optional_output_tensor(shape_dim, optional_output_tensor, de
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
@@ -400,3 +405,96 @@ def test_logsoftmax_backward_optional_output_tensor(shape_dim, optional_output_t
passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
logger.info(out)
assert passing
+
+
+@pytest.mark.parametrize(
+ "shape_dim_strategy",
+ (
+ [[50, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_H],
+ [[2, 3, 32 * 4, 32 * 5], 3, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_W],
+ [[2, 3, 32 * 4, 32 * 5], 2, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_H],
+ [[1, 15, 32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C],
+ ),
+)
+def test_logsoftmax_callback(shape_dim_strategy, device, use_program_cache):
+ shape, dim, strategy = shape_dim_strategy
+ torch.manual_seed(0)
+
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + 100 + i
+ tt_cpu = F.log_softmax(x, dim)
+ dev_x = ttnn.Tensor(x, ttnn.bfloat16).pad_to_tile(float("nan")).to(ttnn.TILE_LAYOUT).to(device)
+ tt_npu = ttnn.operations.moreh.logsoftmax(dev_x, dim, strategy=strategy)
+ if i == 0:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+ else:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
+
+ tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).unpad_from_tile(shape)
+ assert list(tt_dev.shape.with_tile_padding()) == list(tt_cpu.shape)
+ tt_dev = tt_dev.to_torch().to(torch.bfloat16)
+
+ rtol = atol = 0.1
+ passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
+ logger.debug(out)
+ assert passing
+
+
+def logsoftmax_backward_step(dev_y, dev_dy, dim, strategy, device, num_program_cache_entries=None):
+ """
+ Runs a single step of logsoftmax_backward and checks the program cache if needed.
+ """
+ tt_npu = ttnn.operations.moreh.logsoftmax_backward(dev_y, dev_dy, dim, strategy=strategy)
+
+ if num_program_cache_entries is not None:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ else:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
+
+ return tt_npu, num_program_cache_entries
+
+
+@pytest.mark.parametrize(
+ "shape_dim_strategy",
+ (
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_H],
+ [[2, 3, 32 * 4, 32 * 5], 3, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_W],
+ [[2, 3, 32 * 4, 32 * 5], 2, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_H],
+ [[1, 15, 32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_C],
+ ),
+)
+def test_logsoftmax_backward_callback(shape_dim_strategy, device, use_program_cache):
+ shape, dim, strategy = shape_dim_strategy
+ torch.manual_seed(0)
+
+ num_program_cache_entries = None
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
+ y = F.log_softmax(x, dim)
+ dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+
+ dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
+ dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+
+ y.backward(dy)
+
+ tt_npu, num_program_cache_entries = logsoftmax_backward_step(
+ dev_y, dev_dy, dim, strategy, device, num_program_cache_entries
+ )
+
+ assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
+ tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
+
+ rtol = atol = 0.5
+ passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
+ logger.debug(out)
+ assert passing
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
index 3dcff16cfd4..34350dfcabd 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_matmul.py
@@ -48,7 +48,11 @@ def get_tensors(
# tensors for backward
output_grad = tt_output_grad = torch_output_grad = tt_input_grad = tt_other_grad = None
if require_input_grad or require_other_grad:
- output_grad = torch.randint(-2, 3, output_shape, dtype=cpu_dtype)
+ output_grad = (
+ torch.randint(-2, 3, output_shape, dtype=cpu_dtype)
+ if use_randint
+ else torch.rand(output_shape, dtype=cpu_dtype)
+ )
tt_output_grad = ttnn.Tensor(output_grad, npu_dtype).pad_to_tile(float(-1)).to(npu_layout).to(device)
torch_output_grad = output_grad[0][0][0][0] if is_1d else output_grad
@@ -81,6 +85,24 @@ def get_tensors(
)
+def get_bias_tensors(bias_shape, require_bias_grad, device, use_int=True):
+ npu_dtype = ttnn.bfloat16
+ cpu_dtype = torch.bfloat16
+ npu_layout = ttnn.TILE_LAYOUT
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+ bias = (
+ torch.randint(-10, 10, bias_shape, dtype=cpu_dtype)
+ if use_int
+ else torch.rand(bias_shape, dtype=cpu_dtype) * 10 - 5
+ )
+ tt_bias = ttnn.Tensor(bias, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
+ tt_bias_grad = None
+ if require_bias_grad:
+ bias_grad = torch.full(bias_shape, float("nan"), dtype=cpu_dtype)
+ tt_bias_grad = ttnn.Tensor(bias_grad, npu_dtype).pad_to_tile(float("nan")).to(npu_layout).to(device)
+ return tt_bias, bias, tt_bias_grad
+
+
def moreh_matmul(params, has_output, compute_kernel_config, device):
torch.manual_seed(3072)
input_shape, other_shape, output_shape, transpose_input, transpose_other = params
@@ -200,7 +222,7 @@ def test_moreh_matmul_enable_cache(params, device, use_program_cache):
assert device.num_program_cache_entries() == 2
-@skip_for_grayskull("Doesn't seem to work properly on Grayskull devices. Wormhole_b0 devices work fine.")
+@skip_for_grayskull("GS does not support fp32")
@pytest.mark.parametrize(
"params",
(
@@ -306,3 +328,215 @@ def test_moreh_matmul_1d(input_shape, device):
logger.debug(f"Output pcc={output_pcc}")
assert passing
+
+
+@pytest.mark.parametrize(
+ "params",
+ (
+ # input, other, output shape
+ ([3, 128, 96], [3, 4, 1, 96, 256], [3, 4, 3, 128, 256]),
+ ([3, 3, 313, 511], [3, 3, 511, 765], [3, 3, 313, 765]),
+ ([3, 1, 2, 1, 4, 1, 319, 95], [4, 2, 95, 470], [3, 1, 2, 1, 4, 2, 319, 470]),
+ ([3, 2, 1, 470, 95], [2, 1, 3, 1, 2, 2, 95, 319], [2, 1, 3, 3, 2, 2, 470, 319]),
+ ),
+)
+@pytest.mark.parametrize(
+ "requires_grad",
+ (
+ (True, False),
+ (False, True),
+ (True, True),
+ ),
+)
+def test_moreh_matmul_backward(params, requires_grad, device):
+ torch.manual_seed(3072)
+ input_shape, other_shape, output_shape = params
+ require_input_grad, require_other_grad = requires_grad
+
+ # get tensors
+ (
+ tt_input,
+ tt_other,
+ _,
+ tt_output_grad,
+ tt_input_grad,
+ tt_other_grad,
+ torch_input,
+ torch_other,
+ torch_output_grad,
+ ) = get_tensors(input_shape, other_shape, output_shape, require_input_grad, require_other_grad, False, device)
+
+ # torch matmul
+ torch_out = torch.matmul(
+ torch_input.requires_grad_(require_input_grad), torch_other.requires_grad_(require_other_grad)
+ )
+ torch_out.backward(torch_output_grad)
+
+ # tt matmul backward
+ tt_input_grad, tt_other_grad = ttnn.operations.moreh.matmul_backward(
+ tt_output_grad,
+ tt_input,
+ tt_other,
+ are_required_outputs=(require_input_grad, require_other_grad),
+ input_a_grad=tt_input_grad,
+ input_b_grad=tt_other_grad,
+ )
+
+ # test for equivalance
+ rtol = atol = 0.1
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+ if require_input_grad:
+ ttcpu_input_grad = tt_input_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch()
+ passing, output_pcc = comp_allclose_and_pcc(torch_input.grad, ttcpu_input_grad, pcc=0.999, rtol=rtol, atol=atol)
+ logger.debug(f"input_grad passing={passing}")
+ logger.debug(f"input_grad pcc={output_pcc}")
+ assert passing
+ else:
+ assert tt_input_grad is None
+
+ if require_other_grad:
+ ttcpu_other_grad = tt_other_grad.cpu().to(cpu_layout).unpad_from_tile(other_shape).to_torch()
+ passing, output_pcc = comp_allclose_and_pcc(torch_other.grad, ttcpu_other_grad, pcc=0.999, rtol=rtol, atol=atol)
+ logger.debug(f"other_grad passing={passing}")
+ logger.debug(f"other_grad pcc={output_pcc}")
+ assert passing
+ else:
+ assert tt_other_grad is None
+
+
+@pytest.mark.parametrize(
+ "input_shape",
+ (
+ [1, 1, 1, 10], # test not mutiple of 32 case
+ [1, 1, 1, 32], # test single tile
+ [1, 1, 1, 352], # test multiple tiles
+ [1, 1, 1, 323], # test multiple tiles, not a multiple of 32
+ ),
+)
+@pytest.mark.parametrize(
+ "requires_grad",
+ (
+ (True, False),
+ (False, True),
+ (True, True),
+ ),
+)
+def test_moreh_matmul_1d_backward(input_shape, requires_grad, device):
+ torch.manual_seed(3072)
+ require_input_grad, require_other_grad = requires_grad
+ output_shape = [1, 1, 1, 1]
+ # get tensors
+ (
+ tt_input,
+ tt_other,
+ _,
+ tt_output_grad,
+ tt_input_grad,
+ tt_other_grad,
+ torch_input,
+ torch_other,
+ torch_output_grad,
+ ) = get_tensors(input_shape, input_shape, output_shape, require_input_grad, require_other_grad, True, device)
+
+ # torch matmul
+ torch_out = torch.matmul(
+ torch_input.requires_grad_(require_input_grad), torch_other.requires_grad_(require_other_grad)
+ )
+ torch_out.backward(torch_output_grad)
+
+ # tt matmul backward
+ for _ in range(2):
+ ttnn.operations.moreh.matmul_backward(
+ tt_output_grad,
+ tt_input,
+ tt_other,
+ are_required_outputs=(require_input_grad, require_other_grad),
+ input_a_grad=tt_input_grad,
+ input_b_grad=tt_other_grad,
+ )
+
+ # test for equivalance
+ rtol = atol = 0.1
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+ if require_input_grad:
+ ttcpu_input_grad = tt_input_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch()
+
+ passing, output_pcc = comp_allclose_and_pcc(
+ torch_input.grad, ttcpu_input_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol
+ )
+ logger.debug(f"input_grad passing={passing}")
+ logger.debug(f"input_grad pcc={output_pcc}")
+ assert passing
+
+ if require_other_grad:
+ ttcpu_other_grad = tt_other_grad.cpu().to(cpu_layout).unpad_from_tile(input_shape).to_torch()
+
+ passing, output_pcc = comp_allclose_and_pcc(
+ torch_other.grad, ttcpu_other_grad.reshape(-1), pcc=0.999, rtol=rtol, atol=atol
+ )
+ logger.debug(f"other_grad passing={passing}")
+ logger.debug(f"other_grad pcc={output_pcc}")
+ assert passing
+
+
+@skip_for_grayskull("GS does not support fp32")
+@pytest.mark.parametrize(
+ "params",
+ (
+ # input, other, output shape, transpose input, other
+ ([31, 3100], [3100, 31], [31, 31], False, False),
+ ),
+)
+def test_moreh_matmul_with_bias_add_fp32_dest_acc(params, device):
+ torch.manual_seed(3072)
+ input_shape, other_shape, output_shape, transpose_input, transpose_other = params
+ tt_input, tt_other, tt_output_fp32, _, _, _, torch_input, torch_other, _ = get_tensors(
+ input_shape, other_shape, output_shape, False, False, False, device, use_randint=False
+ )
+ tt_bias, torch_bias, _ = get_bias_tensors([1, 31], False, device, False)
+ compute_kernel_config_fp32_dest_acc = get_compute_kernel_options(True)
+ compute_kernel_config_bf16_dest_acc = get_compute_kernel_options(False)
+ torch_input = torch_input.transpose(-1, -2) if transpose_input else torch_input
+ torch_other = torch_other.transpose(-1, -2) if transpose_other else torch_other
+ # tt matmul
+ tt_output_fp32 = ttnn.operations.moreh.matmul(
+ tt_input,
+ tt_other,
+ transpose_input=transpose_input,
+ transpose_other=transpose_other,
+ output=tt_output_fp32,
+ bias=tt_bias,
+ compute_kernel_config=compute_kernel_config_fp32_dest_acc,
+ )
+ tt_output_fp16 = ttnn.operations.moreh.matmul(
+ tt_input,
+ tt_other,
+ transpose_input=transpose_input,
+ transpose_other=transpose_other,
+ bias=tt_bias,
+ compute_kernel_config=compute_kernel_config_bf16_dest_acc,
+ )
+ cpu_layout = ttnn.ROW_MAJOR_LAYOUT
+ tt_output_cpu_fp32 = tt_output_fp32.cpu().to(cpu_layout).unpad_from_tile(output_shape).to_torch()
+ tt_output_cpu_bf16 = tt_output_fp16.cpu().to(cpu_layout).unpad_from_tile(output_shape).to_torch()
+ # torch matmul (float)
+ torch_out = torch.matmul(torch_input.float(), torch_other.float()) + torch_bias
+ # test for equivalance
+ rtol = atol = 0.1
+ passing, output_pcc = comp_allclose_and_pcc(torch_out, tt_output_cpu_fp32, pcc=0.99, rtol=rtol, atol=atol)
+ logger.debug(f"Out passing={passing}")
+ logger.debug(f"Output pcc={output_pcc}")
+ diff = torch.abs(torch_out - tt_output_cpu_fp32)
+ logger.debug(f"std={torch.std(diff)}")
+ logger.debug(f"mean={diff.mean()}")
+ logger.debug(f"topk(5) {torch.topk(diff.reshape(-1), 5)}")
+ assert passing
+ torch_out = torch.matmul(torch_input.bfloat16(), torch_other.bfloat16())
+ passing, output_pcc = comp_allclose_and_pcc(torch_out, tt_output_cpu_bf16, pcc=0.99, rtol=rtol, atol=atol)
+ logger.debug(f"Out passing={passing}")
+ logger.debug(f"Output pcc={output_pcc}")
+ diff_fp16 = torch.abs(torch_out - tt_output_cpu_bf16)
+ logger.debug(f"std={torch.std(diff_fp16)}")
+ logger.debug(f"mean={diff_fp16.mean()}")
+ logger.debug(f"topk(5) {torch.topk(diff_fp16.reshape(-1), 5)}")
+ assert diff.mean() < diff_fp16.mean()
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
index bd286abe891..58ad9c82e68 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_softmax.py
@@ -10,30 +10,29 @@
from loguru import logger
from models.utility_functions import is_wormhole_b0
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
+from tests.ttnn.unit_tests.operations.test_utils import (
get_compute_kernel_options,
compute_kernel_options,
compute_kernel_ids,
+ to_npu,
)
@pytest.mark.parametrize(
"shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
+ [
+ [[32, 32], 1], # single tile
+ [[3, 32, 32 * 5], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 2], # multiple tiles per core
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -57,15 +56,13 @@ def test_softmax_for_dim_hw(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
+ [
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -95,16 +92,15 @@ def test_softmax_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, d
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
+ [
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -129,18 +125,17 @@ def test_softmax_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
+ [
+ [[1, 15, 32, 32], 1], # single tile c
+ [[1, 15, 32 * 7, 32 * 5], 1], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -165,20 +160,19 @@ def test_softmax_for_dim_nc(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
+ [
+ [[32, 32], 1], # single tile
+ [[3, 32, 32 * 5], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 2], # multiple tiles per core
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -206,14 +200,13 @@ def test_softmax_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
+ [
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -249,16 +242,15 @@ def test_softmax_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_op
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
+ [
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -287,18 +279,17 @@ def test_softmax_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kerne
@pytest.mark.parametrize(
"shape_dim",
- (
- ((15, 32, 32), 0), # single tile c
- ((15, 32 * 7, 32 * 5), 0), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
+ [
+ [[15, 32, 32], 0], # single tile c
+ [[15, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -326,28 +317,33 @@ def test_softmax_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim_strategy",
- (
- ((32, 32), 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_W),
- ((32, 32), 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_H),
- ((32, 32), 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_W),
- ((32, 32), 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_H),
- ((1, 1, 32, 32), 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C),
- ((1, 1, 32, 32), 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C),
- ),
+ [
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_H],
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_H],
+ [[1, 1, 32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C],
+ [[1, 1, 32, 32], 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C],
+ ],
)
-def test_softmax_callback(shape_dim_strategy, device):
- device.enable_program_cache()
-
+def test_softmax_callback(shape_dim_strategy, device, use_program_cache):
shape, dim, strategy = shape_dim_strategy
torch.manual_seed(0)
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+ dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- tt_cpu = torch.softmax(x, dim)
- for i in range(2):
+ tt_cpu = torch.softmax(x, dim)
tt_npu = ttnn.operations.moreh.softmax(dev_x, dim, strategy=strategy)
+ if i == 0:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+ else:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
@@ -360,31 +356,36 @@ def test_softmax_callback(shape_dim_strategy, device):
@pytest.mark.parametrize(
"shape_dim_strategy",
- (
- ((32, 32), 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_W),
- ((32, 32), 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_H),
- ((32, 32), 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_W),
- ((32, 32), 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_H),
- ((1, 1, 32, 32), 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_C),
- ((1, 1, 32, 32), 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_C),
- ),
+ [
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_H],
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_H],
+ [[1, 1, 32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_C],
+ ],
)
-def test_softmax_backward_callback(shape_dim_strategy, device):
- device.enable_program_cache()
+def test_softmax_backward_callback(shape_dim_strategy, device, use_program_cache):
shape, dim, strategy = shape_dim_strategy
torch.manual_seed(0)
- x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
- y = torch.softmax(x, dim)
- dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+ y = torch.softmax(x, dim)
+ dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
- dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+ dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
+ dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
- y.backward(dy)
- for i in range(2):
+ y.backward(dy)
tt_npu = ttnn.operations.moreh.softmax_backward(dev_y, dev_dy, dim, strategy=strategy)
+ if i == 0:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+ else:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
@@ -397,15 +398,15 @@ def test_softmax_backward_callback(shape_dim_strategy, device):
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
- (True, False),
+ [True, False],
)
def test_softmax_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -434,14 +435,15 @@ def test_softmax_optional_output_tensor(shape_dim, optional_output_tensor, devic
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
- (True, False),
+ [True, False],
)
def test_softmax_backward_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
index fb00e9d34c1..4e079c0c41c 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_softmin.py
@@ -11,30 +11,29 @@
import torch.nn.functional as F
from models.utility_functions import is_wormhole_b0
-from tests.tt_eager.python_api_testing.unit_testing.misc.test_utils import (
+from tests.ttnn.unit_tests.operations.test_utils import (
get_compute_kernel_options,
compute_kernel_options,
compute_kernel_ids,
+ to_npu,
)
@pytest.mark.parametrize(
"shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
+ [
+ [[32, 32], 1], # single tile
+ [[3, 32, 32 * 5], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 2], # multiple tiles per core
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -58,15 +57,13 @@ def test_softmin_for_dim_hw(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
+ [
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -95,16 +92,15 @@ def test_softmin_large_algorithm_for_dim_hw(shape_dim, compute_kernel_options, d
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
+ [
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -129,18 +125,17 @@ def test_softmin_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
+ [
+ [[1, 15, 32, 32], 1], # single tile c
+ [[1, 15, 32 * 7, 32 * 5], 1], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -165,20 +160,19 @@ def test_softmin_for_dim_nc(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((32, 32), 1), # single tile
- ((3, 32, 32 * 5), 2), # mutiple tile with dim W
- ((5, 6, 32, 32), 3), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 3), # multiple tiles per core
- ((32, 32), 0), # single tile
- ((3, 32 * 5, 32), 1), # mutiple tile with dim H
- ((5, 6, 32, 32), 2), # multiple cores
- ((10, 20, 32 * 3, 32 * 5), 2), # multiple tiles per core
- ),
+ [
+ [[32, 32], 1], # single tile
+ [[3, 32, 32 * 5], 2], # mutiple tile with dim W
+ [[5, 6, 32, 32], 3], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 3], # multiple tiles per core
+ [[32, 32], 0], # single tile
+ [[3, 32 * 5, 32], 1], # mutiple tile with dim H
+ [[5, 6, 32, 32], 2], # multiple cores
+ [[10, 20, 32 * 3, 32 * 5], 2], # multiple tiles per core
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -206,14 +200,13 @@ def test_softmin_backward_for_dim_hw(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (
- ((2, 3, 32 * 4, 32 * 5), 3),
- ((2, 3, 32 * 4, 32 * 5), 2),
- ),
+ [
+ [[2, 3, 32 * 4, 32 * 5], 3],
+ [[2, 3, 32 * 4, 32 * 5], 2],
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -248,16 +241,15 @@ def test_softmin_backward_large_algorithmfor_dim_hw(shape_dim, compute_kernel_op
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 1, 10, 15), 3), # single tile
- ((1, 1, 10, 32 * 2 + 10), 3), # mutiple tile with dim
- ((1, 1, 15, 10), 2), # single tile
- ((1, 1, 32 * 2 + 10, 32), 2), # mutiple tile with dim
- ),
+ [
+ [[1, 1, 10, 15], 3], # single tile
+ [[1, 1, 10, 32 * 2 + 10], 3], # mutiple tile with dim
+ [[1, 1, 15, 10], 2], # single tile
+ [[1, 1, 32 * 2 + 10, 32], 2], # mutiple tile with dim
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -286,18 +278,17 @@ def test_softmin_backward_not_multiple_of_32_for_dim_hw(shape_dim, compute_kerne
@pytest.mark.parametrize(
"shape_dim",
- (
- ((1, 15, 32, 32), 1), # single tile c
- ((1, 15, 32 * 7, 32 * 5), 1), # mutiple cores
- ((109, 15, 32, 32), 1), # mutiple tiles per cores
- ((15, 1, 32, 32), 0), # single tile n
- ((15, 1, 32 * 7, 32 * 5), 0), # mutiple cores
- ((15, 109, 32 * 2, 32 * 2), 0), # mutiple tiles per cores
- ),
+ [
+ [[1, 15, 32, 32], 1], # single tile c
+ [[1, 15, 32 * 7, 32 * 5], 1], # mutiple cores
+ [[109, 15, 32, 32], 1], # mutiple tiles per cores
+ [[15, 1, 32, 32], 0], # single tile n
+ [[15, 1, 32 * 7, 32 * 5], 0], # mutiple cores
+ [[15, 109, 32 * 2, 32 * 2], 0], # mutiple tiles per cores
+ ],
)
@pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
def test_softmin_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -325,15 +316,15 @@ def test_softmin_backward_for_dim_nc(shape_dim, compute_kernel_options, device):
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
- (True, False),
+ [True, False],
)
def test_softmin_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
-
shape, dim = shape_dim
torch.manual_seed(0)
@@ -362,14 +353,15 @@ def test_softmin_optional_output_tensor(shape_dim, optional_output_tensor, devic
@pytest.mark.parametrize(
"shape_dim",
- (((32, 32), 1),), # single tile
+ [
+ [[32, 32], 1],
+ ], # single tile
)
@pytest.mark.parametrize(
"optional_output_tensor",
- (True, False),
+ [True, False],
)
def test_softmin_backward_optional_output_tensor(shape_dim, optional_output_tensor, device):
- device.enable_program_cache()
shape, dim = shape_dim
torch.manual_seed(0)
@@ -397,3 +389,96 @@ def test_softmin_backward_optional_output_tensor(shape_dim, optional_output_tens
passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
logger.info(out)
assert passing
+
+
+@pytest.mark.parametrize(
+ "shape_dim_strategy",
+ [
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.SMALL_H],
+ [[2, 3, 32 * 4, 32 * 5], 3, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_W],
+ [[2, 3, 32 * 4, 32 * 5], 2, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_H],
+ [[1, 15, 32, 32], 1, ttnn.operations.moreh.SoftmaxOpParallelizationStrategy.LARGE_C],
+ ],
+)
+def test_softmin_callback(shape_dim_strategy, device, use_program_cache):
+ shape, dim, strategy = shape_dim_strategy
+ torch.manual_seed(0)
+
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16) + i
+
+ dev_x = ttnn.Tensor(x, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+
+ tt_cpu = F.softmin(x, dim)
+ tt_npu = ttnn.operations.moreh.softmin(dev_x, dim, strategy=strategy)
+ if i == 0:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+ else:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
+
+ assert list(tt_npu.shape.with_tile_padding()) == list(tt_cpu.shape)
+ tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
+
+ rtol = atol = 0.05
+ passing, out = comp_allclose_and_pcc(tt_cpu, tt_dev, rtol=rtol, atol=atol)
+ logger.debug(out)
+ assert passing
+
+
+def softmin_backward_step(dev_y, dev_dy, dim, strategy, device, num_program_cache_entries=None):
+ """
+ Runs a single step of softmin_backward and checks the program cache if needed.
+ """
+ tt_npu = ttnn.operations.moreh.softmin_backward(dev_y, dev_dy, dim, strategy=strategy)
+
+ if num_program_cache_entries is not None:
+ assert device.num_program_cache_entries() == num_program_cache_entries
+ else:
+ num_program_cache_entries = device.num_program_cache_entries()
+ assert num_program_cache_entries > 0
+
+ torch_dummy = torch.randn([32, 32])
+ tt_dummy = to_npu(torch_dummy, device)
+
+ return tt_npu, num_program_cache_entries
+
+
+@pytest.mark.parametrize(
+ "shape_dim_strategy",
+ [
+ [[32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_W],
+ [[32, 32], 0, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.SMALL_H],
+ [[2, 3, 32 * 4, 32 * 5], 3, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_W],
+ [[2, 3, 32 * 4, 32 * 5], 2, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_H],
+ [[1, 15, 32, 32], 1, ttnn.operations.moreh.SoftmaxBackwardOpParallelizationStrategy.LARGE_C],
+ ],
+)
+def test_softmin_backward_callback(shape_dim_strategy, device, use_program_cache):
+ shape, dim, strategy = shape_dim_strategy
+ torch.manual_seed(0)
+
+ num_program_cache_entries = None
+ for i in range(2):
+ x = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16).requires_grad_(True)
+ y = F.softmin(x, dim)
+ dev_y = ttnn.Tensor(y, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+
+ dy = torch.randint(low=0, high=4, size=shape).to(torch.bfloat16)
+ dev_dy = ttnn.Tensor(dy, ttnn.bfloat16).to(ttnn.TILE_LAYOUT).to(device)
+
+ y.backward(dy)
+ tt_npu, num_program_cache_entries = softmin_backward_step(
+ dev_y, dev_dy, dim, strategy, device, num_program_cache_entries
+ )
+
+ assert list(tt_npu.shape.with_tile_padding()) == list(x.grad.shape)
+ tt_dev = tt_npu.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch().to(torch.bfloat16)
+
+ rtol = atol = 0.05
+ passing, out = comp_allclose_and_pcc(x.grad, tt_dev, rtol=rtol, atol=atol)
+ logger.debug(out)
+ assert passing
diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index d04e1ee385d..0217433c0e2 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -71,6 +71,7 @@ def run_conv(
groups=1,
has_bias=True,
shard_layout=None,
+ auto_shard=False,
):
torch.manual_seed(0)
conv_input_shape = [batch_size, input_channels, input_height, input_width]
@@ -115,7 +116,7 @@ def run_conv(
tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)
- if shard_layout is None:
+ if shard_layout is None and not auto_shard:
shard_layout = (
ttnn.TensorMemoryLayout.HEIGHT_SHARDED if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED
)
@@ -249,13 +250,14 @@ def run_conv_with_split(
torch_input2_tensor = torch.permute(split_input_tensors[1], (0, 2, 3, 1))
reader_patterns_cache = {}
+ shard_layout = (
+ ttnn.TensorMemoryLayout.HEIGHT_SHARDED if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED
+ )
conv_config = ttnn.Conv2dConfig(
dtype=activations_dtype,
weights_dtype=weights_dtype,
math_fidelity=math_fidelity,
- shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
- if use_1d_systolic_array
- else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
+ shard_layout=shard_layout if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
fp32_dest_acc_enabled=fp32_accum,
packer_l1_accum_enabled=packer_l1_acc,
# input_channels_alignment=(16 if use_shallow_conv_variant else 32),
@@ -346,6 +348,7 @@ def run_conv_with_split(
"activations_dtype",
[ttnn.bfloat16],
)
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_conv_ws(
device,
use_program_cache,
@@ -362,6 +365,7 @@ def test_conv_ws(
has_bias,
weights_dtype,
activations_dtype,
+ auto_shard,
):
stride_h = stride
stride_w = stride
@@ -419,7 +423,7 @@ def test_conv_ws(
dtype=activations_dtype,
weights_dtype=weights_dtype,
math_fidelity=ttnn.MathFidelity.HiFi4,
- shard_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
+ shard_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED if not auto_shard else None,
input_channels_alignment=32,
deallocate_activation=deallocate_activation,
fp32_dest_acc_enabled=fp32_accum,
@@ -498,6 +502,7 @@ def test_conv_ws(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
@skip_for_grayskull()
def test_conv_for_segformer_512x512(
device,
@@ -521,6 +526,7 @@ def test_conv_for_segformer_512x512(
use_shallow_conv_variant,
groups,
output_layout,
+ auto_shard,
):
run_conv(
device,
@@ -544,6 +550,7 @@ def test_conv_for_segformer_512x512(
groups=groups,
output_layout=output_layout,
has_bias=False,
+ auto_shard=auto_shard,
)
@@ -585,6 +592,7 @@ def test_conv_for_segformer_512x512(
[ttnn.bfloat16, ttnn.bfloat8_b],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_resnet50_conv_gs(
device,
use_program_cache,
@@ -604,6 +612,7 @@ def test_resnet50_conv_gs(
pad_w,
use_1d_systolic_array,
config_override,
+ auto_shard,
):
if is_blackhole():
pytest.skip("This test is for Grayskull only")
@@ -646,6 +655,7 @@ def test_resnet50_conv_gs(
use_shallow_conv_variant=input_channels == 16,
padded_input_channels=16 if input_channels == 16 else None,
debug=not (batch_size == 20 and input_height == 115),
+ auto_shard=auto_shard,
)
@@ -713,6 +723,7 @@ def test_resnet50_conv_gs(
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
@pytest.mark.parametrize("has_bias", [True, False], ids=["with_bias", "no_bias"])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_resnet50_conv_wh(
device,
use_program_cache,
@@ -734,6 +745,7 @@ def test_resnet50_conv_wh(
config_override,
packer_l1_acc,
has_bias,
+ auto_shard,
):
if device.core_grid.y == 7:
pytest.skip("Issue #6992: Statically allocated circular buffers in program clash with L1 buffers on core range")
@@ -781,6 +793,7 @@ def test_resnet50_conv_wh(
packer_l1_acc=packer_l1_acc,
fp32_accum=False,
has_bias=has_bias,
+ auto_shard=auto_shard,
)
@@ -838,6 +851,7 @@ def test_resnet50_conv_wh(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4])
@pytest.mark.parametrize("packer_l1_acc", [True, False], ids=["pack_l1", "no_pack_l1"])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_resnet50_conv_wh_fp32(
device,
use_program_cache,
@@ -859,6 +873,7 @@ def test_resnet50_conv_wh_fp32(
use_1d_systolic_array,
config_override,
packer_l1_acc,
+ auto_shard,
):
if batch_size > 8 and (activations_dtype != ttnn.bfloat8_b or weights_dtype != ttnn.bfloat8_b):
pytest.skip("Batch > 8 must be run fully bfp8")
@@ -899,6 +914,7 @@ def test_resnet50_conv_wh_fp32(
fp32_accum=fp32_accum,
packer_l1_acc=packer_l1_acc,
transpose_mcast=use_1d_systolic_array, ## use RM (transpose_mcast=False) with 2D on WH
+ auto_shard=auto_shard,
)
@@ -1249,6 +1265,7 @@ def test_sd_conv_wh(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_unet_conv(
device,
use_program_cache,
@@ -1270,6 +1287,7 @@ def test_unet_conv(
config_override,
use_shallow_conv_variant,
output_layout,
+ auto_shard,
):
if is_blackhole():
pytest.skip("This test is for Grayskull only")
@@ -1299,6 +1317,7 @@ def test_unet_conv(
use_shallow_conv_variant=use_shallow_conv_variant,
padded_input_channels=16 if input_channels == 3 else None,
output_layout=output_layout,
+ auto_shard=auto_shard,
)
@@ -1339,6 +1358,7 @@ def test_unet_conv(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_unet_conv_wh(
device,
use_program_cache,
@@ -1360,6 +1380,7 @@ def test_unet_conv_wh(
config_override,
use_shallow_conv_variant,
output_layout,
+ auto_shard,
):
if (device.compute_with_storage_grid_size().x, device.compute_with_storage_grid_size().y) == (8, 7):
pytest.skip("Test is not supported on n300 (8,7) grid")
@@ -1389,6 +1410,7 @@ def test_unet_conv_wh(
transpose_mcast=use_1d_systolic_array, ## use RM (transpose_mcast=False) with 2D on WH
padded_input_channels=None,
output_layout=output_layout,
+ auto_shard=auto_shard,
)
@@ -1406,6 +1428,7 @@ def test_unet_conv_wh(
),
)
@pytest.mark.parametrize("use_1d_systolic_array", [False, True])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_halo_reshard_conv(
device,
use_program_cache,
@@ -1422,6 +1445,7 @@ def test_halo_reshard_conv(
pad_h,
pad_w,
config_override,
+ auto_shard,
):
math_fidelity = ttnn.MathFidelity.HiFi4
activations_dtype = ttnn.bfloat16
@@ -1445,6 +1469,7 @@ def test_halo_reshard_conv(
pad_w,
use_1d_systolic_array,
config_override,
+ auto_shard=auto_shard,
)
@@ -1461,6 +1486,7 @@ def test_halo_reshard_conv(
),
)
@pytest.mark.parametrize("use_1d_systolic_array", [False, True])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_conv_core_nondivis(
device,
use_program_cache,
@@ -1478,6 +1504,7 @@ def test_conv_core_nondivis(
pad_w,
config_override,
xfail,
+ auto_shard,
):
if xfail:
pytest.xfail()
@@ -1504,6 +1531,7 @@ def test_conv_core_nondivis(
pad_w,
use_1d_systolic_array,
config_override,
+ auto_shard=auto_shard,
)
@@ -1538,6 +1566,7 @@ def test_conv_core_nondivis(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
@pytest.mark.parametrize(
"filter, dilation, pad",
[
@@ -1563,6 +1592,7 @@ def test_conv_dilation(
pad,
output_layout,
dilation,
+ auto_shard,
):
config_override = {"act_block_w_div": act_block_w_div}
run_conv(
@@ -1587,6 +1617,7 @@ def test_conv_dilation(
output_layout=output_layout,
dilation=dilation,
has_bias=False,
+ auto_shard=auto_shard,
)
@@ -1632,6 +1663,8 @@ def test_conv_dilation(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+# ToDo: Renable this when auto shard heuristic is imporved, currently we run out of L1 in for some test cases
+# @pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_conv_groups(
device,
use_program_cache,
@@ -1745,6 +1778,7 @@ def test_conv_groups(
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
# @pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_yolov4_conv_groups_larger_than_one(
device,
use_program_cache,
@@ -1767,6 +1801,7 @@ def test_yolov4_conv_groups_larger_than_one(
use_shallow_conv_variant,
groups,
output_layout,
+ auto_shard,
):
if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b:
pytest.skip("Row major layout not compatible with bfloat8_b")
@@ -1794,6 +1829,7 @@ def test_yolov4_conv_groups_larger_than_one(
groups=groups,
padded_input_channels=16 if input_channels == 3 else None,
output_layout=output_layout,
+ auto_shard=auto_shard,
)
@@ -1816,6 +1852,7 @@ def test_yolov4_conv_groups_larger_than_one(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_swin_s_conv(
device,
use_program_cache,
@@ -1838,6 +1875,7 @@ def test_swin_s_conv(
use_shallow_conv_variant,
groups,
output_layout,
+ auto_shard,
):
if device.core_grid.y == 7:
pytest.skip("This test is not supported for N300")
@@ -1864,6 +1902,7 @@ def test_swin_s_conv(
use_shallow_conv_variant=use_shallow_conv_variant,
groups=groups,
output_layout=output_layout,
+ auto_shard=auto_shard,
)
@@ -1893,6 +1932,7 @@ def test_swin_s_conv(
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize("output_layout", [ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
@skip_for_grayskull()
def test_conv_for_segformer_512x512(
device,
@@ -1916,6 +1956,7 @@ def test_conv_for_segformer_512x512(
use_shallow_conv_variant,
groups,
output_layout,
+ auto_shard,
):
run_conv(
device,
@@ -1939,6 +1980,7 @@ def test_conv_for_segformer_512x512(
groups=groups,
output_layout=output_layout,
shard_layout=shard_layout,
+ auto_shard=auto_shard,
)
@@ -1963,6 +2005,7 @@ def test_conv_for_segformer_512x512(
[ttnn.bfloat8_b],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
+@pytest.mark.parametrize("auto_shard", [True, False], ids=["auto_shard", "no_auto_shard"])
def test_model_k_256x256(
device,
use_program_cache,
@@ -1984,6 +2027,7 @@ def test_model_k_256x256(
dilation_w,
groups,
use_1d_systolic_array,
+ auto_shard,
):
run_conv(
device,
@@ -2004,6 +2048,7 @@ def test_model_k_256x256(
use_1d_systolic_array,
None,
dilation=dilation_h,
+ auto_shard=auto_shard,
)
diff --git a/tests/ttnn/unit_tests/test_multi_device_async.py b/tests/ttnn/unit_tests/test_multi_device_async.py
index 62be5ba5b63..5a8890c497e 100644
--- a/tests/ttnn/unit_tests/test_multi_device_async.py
+++ b/tests/ttnn/unit_tests/test_multi_device_async.py
@@ -149,9 +149,8 @@ def test_multi_device_unary_binary_op_chain(pcie_mesh_device, program_cache, sha
from ttnn import ShardTensorToMesh, ConcatMeshToTensor
pcie_mesh_device.enable_async(True)
- for device in pcie_mesh_device.get_device_ids():
- if program_cache:
- pcie_mesh_device.get_device(device).enable_program_cache()
+ if program_cache:
+ pcie_mesh_device.enable_program_cache()
torch_silu = torch.nn.SiLU()
for i in range(50):
@@ -190,9 +189,8 @@ def test_multi_device_data_parallel_op_chain(pcie_mesh_device, program_cache, in
from ttnn import ShardTensorToMesh, ConcatMeshToTensor, ReplicateTensorToMesh
pcie_mesh_device.enable_async(True)
- for device in pcie_mesh_device.get_device_ids():
- if program_cache:
- pcie_mesh_device.get_device(device).enable_program_cache()
+ if program_cache:
+ pcie_mesh_device.enable_program_cache()
torch_silu = torch.nn.SiLU()
torch_mish = torch.nn.Mish()
diff --git a/tests/ttnn/unit_tests/test_multi_device_events.py b/tests/ttnn/unit_tests/test_multi_device_events.py
index c83d5c693a2..0217fe9f33f 100644
--- a/tests/ttnn/unit_tests/test_multi_device_events.py
+++ b/tests/ttnn/unit_tests/test_multi_device_events.py
@@ -21,8 +21,7 @@ def test_multi_device_events(t3k_mesh_device, shape):
# Enable Program Cache and Async Mode
t3k_mesh_device.enable_async(True)
- for device_id in t3k_mesh_device.get_device_ids():
- t3k_mesh_device.get_device(device_id).enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
# Preallocate activation tensors.
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, t3k_mesh_device)
diff --git a/tests/ttnn/unit_tests/test_multi_device_trace.py b/tests/ttnn/unit_tests/test_multi_device_trace.py
index 1fc07590d90..2e81db7b248 100644
--- a/tests/ttnn/unit_tests/test_multi_device_trace.py
+++ b/tests/ttnn/unit_tests/test_multi_device_trace.py
@@ -28,8 +28,7 @@ def test_multi_device_single_trace(t3k_mesh_device, shape, use_all_gather, enabl
# Trace requires program cache to be enabled
t3k_mesh_device.enable_async(enable_async)
- for device_id in t3k_mesh_device.get_device_ids():
- t3k_mesh_device.get_device(device_id).enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, t3k_mesh_device)
@@ -142,8 +141,7 @@ def test_multi_device_multi_trace(t3k_mesh_device, shape, use_all_gather, enable
# Trace requires program cache to be enabled
t3k_mesh_device.enable_async(enable_async)
- for device_id in t3k_mesh_device.get_device_ids():
- t3k_mesh_device.get_device(device_id).enable_program_cache()
+ t3k_mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, t3k_mesh_device)
diff --git a/tests/ttnn/unit_tests/test_multi_device_trace_TG.py b/tests/ttnn/unit_tests/test_multi_device_trace_TG.py
index 7836c9de402..60c5f57d613 100644
--- a/tests/ttnn/unit_tests/test_multi_device_trace_TG.py
+++ b/tests/ttnn/unit_tests/test_multi_device_trace_TG.py
@@ -27,8 +27,7 @@ def test_multi_device_single_trace(mesh_device, shape, enable_async, enable_mult
pytest.skip("Test is only valid on Galaxy")
# Trace requires program cache to be enabled
mesh_device.enable_async(True)
- for device_id in mesh_device.get_device_ids():
- mesh_device.get_device(device_id).enable_program_cache()
+ mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, mesh_device)
@@ -129,8 +128,7 @@ def test_multi_device_multi_trace(mesh_device, shape, enable_async, enable_multi
# Trace requires program cache to be enabled
mesh_device.enable_async(True)
- for device_id in mesh_device.get_device_ids():
- mesh_device.get_device(device_id).enable_program_cache()
+ mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, mesh_device)
diff --git a/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py b/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py
index ddb354dc365..9eb27afe2e1 100644
--- a/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py
+++ b/tests/ttnn/unit_tests/test_multi_device_trace_tgg.py
@@ -27,8 +27,7 @@ def test_multi_device_single_trace(mesh_device, shape, enable_async, enable_mult
pytest.skip("Test is only valid on TGG")
# Trace requires program cache to be enabled
mesh_device.enable_async(True)
- for device_id in mesh_device.get_device_ids():
- mesh_device.get_device(device_id).enable_program_cache()
+ mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, mesh_device)
@@ -128,8 +127,7 @@ def test_multi_device_multi_trace(mesh_device, shape, enable_async, enable_multi
# Trace requires program cache to be enabled
mesh_device.enable_async(True)
- for device_id in mesh_device.get_device_ids():
- mesh_device.get_device(device_id).enable_program_cache()
+ mesh_device.enable_program_cache()
# Preallocate activation tensors. These will be used when capturing and executing the trace
input_0_dev = ttnn.allocate_tensor_on_device(ttnn.Shape(shape), ttnn.bfloat16, ttnn.TILE_LAYOUT, mesh_device)
diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt
index d5c9fa860e1..cfecfa2cdb7 100644
--- a/tt_metal/CMakeLists.txt
+++ b/tt_metal/CMakeLists.txt
@@ -23,13 +23,10 @@ set(TT_METAL_OBJECTS
add_library(tt_metal ${TT_METAL_OBJECTS})
-target_link_libraries(tt_metal PUBLIC metal_header_directories umd_device metal_common_libs)
+target_link_libraries(tt_metal PUBLIC metal_header_directories umd_device metal_common_libs magic_enum fmt)
target_precompile_headers(tt_metal PRIVATE
- ${CMAKE_CURRENT_SOURCE_DIR}/third_party/magic_enum/magic_enum.hpp
${CMAKE_CURRENT_SOURCE_DIR}/third_party/tracy/public/tracy/Tracy.hpp
- ${CMAKE_CURRENT_SOURCE_DIR}/third_party/fmt/fmt/core.h
- ${CMAKE_CURRENT_SOURCE_DIR}/third_party/fmt/fmt/format.h