Adding more tosa tests for combined precision inputs and broadcast (#650

) * Add floatxfloat_sub_elem tosa test * Add floatxfloat_add_elem tosa test * Add floatxfloat_sel tosa test * Add bf16xfloat_sub_elem tosa test * Add bf16xfloat_add_elem tosa test * Add i16xi16_sub_elem broadcast tests * Add i8xi8_sub_elem broadcast tests * Reorganize bf16xbf16 broadcast tosa tests * Add floatxfloat_sub_elem broadcast tests * Fix tosa lowering pipeline for bf16xbf16 sub_elem broadcast tests
Xilinx · Sep 21, 2023 · 067b00b · 067b00b
1 parent 3a5e6ea
commit 067b00b
Show file tree

Hide file tree

Showing 72 changed files with 1,197 additions and 76 deletions.
diff --git a/...16_sub_elem_2d_broadcast_1d_unit_dim.mlir → ...16_sub_elem_2d_broadcast_1d_unit_dim.mlir b/...16_sub_elem_2d_broadcast_1d_unit_dim.mlir → ...16_sub_elem_2d_broadcast_1d_unit_dim.mlir
@@ -3,13 +3,13 @@
 
 // XFAIL: *
 // REQUIRES: valid_xchess_license
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor))" -o linalg.mlir
 // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
 // RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
 // RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc
 // RUN: mkdir -p data
-// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 
@@ -20,5 +20,3 @@ module {
     return %1 : tensor<16x1024xbf16>
   }
 }
-
-
diff --git a/...16xbf16_sub_elem_2d_broadcast_scalar.mlir → ...16xbf16_sub_elem_2d_broadcast_scalar.mlir b/...16xbf16_sub_elem_2d_broadcast_scalar.mlir → ...16xbf16_sub_elem_2d_broadcast_scalar.mlir
@@ -3,13 +3,13 @@
 
 // XFAIL: *
 // REQUIRES: valid_xchess_license
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-make-broadcastable, tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor))" -o linalg.mlir
 // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
 // RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
 // RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc
 // RUN: mkdir -p data
-// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 

diff --git a/...b_elem_2d_broadcast_1d_unit_dim/defines.h → ...16_sub_elem_16x1024_broadcast_1/defines.h b/...b_elem_2d_broadcast_1d_unit_dim/defines.h → ...16_sub_elem_16x1024_broadcast_1/defines.h
diff --git a/...f16_sub_elem_2d_broadcast_1d/testbench.cc → ...sub_elem_16x1024_broadcast_1/testbench.cc b/...f16_sub_elem_2d_broadcast_1d/testbench.cc → ...sub_elem_16x1024_broadcast_1/testbench.cc
diff --git a/...d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir → ...d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir b/...d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir → ...d/bf16xbf16_sub_elem_2d_broadcast_1d.mlir
@@ -6,9 +6,9 @@
 // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
 // RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
 // RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc
 // RUN: mkdir -p data
-// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 

diff --git a/...bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc → ...bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc b/...bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc → ...bf16xbf16_sub_elem_2d_broadcast_1d/dut.cc
diff --git a/...f16_sub_elem_2d_broadcast_1d_reshape.mlir → ...f16_sub_elem_2d_broadcast_1d_reshape.mlir b/...f16_sub_elem_2d_broadcast_1d_reshape.mlir → ...f16_sub_elem_2d_broadcast_1d_reshape.mlir
@@ -6,9 +6,9 @@
 // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
 // RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
 // RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc
 // RUN: mkdir -p data
-// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 

diff --git a/...6_sub_elem_2d_broadcast_1d_reshape/dut.cc → ...6_sub_elem_2d_broadcast_1d_reshape/dut.cc b/...6_sub_elem_2d_broadcast_1d_reshape/dut.cc → ...6_sub_elem_2d_broadcast_1d_reshape/dut.cc
diff --git a/...d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir → ...d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir b/...d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir → ...d/bf16xbf16_sub_elem_2d_broadcast_2d.mlir
@@ -6,9 +6,9 @@
 // RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --linalg-fold-unit-extent-dims --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
 // RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
 // RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
-// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/../testbench.cc dut.cc
 // RUN: mkdir -p data
-// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
 // RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
 // CHECK: TEST PASSED
 

diff --git a/...bf16xbf16_sub_elem_2d_broadcast_2d/dut.cc → ...bf16xbf16_sub_elem_2d_broadcast_2d/dut.cc b/...bf16xbf16_sub_elem_2d_broadcast_2d/dut.cc → ...bf16xbf16_sub_elem_2d_broadcast_2d/dut.cc
diff --git a/...6xbf16_sub_elem_2d_broadcast_1d/defines.h → ...sub_elem_16x1024_broadcast_1024/defines.h b/...6xbf16_sub_elem_2d_broadcast_1d/defines.h → ...sub_elem_16x1024_broadcast_1024/defines.h
diff --git a/...elem_2d_broadcast_1d_reshape/testbench.cc → ..._elem_16x1024_broadcast_1024/testbench.cc b/...elem_2d_broadcast_1d_reshape/testbench.cc → ..._elem_16x1024_broadcast_1024/testbench.cc
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/bf16xfloat_add_elem.mlir b/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/bf16xfloat_add_elem.mlir
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: mkdir -p %t
+// RUN: mkdir -p %basename_t
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xbf16>, %arg1: tensor<1024xf32>) -> (tensor<1024xf32>) {
+    %1 = "tosa.cast" (%arg0) : (tensor<1024xbf16>)  -> (tensor<1024xf32>)
+    %2 = "tosa.add"(%1,%arg1) : (tensor<1024xf32>, tensor<1024xf32>)  -> (tensor<1024xf32>)
+    return %2 : tensor<1024xf32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/defines.h b/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/dut.cc b/test/Integration/Dialect/TOSA/bf16xfloat_add_elem/dut.cc
@@ -0,0 +1,20 @@
+// clang-format off
+void dut(bfloat16 * restrict v1, float * restrict v2, float * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 16;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16bfloat16 v8 = *(v16bfloat16 *)(v1 + v7);
+    v16float v9 = *(v16float *)(v2 + v7);
+    v16accfloat v10 = ups_to_v16accfloat(v8);
+    v16accfloat v11 = v16accfloat(v9);
+    v16accfloat v12 = add(v10, v11);
+    v16float v13 = v16float(v12);
+    *(v16float *)(v3 + v7) = v13;
+  }
+  return;
+}
+// clang-format on
diff --git a/...lem_2d_broadcast_1d_unit_dim/testbench.cc → ...ect/TOSA/bf16xfloat_add_elem/testbench.cc b/...lem_2d_broadcast_1d_unit_dim/testbench.cc → ...ect/TOSA/bf16xfloat_add_elem/testbench.cc
@@ -4,22 +4,21 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
-         bfloat16 *restrict out0);
-void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);
+void dut(bfloat16 *restrict in0, float *restrict in1, float *restrict out0);
+void dut_ref(bfloat16 *in0, float *in1, float *out0);
 
 alignas(32) bfloat16 g_in0[IN0_SIZE];
-alignas(32) bfloat16 g_in1[IN1_SIZE];
-alignas(32) bfloat16 g_out0[OUT0_SIZE];
-alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+alignas(32) float g_in1[IN1_SIZE];
+alignas(32) float g_out0[OUT0_SIZE];
+alignas(32) float g_out0Ref[OUT0_SIZE];
 
 int main(int argc, char *argv[]) {
   std::string dataDir(TO_STR(DATA_DIR));
   srand(10);
   std::generate(g_in0, g_in0 + IN0_SIZE,
                 [&]() { return random_bfloat16(-10, 10, 2); });
   std::generate(g_in1, g_in1 + IN1_SIZE,
-                [&]() { return random_bfloat16(-10, 10, 2); });
+                [&]() { return random_float(-80, 80, 10); });
 
   writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
   writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
@@ -39,7 +38,7 @@ int main(int argc, char *argv[]) {
   writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
 
   bool ok = true;
-  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);
 
   if (ok)
     printf("TEST PASSED\n");
@@ -49,8 +48,8 @@ int main(int argc, char *argv[]) {
   return ok ? 0 : 1;
 }
 
-void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
+void dut_ref(bfloat16 *in0, float *in1, float *out0) {
   for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
-    out0[k] = in0[k] - in1[k % IN1_SIZE];
+    out0[k] = in0[k] + in1[k];
   }
 }
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/bf16xfloat_sub_elem.mlir b/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/bf16xfloat_sub_elem.mlir
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine -o aievec.mlir
+// RUN: aie-translate aievec.mlir -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xbf16>, %arg1: tensor<1024xf32>) -> (tensor<1024xf32>) {
+    %1 = "tosa.cast" (%arg0) : (tensor<1024xbf16>)  -> (tensor<1024xf32>)
+    %2 = "tosa.sub"(%1,%arg1) : (tensor<1024xf32>, tensor<1024xf32>)  -> (tensor<1024xf32>)
+    return %2 : tensor<1024xf32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/defines.h b/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/dut.cc b/test/Integration/Dialect/TOSA/bf16xfloat_sub_elem/dut.cc
@@ -0,0 +1,20 @@
+// clang-format off
+void dut(bfloat16 * restrict v1, float * restrict v2, float * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 16;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16bfloat16 v8 = *(v16bfloat16 *)(v1 + v7);
+    v16float v9 = *(v16float *)(v2 + v7);
+    v16accfloat v10 = ups_to_v16accfloat(v8);
+    v16accfloat v11 = v16accfloat(v9);
+    v16accfloat v12 = sub(v10, v11);
+    v16float v13 = v16float(v12);
+    *(v16float *)(v3 + v7) = v13;
+  }
+  return;
+}
+// clang-format on
diff --git a/...f16_sub_elem_2d_broadcast_2d/testbench.cc → ...ect/TOSA/bf16xfloat_sub_elem/testbench.cc b/...f16_sub_elem_2d_broadcast_2d/testbench.cc → ...ect/TOSA/bf16xfloat_sub_elem/testbench.cc
@@ -4,22 +4,21 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-void dut(bfloat16 *restrict in0, bfloat16 *restrict in1,
-         bfloat16 *restrict out0);
-void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0);
+void dut(bfloat16 *restrict in0, float *restrict in1, float *restrict out0);
+void dut_ref(bfloat16 *in0, float *in1, float *out0);
 
 alignas(32) bfloat16 g_in0[IN0_SIZE];
-alignas(32) bfloat16 g_in1[IN1_SIZE];
-alignas(32) bfloat16 g_out0[OUT0_SIZE];
-alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+alignas(32) float g_in1[IN1_SIZE];
+alignas(32) float g_out0[OUT0_SIZE];
+alignas(32) float g_out0Ref[OUT0_SIZE];
 
 int main(int argc, char *argv[]) {
   std::string dataDir(TO_STR(DATA_DIR));
   srand(10);
   std::generate(g_in0, g_in0 + IN0_SIZE,
                 [&]() { return random_bfloat16(-10, 10, 2); });
   std::generate(g_in1, g_in1 + IN1_SIZE,
-                [&]() { return random_bfloat16(-10, 10, 2); });
+                [&]() { return random_float(-80, 80, 10); });
 
   writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
   writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
@@ -39,7 +38,7 @@ int main(int argc, char *argv[]) {
   writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
 
   bool ok = true;
-  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE);
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);
 
   if (ok)
     printf("TEST PASSED\n");
@@ -49,8 +48,8 @@ int main(int argc, char *argv[]) {
   return ok ? 0 : 1;
 }
 
-void dut_ref(bfloat16 *in0, bfloat16 *in1, bfloat16 *out0) {
+void dut_ref(bfloat16 *in0, float *in1, float *out0) {
   for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
-    out0[k] = in0[k] - in1[k % IN1_SIZE];
+    out0[k] = in0[k] - in1[k];
   }
 }
diff --git a/test/Integration/Dialect/TOSA/floatxfloat_add_elem/defines.h b/test/Integration/Dialect/TOSA/floatxfloat_add_elem/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/Integration/Dialect/TOSA/floatxfloat_add_elem/dut.cc b/test/Integration/Dialect/TOSA/floatxfloat_add_elem/dut.cc
@@ -0,0 +1,20 @@
+// clang-format off
+void dut(float * restrict v1, float * restrict v2, float * restrict v3) {
+  size_t v4 = 0;
+  size_t v5 = 1024;
+  size_t v6 = 16;
+  for (size_t v7 = v4; v7 < v5; v7 += v6)
+  chess_prepare_for_pipelining
+  chess_loop_range(64, 64)
+  {
+    v16float v8 = *(v16float *)(v1 + v7);
+    v16float v9 = *(v16float *)(v2 + v7);
+    v16accfloat v10 = v16accfloat(v8);
+    v16accfloat v11 = v16accfloat(v9);
+    v16accfloat v12 = add(v10, v11);
+    v16float v13 = v16float(v12);
+    *(v16float *)(v3 + v7) = v13;
+  }
+  return;
+}
+// clang-format on
diff --git a/test/Integration/Dialect/TOSA/floatxfloat_add_elem/floatxfloat_add_elem.mlir b/test/Integration/Dialect/TOSA/floatxfloat_add_elem/floatxfloat_add_elem.mlir
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+// REQUIRES: valid_xchess_license
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" -o linalg.mlir
+// RUN: mlir-opt linalg.mlir --linalg-fuse-elementwise-ops --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs allow-unknown-ops bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map" --drop-equivalent-buffer-results --buffer-results-to-out-params --buffer-deallocation --canonicalize --cse --convert-linalg-to-affine-loops --affine-super-vectorize="virtual-vector-size=16" -o affine.mlir 
+// RUN: aie-opt affine.mlir --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+
+module {
+  func.func @dut(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> (tensor<1024xf32>) {
+    %1 = "tosa.add"(%arg0,%arg1) : (tensor<1024xf32>, tensor<1024xf32>)  -> (tensor<1024xf32>)
+    return %1 : tensor<1024xf32>
+  }
+}
+
diff --git a/test/Integration/Dialect/TOSA/floatxfloat_add_elem/testbench.cc b/test/Integration/Dialect/TOSA/floatxfloat_add_elem/testbench.cc
@@ -0,0 +1,55 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+void dut(float *restrict in0, float *restrict in1, float *restrict out0);
+void dut_ref(float *in0, float *in1, float *out0);
+
+alignas(32) float g_in0[IN0_SIZE];
+alignas(32) float g_in1[IN1_SIZE];
+alignas(32) float g_out0[OUT0_SIZE];
+alignas(32) float g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_float(-80, 80, 10); });
+  std::generate(g_in1, g_in1 + IN1_SIZE,
+                [&]() { return random_float(-80, 80, 10); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+  writeData(g_in1, IN1_SIZE, dataDir + "/in1.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_in1, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_in1, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+void dut_ref(float *in0, float *in1, float *out0) {
+  for (unsigned k = 0; k < OUT0_SIZE; k += 1) {
+    out0[k] = in0[k] + in1[k];
+  }
+}
diff --git a/test/Integration/Dialect/TOSA/floatxfloat_sel/defines.h b/test/Integration/Dialect/TOSA/floatxfloat_sel/defines.h
@@ -0,0 +1,4 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const IN1_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;