From 01f83f583ffba8e569e287f09cfd1857259de5ea Mon Sep 17 00:00:00 2001
From: hayden-brown <hongcc727@163.com>
Date: Wed, 31 Jul 2024 23:55:19 +0800
Subject: [PATCH 1/2] Llama2 model Operator/Layer level instance extraction

---
 examples/BuddyNext/README.md               | 408 +++++++++++++++++++
 examples/BuddyNext/makefile                | 431 +++++++++++++++++++++
 examples/BuddyNext/next-fc.mlir            |  78 ++++
 examples/BuddyNext/next-ffn.mlir           |  98 +++++
 examples/BuddyNext/next-fpowi.mlir         |  70 ++++
 examples/BuddyNext/next-matmul.mlir        |  63 +++
 examples/BuddyNext/next-mul.mlir           |  65 ++++
 examples/BuddyNext/next-negate.mlir        |  64 +++
 examples/BuddyNext/next-reciprocal.mlir    |  64 +++
 examples/BuddyNext/next-reducesum.mlir     |  64 +++
 examples/BuddyNext/next-rmsnorm.mlir       |  85 ++++
 examples/BuddyNext/next-rsqrt.mlir         |  62 +++
 examples/BuddyNext/next-selfattention.mlir | 226 +++++++++++
 examples/BuddyNext/next-softmax.mlir       |  72 ++++
 examples/BuddyNext/next-transpose.mlir     |  65 ++++
 15 files changed, 1915 insertions(+)
 create mode 100644 examples/BuddyNext/README.md
 create mode 100644 examples/BuddyNext/next-fc.mlir
 create mode 100644 examples/BuddyNext/next-ffn.mlir
 create mode 100644 examples/BuddyNext/next-fpowi.mlir
 create mode 100644 examples/BuddyNext/next-matmul.mlir
 create mode 100644 examples/BuddyNext/next-mul.mlir
 create mode 100644 examples/BuddyNext/next-negate.mlir
 create mode 100644 examples/BuddyNext/next-reciprocal.mlir
 create mode 100644 examples/BuddyNext/next-reducesum.mlir
 create mode 100644 examples/BuddyNext/next-rmsnorm.mlir
 create mode 100644 examples/BuddyNext/next-rsqrt.mlir
 create mode 100644 examples/BuddyNext/next-selfattention.mlir
 create mode 100644 examples/BuddyNext/next-softmax.mlir
 create mode 100644 examples/BuddyNext/next-transpose.mlir

diff --git a/examples/BuddyNext/README.md b/examples/BuddyNext/README.md
new file mode 100644
index 0000000000..423c8b73d9
--- /dev/null
+++ b/examples/BuddyNext/README.md
@@ -0,0 +1,408 @@
+# Llama 2 Operator/Layer level instance extraction
+
+---
+
+## Operator Level：
+
+### **TOSA Dialect**
+
+#### `tosa.mul`
+
+	make next-mul-run
+
+- **Input Tensors**:
+  - Shape: `tensor<1xf32>`
+  - Example: `[3.0]`
+
+  - Shape: `tensor<1x40x1xf32>`
+  - Example: `[[[2.0], [2.0], ..., [2.0]]]` (40 elements)
+- **Output Tensor**:
+  - Shape: `tensor<1x40x1xf32>`
+  - Example: All elements will be `6.0` after the multiplication operation.
+- **Multiplication Operation**:
+  - The `tosa.mul` operation is applied to the input tensors `%arg0` and `%arg1`, performing an element-wise multiplication.
+- **Timing:**
+  - elapsed time: 0.000380993
+
+#### `tosa.negate`
+
+	make next-negate-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x32x40x64xf32>`
+  - Example: All elements initialized to `1.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x32x40x64xf32>`
+  - Example: All elements will be `-1.0` after the negate operation.
+- **Negate Operation**:
+  - The `tosa.negate` operation is applied to the input tensor `%arg0`, which negates each element in the tensor.
+- **Timing:**
+  - elapsed time: 0.000413179
+
+#### `tosa.reciprocal`
+
+	make next-reciprocal-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x10xf32>`
+  - Example: All elements initialized to `[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]`.
+- **Output Tensor**:
+  - Shape: `tensor<1x10xf32>`
+  - Example: All elements will be the reciprocal of the input tensor elements, i.e., `[1.0, 0.5, 0.333, 0.25, 0.2, 0.166, 0.142, 0.125, 0.111, 0.1]`.
+- **Reciprocal Operation**:
+  - The `tosa.reciprocal` operation is applied to the input tensor `%arg0`, which computes the reciprocal (1/x) of each element in the tensor.
+- **Timing:**
+  - elapsed time: 0.000286102
+
+#### `tosa.reduce_sum`
+
+	make next-reducesum-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: All elements initialized to `1.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x1xf32>`
+  - Example: Each element in the output tensor is the sum of 4096 elements from the corresponding dimension of the input tensor, which will be `4096.0` for each element.
+- **Reduce Sum Operation**:
+  - The `tosa.reduce_sum` operation is applied to the input tensor `%arg0`, summing elements along the `axis=2` dimension. This reduces the shape of the tensor from `[1, 40, 4096]` to `[1, 40, 1]`.
+- **Timing:**
+  - elapsed time: 0.000262976
+
+#### `tosa.rsqrt`
+
+	make next-rsqrt-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x40x1xf32>`
+  - Example: All elements initialized to `3.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x1xf32>`
+  - Example: Each element in the output tensor will be the reciprocal of the square root of the corresponding element in the input tensor, which will be approximately `0.57735` for each element.
+- **Rsqrt Operation**:
+  - The `tosa.rsqrt` operation is applied to the input tensor `%arg0`, which computes the reciprocal of the square root of each element in the tensor.
+- **Timing:**
+  - elapsed time: 3.09944e-06
+
+#### `tosa.transpose`
+
+	make next-transpose-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x40x32x128xf32>`
+  - Example: All elements initialized to `1.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x32x40x128xf32>`
+  - Example: The tensor after transposing will have the elements permuted according to the permutation vector `[0, 2, 1, 3]`. Given that all elements are initialized to `1.0`, the values remain `1.0` but the shape is permuted.
+- **Transpose Operation**:
+  - The `tosa.transpose` operation is applied to the input tensor `%arg0` with the permutation vector `%perm`, which rearranges the dimensions of the input tensor according to `[0, 2, 1, 3]`.- The permutation `[0, 2, 1, 3]` means:
+    - The first dimension remains the same.
+    - The second dimension (40) is swapped with the third dimension (32).
+    - The fourth dimension (128) remains the same.
+  - Therefore, the input tensor shape `[1, 40, 32, 128]` is transposed to `[1, 32, 40, 128]`.
+- **Timing:**
+  - elapsed time: 0.000138044
+
+### **Math Dialect**
+
+#### `math.fpowi`
+
+	make next-fpowi-run
+
+- **Input Tensor**:
+  - Shape: `tensor<1x32x40x64xf32>`
+    - Example: All elements initialized to `5.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x32x40x64xf32>`
+    - Example: Each element in the output tensor will be the value of the corresponding element in the input tensor raised to the power of `2`, i.e., `25.0` for each element.
+- **Power Operation**:
+  - The `math.fpowi` operation is applied to each element in the input tensor `%arg0`, raising it to the power of `2`.
+  - For example, if an element in the input tensor is `5.0`, the corresponding element in the output tensor will be `5.0^2 = 25.0`.
+- **Timing:**
+  - elapsed time: 8.29697e-05
+
+### **Linalg Dialect**
+
+#### `linalg.matmul`
+
+make next-matmul-run
+
+- **Input Tensors**:
+  - Shape: `tensor<40x4096xf32>`
+  - Example: All elements initialized to `3.0`.
+
+  - Shape: `tensor<4096x4096xf32>`
+  - Example: All elements initialized to `2.0`.
+- **Output Tensor**:
+  - Shape: `tensor<40x4096xf32>`
+  - Example: Each element in the output tensor will be the result of the matrix multiplication of the input tensors. Given the initialization, the elements will be the result of `3.0 * 2.0 * 4096`.
+- **Matrix Multiplication Operation**:
+  - The `linalg.matmul` operation is applied to the input tensors `%arg0` and `%arg1`, performing matrix multiplication.
+  - The output tensor `%arg2` is the result of the matrix multiplication, where each element is calculated as the sum of the element-wise products of the rows of the first matrix and the columns of the second matrix.
+- **Timing:**
+  - elapsed time: 7.42794
+
+---
+
+## Layer Level
+
+#### `Full Connect Layer`
+
+	`make next-fc-run`
+
+- **Input Tensors**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: All elements initialized to `3.0`.
+
+  - Shape: `tensor<4096x4096xf32>`
+  - Example: All elements initialized to `2.0`.
+
+  - Shape: `tensor<4096x4096xf32>`
+  - Example: All elements initialized to `1.0`.
+
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: All elements initialized to `4.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: The exact values will depend on the computations performed during the fully connected layer operations, which include multiplication, transposition, and reshaping.
+- **Fully Connected Layer Operations**:
+  1. **Multiplication**:
+     - `%41 = tosa.mul %arg0, %arg3` multiplies the elements of `%arg0` and `%arg3` element-wise.
+     - Example: The result tensor will have elements initialized to `3.0 * 4.0 = 12.0`.
+  2. **Transpose**:
+     - `%43 = tosa.transpose %arg1, %42` transposes the tensor `%arg1` according to the permutation `[1, 0]`.
+     - Example: The tensor shape remains `[4096x4096]`.
+  3. **Reshape**:
+     - `%44 = tosa.reshape %41` reshapes the tensor from `tensor<1x40x4096xf32>` to `tensor<40x4096xf32>`.
+  4. **Matrix Multiplication**:
+     - `%45 = linalg.matmul` performs matrix multiplication on the reshaped tensor and the transposed tensor.
+     - Example: Each element of the resulting `tensor<40x4096xf32>` will be `12.0 * 2.0 * 4096 = 98304.0`.
+     - The result is reshaped back to `tensor<1x40x4096xf32>`.
+  5. **Second Transpose and Reshape**:
+     -  Similar transpose and reshape operations are performed on `%arg2` and the result tensor `%41`.
+  6. **Second Matrix Multiplication**:
+     - `%50 = linalg.matmul` performs matrix multiplication on the reshaped tensors, and the result is reshaped back to `tensor<1x40x4096xf32>`.
+     - Example: Each element of the resulting `tensor<40x4096xf32>` will be `12.0 * 1.0 * 4096 = 49152.0`.
+     - The final output tensor will have the shape `tensor<1x40x4096xf32>` with each element being `49152.0`.
+- **Timing:**
+  - elapsed time: 10.8429
+
+#### `Feed Forward Network`
+
+	`make next-ffn-run`
+
+- **Input Tensors**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: All elements initialized to `3.0`.
+
+  - Shape: `tensor<4096xf32>`
+  - Example: All elements initialized to `1.0`.
+
+  - Shape: `tensor<11008x4096xf32>`
+  - Example: All elements initialized to `1.0`.
+
+  - Shape: `tensor<11008x4096xf32>`
+  - Example: All elements initialized to `2.0`.
+
+  - Shape: `tensor<4096x11008xf32>`
+  - Example: All elements initialized to `1.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+- **Feed Forward Network Operations**:
+  1. **Multiplication**:
+     - `%138 = tosa.reshape %arg9 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>`
+     - `%139 = tosa.mul %138, %arg0 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>`
+     - Example: The result tensor will have elements initialized to `1.0 * 3.0 = 3.0`.
+  2. **Transpose**:
+     - `%141 = tosa.transpose %arg10, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>`
+     - Example: The tensor shape remains `[4096x11008]`.
+  3. **Reshape**:
+     - `%142 = tosa.reshape %139 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>`
+  4. **Matrix Multiplication**:
+     - `%143 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32>`
+     - Example: Each element of the resulting `tensor<40x11008xf32>` will be the sum of the products of corresponding elements from the input tensor and the transposed weight tensor, resulting in a tensor with elements calculated as `3.0 * 1.0 * 4096 = 12288.0`.
+  5. **Reshape**:
+     - `%144 = tosa.reshape %143 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>`
+  6. **Sigmoid and Multiplication**:
+     - `%145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>`
+     - Example: Each element of the resulting tensor will be the sigmoid of `12288.0`, which is very close to `1.0` because the sigmoid function asymptotically approaches `1` for large positive inputs.
+     - `%146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>`
+     - Example: Each element of the resulting tensor will be `12288.0 * 1.0 = 12288.0`.
+  7. **Second Transpose and Reshape**:
+     - Similar transpose and reshape operations are performed on `%arg11` and the result tensor `%146`.
+  8. **Second Matrix Multiplication**:
+     - `%150 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32>`
+     - Example: Each element of the resulting tensor will be `3.0 * 2.0 * 4096 = 24576.0`.
+     - The result is reshaped back to `tensor<1x40x11008xf32>`.
+  9. **Final Multiplication and Matrix Multiplication**:
+     - `%152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>`
+     - Example: Each element of the resulting tensor will be `12288.0 * 24576.0 = 301989888.0`.
+     - `%156 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32>`
+     - Example: Each element of the resulting tensor will be the sum of products of elements from the tensor of `301989888.0` and the weight tensor, resulting in very large values.
+  10. **Addition**:
+      - `%158 = tosa.add %arg0, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>`
+      - The final output tensor will be the addition of the original input tensor `%arg0` and the resulting tensor from the previous computations.
+- **Timing:**
+  - elapsed time: 56.0974
+
+#### `RMSNorm`
+
+	`make next-rmsnorm-run`
+
+- **Input Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: All elements initialized to `3.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: Each element in the output tensor will be the result of the RMSNorm operations applied to the input tensor.
+- **RMSNorm Operations**:
+  1. **Square Elements**:
+     - `%31 = linalg.generic` squares each element in the input tensor `%arg0`.
+     - Example: Each element will be `3.0^2 = 9.0`.
+  2. **Reduce Sum**:
+     - `%32 = tosa.reduce_sum %31 {axis = 2 : i32}` sums the squared elements along the last dimension.
+     - Example: Each element in the resulting tensor will be the sum of `4096` squared elements, `9.0 * 4096 = 36864.0`.
+  3. **Reciprocal and Multiplication**:
+     - `%34 = tosa.reciprocal %33` computes the reciprocal of a constant tensor value `4096.0`.
+     - `%35 = tosa.mul %34, %32` multiplies the reciprocal with the sum of squares.
+     - Example: Each element will be `1/4096 * 36864.0 = 9.0`.
+  4. **Add Small Constant**:
+     - `%37 = tosa.add %35, %36` adds a small constant `1e-5` to the result.
+     - Example: Each element will be `9.0 + 1e-5`.
+  5. **Reciprocal Square Root**:
+     - `%38 = tosa.rsqrt %37` computes the reciprocal square root of the result.
+     - Example: Each element will be approximately `1 / sqrt(9.0 + 1e-5) ≈ 0.333333`.
+  6. **Final Multiplication**:
+     - `%39 = tosa.mul %arg0, %38` multiplies the original input tensor `%arg0` by the reciprocal square root.
+     - Example: Each element will be `3.0 * 0.333333 = 0.999999`.
+- **Timing:**
+  - elapsed time: 0.000798941
+
+#### `Softmax`
+
+	`make next-softmax-run`
+
+- **Input Tensors**:
+
+  - Shape: `tensor<1x32x40x40xf32>`
+  - Example: All elements initialized to `3.0`.
+
+  - Shape: `tensor<1x1x40x40xf32>`
+  - Example: All elements initialized to `0.0`.
+
+- **Output Tensor**:
+
+  - Shape: `tensor<1x32x40x40xf32>`
+  - Example: Each element in the output tensor will be the result of the softmax operations applied to the input tensor. The elements will sum to `1` along the last axis (softmax dimension).
+
+- **Softmax Operations**:
+
+  1. **Scaling**:
+
+    - `%101 = tosa.reciprocal %100` computes the reciprocal of a constant tensor value `11.3137083`.
+    - `%102 = tosa.mul %arg0, %101` scales the input tensor `%arg0` by multiplying with the reciprocal.
+    - Example: Each element will be `3.0 / 11.3137083 ≈ 0.265`.
+
+  2. **Addition**:
+     - `%103 = tosa.add %102, %arg1` adds the second input tensor `%arg1` to the scaled tensor.
+     - Example: Each element will remain `0.265` as `%arg1` is all zeros.
+  3. **Max Reduction**:
+     - `%104 = tosa.reduce_max %103` computes the maximum value along the last dimension (axis 3).
+     - Example: The maximum value along each `40x40` slice will be `0.265`.
+  4. **Subtraction**:
+     - `%105 = tosa.sub %103, %104` subtracts the maximum value from each element to ensure numerical stability.
+     - Example: Each element will be `0.265 - 0.265 = 0.0`.
+  5. **Exponentiation**:
+     - `%106 = tosa.exp %105` applies the exponential function to each element.
+     - Example: Each element will be `exp(0.0) = 1.0`.
+  6. **Sum Reduction**:
+     - `%107 = tosa.reduce_sum %106` computes the sum of exponentials along the last dimension (axis 3).
+     - Example: The sum along each `40x40` slice will be `40` since each element is `1.0`.
+  7. **Reciprocal of Sum**:
+     - `%108 = tosa.reciprocal %107` computes the reciprocal of the sum of exponentials.
+     - Example: Each element will be `1 / 40 = 0.025`.
+  8. **Final Multiplication**:
+     - `%109 = tosa.mul %106, %108` multiplies the exponentials by the reciprocal of their sum to normalize them.
+     - Example: Each element will be `1.0 * 0.025 = 0.025`.
+
+- **Timing:**
+
+  - elapsed time: 0.000925779
+
+#### `Self-Attention`
+
+	`make next-selfattention-run`
+
+- **Input Tensors**:
+  - `tensor<1x1x4096xf32>`
+    - Shape: `[1, 1, 4096]`
+    - Example: All elements initialized to `3.0`.
+  - `tensor<1x40x4096xf32>`
+    - Shape: `[1, 40, 4096]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<40xi64>`
+    - Shape: `[40]`
+    - Example: All elements initialized to `2`.
+  - `tensor<4096x4096xf32>`
+    - Shape: `[4096, 4096]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<4096x4096xf32>`
+    - Shape: `[4096, 4096]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<4096x4096xf32>`
+    - Shape: `[4096, 4096]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<1x1x2048x128xf32>`
+    - Shape: `[1, 1, 2048, 128]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<1x1x2048x128xf32>`
+    - Shape: `[1, 1, 2048, 128]`
+    - Example: All elements initialized to `1.0`.
+  - `tensor<4096x4096xf32>`
+    - Shape: `[4096, 4096]`
+    - Example: All elements initialized to `2.0`.
+  - `tensor<1x1x40x40xf32>`
+    - Shape: `[1, 1, 40, 40]`
+    - Example: All elements initialized to `0.0`.
+- **Output Tensor**:
+  - Shape: `tensor<1x40x4096xf32>`
+  - Example: Each element in the output tensor will be the result of the self-attention operations applied to the input tensors.
+- **Softmax Operations**:
+  1. **Compute Query, Key, and Value Matrices**:
+     - **Query**:
+       - `%41 = tosa.mul %arg0, %arg1` scales the input tensor.
+       - Example: Each element will be `3.0 * 1.0 = 3.0`.
+       - `%45` and `%46` involve transposition and reshaping.
+       - Example: Elements remain `3.0`.
+     - **Key**:
+       - `%50` and `%51` involve similar transposition and reshaping as Query.
+       - Example: Elements remain `3.0`.
+     - **Value**:
+       - `%55` and `%56` involve similar transposition and reshaping as Query.
+       - Example: Elements remain `3.0`.
+  2. **Apply Rotary Positional Encoding (RoPE) to Q and K Vectors**:
+     - **Query RoPE**:
+       - Transpose and reshape operations (`%57`, `%58`, `%59`).
+       - Example: Shape transformed to `1x32x40x128`.
+     - **Key RoPE**:
+       - Similar transpose and reshape operations (`%60`, `%61`, `%62`).
+     - **Value RoPE**:
+       - Similar transpose and reshape operations (`%63`, `%64`, `%65`).
+  3. **Compute Softmax(Q, K) and Self-Attention Output**:
+
+     - **Attention Scores**:
+       - Extract slices and generic operations to calculate (`%66` to `%79`).
+       - Compute multiplication of Q and K with positional encoding applied (`%80` to `%83`).
+     - **Softmax**:
+       - `%84 = tosa.add %80, %83` sums the attention scores.
+       - Apply softmax function over the attention scores.
+     - **Self-Attention Output**:
+       - Compute the output by multiplying attention scores with Value matrix (`%112` to `%116`).
+       - Transpose and reshape operations for final output (`%117` to `%121`).
+       - Example: Each element in the output tensor will be influenced by the weighted sum of values, resulting from the softmax-scaled dot product of queries and keys.
+  4. **Final Matrix Multiplication and Addition**:
+     - `%125 = linalg.matmul` performs matrix multiplication on the reshaped tensor and the transposed weight tensor.
+     - `%127 = tosa.add %arg1, %126` adds the original input tensor `%arg1` to the result of the matrix multiplication.
+     - Example: The output tensor shape is `tensor<1x40x4096xf32>`, and each element will be the sum of the original input tensor elements and the matrix multiplication result.
+- **Timing:**
+  - elapsed time: 48.4356
\ No newline at end of file
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index 443907d352..78f3937608 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -230,3 +230,434 @@ next-rope-run:
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-matmul-run:
+	@${MLIR_OPT} ./next-matmul.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-rmsnorm-run:
+	@${MLIR_OPT} ./next-rmsnorm.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-rsqrt-run:
+	@${MLIR_OPT} ./next-rsqrt.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+next-mul-run:
+	@${MLIR_OPT} ./next-mul.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-softmax-run:
+	@${MLIR_OPT} ./next-softmax.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-ffn-run:
+	@${MLIR_OPT} ./next-ffn.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-transpose-run:
+	@${MLIR_OPT} ./next-transpose.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-negate-run:
+	@${MLIR_OPT} ./next-negate.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-fpowi-run:
+	@${MLIR_OPT} ./next-fpowi.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reducesum-run:
+	@${MLIR_OPT} ./next-reducesum.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-reciprocal-run:
+	@${MLIR_OPT} ./next-reciprocal.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-fc-run:
+	@${MLIR_OPT} ./next-fc.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-selfattention-run:
+	@${MLIR_OPT} ./next-selfattention.mlir \
+		-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/BuddyNext/next-fc.mlir b/examples/BuddyNext/next-fc.mlir
new file mode 100644
index 0000000000..89593d8cf6
--- /dev/null
+++ b/examples/BuddyNext/next-fc.mlir
@@ -0,0 +1,78 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_fc_layer(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<4096x4096xf32>, %arg2 : tensor<4096x4096xf32>, %arg3 : tensor<1x40x4096xf32>) {
+%t_start = call @rtclock() : () -> f64
+
+%cst_0 = arith.constant dense<0.0> : tensor<40x4096xf32>
+%cst_1 = arith.constant dense<0.0> : tensor<40x4096xf32>
+
+%41 = tosa.mul %arg0, %arg3 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+%42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+%43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+%44 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+%45 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_0 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+%46 = tosa.reshape %45 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+%47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+%48 = tosa.transpose %arg2, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+%49 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+%50 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+%51 = tosa.reshape %50 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+%t_end = call @rtclock() : () -> f64
+%time = arith.subf %t_end, %t_start : f64
+
+%tensor_unranked = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+vector.print %time : f64
+
+return
+}
+
+func.func @main() {
+%input_tensor_1 = arith.constant dense<3.0> : tensor<1x40x4096xf32>
+%input_tensor_2 = arith.constant dense<2.0> : tensor<4096x4096xf32>
+%input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+%input_tensor_4 = arith.constant dense<4.0> : tensor<1x40x4096xf32>
+
+call @kernel_fc_layer(%input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4) : (tensor<1x40x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x40x4096xf32>) -> ()
+
+return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-ffn.mlir b/examples/BuddyNext/next-ffn.mlir
new file mode 100644
index 0000000000..f132f62d97
--- /dev/null
+++ b/examples/BuddyNext/next-ffn.mlir
@@ -0,0 +1,98 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %arg10: tensor<11008x4096xf32>, %arg11: tensor<11008x4096xf32>, %arg12: tensor<4096x11008xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // FFN
+  %138 = tosa.reshape %arg9 {new_shape = array<i64: 1, 1, 4096>} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+  %139 = tosa.mul %138, %arg0 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+  %140 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %141 = tosa.transpose %arg10, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+  %142 = tosa.reshape %139 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_24 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+  %143 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+  %144 = tosa.reshape %143 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+  %145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+  %146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+  %147 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %148 = tosa.transpose %arg11, %147 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+  %149 = tosa.reshape %139 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_25 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+  %150 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+  %151 = tosa.reshape %150 {new_shape = array<i64: 1, 40, 11008>} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+  %152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+  %153 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %154 = tosa.transpose %arg12, %153 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+  %155 = tosa.reshape %152 {new_shape = array<i64: 40, 11008>} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+  %cst_26 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+  %156 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+  %157 = tosa.reshape %156 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+  %158 = tosa.add %arg0, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %158 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor = arith.constant dense<3.0> : tensor<1x40x4096xf32>
+  %weight1 = arith.constant dense<1.0> : tensor<4096xf32>
+  %weight2 = arith.constant dense<1.0> : tensor<11008x4096xf32>
+  %weight3 = arith.constant dense<2.0> : tensor<11008x4096xf32>
+  %weight4 = arith.constant dense<1.0> : tensor<4096x11008xf32>
+
+  call @kernel_ffn(%input_tensor, %weight1, %weight2, %weight3, %weight4) : (tensor<1x40x4096xf32>, tensor<4096xf32>, tensor<11008x4096xf32>, tensor<11008x4096xf32>, tensor<4096x11008xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-fpowi.mlir b/examples/BuddyNext/next-fpowi.mlir
new file mode 100644
index 0000000000..fca13fd2ed
--- /dev/null
+++ b/examples/BuddyNext/next-fpowi.mlir
@@ -0,0 +1,70 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_fpowi(%arg0: tensor<1x32x40x64xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Power operation
+    %c2_i32 = arith.constant 2 : i32
+    %output_tensor = tensor.empty() : tensor<1x32x40x64xf32>
+    %result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x32x40x64xf32>) outs(%output_tensor : tensor<1x32x40x64xf32>) {
+      ^bb0(%in: f32, %out: f32):
+        %0 = math.fpowi %in, %c2_i32 : f32, i32
+        linalg.yield %0 : f32
+    } -> tensor<1x32x40x64xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %result : tensor<1x32x40x64xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = arith.constant dense<5.0> : tensor<1x32x40x64xf32>
+
+    call @kernel_fpowi(%input_tensor) : (tensor<1x32x40x64xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-matmul.mlir b/examples/BuddyNext/next-matmul.mlir
new file mode 100644
index 0000000000..a81e78385a
--- /dev/null
+++ b/examples/BuddyNext/next-matmul.mlir
@@ -0,0 +1,63 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_matmul(%arg0 : tensor<40x4096xf32>, %arg1 : tensor<4096x4096xf32>, %arg2 : tensor<40x4096xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  %matmul_result = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%arg2 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %matmul_result : tensor<40x4096xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor_1 = arith.constant dense<3.0> : tensor<40x4096xf32>
+  %input_tensor_2 = arith.constant dense<2.0> : tensor<4096x4096xf32>
+  %output_tensor = arith.constant dense<0.0> : tensor<40x4096xf32>
+
+  call @kernel_matmul(%input_tensor_1, %input_tensor_2, %output_tensor) : (tensor<40x4096xf32>, tensor<4096x4096xf32>, tensor<40x4096xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-mul.mlir b/examples/BuddyNext/next-mul.mlir
new file mode 100644
index 0000000000..8b2d5ae677
--- /dev/null
+++ b/examples/BuddyNext/next-mul.mlir
@@ -0,0 +1,65 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_mul(%arg0: tensor<1xf32>, %arg1: tensor<1x40x1xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Perform the multiplication operation
+    %mul_result = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %mul_result : tensor<1x40x1xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor_1 = arith.constant dense<3.0> : tensor<1xf32>
+    %input_tensor_2 = arith.constant dense<2.0> : tensor<1x40x1xf32>
+
+    call @kernel_mul(%input_tensor_1, %input_tensor_2) : (tensor<1xf32>, tensor<1x40x1xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr: tensor<*xf32>)
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-negate.mlir b/examples/BuddyNext/next-negate.mlir
new file mode 100644
index 0000000000..d11c628ee6
--- /dev/null
+++ b/examples/BuddyNext/next-negate.mlir
@@ -0,0 +1,64 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_negate(%arg0: tensor<1x32x40x64xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Negate operation
+    %negated = tosa.negate %arg0 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %negated : tensor<1x32x40x64xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = arith.constant dense<1.0> : tensor<1x32x40x64xf32>
+
+    call @kernel_negate(%input_tensor) : (tensor<1x32x40x64xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-reciprocal.mlir b/examples/BuddyNext/next-reciprocal.mlir
new file mode 100644
index 0000000000..98469786cd
--- /dev/null
+++ b/examples/BuddyNext/next-reciprocal.mlir
@@ -0,0 +1,64 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_reciprocal(%arg0: tensor<1x10xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Reciprocal operation
+    %result = tosa.reciprocal %arg0 : (tensor<1x10xf32>) -> tensor<1x10xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %result : tensor<1x10xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = "tosa.const"() {value = dense<[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]]> : tensor<1x10xf32>} : () -> tensor<1x10xf32>
+
+    call @kernel_reciprocal(%input_tensor) : (tensor<1x10xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-reducesum.mlir b/examples/BuddyNext/next-reducesum.mlir
new file mode 100644
index 0000000000..825aeae113
--- /dev/null
+++ b/examples/BuddyNext/next-reducesum.mlir
@@ -0,0 +1,64 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s  
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_reduce_sum(%arg0: tensor<1x40x4096xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Reduce sum operation
+    %result = tosa.reduce_sum %arg0 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %result : tensor<1x40x1xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = arith.constant dense<1.0> : tensor<1x40x4096xf32>
+
+    call @kernel_reduce_sum(%input_tensor) : (tensor<1x40x4096xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+}
\ No newline at end of file
diff --git a/examples/BuddyNext/next-rmsnorm.mlir b/examples/BuddyNext/next-rmsnorm.mlir
new file mode 100644
index 0000000000..7cb4e2c844
--- /dev/null
+++ b/examples/BuddyNext/next-rmsnorm.mlir
@@ -0,0 +1,85 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_rmsnorm(%arg0: tensor<1x40x4096xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // RMSNorm operations
+  %30 = tensor.empty() : tensor<1x40x4096xf32>
+  %c2_i32 = arith.constant 2 : i32
+  %31 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x40x4096xf32>) outs(%30 : tensor<1x40x4096xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %4175 = math.fpowi %in, %c2_i32 : f32, i32
+      linalg.yield %4175 : f32
+  } -> tensor<1x40x4096xf32>
+  %32 = tosa.reduce_sum %31 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+  %33 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1x1xf32>}> : () -> tensor<1x1xf32>
+  %34 = tosa.reciprocal %33 : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  %35 = tosa.mul %34, %32 {shift = 0 : i8} : (tensor<1x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+  %36 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+  %37 = tosa.add %35, %36 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+  %38 = tosa.rsqrt %37 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+  %39 = tosa.mul %arg0, %38 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %39 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor_1 = arith.constant dense<3.0> : tensor<1x40x4096xf32>
+
+  call @kernel_rmsnorm(%input_tensor_1) : (tensor<1x40x4096xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-rsqrt.mlir b/examples/BuddyNext/next-rsqrt.mlir
new file mode 100644
index 0000000000..6e8d806834
--- /dev/null
+++ b/examples/BuddyNext/next-rsqrt.mlir
@@ -0,0 +1,62 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s  
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_rsqrt(%arg0 : tensor<1x40x1xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // rsqrt operation
+  %rsqrt_result = tosa.rsqrt %arg0 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %rsqrt_result : tensor<1x40x1xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor = arith.constant dense<3.0> : tensor<1x40x1xf32>
+
+  call @kernel_rsqrt(%input_tensor) : (tensor<1x40x1xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-selfattention.mlir b/examples/BuddyNext/next-selfattention.mlir
new file mode 100644
index 0000000000..7976a1b96a
--- /dev/null
+++ b/examples/BuddyNext/next-selfattention.mlir
@@ -0,0 +1,226 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s 
+
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+func.func private @rtclock() -> f64
+
+func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %arg2 : tensor<40xi64>, %arg3 : tensor<4096x4096xf32>, %arg4 : tensor<4096x4096xf32>, %arg5 : tensor<4096x4096xf32>, %arg6 : tensor<1x1x2048x128xf32>, %arg7 : tensor<1x1x2048x128xf32>, %arg8 : tensor<4096x4096xf32>, %arg9 : tensor<1x1x40x40xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // 计算 Query、Key 和 Value 矩阵
+  %41 = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %43 = tosa.transpose %arg3, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+  %44 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_6 = arith.constant dense<0.0> : tensor<40x4096xf32>
+  %45 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+  %46 = tosa.reshape %45 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %48 = tosa.transpose %arg4, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+  %49 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_7 = arith.constant dense<0.0> : tensor<40x4096xf32>
+  %50 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+  %51 = tosa.reshape %50 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %53 = tosa.transpose %arg5, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+  %54 = tosa.reshape %41 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_8 = arith.constant dense<0.0> : tensor<40x4096xf32>
+  %55 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+  %56 = tosa.reshape %55 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  // 对 Q、K 向量进行 RoPE (旋转式位置编码)
+  %57 = tosa.reshape %46 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+  %58 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %59 = tosa.transpose %57, %58 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+  %60 = tosa.reshape %51 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+  %61 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %62 = tosa.transpose %60, %61 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+  %63 = tosa.reshape %56 {new_shape = array<i64: 1, 40, 32, 128>} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+  %64 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %65 = tosa.transpose %63, %64 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+  // 计算 Softmax(Q,K) 以及Self-Attention的输出
+  %extracted_slice_9 = tensor.extract_slice %arg6[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+  %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+  %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+  %extracted_slice_12 = tensor.extract_slice %arg7[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+  %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+  %extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+  %66 = tensor.empty() : tensor<1x40x128xf32>
+  %67 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x40x128xf32>) outs(%66 : tensor<1x40x128xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  } -> tensor<1x40x128xf32>
+  %68 = tensor.empty() : tensor<40x128xf32>
+  %69 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%67 : tensor<1x40x128xf32>) outs(%68 : tensor<40x128xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  } -> tensor<40x128xf32>
+  %70 = tensor.empty() : tensor<1x40x128xf32>
+  %71 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x1x40x128xf32>) outs(%70 : tensor<1x40x128xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  } -> tensor<1x40x128xf32>
+  %72 = tensor.empty() : tensor<40x128xf32>
+  %73 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x40x128xf32>) outs(%72 : tensor<40x128xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    linalg.yield %in : f32
+  } -> tensor<40x128xf32>
+
+  %74 = tensor.empty() : tensor<1x40x128xf32>
+  %arg2_converted = tosa.reshape %arg2 {new_shape = array<i64: 40>} : (tensor<40xi64>) -> tensor<1x40xi64>
+  %75 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2_converted : tensor<1x40xi64>) outs(%74 : tensor<1x40x128xf32>) {
+  ^bb0(%in: i64, %out: f32):
+    %4175 = arith.index_cast %in : i64 to index
+    %4176 = linalg.index 1 : index 
+    %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x40x128xf32>
+  %76 = tosa.reshape %75 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+
+  %77 = tensor.empty() : tensor<1x40x128xf32>
+  %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2_converted : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) {
+  ^bb0(%in: i64, %out: f32):
+    %4175 = arith.index_cast %in : i64 to index
+    %4176 = linalg.index 1 : index 
+    %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x40x128xf32>
+  %79 = tosa.reshape %78 {new_shape = array<i64: 1, 1, 40, 128>} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+
+  %80 = tosa.mul %59, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+  %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+  %81 = tosa.negate %extracted_slice_16 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+  %82 = tensor.empty() : tensor<1x32x40x128xf32>
+  %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+  %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+  %83 = tosa.mul %inserted_slice_17, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %85 = tosa.mul %62, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+  %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+  %86 = tosa.negate %extracted_slice_19 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+  %87 = tensor.empty() : tensor<1x32x40x128xf32>
+  %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+  %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+
+  // 计算 Softmax(QK/sqrt(d_k))
+  %88 = tosa.mul %inserted_slice_21, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %89 = tosa.add %85, %88 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %90 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %91 = tosa.transpose %89, %90 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+  %92 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+  %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %94 = tosa.reshape %93 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+  %95 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+  %96 = tosa.add %91, %95 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+  %97 = tosa.reshape %96 {new_shape = array<i64: 32, 128, 40>} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+  %98 = tosa.matmul %94, %97 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+  %99 = tosa.reshape %98 {new_shape = array<i64: 1, 32, 40, 40>} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+  %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %102 = tosa.mul %99, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %103 = tosa.add %102, %arg9 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+  %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+  %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+  %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+  %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+
+  // 计算Self-Attention的输出
+  %110 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+  %111 = tosa.add %109, %110 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %112 = tosa.reshape %111 {new_shape = array<i64: 32, 40, 40>} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+  %113 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+  %114 = tosa.add %65, %113 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %115 = tosa.reshape %114 {new_shape = array<i64: 32, 40, 128>} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+  %116 = tosa.matmul %112, %115 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+
+  %117 = tosa.reshape %116 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+  %118 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+  %119 = tosa.transpose %117, %118 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+  %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+  %121 = tosa.reshape %120 {new_shape = array<i64: 1, 40, 4096>} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+
+  %122 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+  %123 = tosa.transpose %arg8, %122 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+  %124 = tosa.reshape %121 {new_shape = array<i64: 40, 4096>} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+  %cst_22 = arith.constant dense<0.0> : tensor<40x4096xf32>
+  %125 = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%124, %123 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_22 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+  %126 = tosa.reshape %125 {new_shape = array<i64: 1, 40, 4096>} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+  %127 = tosa.add %arg1, %126 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %127 : tensor<1x40x4096xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor_0 = arith.constant dense<3.0> : tensor<1x1x4096xf32>
+  %input_tensor_1 = arith.constant dense<1.0> : tensor<1x40x4096xf32>
+  %input_tensor_2 = arith.constant dense<2> : tensor<40xi64>
+  %input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+  %input_tensor_4 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+  %input_tensor_5 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+  %input_tensor_6 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32>
+  %input_tensor_7 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32>
+  %input_tensor_8 = arith.constant dense<2.0> : tensor<4096x4096xf32>
+  %input_tensor_9 = arith.constant dense<0.0> : tensor<1x1x40x40xf32>
+
+  call @kernel_self_attention(%input_tensor_0, %input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4, %input_tensor_5, %input_tensor_6, %input_tensor_7, %input_tensor_8, %input_tensor_9) : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<40xi64>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<4096x4096xf32>, tensor<1x1x40x40xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-softmax.mlir b/examples/BuddyNext/next-softmax.mlir
new file mode 100644
index 0000000000..778320ef1a
--- /dev/null
+++ b/examples/BuddyNext/next-softmax.mlir
@@ -0,0 +1,72 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+
+func.func @kernel_softmax(%arg0 : tensor<1x32x40x40xf32>, %arg1 : tensor<1x1x40x40xf32>) {
+  %t_start = call @rtclock() : () -> f64
+
+  // Softmax operations
+  %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+  %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %102 = tosa.mul %arg0, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %103 = tosa.add %102, %arg1 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+  %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+  %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+  %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+  %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+  %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+
+  %t_end = call @rtclock() : () -> f64
+  %time = arith.subf %t_end, %t_start : f64
+
+  %tensor_unranked = tensor.cast %109 : tensor<1x32x40x40xf32> to tensor<*xf32>
+
+  call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+  vector.print %time : f64
+
+  return
+}
+
+func.func @main() {
+  %input_tensor_1 = arith.constant dense<3.0> : tensor<1x32x40x40xf32>
+  %input_tensor_2 = arith.constant dense<0.0> : tensor<1x1x40x40xf32>
+
+  call @kernel_softmax(%input_tensor_1, %input_tensor_2) : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> ()
+
+  return
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir
new file mode 100644
index 0000000000..54c3443c66
--- /dev/null
+++ b/examples/BuddyNext/next-transpose.mlir
@@ -0,0 +1,65 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -func-bufferize \
+// RUN:     -arith-bufferize \
+// RUN:     -tensor-bufferize \
+// RUN:     -buffer-deallocation \
+// RUN:     -finalizing-bufferize \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @rtclock() -> f64
+
+  func.func @kernel_transpose(%arg0: tensor<1x40x32x128xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Transpose operation
+    %perm = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %transposed = tosa.transpose %arg0, %perm : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %tensor_unranked = tensor.cast %transposed : tensor<1x32x40x128xf32> to tensor<*xf32>
+
+    call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = arith.constant dense<1.0> : tensor<1x40x32x128xf32>
+
+    call @kernel_transpose(%input_tensor) : (tensor<1x40x32x128xf32>) -> ()
+
+    return
+  }
+
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+}
\ No newline at end of file

From 3033e0fb7297c60e8405494e5bac8adc4fcfdc75 Mon Sep 17 00:00:00 2001
From: hayden-brown <hongcc727@163.com>
Date: Thu, 1 Aug 2024 13:12:50 +0800
Subject: [PATCH 2/2] Llama2 model Operator/Layer level instance extraction

---
 examples/BuddyNext/next-fc.mlir            |  5 +++++
 examples/BuddyNext/next-ffn.mlir           | 18 ++++++++++++------
 examples/BuddyNext/next-fpowi.mlir         |  5 +++++
 examples/BuddyNext/next-matmul.mlir        |  4 ++++
 examples/BuddyNext/next-mul.mlir           |  4 ++++
 examples/BuddyNext/next-negate.mlir        |  7 ++++++-
 examples/BuddyNext/next-reciprocal.mlir    |  6 +++++-
 examples/BuddyNext/next-reducesum.mlir     |  5 +++++
 examples/BuddyNext/next-rmsnorm.mlir       |  5 +++++
 examples/BuddyNext/next-rsqrt.mlir         |  5 +++++
 examples/BuddyNext/next-selfattention.mlir | 21 +++++++++++++--------
 examples/BuddyNext/next-softmax.mlir       |  5 +++++
 examples/BuddyNext/next-transpose.mlir     |  5 +++++
 13 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/examples/BuddyNext/next-fc.mlir b/examples/BuddyNext/next-fc.mlir
index 89593d8cf6..3798024300 100644
--- a/examples/BuddyNext/next-fc.mlir
+++ b/examples/BuddyNext/next-fc.mlir
@@ -58,6 +58,11 @@ func.func @kernel_fc_layer(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<4096x40
 
 %tensor_unranked = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [49152{{(, 49152)*}}],
+
 call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
 vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-ffn.mlir b/examples/BuddyNext/next-ffn.mlir
index f132f62d97..725e98db19 100644
--- a/examples/BuddyNext/next-ffn.mlir
+++ b/examples/BuddyNext/next-ffn.mlir
@@ -77,6 +77,11 @@ func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %ar
 
   %tensor_unranked = tensor.cast %158 : tensor<1x40x4096xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [461655{{(, 461655)*}}],
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
@@ -84,15 +89,16 @@ func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %ar
 }
 
 func.func @main() {
-  %input_tensor = arith.constant dense<3.0> : tensor<1x40x4096xf32>
-  %weight1 = arith.constant dense<1.0> : tensor<4096xf32>
-  %weight2 = arith.constant dense<1.0> : tensor<11008x4096xf32>
-  %weight3 = arith.constant dense<2.0> : tensor<11008x4096xf32>
-  %weight4 = arith.constant dense<1.0> : tensor<4096x11008xf32>
+  %input_tensor = arith.constant dense<0.5> : tensor<1x40x4096xf32>
+  %weight1 = arith.constant dense<0.1> : tensor<4096xf32>
+  %weight2 = arith.constant dense<0.1> : tensor<11008x4096xf32>
+  %weight3 = arith.constant dense<0.1> : tensor<11008x4096xf32>
+  %weight4 = arith.constant dense<0.1> : tensor<4096x11008xf32>
 
+  // Print timings.
   call @kernel_ffn(%input_tensor, %weight1, %weight2, %weight3, %weight4) : (tensor<1x40x4096xf32>, tensor<4096xf32>, tensor<11008x4096xf32>, tensor<11008x4096xf32>, tensor<4096x11008xf32>) -> ()
 
   return
 }
 
-func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/examples/BuddyNext/next-fpowi.mlir b/examples/BuddyNext/next-fpowi.mlir
index fca13fd2ed..79274c58e7 100644
--- a/examples/BuddyNext/next-fpowi.mlir
+++ b/examples/BuddyNext/next-fpowi.mlir
@@ -52,6 +52,11 @@ module {
 
     %tensor_unranked = tensor.cast %result : tensor<1x32x40x64xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 64] strides = [81920, 2560, 64, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [25{{(, 25)*}}],
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-matmul.mlir b/examples/BuddyNext/next-matmul.mlir
index a81e78385a..72217cd121 100644
--- a/examples/BuddyNext/next-matmul.mlir
+++ b/examples/BuddyNext/next-matmul.mlir
@@ -44,6 +44,10 @@ func.func @kernel_matmul(%arg0 : tensor<40x4096xf32>, %arg1 : tensor<4096x4096xf
 
   %tensor_unranked = tensor.cast %matmul_result : tensor<40x4096xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [40, 4096] strides = [4096, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [24576{{(, 24576)*}}]
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-mul.mlir b/examples/BuddyNext/next-mul.mlir
index 8b2d5ae677..b1c3d03987 100644
--- a/examples/BuddyNext/next-mul.mlir
+++ b/examples/BuddyNext/next-mul.mlir
@@ -46,6 +46,10 @@ module {
 
     %tensor_unranked = tensor.cast %mul_result : tensor<1x40x1xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [6{{(, 6)*}}]
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-negate.mlir b/examples/BuddyNext/next-negate.mlir
index d11c628ee6..e05805c85d 100644
--- a/examples/BuddyNext/next-negate.mlir
+++ b/examples/BuddyNext/next-negate.mlir
@@ -30,7 +30,7 @@
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
-// RUN: | FileCheck %s
+// RUN: | FileCheck %s  
 
 module {
   func.func private @rtclock() -> f64
@@ -46,6 +46,11 @@ module {
 
     %tensor_unranked = tensor.cast %negated : tensor<1x32x40x64xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 64] strides = [81920, 2560, 64, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [-1{{(, -1)*}}],
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-reciprocal.mlir b/examples/BuddyNext/next-reciprocal.mlir
index 98469786cd..e664f56bc8 100644
--- a/examples/BuddyNext/next-reciprocal.mlir
+++ b/examples/BuddyNext/next-reciprocal.mlir
@@ -46,6 +46,10 @@ module {
 
     %tensor_unranked = tensor.cast %result : tensor<1x10xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [1, 10] strides = [10, 1] data = 
+    // CHECK-NEXT: [
+    // CHECK-SAME: [0.5{{(, 0.5)*}}]
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64
 
@@ -53,7 +57,7 @@ module {
   }
 
   func.func @main() {
-    %input_tensor = "tosa.const"() {value = dense<[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]]> : tensor<1x10xf32>} : () -> tensor<1x10xf32>
+    %input_tensor = "tosa.const"() {value = dense<2.0> : tensor<1x10xf32>} : () -> tensor<1x10xf32>
 
     call @kernel_reciprocal(%input_tensor) : (tensor<1x10xf32>) -> ()
 
diff --git a/examples/BuddyNext/next-reducesum.mlir b/examples/BuddyNext/next-reducesum.mlir
index 825aeae113..92aca0ceac 100644
--- a/examples/BuddyNext/next-reducesum.mlir
+++ b/examples/BuddyNext/next-reducesum.mlir
@@ -46,6 +46,11 @@ module {
 
     %tensor_unranked = tensor.cast %result : tensor<1x40x1xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [4096{{(, 4096)*}}],
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-rmsnorm.mlir b/examples/BuddyNext/next-rmsnorm.mlir
index 7cb4e2c844..f4b21891f0 100644
--- a/examples/BuddyNext/next-rmsnorm.mlir
+++ b/examples/BuddyNext/next-rmsnorm.mlir
@@ -68,6 +68,11 @@ func.func @kernel_rmsnorm(%arg0: tensor<1x40x4096xf32>) {
 
   %tensor_unranked = tensor.cast %39 : tensor<1x40x4096xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [0.999999{{(, 0.999999)*}}],
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-rsqrt.mlir b/examples/BuddyNext/next-rsqrt.mlir
index 6e8d806834..468f9ec961 100644
--- a/examples/BuddyNext/next-rsqrt.mlir
+++ b/examples/BuddyNext/next-rsqrt.mlir
@@ -45,6 +45,11 @@ func.func @kernel_rsqrt(%arg0 : tensor<1x40x1xf32>) {
 
   %tensor_unranked = tensor.cast %rsqrt_result : tensor<1x40x1xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [0.57735{{(, 0.57735)*}}],
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-selfattention.mlir b/examples/BuddyNext/next-selfattention.mlir
index 7976a1b96a..aeb6cf09ea 100644
--- a/examples/BuddyNext/next-selfattention.mlir
+++ b/examples/BuddyNext/next-selfattention.mlir
@@ -200,6 +200,11 @@ func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x
 
   %tensor_unranked = tensor.cast %127 : tensor<1x40x4096xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [83883.8{{(, 83883.8)*}}],
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
@@ -207,15 +212,15 @@ func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x
 }
 
 func.func @main() {
-  %input_tensor_0 = arith.constant dense<3.0> : tensor<1x1x4096xf32>
-  %input_tensor_1 = arith.constant dense<1.0> : tensor<1x40x4096xf32>
-  %input_tensor_2 = arith.constant dense<2> : tensor<40xi64>
-  %input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32>
-  %input_tensor_4 = arith.constant dense<1.0> : tensor<4096x4096xf32>
-  %input_tensor_5 = arith.constant dense<1.0> : tensor<4096x4096xf32>
+  %input_tensor_0 = arith.constant dense<1.0> : tensor<1x1x4096xf32>
+  %input_tensor_1 = arith.constant dense<0.1> : tensor<1x40x4096xf32>
+  %input_tensor_2 = arith.constant dense<1> : tensor<40xi64>
+  %input_tensor_3 = arith.constant dense<0.5> : tensor<4096x4096xf32>
+  %input_tensor_4 = arith.constant dense<0.1> : tensor<4096x4096xf32>
+  %input_tensor_5 = arith.constant dense<0.1> : tensor<4096x4096xf32>
   %input_tensor_6 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32>
-  %input_tensor_7 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32>
-  %input_tensor_8 = arith.constant dense<2.0> : tensor<4096x4096xf32>
+  %input_tensor_7 = arith.constant dense<0.1> : tensor<1x1x2048x128xf32>
+  %input_tensor_8 = arith.constant dense<0.5> : tensor<4096x4096xf32>
   %input_tensor_9 = arith.constant dense<0.0> : tensor<1x1x40x40xf32>
 
   call @kernel_self_attention(%input_tensor_0, %input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4, %input_tensor_5, %input_tensor_6, %input_tensor_7, %input_tensor_8, %input_tensor_9) : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<40xi64>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<4096x4096xf32>, tensor<1x1x40x40xf32>) -> ()
diff --git a/examples/BuddyNext/next-softmax.mlir b/examples/BuddyNext/next-softmax.mlir
index 778320ef1a..98b2e37cdb 100644
--- a/examples/BuddyNext/next-softmax.mlir
+++ b/examples/BuddyNext/next-softmax.mlir
@@ -54,6 +54,11 @@ func.func @kernel_softmax(%arg0 : tensor<1x32x40x40xf32>, %arg1 : tensor<1x1x40x
 
   %tensor_unranked = tensor.cast %109 : tensor<1x32x40x40xf32> to tensor<*xf32>
 
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 40] strides = [51200, 1600, 40, 1] data =
+  // CHECK-NEXT: [
+  // CHECK-SAME: [
+  // CHECK-SAME: [0.025{{(, 0.025)*}}],
+
   call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
   vector.print %time : f64
 
diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir
index 54c3443c66..63e942668e 100644
--- a/examples/BuddyNext/next-transpose.mlir
+++ b/examples/BuddyNext/next-transpose.mlir
@@ -47,6 +47,11 @@ module {
 
     %tensor_unranked = tensor.cast %transposed : tensor<1x32x40x128xf32> to tensor<*xf32>
 
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [
+    // CHECK-SAME: [1{{(, 1)*}}],
+
     call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
     vector.print %time : f64