From 01f83f583ffba8e569e287f09cfd1857259de5ea Mon Sep 17 00:00:00 2001 From: hayden-brown Date: Wed, 31 Jul 2024 23:55:19 +0800 Subject: [PATCH 1/2] Llama2 model Operator/Layer level instance extraction --- examples/BuddyNext/README.md | 408 +++++++++++++++++++ examples/BuddyNext/makefile | 431 +++++++++++++++++++++ examples/BuddyNext/next-fc.mlir | 78 ++++ examples/BuddyNext/next-ffn.mlir | 98 +++++ examples/BuddyNext/next-fpowi.mlir | 70 ++++ examples/BuddyNext/next-matmul.mlir | 63 +++ examples/BuddyNext/next-mul.mlir | 65 ++++ examples/BuddyNext/next-negate.mlir | 64 +++ examples/BuddyNext/next-reciprocal.mlir | 64 +++ examples/BuddyNext/next-reducesum.mlir | 64 +++ examples/BuddyNext/next-rmsnorm.mlir | 85 ++++ examples/BuddyNext/next-rsqrt.mlir | 62 +++ examples/BuddyNext/next-selfattention.mlir | 226 +++++++++++ examples/BuddyNext/next-softmax.mlir | 72 ++++ examples/BuddyNext/next-transpose.mlir | 65 ++++ 15 files changed, 1915 insertions(+) create mode 100644 examples/BuddyNext/README.md create mode 100644 examples/BuddyNext/next-fc.mlir create mode 100644 examples/BuddyNext/next-ffn.mlir create mode 100644 examples/BuddyNext/next-fpowi.mlir create mode 100644 examples/BuddyNext/next-matmul.mlir create mode 100644 examples/BuddyNext/next-mul.mlir create mode 100644 examples/BuddyNext/next-negate.mlir create mode 100644 examples/BuddyNext/next-reciprocal.mlir create mode 100644 examples/BuddyNext/next-reducesum.mlir create mode 100644 examples/BuddyNext/next-rmsnorm.mlir create mode 100644 examples/BuddyNext/next-rsqrt.mlir create mode 100644 examples/BuddyNext/next-selfattention.mlir create mode 100644 examples/BuddyNext/next-softmax.mlir create mode 100644 examples/BuddyNext/next-transpose.mlir diff --git a/examples/BuddyNext/README.md b/examples/BuddyNext/README.md new file mode 100644 index 0000000000..423c8b73d9 --- /dev/null +++ b/examples/BuddyNext/README.md @@ -0,0 +1,408 @@ +# Llama 2 Operator/Layer level instance extraction + +--- + +## Operator Level: + +### **TOSA Dialect** + +#### `tosa.mul` + + make next-mul-run + +- **Input Tensors**: + - Shape: `tensor<1xf32>` + - Example: `[3.0]` + + - Shape: `tensor<1x40x1xf32>` + - Example: `[[[2.0], [2.0], ..., [2.0]]]` (40 elements) +- **Output Tensor**: + - Shape: `tensor<1x40x1xf32>` + - Example: All elements will be `6.0` after the multiplication operation. +- **Multiplication Operation**: + - The `tosa.mul` operation is applied to the input tensors `%arg0` and `%arg1`, performing an element-wise multiplication. +- **Timing:** + - elapsed time: 0.000380993 + +#### `tosa.negate` + + make next-negate-run + +- **Input Tensor**: + - Shape: `tensor<1x32x40x64xf32>` + - Example: All elements initialized to `1.0`. +- **Output Tensor**: + - Shape: `tensor<1x32x40x64xf32>` + - Example: All elements will be `-1.0` after the negate operation. +- **Negate Operation**: + - The `tosa.negate` operation is applied to the input tensor `%arg0`, which negates each element in the tensor. +- **Timing:** + - elapsed time: 0.000413179 + +#### `tosa.reciprocal` + + make next-reciprocal-run + +- **Input Tensor**: + - Shape: `tensor<1x10xf32>` + - Example: All elements initialized to `[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]`. +- **Output Tensor**: + - Shape: `tensor<1x10xf32>` + - Example: All elements will be the reciprocal of the input tensor elements, i.e., `[1.0, 0.5, 0.333, 0.25, 0.2, 0.166, 0.142, 0.125, 0.111, 0.1]`. +- **Reciprocal Operation**: + - The `tosa.reciprocal` operation is applied to the input tensor `%arg0`, which computes the reciprocal (1/x) of each element in the tensor. +- **Timing:** + - elapsed time: 0.000286102 + +#### `tosa.reduce_sum` + + make next-reducesum-run + +- **Input Tensor**: + - Shape: `tensor<1x40x4096xf32>` + - Example: All elements initialized to `1.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x1xf32>` + - Example: Each element in the output tensor is the sum of 4096 elements from the corresponding dimension of the input tensor, which will be `4096.0` for each element. +- **Reduce Sum Operation**: + - The `tosa.reduce_sum` operation is applied to the input tensor `%arg0`, summing elements along the `axis=2` dimension. This reduces the shape of the tensor from `[1, 40, 4096]` to `[1, 40, 1]`. +- **Timing:** + - elapsed time: 0.000262976 + +#### `tosa.rsqrt` + + make next-rsqrt-run + +- **Input Tensor**: + - Shape: `tensor<1x40x1xf32>` + - Example: All elements initialized to `3.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x1xf32>` + - Example: Each element in the output tensor will be the reciprocal of the square root of the corresponding element in the input tensor, which will be approximately `0.57735` for each element. +- **Rsqrt Operation**: + - The `tosa.rsqrt` operation is applied to the input tensor `%arg0`, which computes the reciprocal of the square root of each element in the tensor. +- **Timing:** + - elapsed time: 3.09944e-06 + +#### `tosa.transpose` + + make next-transpose-run + +- **Input Tensor**: + - Shape: `tensor<1x40x32x128xf32>` + - Example: All elements initialized to `1.0`. +- **Output Tensor**: + - Shape: `tensor<1x32x40x128xf32>` + - Example: The tensor after transposing will have the elements permuted according to the permutation vector `[0, 2, 1, 3]`. Given that all elements are initialized to `1.0`, the values remain `1.0` but the shape is permuted. +- **Transpose Operation**: + - The `tosa.transpose` operation is applied to the input tensor `%arg0` with the permutation vector `%perm`, which rearranges the dimensions of the input tensor according to `[0, 2, 1, 3]`.- The permutation `[0, 2, 1, 3]` means: + - The first dimension remains the same. + - The second dimension (40) is swapped with the third dimension (32). + - The fourth dimension (128) remains the same. + - Therefore, the input tensor shape `[1, 40, 32, 128]` is transposed to `[1, 32, 40, 128]`. +- **Timing:** + - elapsed time: 0.000138044 + +### **Math Dialect** + +#### `math.fpowi` + + make next-fpowi-run + +- **Input Tensor**: + - Shape: `tensor<1x32x40x64xf32>` + - Example: All elements initialized to `5.0`. +- **Output Tensor**: + - Shape: `tensor<1x32x40x64xf32>` + - Example: Each element in the output tensor will be the value of the corresponding element in the input tensor raised to the power of `2`, i.e., `25.0` for each element. +- **Power Operation**: + - The `math.fpowi` operation is applied to each element in the input tensor `%arg0`, raising it to the power of `2`. + - For example, if an element in the input tensor is `5.0`, the corresponding element in the output tensor will be `5.0^2 = 25.0`. +- **Timing:** + - elapsed time: 8.29697e-05 + +### **Linalg Dialect** + +#### `linalg.matmul` + +make next-matmul-run + +- **Input Tensors**: + - Shape: `tensor<40x4096xf32>` + - Example: All elements initialized to `3.0`. + + - Shape: `tensor<4096x4096xf32>` + - Example: All elements initialized to `2.0`. +- **Output Tensor**: + - Shape: `tensor<40x4096xf32>` + - Example: Each element in the output tensor will be the result of the matrix multiplication of the input tensors. Given the initialization, the elements will be the result of `3.0 * 2.0 * 4096`. +- **Matrix Multiplication Operation**: + - The `linalg.matmul` operation is applied to the input tensors `%arg0` and `%arg1`, performing matrix multiplication. + - The output tensor `%arg2` is the result of the matrix multiplication, where each element is calculated as the sum of the element-wise products of the rows of the first matrix and the columns of the second matrix. +- **Timing:** + - elapsed time: 7.42794 + +--- + +## Layer Level + +#### `Full Connect Layer` + + `make next-fc-run` + +- **Input Tensors**: + - Shape: `tensor<1x40x4096xf32>` + - Example: All elements initialized to `3.0`. + + - Shape: `tensor<4096x4096xf32>` + - Example: All elements initialized to `2.0`. + + - Shape: `tensor<4096x4096xf32>` + - Example: All elements initialized to `1.0`. + + - Shape: `tensor<1x40x4096xf32>` + - Example: All elements initialized to `4.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x4096xf32>` + - Example: The exact values will depend on the computations performed during the fully connected layer operations, which include multiplication, transposition, and reshaping. +- **Fully Connected Layer Operations**: + 1. **Multiplication**: + - `%41 = tosa.mul %arg0, %arg3` multiplies the elements of `%arg0` and `%arg3` element-wise. + - Example: The result tensor will have elements initialized to `3.0 * 4.0 = 12.0`. + 2. **Transpose**: + - `%43 = tosa.transpose %arg1, %42` transposes the tensor `%arg1` according to the permutation `[1, 0]`. + - Example: The tensor shape remains `[4096x4096]`. + 3. **Reshape**: + - `%44 = tosa.reshape %41` reshapes the tensor from `tensor<1x40x4096xf32>` to `tensor<40x4096xf32>`. + 4. **Matrix Multiplication**: + - `%45 = linalg.matmul` performs matrix multiplication on the reshaped tensor and the transposed tensor. + - Example: Each element of the resulting `tensor<40x4096xf32>` will be `12.0 * 2.0 * 4096 = 98304.0`. + - The result is reshaped back to `tensor<1x40x4096xf32>`. + 5. **Second Transpose and Reshape**: + - Similar transpose and reshape operations are performed on `%arg2` and the result tensor `%41`. + 6. **Second Matrix Multiplication**: + - `%50 = linalg.matmul` performs matrix multiplication on the reshaped tensors, and the result is reshaped back to `tensor<1x40x4096xf32>`. + - Example: Each element of the resulting `tensor<40x4096xf32>` will be `12.0 * 1.0 * 4096 = 49152.0`. + - The final output tensor will have the shape `tensor<1x40x4096xf32>` with each element being `49152.0`. +- **Timing:** + - elapsed time: 10.8429 + +#### `Feed Forward Network` + + `make next-ffn-run` + +- **Input Tensors**: + - Shape: `tensor<1x40x4096xf32>` + - Example: All elements initialized to `3.0`. + + - Shape: `tensor<4096xf32>` + - Example: All elements initialized to `1.0`. + + - Shape: `tensor<11008x4096xf32>` + - Example: All elements initialized to `1.0`. + + - Shape: `tensor<11008x4096xf32>` + - Example: All elements initialized to `2.0`. + + - Shape: `tensor<4096x11008xf32>` + - Example: All elements initialized to `1.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x4096xf32>` +- **Feed Forward Network Operations**: + 1. **Multiplication**: + - `%138 = tosa.reshape %arg9 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>` + - `%139 = tosa.mul %138, %arg0 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>` + - Example: The result tensor will have elements initialized to `1.0 * 3.0 = 3.0`. + 2. **Transpose**: + - `%141 = tosa.transpose %arg10, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>` + - Example: The tensor shape remains `[4096x11008]`. + 3. **Reshape**: + - `%142 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>` + 4. **Matrix Multiplication**: + - `%143 = linalg.matmul {cast = #linalg.type_fn} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32>` + - Example: Each element of the resulting `tensor<40x11008xf32>` will be the sum of the products of corresponding elements from the input tensor and the transposed weight tensor, resulting in a tensor with elements calculated as `3.0 * 1.0 * 4096 = 12288.0`. + 5. **Reshape**: + - `%144 = tosa.reshape %143 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>` + 6. **Sigmoid and Multiplication**: + - `%145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>` + - Example: Each element of the resulting tensor will be the sigmoid of `12288.0`, which is very close to `1.0` because the sigmoid function asymptotically approaches `1` for large positive inputs. + - `%146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>` + - Example: Each element of the resulting tensor will be `12288.0 * 1.0 = 12288.0`. + 7. **Second Transpose and Reshape**: + - Similar transpose and reshape operations are performed on `%arg11` and the result tensor `%146`. + 8. **Second Matrix Multiplication**: + - `%150 = linalg.matmul {cast = #linalg.type_fn} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32>` + - Example: Each element of the resulting tensor will be `3.0 * 2.0 * 4096 = 24576.0`. + - The result is reshaped back to `tensor<1x40x11008xf32>`. + 9. **Final Multiplication and Matrix Multiplication**: + - `%152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>` + - Example: Each element of the resulting tensor will be `12288.0 * 24576.0 = 301989888.0`. + - `%156 = linalg.matmul {cast = #linalg.type_fn} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32>` + - Example: Each element of the resulting tensor will be the sum of products of elements from the tensor of `301989888.0` and the weight tensor, resulting in very large values. + 10. **Addition**: + - `%158 = tosa.add %arg0, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>` + - The final output tensor will be the addition of the original input tensor `%arg0` and the resulting tensor from the previous computations. +- **Timing:** + - elapsed time: 56.0974 + +#### `RMSNorm` + + `make next-rmsnorm-run` + +- **Input Tensor**: + - Shape: `tensor<1x40x4096xf32>` + - Example: All elements initialized to `3.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x4096xf32>` + - Example: Each element in the output tensor will be the result of the RMSNorm operations applied to the input tensor. +- **RMSNorm Operations**: + 1. **Square Elements**: + - `%31 = linalg.generic` squares each element in the input tensor `%arg0`. + - Example: Each element will be `3.0^2 = 9.0`. + 2. **Reduce Sum**: + - `%32 = tosa.reduce_sum %31 {axis = 2 : i32}` sums the squared elements along the last dimension. + - Example: Each element in the resulting tensor will be the sum of `4096` squared elements, `9.0 * 4096 = 36864.0`. + 3. **Reciprocal and Multiplication**: + - `%34 = tosa.reciprocal %33` computes the reciprocal of a constant tensor value `4096.0`. + - `%35 = tosa.mul %34, %32` multiplies the reciprocal with the sum of squares. + - Example: Each element will be `1/4096 * 36864.0 = 9.0`. + 4. **Add Small Constant**: + - `%37 = tosa.add %35, %36` adds a small constant `1e-5` to the result. + - Example: Each element will be `9.0 + 1e-5`. + 5. **Reciprocal Square Root**: + - `%38 = tosa.rsqrt %37` computes the reciprocal square root of the result. + - Example: Each element will be approximately `1 / sqrt(9.0 + 1e-5) ≈ 0.333333`. + 6. **Final Multiplication**: + - `%39 = tosa.mul %arg0, %38` multiplies the original input tensor `%arg0` by the reciprocal square root. + - Example: Each element will be `3.0 * 0.333333 = 0.999999`. +- **Timing:** + - elapsed time: 0.000798941 + +#### `Softmax` + + `make next-softmax-run` + +- **Input Tensors**: + + - Shape: `tensor<1x32x40x40xf32>` + - Example: All elements initialized to `3.0`. + + - Shape: `tensor<1x1x40x40xf32>` + - Example: All elements initialized to `0.0`. + +- **Output Tensor**: + + - Shape: `tensor<1x32x40x40xf32>` + - Example: Each element in the output tensor will be the result of the softmax operations applied to the input tensor. The elements will sum to `1` along the last axis (softmax dimension). + +- **Softmax Operations**: + + 1. **Scaling**: + + - `%101 = tosa.reciprocal %100` computes the reciprocal of a constant tensor value `11.3137083`. + - `%102 = tosa.mul %arg0, %101` scales the input tensor `%arg0` by multiplying with the reciprocal. + - Example: Each element will be `3.0 / 11.3137083 ≈ 0.265`. + + 2. **Addition**: + - `%103 = tosa.add %102, %arg1` adds the second input tensor `%arg1` to the scaled tensor. + - Example: Each element will remain `0.265` as `%arg1` is all zeros. + 3. **Max Reduction**: + - `%104 = tosa.reduce_max %103` computes the maximum value along the last dimension (axis 3). + - Example: The maximum value along each `40x40` slice will be `0.265`. + 4. **Subtraction**: + - `%105 = tosa.sub %103, %104` subtracts the maximum value from each element to ensure numerical stability. + - Example: Each element will be `0.265 - 0.265 = 0.0`. + 5. **Exponentiation**: + - `%106 = tosa.exp %105` applies the exponential function to each element. + - Example: Each element will be `exp(0.0) = 1.0`. + 6. **Sum Reduction**: + - `%107 = tosa.reduce_sum %106` computes the sum of exponentials along the last dimension (axis 3). + - Example: The sum along each `40x40` slice will be `40` since each element is `1.0`. + 7. **Reciprocal of Sum**: + - `%108 = tosa.reciprocal %107` computes the reciprocal of the sum of exponentials. + - Example: Each element will be `1 / 40 = 0.025`. + 8. **Final Multiplication**: + - `%109 = tosa.mul %106, %108` multiplies the exponentials by the reciprocal of their sum to normalize them. + - Example: Each element will be `1.0 * 0.025 = 0.025`. + +- **Timing:** + + - elapsed time: 0.000925779 + +#### `Self-Attention` + + `make next-selfattention-run` + +- **Input Tensors**: + - `tensor<1x1x4096xf32>` + - Shape: `[1, 1, 4096]` + - Example: All elements initialized to `3.0`. + - `tensor<1x40x4096xf32>` + - Shape: `[1, 40, 4096]` + - Example: All elements initialized to `1.0`. + - `tensor<40xi64>` + - Shape: `[40]` + - Example: All elements initialized to `2`. + - `tensor<4096x4096xf32>` + - Shape: `[4096, 4096]` + - Example: All elements initialized to `1.0`. + - `tensor<4096x4096xf32>` + - Shape: `[4096, 4096]` + - Example: All elements initialized to `1.0`. + - `tensor<4096x4096xf32>` + - Shape: `[4096, 4096]` + - Example: All elements initialized to `1.0`. + - `tensor<1x1x2048x128xf32>` + - Shape: `[1, 1, 2048, 128]` + - Example: All elements initialized to `1.0`. + - `tensor<1x1x2048x128xf32>` + - Shape: `[1, 1, 2048, 128]` + - Example: All elements initialized to `1.0`. + - `tensor<4096x4096xf32>` + - Shape: `[4096, 4096]` + - Example: All elements initialized to `2.0`. + - `tensor<1x1x40x40xf32>` + - Shape: `[1, 1, 40, 40]` + - Example: All elements initialized to `0.0`. +- **Output Tensor**: + - Shape: `tensor<1x40x4096xf32>` + - Example: Each element in the output tensor will be the result of the self-attention operations applied to the input tensors. +- **Softmax Operations**: + 1. **Compute Query, Key, and Value Matrices**: + - **Query**: + - `%41 = tosa.mul %arg0, %arg1` scales the input tensor. + - Example: Each element will be `3.0 * 1.0 = 3.0`. + - `%45` and `%46` involve transposition and reshaping. + - Example: Elements remain `3.0`. + - **Key**: + - `%50` and `%51` involve similar transposition and reshaping as Query. + - Example: Elements remain `3.0`. + - **Value**: + - `%55` and `%56` involve similar transposition and reshaping as Query. + - Example: Elements remain `3.0`. + 2. **Apply Rotary Positional Encoding (RoPE) to Q and K Vectors**: + - **Query RoPE**: + - Transpose and reshape operations (`%57`, `%58`, `%59`). + - Example: Shape transformed to `1x32x40x128`. + - **Key RoPE**: + - Similar transpose and reshape operations (`%60`, `%61`, `%62`). + - **Value RoPE**: + - Similar transpose and reshape operations (`%63`, `%64`, `%65`). + 3. **Compute Softmax(Q, K) and Self-Attention Output**: + + - **Attention Scores**: + - Extract slices and generic operations to calculate (`%66` to `%79`). + - Compute multiplication of Q and K with positional encoding applied (`%80` to `%83`). + - **Softmax**: + - `%84 = tosa.add %80, %83` sums the attention scores. + - Apply softmax function over the attention scores. + - **Self-Attention Output**: + - Compute the output by multiplying attention scores with Value matrix (`%112` to `%116`). + - Transpose and reshape operations for final output (`%117` to `%121`). + - Example: Each element in the output tensor will be influenced by the weighted sum of values, resulting from the softmax-scaled dot product of queries and keys. + 4. **Final Matrix Multiplication and Addition**: + - `%125 = linalg.matmul` performs matrix multiplication on the reshaped tensor and the transposed weight tensor. + - `%127 = tosa.add %arg1, %126` adds the original input tensor `%arg1` to the result of the matrix multiplication. + - Example: The output tensor shape is `tensor<1x40x4096xf32>`, and each element will be the sum of the original input tensor elements and the matrix multiplication result. +- **Timing:** + - elapsed time: 48.4356 \ No newline at end of file diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile index 443907d352..78f3937608 100644 --- a/examples/BuddyNext/makefile +++ b/examples/BuddyNext/makefile @@ -230,3 +230,434 @@ next-rope-run: -reconcile-unrealized-casts | \ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-matmul-run: + @${MLIR_OPT} ./next-matmul.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + + +next-rmsnorm-run: + @${MLIR_OPT} ./next-rmsnorm.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-rsqrt-run: + @${MLIR_OPT} ./next-rsqrt.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + + +next-mul-run: + @${MLIR_OPT} ./next-mul.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-softmax-run: + @${MLIR_OPT} ./next-softmax.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-ffn-run: + @${MLIR_OPT} ./next-ffn.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-transpose-run: + @${MLIR_OPT} ./next-transpose.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-negate-run: + @${MLIR_OPT} ./next-negate.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-fpowi-run: + @${MLIR_OPT} ./next-fpowi.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reducesum-run: + @${MLIR_OPT} ./next-reducesum.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-reciprocal-run: + @${MLIR_OPT} ./next-reciprocal.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-fc-run: + @${MLIR_OPT} ./next-fc.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} + +next-selfattention-run: + @${MLIR_OPT} ./next-selfattention.mlir \ + -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \ + ${MLIR_OPT} \ + -arith-expand \ + -eliminate-empty-tensors \ + -empty-tensor-to-alloc-tensor \ + -one-shot-bufferize \ + -convert-linalg-to-affine-loops \ + -affine-loop-fusion \ + -lower-affine \ + -func-bufferize \ + -arith-bufferize \ + -tensor-bufferize \ + -buffer-deallocation \ + -finalizing-bufferize \ + -convert-vector-to-scf \ + -expand-strided-metadata \ + -convert-vector-to-llvm \ + -memref-expand \ + -arith-expand \ + -convert-arith-to-llvm \ + -finalize-memref-to-llvm \ + -convert-scf-to-cf \ + -convert-openmp-to-llvm \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts | \ + ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \ + -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS} diff --git a/examples/BuddyNext/next-fc.mlir b/examples/BuddyNext/next-fc.mlir new file mode 100644 index 0000000000..89593d8cf6 --- /dev/null +++ b/examples/BuddyNext/next-fc.mlir @@ -0,0 +1,78 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 + +func.func @kernel_fc_layer(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<4096x4096xf32>, %arg2 : tensor<4096x4096xf32>, %arg3 : tensor<1x40x4096xf32>) { +%t_start = call @rtclock() : () -> f64 + +%cst_0 = arith.constant dense<0.0> : tensor<40x4096xf32> +%cst_1 = arith.constant dense<0.0> : tensor<40x4096xf32> + +%41 = tosa.mul %arg0, %arg3 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> +%42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> +%43 = tosa.transpose %arg1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> +%44 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> +%45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_0 : tensor<40x4096xf32>) -> tensor<40x4096xf32> +%46 = tosa.reshape %45 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + +%47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> +%48 = tosa.transpose %arg2, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> +%49 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> +%50 = linalg.matmul {cast = #linalg.type_fn} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_1 : tensor<40x4096xf32>) -> tensor<40x4096xf32> +%51 = tosa.reshape %50 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + +%t_end = call @rtclock() : () -> f64 +%time = arith.subf %t_end, %t_start : f64 + +%tensor_unranked = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32> + +call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () +vector.print %time : f64 + +return +} + +func.func @main() { +%input_tensor_1 = arith.constant dense<3.0> : tensor<1x40x4096xf32> +%input_tensor_2 = arith.constant dense<2.0> : tensor<4096x4096xf32> +%input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32> +%input_tensor_4 = arith.constant dense<4.0> : tensor<1x40x4096xf32> + +call @kernel_fc_layer(%input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4) : (tensor<1x40x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x40x4096xf32>) -> () + +return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-ffn.mlir b/examples/BuddyNext/next-ffn.mlir new file mode 100644 index 0000000000..f132f62d97 --- /dev/null +++ b/examples/BuddyNext/next-ffn.mlir @@ -0,0 +1,98 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map3 = affine_map<(d0, d1) -> (d0, d1)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map7 = affine_map<(d0, d1) -> (0, d0, d1)> + +func.func private @rtclock() -> f64 + +func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %arg10: tensor<11008x4096xf32>, %arg11: tensor<11008x4096xf32>, %arg12: tensor<4096x11008xf32>) { + %t_start = call @rtclock() : () -> f64 + + // FFN + %138 = tosa.reshape %arg9 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32> + %139 = tosa.mul %138, %arg0 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + %140 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %141 = tosa.transpose %arg10, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32> + %142 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_24 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32> + %143 = linalg.matmul {cast = #linalg.type_fn} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32> + %144 = tosa.reshape %143 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32> + %145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> + %146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> + %147 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %148 = tosa.transpose %arg11, %147 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32> + %149 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_25 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32> + %150 = linalg.matmul {cast = #linalg.type_fn} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32> + %151 = tosa.reshape %150 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32> + %152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> + %153 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %154 = tosa.transpose %arg12, %153 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32> + %155 = tosa.reshape %152 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32> + %cst_26 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> + %156 = linalg.matmul {cast = #linalg.type_fn} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %157 = tosa.reshape %156 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %158 = tosa.add %arg0, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %158 : tensor<1x40x4096xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor = arith.constant dense<3.0> : tensor<1x40x4096xf32> + %weight1 = arith.constant dense<1.0> : tensor<4096xf32> + %weight2 = arith.constant dense<1.0> : tensor<11008x4096xf32> + %weight3 = arith.constant dense<2.0> : tensor<11008x4096xf32> + %weight4 = arith.constant dense<1.0> : tensor<4096x11008xf32> + + call @kernel_ffn(%input_tensor, %weight1, %weight2, %weight3, %weight4) : (tensor<1x40x4096xf32>, tensor<4096xf32>, tensor<11008x4096xf32>, tensor<11008x4096xf32>, tensor<4096x11008xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-fpowi.mlir b/examples/BuddyNext/next-fpowi.mlir new file mode 100644 index 0000000000..fca13fd2ed --- /dev/null +++ b/examples/BuddyNext/next-fpowi.mlir @@ -0,0 +1,70 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_fpowi(%arg0: tensor<1x32x40x64xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Power operation + %c2_i32 = arith.constant 2 : i32 + %output_tensor = tensor.empty() : tensor<1x32x40x64xf32> + %result = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x32x40x64xf32>) outs(%output_tensor : tensor<1x32x40x64xf32>) { + ^bb0(%in: f32, %out: f32): + %0 = math.fpowi %in, %c2_i32 : f32, i32 + linalg.yield %0 : f32 + } -> tensor<1x32x40x64xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %result : tensor<1x32x40x64xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor = arith.constant dense<5.0> : tensor<1x32x40x64xf32> + + call @kernel_fpowi(%input_tensor) : (tensor<1x32x40x64xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr : tensor<*xf32>) +} \ No newline at end of file diff --git a/examples/BuddyNext/next-matmul.mlir b/examples/BuddyNext/next-matmul.mlir new file mode 100644 index 0000000000..a81e78385a --- /dev/null +++ b/examples/BuddyNext/next-matmul.mlir @@ -0,0 +1,63 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 + +func.func @kernel_matmul(%arg0 : tensor<40x4096xf32>, %arg1 : tensor<4096x4096xf32>, %arg2 : tensor<40x4096xf32>) { + %t_start = call @rtclock() : () -> f64 + + %matmul_result = linalg.matmul {cast = #linalg.type_fn} ins(%arg0, %arg1 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%arg2 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %matmul_result : tensor<40x4096xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor_1 = arith.constant dense<3.0> : tensor<40x4096xf32> + %input_tensor_2 = arith.constant dense<2.0> : tensor<4096x4096xf32> + %output_tensor = arith.constant dense<0.0> : tensor<40x4096xf32> + + call @kernel_matmul(%input_tensor_1, %input_tensor_2, %output_tensor) : (tensor<40x4096xf32>, tensor<4096x4096xf32>, tensor<40x4096xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-mul.mlir b/examples/BuddyNext/next-mul.mlir new file mode 100644 index 0000000000..8b2d5ae677 --- /dev/null +++ b/examples/BuddyNext/next-mul.mlir @@ -0,0 +1,65 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_mul(%arg0: tensor<1xf32>, %arg1: tensor<1x40x1xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Perform the multiplication operation + %mul_result = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %mul_result : tensor<1x40x1xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor_1 = arith.constant dense<3.0> : tensor<1xf32> + %input_tensor_2 = arith.constant dense<2.0> : tensor<1x40x1xf32> + + call @kernel_mul(%input_tensor_1, %input_tensor_2) : (tensor<1xf32>, tensor<1x40x1xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr: tensor<*xf32>) +} \ No newline at end of file diff --git a/examples/BuddyNext/next-negate.mlir b/examples/BuddyNext/next-negate.mlir new file mode 100644 index 0000000000..d11c628ee6 --- /dev/null +++ b/examples/BuddyNext/next-negate.mlir @@ -0,0 +1,64 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_negate(%arg0: tensor<1x32x40x64xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Negate operation + %negated = tosa.negate %arg0 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %negated : tensor<1x32x40x64xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor = arith.constant dense<1.0> : tensor<1x32x40x64xf32> + + call @kernel_negate(%input_tensor) : (tensor<1x32x40x64xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr : tensor<*xf32>) +} \ No newline at end of file diff --git a/examples/BuddyNext/next-reciprocal.mlir b/examples/BuddyNext/next-reciprocal.mlir new file mode 100644 index 0000000000..98469786cd --- /dev/null +++ b/examples/BuddyNext/next-reciprocal.mlir @@ -0,0 +1,64 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_reciprocal(%arg0: tensor<1x10xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Reciprocal operation + %result = tosa.reciprocal %arg0 : (tensor<1x10xf32>) -> tensor<1x10xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %result : tensor<1x10xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor = "tosa.const"() {value = dense<[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]]> : tensor<1x10xf32>} : () -> tensor<1x10xf32> + + call @kernel_reciprocal(%input_tensor) : (tensor<1x10xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr : tensor<*xf32>) +} \ No newline at end of file diff --git a/examples/BuddyNext/next-reducesum.mlir b/examples/BuddyNext/next-reducesum.mlir new file mode 100644 index 0000000000..825aeae113 --- /dev/null +++ b/examples/BuddyNext/next-reducesum.mlir @@ -0,0 +1,64 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_reduce_sum(%arg0: tensor<1x40x4096xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Reduce sum operation + %result = tosa.reduce_sum %arg0 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %result : tensor<1x40x1xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor = arith.constant dense<1.0> : tensor<1x40x4096xf32> + + call @kernel_reduce_sum(%input_tensor) : (tensor<1x40x4096xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr : tensor<*xf32>) +} \ No newline at end of file diff --git a/examples/BuddyNext/next-rmsnorm.mlir b/examples/BuddyNext/next-rmsnorm.mlir new file mode 100644 index 0000000000..7cb4e2c844 --- /dev/null +++ b/examples/BuddyNext/next-rmsnorm.mlir @@ -0,0 +1,85 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map3 = affine_map<(d0, d1) -> (d0, d1)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map7 = affine_map<(d0, d1) -> (0, d0, d1)> + +func.func private @rtclock() -> f64 + +func.func @kernel_rmsnorm(%arg0: tensor<1x40x4096xf32>) { + %t_start = call @rtclock() : () -> f64 + + // RMSNorm operations + %30 = tensor.empty() : tensor<1x40x4096xf32> + %c2_i32 = arith.constant 2 : i32 + %31 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x40x4096xf32>) outs(%30 : tensor<1x40x4096xf32>) { + ^bb0(%in: f32, %out: f32): + %4175 = math.fpowi %in, %c2_i32 : f32, i32 + linalg.yield %4175 : f32 + } -> tensor<1x40x4096xf32> + %32 = tosa.reduce_sum %31 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32> + %33 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1x1xf32>}> : () -> tensor<1x1xf32> + %34 = tosa.reciprocal %33 : (tensor<1x1xf32>) -> tensor<1x1xf32> + %35 = tosa.mul %34, %32 {shift = 0 : i8} : (tensor<1x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> + %36 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32> + %37 = tosa.add %35, %36 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> + %38 = tosa.rsqrt %37 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32> + %39 = tosa.mul %arg0, %38 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %39 : tensor<1x40x4096xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor_1 = arith.constant dense<3.0> : tensor<1x40x4096xf32> + + call @kernel_rmsnorm(%input_tensor_1) : (tensor<1x40x4096xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-rsqrt.mlir b/examples/BuddyNext/next-rsqrt.mlir new file mode 100644 index 0000000000..6e8d806834 --- /dev/null +++ b/examples/BuddyNext/next-rsqrt.mlir @@ -0,0 +1,62 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 + +func.func @kernel_rsqrt(%arg0 : tensor<1x40x1xf32>) { + %t_start = call @rtclock() : () -> f64 + + // rsqrt operation + %rsqrt_result = tosa.rsqrt %arg0 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %rsqrt_result : tensor<1x40x1xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor = arith.constant dense<3.0> : tensor<1x40x1xf32> + + call @kernel_rsqrt(%input_tensor) : (tensor<1x40x1xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-selfattention.mlir b/examples/BuddyNext/next-selfattention.mlir new file mode 100644 index 0000000000..7976a1b96a --- /dev/null +++ b/examples/BuddyNext/next-selfattention.mlir @@ -0,0 +1,226 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map3 = affine_map<(d0, d1) -> (d0, d1)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map7 = affine_map<(d0, d1) -> (0, d0, d1)> +func.func private @rtclock() -> f64 + +func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %arg2 : tensor<40xi64>, %arg3 : tensor<4096x4096xf32>, %arg4 : tensor<4096x4096xf32>, %arg5 : tensor<4096x4096xf32>, %arg6 : tensor<1x1x2048x128xf32>, %arg7 : tensor<1x1x2048x128xf32>, %arg8 : tensor<4096x4096xf32>, %arg9 : tensor<1x1x40x40xf32>) { + %t_start = call @rtclock() : () -> f64 + + // 计算 Query、Key 和 Value 矩阵 + %41 = tosa.mul %arg0, %arg1 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + + %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %43 = tosa.transpose %arg3, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %44 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_6 = arith.constant dense<0.0> : tensor<40x4096xf32> + %45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %46 = tosa.reshape %45 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %48 = tosa.transpose %arg4, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %49 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_7 = arith.constant dense<0.0> : tensor<40x4096xf32> + %50 = linalg.matmul {cast = #linalg.type_fn} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %51 = tosa.reshape %50 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %53 = tosa.transpose %arg5, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %54 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_8 = arith.constant dense<0.0> : tensor<40x4096xf32> + %55 = linalg.matmul {cast = #linalg.type_fn} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %56 = tosa.reshape %55 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + + // 对 Q、K 向量进行 RoPE (旋转式位置编码) + %57 = tosa.reshape %46 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> + %58 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> + %59 = tosa.transpose %57, %58 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + + %60 = tosa.reshape %51 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> + %61 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> + %62 = tosa.transpose %60, %61 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + + %63 = tosa.reshape %56 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> + %64 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> + %65 = tosa.transpose %63, %64 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + + // 计算 Softmax(Q,K) 以及Self-Attention的输出 + %extracted_slice_9 = tensor.extract_slice %arg6[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %extracted_slice_12 = tensor.extract_slice %arg7[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %66 = tensor.empty() : tensor<1x40x128xf32> + %67 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x40x128xf32>) outs(%66 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %68 = tensor.empty() : tensor<40x128xf32> + %69 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%67 : tensor<1x40x128xf32>) outs(%68 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + %70 = tensor.empty() : tensor<1x40x128xf32> + %71 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x1x40x128xf32>) outs(%70 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %72 = tensor.empty() : tensor<40x128xf32> + %73 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x40x128xf32>) outs(%72 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + + %74 = tensor.empty() : tensor<1x40x128xf32> + %arg2_converted = tosa.reshape %arg2 {new_shape = array} : (tensor<40xi64>) -> tensor<1x40xi64> + %75 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2_converted : tensor<1x40xi64>) outs(%74 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %4175 = arith.index_cast %in : i64 to index + %4176 = linalg.index 1 : index + %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + %76 = tosa.reshape %75 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32> + + %77 = tensor.empty() : tensor<1x40x128xf32> + %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg2_converted : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %4175 = arith.index_cast %in : i64 to index + %4176 = linalg.index 1 : index + %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + %79 = tosa.reshape %78 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32> + + %80 = tosa.mul %59, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %81 = tosa.negate %extracted_slice_16 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32> + %82 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %83 = tosa.mul %inserted_slice_17, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %85 = tosa.mul %62, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %86 = tosa.negate %extracted_slice_19 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32> + %87 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + + // 计算 Softmax(QK/sqrt(d_k)) + %88 = tosa.mul %inserted_slice_21, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %89 = tosa.add %85, %88 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %90 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> + %91 = tosa.transpose %89, %90 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32> + %92 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %94 = tosa.reshape %93 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + %95 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32> + %96 = tosa.add %91, %95 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32> + %97 = tosa.reshape %96 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32> + %98 = tosa.matmul %94, %97 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32> + %99 = tosa.reshape %98 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32> + %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %102 = tosa.mul %99, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %103 = tosa.add %102, %arg9 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32> + %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> + %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> + %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32> + %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + + // 计算Self-Attention的输出 + %110 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %111 = tosa.add %109, %110 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %112 = tosa.reshape %111 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32> + %113 = "tosa.const"() <{value = dense<0.0> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %114 = tosa.add %65, %113 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %115 = tosa.reshape %114 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + %116 = tosa.matmul %112, %115 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32> + + %117 = tosa.reshape %116 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32> + %118 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> + %119 = tosa.transpose %117, %118 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32> + %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32> + %121 = tosa.reshape %120 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32> + + %122 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %123 = tosa.transpose %arg8, %122 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> + %124 = tosa.reshape %121 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %cst_22 = arith.constant dense<0.0> : tensor<40x4096xf32> + %125 = linalg.matmul {cast = #linalg.type_fn} ins(%124, %123 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_22 : tensor<40x4096xf32>) -> tensor<40x4096xf32> + %126 = tosa.reshape %125 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %127 = tosa.add %arg1, %126 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %127 : tensor<1x40x4096xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor_0 = arith.constant dense<3.0> : tensor<1x1x4096xf32> + %input_tensor_1 = arith.constant dense<1.0> : tensor<1x40x4096xf32> + %input_tensor_2 = arith.constant dense<2> : tensor<40xi64> + %input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32> + %input_tensor_4 = arith.constant dense<1.0> : tensor<4096x4096xf32> + %input_tensor_5 = arith.constant dense<1.0> : tensor<4096x4096xf32> + %input_tensor_6 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32> + %input_tensor_7 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32> + %input_tensor_8 = arith.constant dense<2.0> : tensor<4096x4096xf32> + %input_tensor_9 = arith.constant dense<0.0> : tensor<1x1x40x40xf32> + + call @kernel_self_attention(%input_tensor_0, %input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4, %input_tensor_5, %input_tensor_6, %input_tensor_7, %input_tensor_8, %input_tensor_9) : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<40xi64>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<4096x4096xf32>, tensor<1x1x40x40xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-softmax.mlir b/examples/BuddyNext/next-softmax.mlir new file mode 100644 index 0000000000..778320ef1a --- /dev/null +++ b/examples/BuddyNext/next-softmax.mlir @@ -0,0 +1,72 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 + +func.func @kernel_softmax(%arg0 : tensor<1x32x40x40xf32>, %arg1 : tensor<1x1x40x40xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Softmax operations + %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %102 = tosa.mul %arg0, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %103 = tosa.add %102, %arg1 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32> + %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> + %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> + %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32> + %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %109 : tensor<1x32x40x40xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return +} + +func.func @main() { + %input_tensor_1 = arith.constant dense<3.0> : tensor<1x32x40x40xf32> + %input_tensor_2 = arith.constant dense<0.0> : tensor<1x1x40x40xf32> + + call @kernel_softmax(%input_tensor_1, %input_tensor_2) : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> () + + return +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir new file mode 100644 index 0000000000..54c3443c66 --- /dev/null +++ b/examples/BuddyNext/next-transpose.mlir @@ -0,0 +1,65 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -lower-affine \ +// RUN: -func-bufferize \ +// RUN: -arith-bufferize \ +// RUN: -tensor-bufferize \ +// RUN: -buffer-deallocation \ +// RUN: -finalizing-bufferize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-cpu-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +module { + func.func private @rtclock() -> f64 + + func.func @kernel_transpose(%arg0: tensor<1x40x32x128xf32>) { + %t_start = call @rtclock() : () -> f64 + + // Transpose operation + %perm = "tosa.const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32> + %transposed = tosa.transpose %arg0, %perm : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %transposed : tensor<1x32x40x128xf32> to tensor<*xf32> + + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + vector.print %time : f64 + + return + } + + func.func @main() { + %input_tensor = arith.constant dense<1.0> : tensor<1x40x32x128xf32> + + call @kernel_transpose(%input_tensor) : (tensor<1x40x32x128xf32>) -> () + + return + } + + func.func private @printMemrefF32(%ptr : tensor<*xf32>) +} \ No newline at end of file From 3033e0fb7297c60e8405494e5bac8adc4fcfdc75 Mon Sep 17 00:00:00 2001 From: hayden-brown Date: Thu, 1 Aug 2024 13:12:50 +0800 Subject: [PATCH 2/2] Llama2 model Operator/Layer level instance extraction --- examples/BuddyNext/next-fc.mlir | 5 +++++ examples/BuddyNext/next-ffn.mlir | 18 ++++++++++++------ examples/BuddyNext/next-fpowi.mlir | 5 +++++ examples/BuddyNext/next-matmul.mlir | 4 ++++ examples/BuddyNext/next-mul.mlir | 4 ++++ examples/BuddyNext/next-negate.mlir | 7 ++++++- examples/BuddyNext/next-reciprocal.mlir | 6 +++++- examples/BuddyNext/next-reducesum.mlir | 5 +++++ examples/BuddyNext/next-rmsnorm.mlir | 5 +++++ examples/BuddyNext/next-rsqrt.mlir | 5 +++++ examples/BuddyNext/next-selfattention.mlir | 21 +++++++++++++-------- examples/BuddyNext/next-softmax.mlir | 5 +++++ examples/BuddyNext/next-transpose.mlir | 5 +++++ 13 files changed, 79 insertions(+), 16 deletions(-) diff --git a/examples/BuddyNext/next-fc.mlir b/examples/BuddyNext/next-fc.mlir index 89593d8cf6..3798024300 100644 --- a/examples/BuddyNext/next-fc.mlir +++ b/examples/BuddyNext/next-fc.mlir @@ -58,6 +58,11 @@ func.func @kernel_fc_layer(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<4096x40 %tensor_unranked = tensor.cast %51 : tensor<1x40x4096xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [49152{{(, 49152)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-ffn.mlir b/examples/BuddyNext/next-ffn.mlir index f132f62d97..725e98db19 100644 --- a/examples/BuddyNext/next-ffn.mlir +++ b/examples/BuddyNext/next-ffn.mlir @@ -77,6 +77,11 @@ func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %ar %tensor_unranked = tensor.cast %158 : tensor<1x40x4096xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [461655{{(, 461655)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 @@ -84,15 +89,16 @@ func.func @kernel_ffn(%arg0: tensor<1x40x4096xf32>, %arg9: tensor<4096xf32>, %ar } func.func @main() { - %input_tensor = arith.constant dense<3.0> : tensor<1x40x4096xf32> - %weight1 = arith.constant dense<1.0> : tensor<4096xf32> - %weight2 = arith.constant dense<1.0> : tensor<11008x4096xf32> - %weight3 = arith.constant dense<2.0> : tensor<11008x4096xf32> - %weight4 = arith.constant dense<1.0> : tensor<4096x11008xf32> + %input_tensor = arith.constant dense<0.5> : tensor<1x40x4096xf32> + %weight1 = arith.constant dense<0.1> : tensor<4096xf32> + %weight2 = arith.constant dense<0.1> : tensor<11008x4096xf32> + %weight3 = arith.constant dense<0.1> : tensor<11008x4096xf32> + %weight4 = arith.constant dense<0.1> : tensor<4096x11008xf32> + // Print timings. call @kernel_ffn(%input_tensor, %weight1, %weight2, %weight3, %weight4) : (tensor<1x40x4096xf32>, tensor<4096xf32>, tensor<11008x4096xf32>, tensor<11008x4096xf32>, tensor<4096x11008xf32>) -> () return } -func.func private @printMemrefF32(%ptr : tensor<*xf32>) \ No newline at end of file +func.func private @printMemrefF32(%ptr : tensor<*xf32>) diff --git a/examples/BuddyNext/next-fpowi.mlir b/examples/BuddyNext/next-fpowi.mlir index fca13fd2ed..79274c58e7 100644 --- a/examples/BuddyNext/next-fpowi.mlir +++ b/examples/BuddyNext/next-fpowi.mlir @@ -52,6 +52,11 @@ module { %tensor_unranked = tensor.cast %result : tensor<1x32x40x64xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 64] strides = [81920, 2560, 64, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [25{{(, 25)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-matmul.mlir b/examples/BuddyNext/next-matmul.mlir index a81e78385a..72217cd121 100644 --- a/examples/BuddyNext/next-matmul.mlir +++ b/examples/BuddyNext/next-matmul.mlir @@ -44,6 +44,10 @@ func.func @kernel_matmul(%arg0 : tensor<40x4096xf32>, %arg1 : tensor<4096x4096xf %tensor_unranked = tensor.cast %matmul_result : tensor<40x4096xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [40, 4096] strides = [4096, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [24576{{(, 24576)*}}] + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-mul.mlir b/examples/BuddyNext/next-mul.mlir index 8b2d5ae677..b1c3d03987 100644 --- a/examples/BuddyNext/next-mul.mlir +++ b/examples/BuddyNext/next-mul.mlir @@ -46,6 +46,10 @@ module { %tensor_unranked = tensor.cast %mul_result : tensor<1x40x1xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [6{{(, 6)*}}] + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-negate.mlir b/examples/BuddyNext/next-negate.mlir index d11c628ee6..e05805c85d 100644 --- a/examples/BuddyNext/next-negate.mlir +++ b/examples/BuddyNext/next-negate.mlir @@ -30,7 +30,7 @@ // RUN: | mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ -// RUN: | FileCheck %s +// RUN: | FileCheck %s module { func.func private @rtclock() -> f64 @@ -46,6 +46,11 @@ module { %tensor_unranked = tensor.cast %negated : tensor<1x32x40x64xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 64] strides = [81920, 2560, 64, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [-1{{(, -1)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-reciprocal.mlir b/examples/BuddyNext/next-reciprocal.mlir index 98469786cd..e664f56bc8 100644 --- a/examples/BuddyNext/next-reciprocal.mlir +++ b/examples/BuddyNext/next-reciprocal.mlir @@ -46,6 +46,10 @@ module { %tensor_unranked = tensor.cast %result : tensor<1x10xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [1, 10] strides = [10, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [0.5{{(, 0.5)*}}] + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 @@ -53,7 +57,7 @@ module { } func.func @main() { - %input_tensor = "tosa.const"() {value = dense<[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]]> : tensor<1x10xf32>} : () -> tensor<1x10xf32> + %input_tensor = "tosa.const"() {value = dense<2.0> : tensor<1x10xf32>} : () -> tensor<1x10xf32> call @kernel_reciprocal(%input_tensor) : (tensor<1x10xf32>) -> () diff --git a/examples/BuddyNext/next-reducesum.mlir b/examples/BuddyNext/next-reducesum.mlir index 825aeae113..92aca0ceac 100644 --- a/examples/BuddyNext/next-reducesum.mlir +++ b/examples/BuddyNext/next-reducesum.mlir @@ -46,6 +46,11 @@ module { %tensor_unranked = tensor.cast %result : tensor<1x40x1xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [4096{{(, 4096)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-rmsnorm.mlir b/examples/BuddyNext/next-rmsnorm.mlir index 7cb4e2c844..f4b21891f0 100644 --- a/examples/BuddyNext/next-rmsnorm.mlir +++ b/examples/BuddyNext/next-rmsnorm.mlir @@ -68,6 +68,11 @@ func.func @kernel_rmsnorm(%arg0: tensor<1x40x4096xf32>) { %tensor_unranked = tensor.cast %39 : tensor<1x40x4096xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [0.999999{{(, 0.999999)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-rsqrt.mlir b/examples/BuddyNext/next-rsqrt.mlir index 6e8d806834..468f9ec961 100644 --- a/examples/BuddyNext/next-rsqrt.mlir +++ b/examples/BuddyNext/next-rsqrt.mlir @@ -45,6 +45,11 @@ func.func @kernel_rsqrt(%arg0 : tensor<1x40x1xf32>) { %tensor_unranked = tensor.cast %rsqrt_result : tensor<1x40x1xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 1] strides = [40, 1, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [0.57735{{(, 0.57735)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-selfattention.mlir b/examples/BuddyNext/next-selfattention.mlir index 7976a1b96a..aeb6cf09ea 100644 --- a/examples/BuddyNext/next-selfattention.mlir +++ b/examples/BuddyNext/next-selfattention.mlir @@ -200,6 +200,11 @@ func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x %tensor_unranked = tensor.cast %127 : tensor<1x40x4096xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 4096] strides = [163840, 4096, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [83883.8{{(, 83883.8)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 @@ -207,15 +212,15 @@ func.func @kernel_self_attention(%arg0 : tensor<1x1x4096xf32>, %arg1 : tensor<1x } func.func @main() { - %input_tensor_0 = arith.constant dense<3.0> : tensor<1x1x4096xf32> - %input_tensor_1 = arith.constant dense<1.0> : tensor<1x40x4096xf32> - %input_tensor_2 = arith.constant dense<2> : tensor<40xi64> - %input_tensor_3 = arith.constant dense<1.0> : tensor<4096x4096xf32> - %input_tensor_4 = arith.constant dense<1.0> : tensor<4096x4096xf32> - %input_tensor_5 = arith.constant dense<1.0> : tensor<4096x4096xf32> + %input_tensor_0 = arith.constant dense<1.0> : tensor<1x1x4096xf32> + %input_tensor_1 = arith.constant dense<0.1> : tensor<1x40x4096xf32> + %input_tensor_2 = arith.constant dense<1> : tensor<40xi64> + %input_tensor_3 = arith.constant dense<0.5> : tensor<4096x4096xf32> + %input_tensor_4 = arith.constant dense<0.1> : tensor<4096x4096xf32> + %input_tensor_5 = arith.constant dense<0.1> : tensor<4096x4096xf32> %input_tensor_6 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32> - %input_tensor_7 = arith.constant dense<1.0> : tensor<1x1x2048x128xf32> - %input_tensor_8 = arith.constant dense<2.0> : tensor<4096x4096xf32> + %input_tensor_7 = arith.constant dense<0.1> : tensor<1x1x2048x128xf32> + %input_tensor_8 = arith.constant dense<0.5> : tensor<4096x4096xf32> %input_tensor_9 = arith.constant dense<0.0> : tensor<1x1x40x40xf32> call @kernel_self_attention(%input_tensor_0, %input_tensor_1, %input_tensor_2, %input_tensor_3, %input_tensor_4, %input_tensor_5, %input_tensor_6, %input_tensor_7, %input_tensor_8, %input_tensor_9) : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<40xi64>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<4096x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<4096x4096xf32>, tensor<1x1x40x40xf32>) -> () diff --git a/examples/BuddyNext/next-softmax.mlir b/examples/BuddyNext/next-softmax.mlir index 778320ef1a..98b2e37cdb 100644 --- a/examples/BuddyNext/next-softmax.mlir +++ b/examples/BuddyNext/next-softmax.mlir @@ -54,6 +54,11 @@ func.func @kernel_softmax(%arg0 : tensor<1x32x40x40xf32>, %arg1 : tensor<1x1x40x %tensor_unranked = tensor.cast %109 : tensor<1x32x40x40xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 40] strides = [51200, 1600, 40, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [0.025{{(, 0.025)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64 diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir index 54c3443c66..63e942668e 100644 --- a/examples/BuddyNext/next-transpose.mlir +++ b/examples/BuddyNext/next-transpose.mlir @@ -47,6 +47,11 @@ module { %tensor_unranked = tensor.cast %transposed : tensor<1x32x40x128xf32> to tensor<*xf32> + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [1{{(, 1)*}}], + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () vector.print %time : f64