Xilinx · hunhoffe · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
@@ -13,12 +13,12 @@
 from aie.dialects.aiex import *
 from aie.extras.context import mlir_mod_ctx
 from aie.helpers.dialects.ext.scf import _for as range_
-from aie.helpers.tensortiler import TensorTile
+from aie.helpers.taplib import TensorAccessPattern
 
 
 def my_passthrough(M, K, N, generate_access_map=False):
     tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
-    data_transform = TensorTile(
+    data_transform = TensorAccessPattern(
         (M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
     )
     if generate_access_map:
@@ -57,7 +57,7 @@ def sequence(A, B, C):
                     metadata=of_in,
                     bd_id=1,
                     mem=A,
-                    tensor_tile=data_transform,
+                    tap=data_transform,
                     issue_token=True,
                 )
                 npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])

@@ -13,12 +13,12 @@
 from aie.dialects.aiex import *
 from aie.extras.context import mlir_mod_ctx
 from aie.helpers.dialects.ext.scf import _for as range_
-from aie.helpers.tensortiler import TensorTile
+from aie.helpers.taplib import TensorAccessPattern
 
 
 def my_passthrough(M, K, N, generate_access_map=False):
     tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
-    data_transform = TensorTile(
+    data_transform = TensorAccessPattern(
         (M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
     )
     if generate_access_map:
@@ -54,7 +54,7 @@ def sequence(A, B, C):
                 # The strides below are configured to read across all rows in the same column
                 # Stride of K in dim/wrap 2 skips an entire row to read a full column
                 in_task = shim_dma_single_bd_task(
-                    of_in, A, tensor_tile=data_transform, issue_token=True
+                    of_in, A, tap=data_transform, issue_token=True
                 )
                 out_task = shim_dma_single_bd_task(
                     of_out, C, sizes=[1, 1, 1, N], issue_token=True

@@ -14,7 +14,7 @@
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.helpers.dialects.ext.scf import _for as range_
-from aie.helpers.tensortiler import TensorTile, TensorTileSequence
+from aie.helpers.taplib import TensorAccessPattern, TensorAccessSequence
 
 dtype_map = {
     "bf16": bfloat16,
@@ -49,14 +49,14 @@ def main():
     )
     argparser.add_argument("--trace_size", type=int, default=0)
     argparser.add_argument(
-        "--generate-tiles",
+        "--generate-taps",
         action="store_true",
-        help="Generate TensorTiles, a Python object to represent each data transfer"
+        help="Generate TensorAccessPatterns, a Python object to represent each data transfer"
         "of the input/output matrices. These objects can be used for visualization.",
     )
     args = argparser.parse_args()
     with mlir_mod_ctx() as ctx:
-        maybe_tiles = my_matmul(
+        maybe_taps = my_matmul(
             args.M,
             args.K,
             args.N,
@@ -68,13 +68,13 @@ def main():
             args.dtype_out,
             args.b_col_maj,
             args.trace_size,
-            args.generate_tiles,
+            args.generate_taps,
         )
         # print(ctx.module.operation.verify())
         print(ctx.module)
 
-    if args.generate_tiles:
-        return maybe_tiles
+    if args.generate_taps:
+        return maybe_taps
 
 
 def ceildiv(a, b):
@@ -93,7 +93,7 @@ def my_matmul(
     dtype_out_str,
     b_col_maj,
     trace_size,
-    generate_tiles=False,
+    generate_taps=False,
 ):
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols
@@ -169,11 +169,11 @@ def my_matmul(
     elif n_aie_cols == 4:
         dev = AIEDevice.npu1_4col
 
-    # These will hold TensorTile objects that represent the runtime
-    # npu_dma_memcpy_nd operations of this design. They are only used if generate_tiles is true
-    A_tensor_tiles = []
-    B_tensor_tiles = []
-    C_tensor_tiles = []
+    # These will hold TensorAccessPattern objects that represent the runtime
+    # npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true
+    A_taps = []
+    B_taps = []
+    C_taps = []
 
     @device(dev)
     def device_body():
@@ -415,8 +415,8 @@ def sequence(A, B, C):
                         # Use the calculated sizes/strides/offsets to record the data movement
                         # caused by the above call to npu_dma_memcpy_nd.
                         # This line does not change MLIR output at all.
-                        C_tensor_tiles.append(
-                            TensorTile(
+                        C_taps.append(
+                            TensorAccessPattern(
                                 (M, N),
                                 offset=C_offset,
                                 sizes=C_sizes,
@@ -469,8 +469,8 @@ def sequence(A, B, C):
                             # Use the calculated sizes/strides/offsets to record the data movement
                             # caused by the above call to npu_dma_memcpy_nd.
                             # This line does not change MLIR output at all.
-                            A_tensor_tiles.append(
-                                TensorTile(
+                            A_taps.append(
+                                TensorAccessPattern(
                                     (M, K),
                                     offset=A_offset,
                                     sizes=A_sizes,
@@ -515,8 +515,8 @@ def sequence(A, B, C):
                             # Use the calculated sizes/strides/offsets to record the data movement
                             # caused by the above call to npu_dma_memcpy_nd.
                             # This line does not change MLIR output at all.
-                            B_tensor_tiles.append(
-                                TensorTile(
+                            B_taps.append(
+                                TensorAccessPattern(
                                     (K, N),
                                     offset=B_col_offset,
                                     sizes=B_sizes,
@@ -527,13 +527,13 @@ def sequence(A, B, C):
                         dma_wait(*C_l2l3_fifos)
             dma_wait(*C_l2l3_fifos)
 
-    if generate_tiles:
-        # If generate tiles is true, return a representation of tensor tiles
+    if generate_taps:
+        # If generate_taps is true, return a representation of tensor tiles
         # representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor.
         return (
-            TensorTileSequence.from_tiles(A_tensor_tiles),
-            TensorTileSequence.from_tiles(B_tensor_tiles),
-            TensorTileSequence.from_tiles(C_tensor_tiles),
+            TensorAccessSequence.from_taps(A_taps),
+            TensorAccessSequence.from_taps(B_taps),
+            TensorAccessSequence.from_taps(C_taps),
         )
 
 

@@ -14,7 +14,7 @@
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.helpers.dialects.ext.scf import _for as range_
-from aie.helpers.tensortiler import TensorTiler2D, TensorTileSequence
+from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence
 
 dtype_map = {
     "bf16": bfloat16,
@@ -49,14 +49,14 @@ def main():
     )
     argparser.add_argument("--trace_size", type=int, default=0)
     argparser.add_argument(
-        "--generate-tiles",
+        "--generate-taps",
         action="store_true",
-        help="Generate TensorTiles, a Python object to represent each data transfer"
+        help="Generate TensorAccessPatterns, a Python object to represent each data transfer"
         "of the input/output matrices. These objects can be used for visualization.",
     )
     args = argparser.parse_args()
     with mlir_mod_ctx() as ctx:
-        maybe_tiles = my_matmul(
+        maybe_taps = my_matmul(
             args.M,
             args.K,
             args.N,
@@ -68,13 +68,13 @@ def main():
             args.dtype_out,
             args.b_col_maj,
             args.trace_size,
-            args.generate_tiles,
+            args.generate_taps,
         )
         # print(ctx.module.operation.verify())
         print(ctx.module)
 
-    if args.generate_tiles:
-        return maybe_tiles
+    if args.generate_taps:
+        return maybe_taps
 
 
 def ceildiv(a, b):
@@ -93,7 +93,7 @@ def my_matmul(
     dtype_out_str,
     b_col_maj,
     trace_size,
-    generate_tiles=False,
+    generate_taps=False,
 ):
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols
@@ -169,11 +169,11 @@ def my_matmul(
     elif n_aie_cols == 4:
         dev = AIEDevice.npu1_4col
 
-    # These will hold TensorTile objects that represent the runtime
-    # npu_dma_memcpy_nd operations of this design. They are only used if generate_tiles is true
-    A_tensor_tiles = []
-    B_tensor_tiles = []
-    C_tensor_tiles = []
+    # These will hold TensorAccessPattern objects that represent the runtime
+    # npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true
+    A_taps = []
+    B_taps = []
+    C_taps = []
 
     @device(dev)
     def device_body():
@@ -433,7 +433,7 @@ def sequence(A, B, C):
                     for col in range(n_aie_cols):
 
                         # This line does not change MLIR output at all - it's just for recording data movement
-                        C_tensor_tiles.append(C_tiles[c_index])
+                        C_taps.append(C_tiles[c_index])
 
                         # C Output Transfer:
                         # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
@@ -457,16 +457,11 @@ def sequence(A, B, C):
                         c_task = shim_dma_single_bd_task(
                             C_l2l3_fifos[col],
                             C,
-                            tensor_tile=C_tiles[c_index],
+                            tap=C_tiles[c_index],
                             issue_token=True,
                         )
                         dma_start_task(c_task)
                         out_tasks.append(c_task)
-
-                        # Use the calculated sizes/strides/offsets to record the data movement
-                        # caused by the above call to npu_dma_memcpy_nd.
-                        # This line does not change MLIR output at all.
-                        C_tensor_tiles.append(C_tiles[c_index])
                         c_index += 1
 
                         for tile_row in range(current_tb_n_rows):
@@ -495,14 +490,13 @@ def sequence(A, B, C):
                             a_task = shim_dma_single_bd_task(
                                 A_l3l2_fifos[col],
                                 A,
-                                tensor_tile=A_tiles[tile_offset],
+                                tap=A_tiles[tile_offset],
                             )
                             dma_start_task(a_task)
                             in_tasks.append(a_task)
                             # Use the calculated sizes/strides/offsets to record the data movement
                             # caused by the above call to npu_dma_memcpy_nd.
                             # This line does not change MLIR output at all.
-                            A_tensor_tiles.append(A_tiles[tile_offset])
 
                             # B input transfer:
                             # Transfer the first a (n)-wide block of columns of B,
@@ -525,13 +519,14 @@ def sequence(A, B, C):
                             b_task = shim_dma_single_bd_task(
                                 B_l3l2_fifos[col],
                                 B,
-                                tensor_tile=B_tiles[col],
+                                tap=B_tiles[col],
                             )
                             dma_start_task(b_task)
                             in_tasks.append(b_task)
 
                             # These lines do not change MLIR output at all - they are just for recording data movement
-                            B_tensor_tiles.append(B_tiles[col])
+                            A_taps.append(A_tiles[tile_offset])
+                            B_taps.append(B_tiles[col])
                     if tb > 0 or (tb == 0 and pingpong > 0):
                         dma_await_task(*out_tasks)
                         out_tasks = []
@@ -542,13 +537,13 @@ def sequence(A, B, C):
             if len(in_tasks) > 0:
                 dma_free_task(*in_tasks)
 
-    if generate_tiles:
-        # If generate tiles is true, return a representation of tensor tiles
+    if generate_taps:
+        # If generate taps is true, return a representation of tensor access patterns
         # representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor.
         return (
-            TensorTileSequence.from_tiles(A_tensor_tiles),
-            TensorTileSequence.from_tiles(B_tensor_tiles),
-            TensorTileSequence.from_tiles(C_tensor_tiles),
+            TensorAccessSequence.from_taps(A_taps),
+            TensorAccessSequence.from_taps(B_taps),
+            TensorAccessSequence.from_taps(C_taps),
         )