Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename TensorTile to TensorAccessPattern #1941

Merged
merged 10 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions programming_examples/basic/dma_transpose/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx
from aie.helpers.dialects.ext.scf import _for as range_
from aie.helpers.tensortiler import TensorTile
from aie.helpers.taplib import TensorAccessPattern


def my_passthrough(M, K, N, generate_access_map=False):
tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
data_transform = TensorTile(
data_transform = TensorAccessPattern(
(M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
)
if generate_access_map:
Expand Down Expand Up @@ -57,7 +57,7 @@ def sequence(A, B, C):
metadata=of_in,
bd_id=1,
mem=A,
tensor_tile=data_transform,
tap=data_transform,
issue_token=True,
)
npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
Expand Down
6 changes: 3 additions & 3 deletions programming_examples/basic/dma_transpose/aie2_alt.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx
from aie.helpers.dialects.ext.scf import _for as range_
from aie.helpers.tensortiler import TensorTile
from aie.helpers.taplib import TensorAccessPattern


def my_passthrough(M, K, N, generate_access_map=False):
tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
data_transform = TensorTile(
data_transform = TensorAccessPattern(
(M, K), offset=0, sizes=[1, 1, K, M], strides=[1, 1, 1, K]
)
if generate_access_map:
Expand Down Expand Up @@ -54,7 +54,7 @@ def sequence(A, B, C):
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
in_task = shim_dma_single_bd_task(
of_in, A, tensor_tile=data_transform, issue_token=True
of_in, A, tap=data_transform, issue_token=True
)
out_task = shim_dma_single_bd_task(
of_out, C, sizes=[1, 1, 1, N], issue_token=True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_
from aie.helpers.tensortiler import TensorTile, TensorTileSequence
from aie.helpers.taplib import TensorAccessPattern, TensorAccessSequence

dtype_map = {
"bf16": bfloat16,
Expand Down Expand Up @@ -49,14 +49,14 @@ def main():
)
argparser.add_argument("--trace_size", type=int, default=0)
argparser.add_argument(
"--generate-tiles",
"--generate-taps",
action="store_true",
help="Generate TensorTiles, a Python object to represent each data transfer"
help="Generate TensorAccessPatterns, a Python object to represent each data transfer"
"of the input/output matrices. These objects can be used for visualization.",
)
args = argparser.parse_args()
with mlir_mod_ctx() as ctx:
maybe_tiles = my_matmul(
maybe_taps = my_matmul(
args.M,
args.K,
args.N,
Expand All @@ -68,13 +68,13 @@ def main():
args.dtype_out,
args.b_col_maj,
args.trace_size,
args.generate_tiles,
args.generate_taps,
)
# print(ctx.module.operation.verify())
print(ctx.module)

if args.generate_tiles:
return maybe_tiles
if args.generate_taps:
return maybe_taps


def ceildiv(a, b):
Expand All @@ -93,7 +93,7 @@ def my_matmul(
dtype_out_str,
b_col_maj,
trace_size,
generate_tiles=False,
generate_taps=False,
):
n_aie_rows = 4
n_aie_cores = n_aie_rows * n_aie_cols
Expand Down Expand Up @@ -169,11 +169,11 @@ def my_matmul(
elif n_aie_cols == 4:
dev = AIEDevice.npu1_4col

# These will hold TensorTile objects that represent the runtime
# npu_dma_memcpy_nd operations of this design. They are only used if generate_tiles is true
A_tensor_tiles = []
B_tensor_tiles = []
C_tensor_tiles = []
# These will hold TensorAccessPattern objects that represent the runtime
# npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true
A_taps = []
B_taps = []
C_taps = []

@device(dev)
def device_body():
Expand Down Expand Up @@ -415,8 +415,8 @@ def sequence(A, B, C):
# Use the calculated sizes/strides/offsets to record the data movement
# caused by the above call to npu_dma_memcpy_nd.
# This line does not change MLIR output at all.
C_tensor_tiles.append(
TensorTile(
C_taps.append(
TensorAccessPattern(
(M, N),
offset=C_offset,
sizes=C_sizes,
Expand Down Expand Up @@ -469,8 +469,8 @@ def sequence(A, B, C):
# Use the calculated sizes/strides/offsets to record the data movement
# caused by the above call to npu_dma_memcpy_nd.
# This line does not change MLIR output at all.
A_tensor_tiles.append(
TensorTile(
A_taps.append(
TensorAccessPattern(
(M, K),
offset=A_offset,
sizes=A_sizes,
Expand Down Expand Up @@ -515,8 +515,8 @@ def sequence(A, B, C):
# Use the calculated sizes/strides/offsets to record the data movement
# caused by the above call to npu_dma_memcpy_nd.
# This line does not change MLIR output at all.
B_tensor_tiles.append(
TensorTile(
B_taps.append(
TensorAccessPattern(
(K, N),
offset=B_col_offset,
sizes=B_sizes,
Expand All @@ -527,13 +527,13 @@ def sequence(A, B, C):
dma_wait(*C_l2l3_fifos)
dma_wait(*C_l2l3_fifos)

if generate_tiles:
# If generate tiles is true, return a representation of tensor tiles
if generate_taps:
# If generate_taps is true, return a representation of tensor tiles
# representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor.
return (
TensorTileSequence.from_tiles(A_tensor_tiles),
TensorTileSequence.from_tiles(B_tensor_tiles),
TensorTileSequence.from_tiles(C_tensor_tiles),
TensorAccessSequence.from_taps(A_taps),
TensorAccessSequence.from_taps(B_taps),
TensorAccessSequence.from_taps(C_taps),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_
from aie.helpers.tensortiler import TensorTiler2D, TensorTileSequence
from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence

dtype_map = {
"bf16": bfloat16,
Expand Down Expand Up @@ -49,14 +49,14 @@ def main():
)
argparser.add_argument("--trace_size", type=int, default=0)
argparser.add_argument(
"--generate-tiles",
"--generate-taps",
action="store_true",
help="Generate TensorTiles, a Python object to represent each data transfer"
help="Generate TensorAccessPatterns, a Python object to represent each data transfer"
"of the input/output matrices. These objects can be used for visualization.",
)
args = argparser.parse_args()
with mlir_mod_ctx() as ctx:
maybe_tiles = my_matmul(
maybe_taps = my_matmul(
args.M,
args.K,
args.N,
Expand All @@ -68,13 +68,13 @@ def main():
args.dtype_out,
args.b_col_maj,
args.trace_size,
args.generate_tiles,
args.generate_taps,
)
# print(ctx.module.operation.verify())
print(ctx.module)

if args.generate_tiles:
return maybe_tiles
if args.generate_taps:
return maybe_taps


def ceildiv(a, b):
Expand All @@ -93,7 +93,7 @@ def my_matmul(
dtype_out_str,
b_col_maj,
trace_size,
generate_tiles=False,
generate_taps=False,
):
n_aie_rows = 4
n_aie_cores = n_aie_rows * n_aie_cols
Expand Down Expand Up @@ -169,11 +169,11 @@ def my_matmul(
elif n_aie_cols == 4:
dev = AIEDevice.npu1_4col

# These will hold TensorTile objects that represent the runtime
# npu_dma_memcpy_nd operations of this design. They are only used if generate_tiles is true
A_tensor_tiles = []
B_tensor_tiles = []
C_tensor_tiles = []
# These will hold TensorAccessPattern objects that represent the runtime
# npu_dma_memcpy_nd operations of this design. They are only used if generate_taps is true
A_taps = []
B_taps = []
C_taps = []

@device(dev)
def device_body():
Expand Down Expand Up @@ -433,7 +433,7 @@ def sequence(A, B, C):
for col in range(n_aie_cols):

# This line does not change MLIR output at all - it's just for recording data movement
C_tensor_tiles.append(C_tiles[c_index])
C_taps.append(C_tiles[c_index])

# C Output Transfer:
# The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
Expand All @@ -457,16 +457,11 @@ def sequence(A, B, C):
c_task = shim_dma_single_bd_task(
C_l2l3_fifos[col],
C,
tensor_tile=C_tiles[c_index],
tap=C_tiles[c_index],
issue_token=True,
)
dma_start_task(c_task)
out_tasks.append(c_task)

# Use the calculated sizes/strides/offsets to record the data movement
# caused by the above call to npu_dma_memcpy_nd.
# This line does not change MLIR output at all.
C_tensor_tiles.append(C_tiles[c_index])
c_index += 1

for tile_row in range(current_tb_n_rows):
Expand Down Expand Up @@ -495,14 +490,13 @@ def sequence(A, B, C):
a_task = shim_dma_single_bd_task(
A_l3l2_fifos[col],
A,
tensor_tile=A_tiles[tile_offset],
tap=A_tiles[tile_offset],
)
dma_start_task(a_task)
in_tasks.append(a_task)
# Use the calculated sizes/strides/offsets to record the data movement
# caused by the above call to npu_dma_memcpy_nd.
# This line does not change MLIR output at all.
A_tensor_tiles.append(A_tiles[tile_offset])

# B input transfer:
# Transfer the first a (n)-wide block of columns of B,
Expand All @@ -525,13 +519,14 @@ def sequence(A, B, C):
b_task = shim_dma_single_bd_task(
B_l3l2_fifos[col],
B,
tensor_tile=B_tiles[col],
tap=B_tiles[col],
)
dma_start_task(b_task)
in_tasks.append(b_task)

# These lines do not change MLIR output at all - they are just for recording data movement
B_tensor_tiles.append(B_tiles[col])
A_taps.append(A_tiles[tile_offset])
B_taps.append(B_tiles[col])
if tb > 0 or (tb == 0 and pingpong > 0):
dma_await_task(*out_tasks)
out_tasks = []
Expand All @@ -542,13 +537,13 @@ def sequence(A, B, C):
if len(in_tasks) > 0:
dma_free_task(*in_tasks)

if generate_tiles:
# If generate tiles is true, return a representation of tensor tiles
if generate_taps:
# If generate taps is true, return a representation of tensor access patterns
# representing all the npu_dma_memcpy_nd runtime sequence operations per input/ouput tensor.
return (
TensorTileSequence.from_tiles(A_tensor_tiles),
TensorTileSequence.from_tiles(B_tensor_tiles),
TensorTileSequence.from_tiles(C_tensor_tiles),
TensorAccessSequence.from_taps(A_taps),
TensorAccessSequence.from_taps(B_taps),
TensorAccessSequence.from_taps(C_taps),
)


Expand Down
Loading
Loading