From f5ca3289edd3f41034c317f50c57eccef51f470b Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Tue, 24 Sep 2024 11:00:03 -0600 Subject: [PATCH] Hide yield statements at end of python scf for-loop body (#1785) --- .../basic/dma_transpose/aie2.py | 7 +- .../matrix_multiplication/cascade/aie2.py | 15 +- .../matrix_vector/aie2.py | 8 +- .../matrix_multiplication/single_core/aie2.py | 13 +- .../matrix_multiplication/whole_array/aie2.py | 15 +- .../basic/matrix_scalar_add/aie2.py | 8 +- .../memtile_repeat/distribute_repeat/aie2.py | 14 +- .../memtile_repeat/simple_repeat/aie2.py | 1 - .../basic/passthrough_dmas/aie2.py | 7 +- .../passthrough_dmas_plio/aie2-input-plio.py | 7 +- .../passthrough_dmas_plio/aie2-output-plio.py | 7 +- .../basic/passthrough_kernel/aie2.py | 5 +- .../basic/passthrough_kernel/test.py | 4 - .../basic/row_wise_bias_add/aie2.py | 12 +- programming_examples/basic/vector_exp/aie2.py | 11 +- .../basic/vector_reduce_add/aie2.py | 6 +- .../basic/vector_reduce_max/aie2.py | 6 +- .../basic/vector_reduce_min/aie2.py | 6 +- .../basic/vector_scalar_add/aie2.py | 8 +- .../basic/vector_scalar_add_runlist/aie2.py | 8 +- .../basic/vector_scalar_mul/aie2.py | 8 +- .../basic/vector_scalar_mul/test.py | 4 - .../basic/vector_vector_add/aie2.py | 11 +- .../basic/vector_vector_modulo/aie2.py | 13 +- .../basic/vector_vector_mul/aie2.py | 13 +- programming_examples/ml/bottleneck/aie2.py | 111 ++++---------- programming_examples/ml/conv2d/aie2.py | 8 +- .../ml/conv2d_fused_relu/aie2.py | 8 +- programming_examples/ml/eltwise_add/aie2.py | 8 +- programming_examples/ml/eltwise_mul/aie2.py | 8 +- programming_examples/ml/relu/aie2.py | 8 +- .../ml/resnet/layers_conv2_x/aie2.py | 145 +++++------------- programming_examples/ml/softmax/aie2.py | 9 +- .../vision/color_detect/aie2_colorDetect.py | 6 +- .../color_threshold/aie2_colorThreshold.py | 18 +-- .../vision/edge_detect/aie2_edgeDetect.py | 18 +-- .../vision/vision_passthrough/aie2.py | 8 +- .../section-2/section-2a/README.md | 4 - .../section-2/section-2d/README.md | 12 +- .../section-2/section-2d/aie2.py | 10 +- .../section-2/section-2d/aie2_multi.py | 10 +- .../01_single_double_buffer/single_buffer.py | 11 +- .../02_external_mem_to_core/ext_to_core.py | 8 +- .../ext_to_core_L2.py | 8 +- .../04_distribute_L2/distribute_L2.py | 20 +-- .../05_join_L2/distribute_and_join_L2.py | 20 +-- .../section-2e/05_join_L2/join_L2.py | 20 +-- programming_guide/section-3/README.md | 6 +- programming_guide/section-3/aie2.py | 8 +- programming_guide/section-3/test.py | 7 - .../section-4/section-4a/aie2.py | 8 +- .../section-4/section-4a/test.py | 6 - .../section-4/section-4b/aie2.py | 8 +- .../section-4/section-4b/test.py | 6 - python/extras/dialects/ext/scf.py | 52 +++++++ ...dd_256_using_dma_op_no_double_buffering.py | 6 +- test/npu-xrt/e2e/test_manual_dpu_args.py | 9 +- .../npu-xrt/e2e/test_offsets_sizes_strides.py | 4 +- test/npu-xrt/e2e/test_repeat_count.py | 11 +- test/npu-xrt/e2e/test_tiled_matrix_add.py | 7 +- ...iled_nonsquare_spatial_tile_matrix_mult.py | 6 - .../test_tiled_nonsquare_tile_matrix_mult.py | 6 +- ...d_nonsquare_tile_matrix_mult_vectorized.py | 12 +- test/npu-xrt/e2e/test_tiled_vec_add.py | 5 +- .../e2e/test_tiled_vec_add_vectorized.py | 4 +- test/npu-xrt/e2e/test_vec_dot.py | 9 +- test/npu-xrt/matrix_transpose/aie2.py | 6 +- test/npu-xrt/nd_memcpy_transforms/aie2.py | 5 +- test/npu-xrt/sync_task_complete_token/aie2.py | 8 +- .../aie2.py | 8 +- test/python/aievec.py | 12 +- test/python/code_region.py | 6 +- test/python/core_ext_kernel.py | 7 +- test/python/npu.py | 19 +-- test/python/trace_utils.py | 5 +- 75 files changed, 322 insertions(+), 629 deletions(-) create mode 100644 python/extras/dialects/ext/scf.py diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py index 0562194664..a4852aba62 100644 --- a/programming_examples/basic/dma_transpose/aie2.py +++ b/programming_examples/basic/dma_transpose/aie2.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ N = 4096 M = 64 @@ -45,8 +44,8 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): - yield_([]) + for _ in range_(sys.maxsize): + pass # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/matrix_multiplication/cascade/aie2.py b/programming_examples/basic/matrix_multiplication/cascade/aie2.py index 51e7f5dddb..fc7aecc45b 100644 --- a/programming_examples/basic/matrix_multiplication/cascade/aie2.py +++ b/programming_examples/basic/matrix_multiplication/cascade/aie2.py @@ -12,7 +12,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ def main(): @@ -277,9 +277,11 @@ def device_body(): @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): loop = ( - for_(n_tiles_per_core) if n_tiles_per_core > 1 else range(1) + range_(n_tiles_per_core) + if n_tiles_per_core > 1 + else range(1) ) # Workaround for issue #1547 for _ in loop: if row == 0: @@ -292,7 +294,7 @@ def core_body(): if row == 0: call(zero_scalar, [elem_out]) - for _ in for_(K // k // n_aie_rows): + for _ in range_(K // k // n_aie_rows): elem_in_a = A_l2l1_fifos[row].acquire( ObjectFifoPort.Consume, 1 ) @@ -319,14 +321,9 @@ def core_body(): B_l2l1_fifos[row][col].release( ObjectFifoPort.Consume, 1 ) - yield_([]) if row == 0: C_l1l2_fifos[col].release(ObjectFifoPort.Produce, 1) - yield_([]) - - if n_tiles_per_core > 1: # workaround for issue #1547 - yield_([]) # To/from AIE-array data movement @runtime_sequence( diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 5ecc36da04..d518d282f6 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -9,7 +9,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ def my_matmul(): @@ -144,7 +144,7 @@ def device_body(): # Compute tile i @core(cores[i], f"mv_{m}x{k}.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1, @@ -154,7 +154,7 @@ def core_body(): else: call(zero_scalar, [elem_out]) - for _ in for_(K_div_k): + for _ in range_(K_div_k): elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( ObjectFifoPort.Consume, 1, @@ -175,13 +175,11 @@ def core_body(): ObjectFifoPort.Consume, 1, ) - yield_([]) outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1, ) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index ddabddbcc5..6c0824e168 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -11,9 +11,9 @@ from aie.extras.context import mlir_mod_ctx from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * import aie.utils.trace as trace_utils from aie.utils.trace import PortEvent +from aie.extras.dialects.ext.scf import _for as range_ def main(): @@ -214,8 +214,8 @@ def device_body(): # Compute tile 2 @core(compute_tile2, f"mm_{m}x{k}x{n}.o") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles) if tiles > 1 else range(1): # issue #1547 + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles) if tiles > 1 else range(1): # issue #1547 elem_out = memC.acquire(ObjectFifoPort.Produce, 1) if vectorized: call(zero, [elem_out]) @@ -223,7 +223,7 @@ def core_body(): call(zero_scalar, [elem_out]) for _ in ( - for_(K_div_k) if K_div_k > 1 else range(1) + range_(K_div_k) if K_div_k > 1 else range(1) ): # issue #1547 elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1) elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1) @@ -233,13 +233,8 @@ def core_body(): call(matmul_scalar, [elem_in_a, elem_in_b, elem_out]) memA.release(ObjectFifoPort.Consume, 1) memB.release(ObjectFifoPort.Consume, 1) - if K_div_k > 1: - yield_([]) memC.release(ObjectFifoPort.Produce, 1) - if tiles > 1: - yield_([]) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 0098acab01..37197a41b1 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -12,7 +12,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ def main(): @@ -299,9 +299,11 @@ def device_body(): @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): loop = ( - for_(n_tiles_per_core) if n_tiles_per_core > 1 else range(1) + range_(n_tiles_per_core) + if n_tiles_per_core > 1 + else range(1) ) # Workaround for issue #1547 for _ in loop: elem_out = C_l1l2_fifos[row][col].acquire( @@ -309,7 +311,7 @@ def core_body(): ) call(zero, [elem_out]) - for _ in for_(K // k): + for _ in range_(K // k): elem_in_a = A_l2l1_fifos[row].acquire( ObjectFifoPort.Consume, 1 ) @@ -319,13 +321,8 @@ def core_body(): call(matmul, [elem_in_a, elem_in_b, elem_out]) A_l2l1_fifos[row].release(ObjectFifoPort.Consume, 1) B_l2l1_fifos[col].release(ObjectFifoPort.Consume, 1) - yield_([]) C_l1l2_fifos[row][col].release(ObjectFifoPort.Produce, 1) - yield_([]) - - if n_tiles_per_core > 1: # workaround for issue #1547 - yield_([]) # To/from AIE-array data movement @runtime_sequence( diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py index bfdd226186..17afb1e7d1 100644 --- a/programming_examples/basic/matrix_scalar_add/aie2.py +++ b/programming_examples/basic/matrix_scalar_add/aie2.py @@ -8,9 +8,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -62,17 +62,15 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(TILE_SIZE): + for i in range_(TILE_SIZE): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py index 6055f6b580..602aa75652 100644 --- a/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/distribute_repeat/aie2.py @@ -10,9 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ dev = AIEDevice.npu1_1col col = 0 @@ -70,32 +70,28 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out2.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in2.acquire(ObjectFifoPort.Consume, 1) - for i in for_(N // 2): + for i in range_(N // 2): v0 = memref.load(elemIn, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elemOut, [i]) - yield_([]) of_in2.release(ObjectFifoPort.Consume, 1) of_out2.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out3.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in3.acquire(ObjectFifoPort.Consume, 1) - for i in for_(N // 2): + for i in range_(N // 2): v0 = memref.load(elemIn, [i]) v1 = arith.addi(v0, arith.constant(2, T.i32())) memref.store(v1, elemOut, [i]) - yield_([]) of_in3.release(ObjectFifoPort.Consume, 1) of_out3.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_out_ty = T.memref(out_size, T.i32()) diff --git a/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py b/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py index d1e00a1cd3..e27d73f208 100644 --- a/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py +++ b/programming_examples/basic/memtile_repeat/simple_repeat/aie2.py @@ -10,7 +10,6 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx N = 4096 diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index 7ea797a84d..4bd9266c2c 100644 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ N = 4096 dev = AIEDevice.npu1_1col @@ -54,8 +53,8 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): - yield_([]) + for _ in range_(sys.maxsize): + pass # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py b/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py index 19d776a772..37c2e340dd 100644 --- a/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py +++ b/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ N = 1024 @@ -44,8 +43,8 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): - yield_([]) + for _ in range_(sys.maxsize): + pass # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py b/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py index 925b86f6da..a3d89fc719 100644 --- a/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py +++ b/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ N = 1024 @@ -46,8 +45,8 @@ def device_body(): # Compute tile 2 @core(ComputeTile2) def core_body(): - for _ in for_(sys.maxsize): - yield_([]) + for _ in range_(sys.maxsize): + pass # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 39ef9106bd..50491765bd 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -10,8 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -47,13 +47,12 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "passThrough.cc.o") def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) call(passThroughLine, [elemIn, elemOut, lineWidthInBytes]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # print(ctx.module.operation.verify()) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 814f8c7a6a..927b92a12d 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -9,13 +9,9 @@ import numpy as np import pyxrt as xrt import sys -import time from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith import aie.utils.test as test_utils diff --git a/programming_examples/basic/row_wise_bias_add/aie2.py b/programming_examples/basic/row_wise_bias_add/aie2.py index dfe8a5ac14..1460d6f71b 100644 --- a/programming_examples/basic/row_wise_bias_add/aie2.py +++ b/programming_examples/basic/row_wise_bias_add/aie2.py @@ -7,9 +7,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -43,19 +42,16 @@ def device_body(): @core(compute_tile, "kernel.o") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(N // n): + for _ in range_(0xFFFFFFFF): + for _ in range_(N // n): elem_bias = bias_fifo.acquire(ObjectFifoPort.Consume, 1) - for i in for_(M // m): + for _ in range_(M // m): elem_in = in_fifo.acquire(ObjectFifoPort.Consume, 1) elem_out = out_fifo.acquire(ObjectFifoPort.Produce, 1) call(kernel_func, [elem_in, elem_bias, elem_out]) out_fifo.release(ObjectFifoPort.Produce, 1) in_fifo.release(ObjectFifoPort.Consume, 1) - yield_([]) bias_fifo.release(ObjectFifoPort.Consume, 1) - yield_([]) - yield_([]) @runtime_sequence(complete_in_memref, complete_bias_memref, complete_out_memref) def sequence(inp, bias, out): diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py index f0e8672571..f2b7874bcb 100644 --- a/programming_examples/basic/vector_exp/aie2.py +++ b/programming_examples/basic/vector_exp/aie2.py @@ -10,8 +10,9 @@ from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect -from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects +from aie.extras.dialects.ext.scf import ( + _for as range_, +) # scf (structured control flow) dialect # AI Engine structural design function @@ -91,8 +92,8 @@ def device_body(): # Compute tile i @core(cores[i], "kernels.a") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) @@ -106,8 +107,6 @@ def core_body(): outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.bf16()) diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py index cb0e26a866..652a91eda6 100644 --- a/programming_examples/basic/vector_reduce_add/aie2.py +++ b/programming_examples/basic/vector_reduce_add/aie2.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -55,13 +54,12 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "reduce_add.cc.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(reduce_add_vector, [elem_in, elem_out, N]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py index bc5f30e34c..451f79697d 100644 --- a/programming_examples/basic/vector_reduce_max/aie2.py +++ b/programming_examples/basic/vector_reduce_max/aie2.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -55,13 +54,12 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "reduce_max.cc.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(reduce_max_vector, [elem_in, elem_out, N]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py index c39b9f5ae9..23b4600899 100644 --- a/programming_examples/basic/vector_reduce_min/aie2.py +++ b/programming_examples/basic/vector_reduce_min/aie2.py @@ -10,9 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -55,13 +54,12 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "reduce_min.cc.o") def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(reduce_min_vector, [elem_in, elem_out, N]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py index 754f38c584..b2daaa2575 100644 --- a/programming_examples/basic/vector_scalar_add/aie2.py +++ b/programming_examples/basic/vector_scalar_add/aie2.py @@ -8,9 +8,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -47,17 +47,15 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(AIE_TILE_WIDTH): + for i in range_(AIE_TILE_WIDTH): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(PROBLEM_SIZE, T.i32()) diff --git a/programming_examples/basic/vector_scalar_add_runlist/aie2.py b/programming_examples/basic/vector_scalar_add_runlist/aie2.py index 754f38c584..b2daaa2575 100644 --- a/programming_examples/basic/vector_scalar_add_runlist/aie2.py +++ b/programming_examples/basic/vector_scalar_add_runlist/aie2.py @@ -8,9 +8,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -47,17 +47,15 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(AIE_TILE_WIDTH): + for i in range_(AIE_TILE_WIDTH): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(PROBLEM_SIZE, T.i32()) diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index dd02a1010c..0acdd531db 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -10,8 +10,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -63,10 +63,10 @@ def device_body(): @core(ComputeTile2, "scale.o") def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) # Number of sub-vector "tile" iterations - for _ in for_(N_div_n): + for _ in range_(N_div_n): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) if vectorized: @@ -75,9 +75,7 @@ def core_body(): call(scale_scalar, [elem_in, elem_out, elem_factor, n]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) of_factor.release(ObjectFifoPort.Consume, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i16()) diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index 87f2f9fffe..a0ded07998 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -9,13 +9,9 @@ import numpy as np import pyxrt as xrt import sys -import time from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith import aie.utils.test as test_utils import aie.utils.trace as trace_utils diff --git a/programming_examples/basic/vector_vector_add/aie2.py b/programming_examples/basic/vector_vector_add/aie2.py index 2a3595a754..d65759fa6d 100644 --- a/programming_examples/basic/vector_vector_add/aie2.py +++ b/programming_examples/basic/vector_vector_add/aie2.py @@ -10,9 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx from aie.extras.dialects.ext import memref, arith +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -55,23 +55,20 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # Number of sub-vector "tile" iterations - for _ in for_(N_div_n): + for _ in range_(N_div_n): elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(n): + for i in range_(n): v0 = memref.load(elem_in1, [i]) v1 = memref.load(elem_in2, [i]) v2 = arith.addi(v0, v1) memref.store(v2, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_in2.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/vector_vector_modulo/aie2.py b/programming_examples/basic/vector_vector_modulo/aie2.py index eb3e8f8d03..98437759e3 100644 --- a/programming_examples/basic/vector_vector_modulo/aie2.py +++ b/programming_examples/basic/vector_vector_modulo/aie2.py @@ -10,11 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx from aie.extras.dialects.ext import memref, arith - -import sys +from aie.extras.dialects.ext.scf import _for as range_ def my_vector_add(): @@ -55,23 +53,20 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # Number of sub-vector "tile" iterations - for _ in for_(N_div_n): + for _ in range_(N_div_n): elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(n): + for i in range_(n): v0 = memref.load(elem_in1, [i]) v1 = memref.load(elem_in2, [i]) v2 = arith.remsi(v0, v1) memref.store(v2, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_in2.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py index 414d62fa26..a9dc51340e 100644 --- a/programming_examples/basic/vector_vector_mul/aie2.py +++ b/programming_examples/basic/vector_vector_mul/aie2.py @@ -10,11 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx from aie.extras.dialects.ext import memref, arith - -import sys +from aie.extras.dialects.ext.scf import _for as range_ def my_vector_mul(): @@ -55,23 +53,20 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # Number of sub-vector "tile" iterations - for _ in for_(N_div_n): + for _ in range_(N_div_n): elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(n): + for i in range_(n): v0 = memref.load(elem_in1, [i]) v1 = memref.load(elem_in2, [i]) v2 = arith.muli(v0, v1) memref.store(v2, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_in2.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.i32()) diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py index 1899f52206..966ae683c5 100644 --- a/programming_examples/ml/bottleneck/aie2.py +++ b/programming_examples/ml/bottleneck/aie2.py @@ -5,14 +5,13 @@ # # Copyright (C) 2024, Advanced Micro Devices, Inc. +import sys + from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.extras.dialects.ext import memref, arith -from aie.dialects.scf import * +from aie.extras.dialects.ext import memref from aie.extras.context import mlir_mod_ctx -from aie.ir import MemRefType, TypeAttr - -import sys +from aie.extras.dialects.ext.scf import _for as range_ # tracing definitions trace_sz_in_bytes = 8192 @@ -47,68 +46,27 @@ def deviceBody(): int16_ty = IntegerType.get_signless(16) int32_ty = IntegerType.get_signless(32) - tensorLayer1In_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL1InC, - ), + tensorLayer1In_ty = T.memref( + tensorInW, + 1, + tensorL1InC, int8_ty, ) - weightsLayer1_ty = MemRefType.get((tensorL1InC * tensorL1OutC,), int8_ty) - tensorLayer1Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL1OutC, - ), - uint8_ty, - ) + weightsLayer1_ty = T.memref(tensorL1InC * tensorL1OutC, int8_ty) + tensorLayer1Out_ty = T.memref(tensorInW, 1, tensorL1OutC, uint8_ty) - tensorLayer2In_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL2InC, - ), - uint8_ty, - ) - weightsLayer2_ty = MemRefType.get( - (3 * 3 * tensorL2InC * tensorL2OutC,), int8_ty - ) - tensorLayer2Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL2OutC // 2, - ), - uint8_ty, - ) + tensorLayer2In_ty = T.memref(tensorInW, 1, tensorL2InC, uint8_ty) + weightsLayer2_ty = T.memref(3 * 3 * tensorL2InC * tensorL2OutC, int8_ty) + tensorLayer2Out_ty = T.memref(tensorInW, 1, tensorL2OutC // 2, uint8_ty) - tensorLayer3In_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL3InC // 2, - ), - uint8_ty, - ) - weightsLayer3_ty = MemRefType.get((tensorL3InC * tensorL3OutC,), int8_ty) - tensorLayer3Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorL3OutC, - ), - uint8_ty, - ) + tensorLayer3In_ty = T.memref(tensorInW, 1, tensorL3InC // 2, uint8_ty) + weightsLayer3_ty = T.memref(tensorL3InC * tensorL3OutC, int8_ty) + tensorLayer3Out_ty = T.memref(tensorInW, 1, tensorL3OutC, uint8_ty) - allWeights_ty = MemRefType.get( - ( - tensorL1InC * tensorL1OutC - + 3 * 3 * tensorL2InC * tensorL2OutC - + tensorL3InC * tensorL3OutC, - ), + allWeights_ty = T.memref( + tensorL1InC * tensorL1OutC + + 3 * 3 * tensorL2InC * tensorL2OutC + + tensorL3InC * tensorL3OutC, int8_ty, ) @@ -239,12 +197,12 @@ def deviceBody(): # 1x1 conv2d @core(ComputeTile2, "conv2dk1.o") def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights once element0Weights = of_wts_buf_00.acquire(ObjectFifoPort.Consume, 1) scale = memref.load(rtpComputeTile2, [0]) - for _ in for_(tensorInH): + for _ in range_(tensorInH): element0ActivactionsIn = of_inOF_act_L3L2.acquire( ObjectFifoPort.Consume, 1 ) @@ -267,15 +225,14 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "inOF_act_L3L2", 1) objectfifo_release(ObjectFifoPort.Produce, "act_2_3_5", 1) - yield_([]) + objectfifo_release(ObjectFifoPort.Consume, "wts_buf_00", 1) - yield_([]) # 3x3 conv2d OFM 0-31 @core(ComputeTile3, "conv2dk3.o") def core_body(): scale = 11 - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1) @@ -307,7 +264,7 @@ def core_body(): objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) # middle - for _ in for_(tensorInH - 2): + for _ in range_(tensorInH - 2): elementActivactionsIn = of_act_2_3_5.acquire( ObjectFifoPort.Consume, 3 ) @@ -335,7 +292,6 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1) objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) - yield_([]) # last part elementActivactionsIn = of_act_2_3_5.acquire( @@ -365,13 +321,12 @@ def core_body(): objectfifo_release(ObjectFifoPort.Produce, "act_3_4", 1) objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1) - yield_([]) # 3x3 conv2d OFM 32-63 @core(ComputeTile5, "conv2dk3.o") def core_body(): scale = 11 - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_buf_01.acquire(ObjectFifoPort.Consume, 1) @@ -404,7 +359,7 @@ def core_body(): objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) # middle - for _ in for_(tensorInH - 2): + for _ in range_(tensorInH - 2): elementActivactionsIn = of_act_2_3_5.acquire( ObjectFifoPort.Consume, 3 ) @@ -432,7 +387,6 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 1) objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) - yield_([]) # last part elementActivactionsIn = of_act_2_3_5.acquire( @@ -460,19 +414,18 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_2_3_5", 2) objectfifo_release(ObjectFifoPort.Produce, "act_5_4", 1) objectfifo_release(ObjectFifoPort.Consume, "wts_buf_01", 1) - yield_([]) # # 1x1 conv2d and add skip @core(ComputeTile4, "conv2dk1_skip.o") def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_buf_02.acquire(ObjectFifoPort.Consume, 1) scale = memref.load(rtpComputeTile4, [0]) skipScale = memref.load(rtpComputeTile4, [1]) - for _ in for_(tensorInH): + for _ in range_(tensorInH): element0ActivactionsIn = act_3_4.acquire( ObjectFifoPort.Consume, 1 ) @@ -503,9 +456,7 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_3_4", 1) objectfifo_release(ObjectFifoPort.Consume, "act_5_4", 1) objectfifo_release(ObjectFifoPort.Consume, "skip_buf", 1) - yield_([]) objectfifo_release(ObjectFifoPort.Consume, "wts_buf_02", 1) - yield_([]) # instruction stream generation activationsIn = tensorInW * tensorInH * tensorInC @@ -516,8 +467,8 @@ def core_body(): + tensorL3InC * tensorL3OutC ) - activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty) - weightsInL3_ty = MemRefType.get((totalWeights,), uint8_ty) + activationsInL3_ty = T.memref(activationsIn, int8_ty) + weightsInL3_ty = T.memref(totalWeights, uint8_ty) @runtime_sequence(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty) def sequence(inputFromL3, weightsFromL3, outputToL3): diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py index baaee53f5f..9fa7a0c4cd 100644 --- a/programming_examples/ml/conv2d/aie2.py +++ b/programming_examples/ml/conv2d/aie2.py @@ -9,9 +9,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ width = 32 height = 32 @@ -106,13 +106,13 @@ def core_body(): ci = 64 co = 64 - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elemWts = of_inOF_wts_0_L3L2.acquire(ObjectFifoPort.Consume, 1) scale = memref.load(rtp2, [0]) # scale = memref.load(rtpComputeTile2, [0]) - for _ in for_(y_dim): + for _ in range_(y_dim): elemIn = of_act_L2_02.acquire(ObjectFifoPort.Consume, 1) elemOut0 = of_out_02_L2.acquire(ObjectFifoPort.Produce, 1) @@ -131,9 +131,7 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_L2_02", 1) objectfifo_release(ObjectFifoPort.Produce, "out_02_L2", 1) - yield_([]) objectfifo_release(ObjectFifoPort.Consume, "inOF_wts_0_L3L2", 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py index 9aa7daac1b..b18f5030ee 100644 --- a/programming_examples/ml/conv2d_fused_relu/aie2.py +++ b/programming_examples/ml/conv2d_fused_relu/aie2.py @@ -9,9 +9,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ width = 32 height = 32 @@ -113,13 +113,13 @@ def core_body(): ci = 64 co = 64 - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elemWts = of_inOF_wts_0_L3L2.acquire(ObjectFifoPort.Consume, 1) scale = memref.load(rtp2, [0]) # scale = memref.load(rtpComputeTile2, [0]) - for _ in for_(y_dim): + for _ in range_(y_dim): elemIn = of_act_L2_02.acquire(ObjectFifoPort.Consume, 1) elemOut0 = of_out_02_L2.acquire(ObjectFifoPort.Produce, 1) @@ -138,9 +138,7 @@ def core_body(): objectfifo_release(ObjectFifoPort.Consume, "act_L2_02", 1) objectfifo_release(ObjectFifoPort.Produce, "out_02_L2", 1) - yield_([]) objectfifo_release(ObjectFifoPort.Consume, "inOF_wts_0_L3L2", 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py index e0a7f4e5b8..ca4e1cae2b 100644 --- a/programming_examples/ml/eltwise_add/aie2.py +++ b/programming_examples/ml/eltwise_add/aie2.py @@ -9,8 +9,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -118,8 +118,8 @@ def device_body(): # Compute tile i @core(cores[i], "add.o") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) @@ -139,8 +139,6 @@ def core_body(): outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.bf16()) diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py index f0420abf60..1e0112f3e6 100644 --- a/programming_examples/ml/eltwise_mul/aie2.py +++ b/programming_examples/ml/eltwise_mul/aie2.py @@ -9,8 +9,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -119,8 +119,8 @@ def device_body(): # Compute tile i @core(cores[i], "mul.o") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) @@ -140,8 +140,6 @@ def core_body(): outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.bf16()) diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py index 3e81173320..d9c2d48627 100644 --- a/programming_examples/ml/relu/aie2.py +++ b/programming_examples/ml/relu/aie2.py @@ -9,8 +9,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -95,8 +95,8 @@ def device_body(): # Compute tile i @core(cores[i], "relu.o") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) @@ -110,8 +110,6 @@ def core_body(): outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.bf16()) diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 6d96eb79ff..e497e47b83 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -7,11 +7,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import memref, arith -from aie.dialects.scf import for_, yield_ +from aie.extras.dialects.ext import memref from aie.extras.context import mlir_mod_ctx -from aie.ir import MemRefType, TypeAttr +from aie.extras.dialects.ext.scf import _for as range_ import sys @@ -42,98 +40,37 @@ def deviceBody(): int8_ty = IntegerType.get_signless(8) int32_ty = IntegerType.get_signless(32) - tensorLayer1In_ty_init = MemRefType.get( - ( - tensorInW, - 1, - tensorInCInit, - ), - int8_ty, - ) - tensorLayer1In_ty_rest = MemRefType.get( - ( - tensorInW, - 1, - tensorInCRest, - ), - uint8_ty, - ) - weightsLayer1_ty_init = MemRefType.get( - (tensorInCInit * tensorInCInit,), int8_ty - ) - weightsLayer1_ty_rest = MemRefType.get( - (tensorInCRest * tensorInCInit,), int8_ty - ) + tensorLayer1In_ty_init = T.memref(tensorInW, 1, tensorInCInit, int8_ty) + tensorLayer1In_ty_rest = T.memref(tensorInW, 1, tensorInCRest, uint8_ty) + weightsLayer1_ty_init = T.memref(tensorInCInit * tensorInCInit, int8_ty) + weightsLayer1_ty_rest = T.memref(tensorInCRest * tensorInCInit, int8_ty) - tensorLayer1Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorInCInit, - ), - uint8_ty, - ) + tensorLayer1Out_ty = T.memref(tensorInW, 1, tensorInCInit, uint8_ty) - tensorLayer2In_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorInCInit, - ), - uint8_ty, - ) - weightsLayer2_ty = MemRefType.get( - (3 * 3 * tensorInCInit * tensorInCInit,), int8_ty - ) - tensorLayer2Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorInCInit // 2, - ), - uint8_ty, - ) + tensorLayer2In_ty = T.memref(tensorInW, 1, tensorInCInit, uint8_ty) + weightsLayer2_ty = T.memref(3 * 3 * tensorInCInit * tensorInCInit, int8_ty) + tensorLayer2Out_ty = T.memref(tensorInW, 1, tensorInCInit // 2, uint8_ty) - tensorLayer3In_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorInCInit // 2, - ), - uint8_ty, - ) - weightsLayer3_ty_init = MemRefType.get( - (2 * tensorInCInit * tensorInCRest,), int8_ty - ) - weightsLayer3_ty_rest = MemRefType.get( - (tensorInCRest // 4 * tensorInCRest,), int8_ty + tensorLayer3In_ty = T.memref(tensorInW, 1, tensorInCInit // 2, uint8_ty) + weightsLayer3_ty_init = T.memref(2 * tensorInCInit * tensorInCRest, int8_ty) + weightsLayer3_ty_rest = T.memref( + tensorInCRest // 4 * tensorInCRest, int8_ty ) - tensorLayer3Out_ty = MemRefType.get( - ( - tensorInW, - 1, - tensorInCRest, - ), - uint8_ty, - ) + tensorLayer3Out_ty = T.memref(tensorInW, 1, tensorInCRest, uint8_ty) - allWeights_ty_init = MemRefType.get( - ( - tensorInCInit * tensorInCInit - + 3 * 3 * tensorInCInit * tensorInCInit - + tensorInCInit * tensorInCRest - + tensorInCInit * tensorInCRest, - ), + allWeights_ty_init = T.memref( + tensorInCInit * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest + + tensorInCInit * tensorInCRest, int8_ty, ) - allWeights_ty_rest = MemRefType.get( - ( - tensorInCRest * tensorInCInit - + 3 * 3 * tensorInCInit * tensorInCInit - + tensorInCInit * tensorInCRest, - ), + allWeights_ty_rest = T.memref( + tensorInCRest * tensorInCInit + + 3 * 3 * tensorInCInit * tensorInCInit + + tensorInCInit * tensorInCRest, int8_ty, ) @@ -528,14 +465,14 @@ def deviceBody(): @core(cores[i][0], conv1_kernels[i]) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights once element0Weights = wts_sub_fifos[ wts_sub_fifo_names[i][0] ].acquire(ObjectFifoPort.Consume, 1) scale = memref.load(rtp[i][0], [0]) - for _ in for_(tensorInH): + for _ in range_(tensorInH): element0ActivactionsIn = act1_fifos[ act1_fifo_names[i] ].acquire(ObjectFifoPort.Consume, 1) @@ -576,11 +513,9 @@ def core_body(): objectfifo_release( ObjectFifoPort.Produce, act2_fifo_names[i], 1 ) - yield_([]) objectfifo_release( ObjectFifoPort.Consume, wts_sub_fifo_names[i][0], 1 ) - yield_([]) # 3x3 conv2d OFM 0-31 for i in range(n_cols): @@ -588,7 +523,7 @@ def core_body(): @core(cores[i][1], "conv2dk3.o") def core_body(): scale = 1 - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_sub_fifos[ @@ -626,7 +561,7 @@ def core_body(): ) # middle - for _ in for_(tensorInH - 2): + for _ in range_(tensorInH - 2): elementActivactionsIn = act2_fifos[ act2_fifo_names[i] ].acquire(ObjectFifoPort.Consume, 3) @@ -658,7 +593,6 @@ def core_body(): objectfifo_release( ObjectFifoPort.Produce, act3_fifo_names_1[i], 1 ) - yield_([]) # last part elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( @@ -696,7 +630,6 @@ def core_body(): objectfifo_release( ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 ) - yield_([]) # 3x3 conv2d OFM 32-63 @@ -705,7 +638,7 @@ def core_body(): @core(cores[i][3], "conv2dk3.o") def core_body(): scale = 1 - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_sub_fifos[ @@ -744,7 +677,7 @@ def core_body(): ) # middle - for _ in for_(tensorInH - 2): + for _ in range_(tensorInH - 2): elementActivactionsIn = act2_fifos[ act2_fifo_names[i] ].acquire(ObjectFifoPort.Consume, 3) @@ -776,7 +709,6 @@ def core_body(): objectfifo_release( ObjectFifoPort.Produce, act3_fifo_names_2[i], 1 ) - yield_([]) # last part elementActivactionsIn = act2_fifos[act2_fifo_names[i]].acquire( @@ -812,14 +744,13 @@ def core_body(): objectfifo_release( ObjectFifoPort.Consume, wts_sub_fifo_names[i][1], 1 ) - yield_([]) # # 1x1 conv2d and add skip for i in range(n_cols): @core(cores[i][2], conv3_kernels[i]) def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): # acquire weights and rtps once element0Weights = wts_sub_fifos[ @@ -833,7 +764,7 @@ def core_body(): scale = memref.load(rtp[i][2], [0]) skipScale = memref.load(rtp[i][2], [1]) - for _ in for_(tensorInH): + for _ in range_(tensorInH): element0ActivactionsIn = act3_fifo_1[ act3_fifo_names_1[i] ].acquire(ObjectFifoPort.Consume, 1) @@ -894,11 +825,9 @@ def core_body(): objectfifo_release( ObjectFifoPort.Consume, skip_fifo_names[i], 1 ) - yield_([]) objectfifo_release( ObjectFifoPort.Consume, wts_sub_fifo_names[i][2], 1 ) - yield_([]) # instruction stream generation activationsIn = tensorInW * tensorInH * tensorInCInit @@ -918,12 +847,12 @@ def core_body(): totalWeights_complete = totalWeights_init + repeat * totalWeights_rest - activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty) - activationsOutL3_ty = MemRefType.get((acitivationsOut,), int8_ty) - weightsInL3_ty_init = MemRefType.get((totalWeights_init,), int8_ty) - weightsInL3_ty_rest = MemRefType.get((totalWeights_rest,), int8_ty) + activationsInL3_ty = T.memref(activationsIn, int8_ty) + activationsOutL3_ty = T.memref(acitivationsOut, int8_ty) + weightsInL3_ty_init = T.memref(totalWeights_init, int8_ty) + weightsInL3_ty_rest = T.memref(totalWeights_rest, int8_ty) - weightsInL3_ty_complete = MemRefType.get((totalWeights_complete,), int8_ty) + weightsInL3_ty_complete = T.memref(totalWeights_complete, int8_ty) @runtime_sequence( activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py index 787988651c..dbec68f5dc 100755 --- a/programming_examples/ml/softmax/aie2.py +++ b/programming_examples/ml/softmax/aie2.py @@ -9,9 +9,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.dialects.ext import func from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ import aie.utils.trace as trace_utils @@ -98,8 +97,8 @@ def device_body(): # Compute tile i @core(cores[i], "kernels.a") def core_body(): - for _ in for_(0xFFFFFFFF): - for _ in for_(tiles): + for _ in range_(0xFFFFFFFF): + for _ in range_(tiles): elem_out = outC_fifos[outC_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) @@ -113,8 +112,6 @@ def core_body(): outC_fifos[outC_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(N, T.bf16()) diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py index 026aa0a00f..ec490e52c4 100644 --- a/programming_examples/vision/color_detect/aie2_colorDetect.py +++ b/programming_examples/vision/color_detect/aie2_colorDetect.py @@ -10,9 +10,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * from aie.extras.dialects.ext import arith -from aie.dialects.scf import yield_, for_ as range_ from aie.extras.context import mlir_mod_ctx from aie.ir import MemRefType, TypeAttr +from aie.extras.dialects.ext.scf import _for as range_ width = 64 height = 36 @@ -111,7 +111,6 @@ def coreBody(): call(rgba2hueLine, [elemIn, elemOut, arith.constant(lineWidth)]) inOF_L3L2.release(ObjectFifoPort.Consume, 1) OF_2to34.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3, "threshold.cc.o") @@ -152,7 +151,6 @@ def coreBody(): ) OF_3to3.release(ObjectFifoPort.Consume, 1) OF_3to5.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 4 @core(ComputeTile4, "threshold.cc.o") @@ -193,7 +191,6 @@ def coreBody(): ) OF_4to4.release(ObjectFifoPort.Consume, 1) OF_4to5.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 5 @core(ComputeTile5, "combined_bitwiseOR_gray2rgba_bitwiseAND.a") @@ -235,7 +232,6 @@ def coreBody(): OF_5to5b.release(ObjectFifoPort.Consume, 1) inOF_L2L1.release(ObjectFifoPort.Consume, 1) outOF_L1L2.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index 1e58735aeb..619bb2e0d1 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -9,9 +9,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ width = 512 height = 9 @@ -109,8 +109,7 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "threshold.cc.o") def core_body(): - # for _ in for_(4096): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) @@ -140,13 +139,11 @@ def core_body(): inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3, "threshold.cc.o") def core_body(): - # for _ in for_(4096): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) # RTPs written from the instruction stream must be read right before the kernel @@ -175,13 +172,11 @@ def core_body(): inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 4 @core(ComputeTile4, "threshold.cc.o") def core_body(): - # for _ in for_(4096): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) @@ -211,13 +206,11 @@ def core_body(): inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 5 @core(ComputeTile5, "threshold.cc.o") def core_body(): - # for _ in for_(4096): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) @@ -247,7 +240,6 @@ def core_body(): inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 933f5404f7..de1e9d44c3 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -9,9 +9,9 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ width = 64 height = 36 @@ -143,8 +143,7 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "rgba2gray.cc.o") def core_body(): - for _ in for_(4294967295): - # for _ in for_(36): + for _ in range_(sys.maxsize): elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) @@ -152,7 +151,6 @@ def core_body(): inOF_L3L2.release(ObjectFifoPort.Consume, 1) OF_2to3.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3, "filter2d.cc.o") @@ -171,7 +169,7 @@ def core_body(): memref.store(v1, kernel, [2, 1]) memref.store(v0, kernel, [2, 2]) - for _ in for_(4294967295): + for _ in range_(sys.maxsize): # Preamble : Top Border elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) @@ -189,7 +187,7 @@ def core_body(): OF_3to4.release(ObjectFifoPort.Produce, 1) # Steady State : Middle - for _ in for_(1, heightMinus1): + for _ in range_(1, heightMinus1): elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) call( @@ -205,7 +203,6 @@ def core_body(): ) OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - yield_([]) # Postamble : Bottom Border elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) @@ -223,7 +220,6 @@ def core_body(): ) OF_2to3.release(ObjectFifoPort.Consume, 2) OF_3to4.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 4 @core(ComputeTile4, "threshold.cc.o") @@ -232,7 +228,7 @@ def core_body(): v_max = arith.constant(255, T.i16()) v_typ = arith.constant(0, T.i8()) - for _ in for_(4294967295): + for _ in range_(sys.maxsize): elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) @@ -250,12 +246,11 @@ def core_body(): OF_3to4.release(ObjectFifoPort.Consume, 1) OF_4to5.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 5 @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") def core_body(): - for _ in for_(4294967295): + for _ in range_(sys.maxsize): elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) @@ -288,7 +283,6 @@ def core_body(): OF_5to5.release(ObjectFifoPort.Consume, 1) inOF_L2L1.release(ObjectFifoPort.Consume, 1) outOF_L1L2.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py index abadaa785e..76311159c5 100644 --- a/programming_examples/vision/vision_passthrough/aie2.py +++ b/programming_examples/vision/vision_passthrough/aie2.py @@ -9,8 +9,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext.scf import _for as range_ width = 512 # 1920 // 8 height = 9 # 1080 // 8 @@ -54,15 +54,13 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "passThrough.cc.o") def core_body(): - for _ in for_(sys.maxsize): - for _ in for_(height): + for _ in range_(sys.maxsize): + for _ in range_(height): elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) call(passThroughLine, [elemIn, elemOut, width]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) # print(ctx.module.operation.verify()) diff --git a/programming_guide/section-2/section-2a/README.md b/programming_guide/section-2/section-2a/README.md index 2f254e5e69..4ce509da23 100644 --- a/programming_guide/section-2/section-2a/README.md +++ b/programming_guide/section-2/section-2a/README.md @@ -89,7 +89,6 @@ def core_body(): elem0 = of0.acquire(ObjectFifoPort.Produce, 1) call(test_func, [elem0]) of0.release(ObjectFifoPort.Produce, 1) - yield_([]) @core(B) def core_body(): @@ -123,7 +122,6 @@ def core_body(): elem1 = of0.acquire(ObjectFifoPort.Consume, 1) call(test_func2, [elem1]) of0.release(ObjectFifoPort.Consume, 1) - yield_([]) ``` ### Specifying the Object FIFO Depth as an Array @@ -148,7 +146,6 @@ def core_body(): elem0 = of0.acquire(ObjectFifoPort.Produce, 1) call(produce_func, [elem0]) of0.release(ObjectFifoPort.Produce, 1) - yield_([]) @core(B) def core_body(): @@ -156,7 +153,6 @@ def core_body(): elems = of0.acquire(ObjectFifoPort.Consume, 2) call(consume_func, [elems[0], elems[1]]) of0.release(ObjectFifoPort.Consume, 2) - yield_([]) ``` Each iteration: * producer A acquires one object to produce into, calls the kernel function `produce_func` to store new data in it for B to consume, and releases the object, diff --git a/programming_guide/section-2/section-2d/README.md b/programming_guide/section-2/section-2d/README.md index b7fe51f9be..fd9c7d1ded 100644 --- a/programming_guide/section-2/section-2d/README.md +++ b/programming_guide/section-2/section-2d/README.md @@ -112,17 +112,15 @@ The core of this simple design acquires one object of each Object FIFO, adds `1` @core(ComputeTile) def core_body(): # Effective while(1) - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - for i in for_(data_size): + for i in range_(data_size): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in0.release(ObjectFifoPort.Consume, 1) of_out0.release(ObjectFifoPort.Produce, 1) - yield_([]) ``` Once again we apply the same logic and use a `for`-loop over our three cores to write the code which will be executed on the three compute tiles. Each tile will index the `inX_fifos` and `outX_fifos` maps to retrieve the Object FIFOs it will acquire and release from. This process results in the following code: ```python @@ -130,25 +128,23 @@ for i in range(n_cores): # Compute tile i @core(ComputeTiles[i]) def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_in = inX_fifos[inX_fifo_names[i]].acquire( ObjectFifoPort.Consume, 1 ) elem_out = outX_fifos[outX_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) - for i in for_(tile_size): + for i in range_(tile_size): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) inX_fifos[inX_fifo_names[i]].release( ObjectFifoPort.Consume, 1 ) outX_fifos[outX_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) ``` ----- diff --git a/programming_guide/section-2/section-2d/aie2.py b/programming_guide/section-2/section-2d/aie2.py index 5aefaaf543..c624880098 100644 --- a/programming_guide/section-2/section-2d/aie2.py +++ b/programming_guide/section-2/section-2d/aie2.py @@ -10,7 +10,9 @@ from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect +from aie.extras.dialects.ext.scf import ( + _for as range_, +) # scf (structured control flow) dialect from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects buffer_depth = 2 @@ -54,17 +56,15 @@ def device_body(): @core(ComputeTile) def core_body(): # Effective while(1) - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(data_size): + for i in range_(data_size): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # Print the mlir conversion res = ctx.module.operation.verify() diff --git a/programming_guide/section-2/section-2d/aie2_multi.py b/programming_guide/section-2/section-2d/aie2_multi.py index 0a4f44b72b..4e8e282626 100644 --- a/programming_guide/section-2/section-2d/aie2_multi.py +++ b/programming_guide/section-2/section-2d/aie2_multi.py @@ -10,7 +10,9 @@ from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect +from aie.extras.dialects.ext.scf import ( + _for as range_, +) # scf (structured control flow) dialect from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects n_cores = 3 @@ -82,23 +84,21 @@ def device_body(): # Compute tile i @core(ComputeTiles[i]) def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_in = inX_fifos[inX_fifo_names[i]].acquire( ObjectFifoPort.Consume, 1 ) elem_out = outX_fifos[outX_fifo_names[i]].acquire( ObjectFifoPort.Produce, 1 ) - for j in for_(tile_size): + for j in range_(tile_size): v0 = memref.load(elem_in, [j]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [j]) - yield_([]) inX_fifos[inX_fifo_names[i]].release(ObjectFifoPort.Consume, 1) outX_fifos[outX_fifo_names[i]].release( ObjectFifoPort.Produce, 1 ) - yield_([]) # Print the mlir conversion res = ctx.module.operation.verify() diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py index a68b397862..17d6fe4d29 100644 --- a/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py +++ b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -34,23 +34,20 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(8): + for _ in range_(8): elem_out = of_in.acquire(ObjectFifoPort.Produce, 1) - for i in for_(16): + for i in range_(16): v1 = arith.constant(1, T.i32()) memref.store(v1, elem_out, [i]) - yield_([]) of_in.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile3) def core_body(): # Effective while(1) - for _ in for_(8): + for _ in range_(8): elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) of_in.release(ObjectFifoPort.Consume, 1) - yield_([]) res = ctx.module.operation.verify() if res == True: diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py index 0a0c17f1eb..3c01c24f15 100644 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -36,17 +36,15 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(2): + for _ in range_(2): elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - for i in for_(24): + for i in range_(24): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py index 4267abd903..81d98292a9 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -41,17 +41,15 @@ def device_body(): @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(6): + for _ in range_(6): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) memRef_48_ty = T.memref(48, T.i32()) diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py index e2eb2ed0f7..b9d54e67bf 100644 --- a/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py +++ b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -40,43 +40,37 @@ def device_body(): @core(ComputeTile0) def core_body(): # Effective while(1) - for _ in for_(8): + for _ in range_(8): elem = of_in0.acquire(ObjectFifoPort.Consume, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem, [i]) - yield_([]) of_in0.release(ObjectFifoPort.Consume, 1) - yield_([]) # Compute tile 3 @core(ComputeTile1) def core_body(): # Effective while(1) - for _ in for_(8): + for _ in range_(8): elem = of_in1.acquire(ObjectFifoPort.Consume, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) - yield_([]) # Compute tile 4 @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(8): + for _ in range_(8): elem = of_in2.acquire(ObjectFifoPort.Consume, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem, [i]) - yield_([]) of_in2.release(ObjectFifoPort.Consume, 1) - yield_([]) res = ctx.module.operation.verify() if res == True: diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py index b34cc32a42..cd55d8c07a 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py +++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -47,49 +47,43 @@ def device_body(): @core(ComputeTile0) def core_body(): # Effective while(1) - for _ in for_(2): + for _ in range_(2): elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in0.release(ObjectFifoPort.Consume, 1) of_out0.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile1) def core_body(): # Effective while(1) - for _ in for_(2): + for _ in range_(2): elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 4 @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(2): + for _ in range_(2): elem_in = of_in2.acquire(ObjectFifoPort.Consume, 1) elem_out = of_out2.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in2.release(ObjectFifoPort.Consume, 1) of_out2.release(ObjectFifoPort.Produce, 1) - yield_([]) memRef_48_ty = T.memref(48, T.i32()) diff --git a/programming_guide/section-2/section-2e/05_join_L2/join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py index e91c4e6717..1cab577e57 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/join_L2.py +++ b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py @@ -7,7 +7,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import memref, arith from aie.extras.context import mlir_mod_ctx @@ -40,43 +40,37 @@ def device_body(): @core(ComputeTile0) def core_body(): # Effective while(1) - for _ in for_(6): + for _ in range_(6): elem = of_out0.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.constant(1, T.i32()) memref.store(v1, elem, [i]) - yield_([]) of_out0.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 3 @core(ComputeTile1) def core_body(): # Effective while(1) - for _ in for_(6): + for _ in range_(6): elem = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.constant(1, T.i32()) memref.store(v1, elem, [i]) - yield_([]) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) # Compute tile 4 @core(ComputeTile2) def core_body(): # Effective while(1) - for _ in for_(6): + for _ in range_(6): elem = of_out2.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): + for i in range_(8): v0 = memref.load(elem, [i]) v1 = arith.constant(1, T.i32()) memref.store(v1, elem, [i]) - yield_([]) of_out2.release(ObjectFifoPort.Produce, 1) - yield_([]) res = ctx.module.operation.verify() if res == True: diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md index e46da9dc13..0035178837 100644 --- a/programming_guide/section-3/README.md +++ b/programming_guide/section-3/README.md @@ -90,18 +90,16 @@ This access and execute pattern runs on the AIE compute core `ComputeTile2` and @core(ComputeTile2, "scale.o") def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) # Number of sub-vector "tile" iterations - for _ in for_(4): + for _ in range_(4): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) of_factor.release(ObjectFifoPort.Consume, 1) - yield_([]) ``` ## Kernel Code diff --git a/programming_guide/section-3/aie2.py b/programming_guide/section-3/aie2.py index 198eaba58e..44941872ca 100644 --- a/programming_guide/section-3/aie2.py +++ b/programming_guide/section-3/aie2.py @@ -10,7 +10,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx import aie.utils.trace as trace_utils @@ -43,18 +43,16 @@ def device_body(): @core(ComputeTile2, "scale.o") def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) # Number of sub-vector "tile" iterations - for _ in for_(4): + for _ in range_(4): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) of_factor.release(ObjectFifoPort.Consume, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(4096, T.i32()) diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py index b422ccd242..37fda6938c 100644 --- a/programming_guide/section-3/test.py +++ b/programming_guide/section-3/test.py @@ -9,13 +9,6 @@ import numpy as np import pyxrt as xrt import sys -import time - -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith import aie.utils.test as test_utils diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py index 3cde8754c2..b108beabea 100644 --- a/programming_guide/section-4/section-4a/aie2.py +++ b/programming_guide/section-4/section-4a/aie2.py @@ -10,7 +10,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx import aie.utils.trace as trace_utils @@ -43,18 +43,16 @@ def device_body(): @core(ComputeTile2, "scale.o") def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) # Number of sub-vector "tile" iterations - for _ in for_(4): + for _ in range_(4): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) of_factor.release(ObjectFifoPort.Consume, 1) - yield_([]) # To/from AIE-array data movement tensor_ty = T.memref(4096, T.i32()) diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py index d4d47cd918..9658675cdb 100644 --- a/programming_guide/section-4/section-4a/test.py +++ b/programming_guide/section-4/section-4a/test.py @@ -11,12 +11,6 @@ import sys import time -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith - import aie.utils.test as test_utils diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py index ba3a505812..a04be30d74 100644 --- a/programming_guide/section-4/section-4b/aie2.py +++ b/programming_guide/section-4/section-4b/aie2.py @@ -11,7 +11,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx import aie.utils.trace as trace_utils @@ -48,18 +48,16 @@ def device_body(): @core(ComputeTile2, "scale.o") def core_body(): # Effective while(1) - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) # Number of sub-vector "tile" iterations - for _ in for_(4): + for _ in range_(4): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) of_factor.release(ObjectFifoPort.Consume, 1) - yield_([]) # Set up a circuit-switched flow from core to shim for tracing information if enableTrace: diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py index 01330cdc5e..5e2fe78184 100644 --- a/programming_guide/section-4/section-4b/test.py +++ b/programming_guide/section-4/section-4b/test.py @@ -11,12 +11,6 @@ import sys import time -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.extras.context import mlir_mod_ctx -from aie.extras.dialects.ext import memref, arith - import aie.utils.test as test_utils import aie.utils.trace as trace_utils diff --git a/python/extras/dialects/ext/scf.py b/python/extras/dialects/ext/scf.py new file mode 100644 index 0000000000..f1e155d942 --- /dev/null +++ b/python/extras/dialects/ext/scf.py @@ -0,0 +1,52 @@ +from typing import Optional, Sequence + +from ....ir import InsertionPoint, Value +from ....dialects.linalg.opdsl.lang.emitter import _is_index_type +from ....dialects.scf import ForOp, yield_ + +from .arith import constant, index_cast + + +def _for( + start, + stop=None, + step=None, + iter_args: Optional[Sequence[Value]] = None, + insert_yield: bool = True, + *, + loc=None, + ip=None, +): + """ + This is nearly identical to the convenience wrapper in scf, but with the added insert_yield parameter. + The insert_yield parameter defaults to True; if left as True, the user no longer needs to manually insert + yield operations (```yield_([])```). If the user wishes to specify yield directly (such as if there is + a return value from the loop body), insert_yield should be set to False. + """ + if step is None: + step = 1 + if stop is None: + stop = start + start = 0 + params = [start, stop, step] + for i, p in enumerate(params): + if isinstance(p, int): + p = constant(p, index=True) + if not _is_index_type(p.type): + p = index_cast(p) + params[i] = p + + start, stop, step = params + + for_op = ForOp(start, stop, step, iter_args, loc=loc, ip=ip) + iv = for_op.induction_variable + iter_args = tuple(for_op.inner_iter_args) + with InsertionPoint(for_op.body): + if len(iter_args) > 1: + yield iv, iter_args, for_op.results + elif len(iter_args) == 1: + yield iv, iter_args[0], for_op.results[0] + else: + yield iv + if insert_yield: + yield_([]) diff --git a/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py index 531c29f8a2..daf03580d4 100644 --- a/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py +++ b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py @@ -20,7 +20,7 @@ WireBundle, npu_instgen, ) -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import arith, func, memref from aie.extras.runtime.passes import run_pipeline @@ -90,13 +90,9 @@ def core(): v0 = memref.load(buffer_0_2, [arg1]) v1 = arith.addi(v0, random_number) memref.store(v1, buffer_0_2_1, [arg1]) - yield_([]) - aie.use_lock(lock_0_2_0, Release) aie.use_lock(lock_0_2_3, Release) - yield_([]) - # this is gibberish - everything from here to the end of "bobsyouruncle" this_is_meaningless_1 = memref.global_( sym_name="this_is_meaningless_1", diff --git a/test/npu-xrt/e2e/test_manual_dpu_args.py b/test/npu-xrt/e2e/test_manual_dpu_args.py index e7cbe8684c..9770a9cf5e 100644 --- a/test/npu-xrt/e2e/test_manual_dpu_args.py +++ b/test/npu-xrt/e2e/test_manual_dpu_args.py @@ -25,10 +25,11 @@ # this is to get the MemRefValue caster inside of aie-python-extras # noinspection PyUnresolvedReferences -from aie.extras.dialects.ext import arith, func, linalg, memref, scf +from aie.extras.dialects.ext import linalg, memref +from aie.extras.dialects.ext.scf import _for as range_ # noinspection PyUnresolvedReferences -from aie.extras.testing import MLIRContext, filecheck, mlir_ctx as ctx +from aie.extras.testing import MLIRContext import aie.extras.types as T from aie.xrt import XCLBin from filelock import FileLock @@ -46,9 +47,6 @@ AcquireGreaterEqual = LockAction.AcquireGreaterEqual Release = LockAction.Release -range_ = scf.range_ -yield_ = scf.yield_ - def test_manual_args(ctx: MLIRContext, workdir: Path): K = 32 @@ -378,7 +376,6 @@ def core(): with aiex.hold_lock(lock_read_weight, lock_send_weight): linalg.fill(i, y) linalg.copy(y, buffer_weight) - yield_() @aie.mem(tile_c_2) def mem_c_2(): diff --git a/test/npu-xrt/e2e/test_offsets_sizes_strides.py b/test/npu-xrt/e2e/test_offsets_sizes_strides.py index d668f81e09..7d00630592 100644 --- a/test/npu-xrt/e2e/test_offsets_sizes_strides.py +++ b/test/npu-xrt/e2e/test_offsets_sizes_strides.py @@ -22,7 +22,7 @@ WireBundle, ) from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import arith, linalg # noinspection PyUnresolvedReferences @@ -254,8 +254,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) diff --git a/test/npu-xrt/e2e/test_repeat_count.py b/test/npu-xrt/e2e/test_repeat_count.py index 27bdd7cfee..c0b9db7ae4 100644 --- a/test/npu-xrt/e2e/test_repeat_count.py +++ b/test/npu-xrt/e2e/test_repeat_count.py @@ -8,20 +8,18 @@ import random import sys -from aie.dialects import aie, aiex, scf +from aie.dialects import aie, aiex from aie.dialects.aie import ( AIEDevice, DMAChannelDir, LockAction, WireBundle, ) - -range_ = scf.for_ -yield_ = scf.yield_ +from aie.extras.dialects.ext.scf import _for as range_ # this is to get the MemRefValue caster inside of aie-python-extras # noinspection PyUnresolvedReferences -from aie.extras.dialects.ext import arith, func, linalg, memref +from aie.extras.dialects.ext import linalg, memref import aie.extras.types as T from aie.xrt import XCLBin from filelock import FileLock @@ -32,7 +30,7 @@ import pytest # noinspection PyUnresolvedReferences -from aie.extras.testing import mlir_ctx as ctx, filecheck, MLIRContext +from aie.extras.testing import MLIRContext # needed since the fix isn't defined here nor conftest.py pytest.mark.usefixtures("ctx") @@ -191,7 +189,6 @@ def core(): with aiex.hold_lock(lock_read_weight, lock_send_weight): linalg.fill(col, y) linalg.add(y, buffer_weight, buffer_weight) - yield_([]) @aie.mem(tile_c_2) def mem_c_2(): diff --git a/test/npu-xrt/e2e/test_tiled_matrix_add.py b/test/npu-xrt/e2e/test_tiled_matrix_add.py index d248c769be..4c17eb153b 100644 --- a/test/npu-xrt/e2e/test_tiled_matrix_add.py +++ b/test/npu-xrt/e2e/test_tiled_matrix_add.py @@ -17,7 +17,7 @@ from aie.dialects import aie, aiex from aie.dialects.aie import AIEDevice, DMAChannelDir, LockAction, WireBundle from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import arith, linalg # noinspection PyUnresolvedReferences @@ -255,8 +255,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) @@ -493,9 +491,6 @@ def core(): linalg.add(buffer_0_2_a, buffer_0_2_c, buffer_0_2_c) linalg.add(buffer_0_2_b, buffer_0_2_c, buffer_0_2_c) - yield_([]) - yield_([]) - compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) with FileLock("/tmp/npu.lock"): diff --git a/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py index bd417b9426..bb23d31fb2 100644 --- a/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py @@ -44,10 +44,6 @@ # needed since the fix isn't defined here nor conftest.py pytest.mark.usefixtures("ctx") - -range_ = scf.range_ -yield_ = scf.yield_ - DMA = WireBundle.DMA S2MM = DMAChannelDir.S2MM MM2S = DMAChannelDir.MM2S @@ -415,8 +411,6 @@ def matmul_i32_i32_already_vectorized( [j, c0], in_bounds=[True], ) - yield_([]) - yield_([]) def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir: Path): diff --git a/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py index 1fb2513612..6542c241cb 100644 --- a/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py @@ -21,7 +21,7 @@ LockAction, WireBundle, ) -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import linalg # noinspection PyUnresolvedReferences @@ -309,8 +309,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) @@ -567,8 +565,6 @@ def core(): ): linalg.fill(0, buffer_0_2_c) linalg.matmul(buffer_0_2_a, buffer_0_2_b, buffer_0_2_c) - yield_([]) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) diff --git a/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py index 4a7a8bc0f3..51a609354b 100644 --- a/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py @@ -24,7 +24,7 @@ WireBundle, ) from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.dialects.transform import any_op_t, apply_registered_pass, get_parent_op from aie.dialects.transform.extras import named_sequence from aie.dialects.transform.loop import loop_unroll @@ -339,8 +339,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) - yield_([]) mod_aie.finish() mod_aievec = ExplicitlyManagedModule() @@ -666,9 +664,6 @@ def core(): linalg_fill(arith.constant(0), outs=[buffer_0_2_c]) matmul_i32_i32(buffer_0_2_a, buffer_0_2_b, buffer_0_2_c) - yield_([]) - yield_([]) - mod_aie.finish() mod_aievec = ExplicitlyManagedModule() @@ -798,8 +793,6 @@ def matmul_i32_i32_already_vectorized( permutation_map=perm_map, in_bounds=[True], ) - yield_([]) - yield_([]) def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar_already_vectorized( @@ -1022,9 +1015,6 @@ def core(): buffer_0_2_a, buffer_0_2_b, buffer_0_2_c ) - yield_([]) - yield_([]) - mod_aie.finish() mod_aievec = ExplicitlyManagedModule() matmul_i32_i32_already_vectorized.emit(force=True) diff --git a/test/npu-xrt/e2e/test_tiled_vec_add.py b/test/npu-xrt/e2e/test_tiled_vec_add.py index c5b66355c6..5bea658521 100644 --- a/test/npu-xrt/e2e/test_tiled_vec_add.py +++ b/test/npu-xrt/e2e/test_tiled_vec_add.py @@ -22,7 +22,7 @@ WireBundle, ) from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.dialects.ext import arith, linalg # noinspection PyUnresolvedReferences @@ -239,7 +239,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) @@ -420,8 +419,6 @@ def core(): linalg_fill(arith.constant(0), outs=[buffer_0_2_c]) linalg.add(buffer_0_2_a, buffer_0_2_b, buffer_0_2_c) - yield_([]) - compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) with FileLock("/tmp/npu.lock"): diff --git a/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py index cd7212a575..3ccc669497 100644 --- a/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py +++ b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py @@ -24,7 +24,7 @@ WireBundle, ) from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.dialects.transform import any_op_t, apply_registered_pass, get_parent_op from aie.dialects.transform.extras import named_sequence from aie.dialects.transform.structured import structured_match @@ -261,7 +261,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) mod_aie.finish() mod_aievec = ExplicitlyManagedModule() @@ -506,7 +505,6 @@ def core(): ): linalg_fill(arith.constant(0), outs=[buffer_0_2_c]) vec_add_i32_i32(buffer_0_2_a, buffer_0_2_b, buffer_0_2_c) - yield_([]) mod_aie.finish() mod_aievec = ExplicitlyManagedModule() diff --git a/test/npu-xrt/e2e/test_vec_dot.py b/test/npu-xrt/e2e/test_vec_dot.py index a34da60a4c..85c9c68ba9 100644 --- a/test/npu-xrt/e2e/test_vec_dot.py +++ b/test/npu-xrt/e2e/test_vec_dot.py @@ -23,14 +23,14 @@ WireBundle, ) from aie.dialects.linalg.opdsl.ops.core_named_ops import fill as linalg_fill -from aie.dialects.scf import for_ as range_, yield_ +from aie.extras.dialects.ext.scf import _for as range_ # this is to get the MemRefValue caster inside of aie-python-extras # noinspection PyUnresolvedReferences -from aie.extras.dialects.ext import arith, func, linalg, memref +from aie.extras.dialects.ext import arith, linalg # noinspection PyUnresolvedReferences -from aie.extras.testing import MLIRContext, filecheck, mlir_ctx as ctx +from aie.extras.testing import MLIRContext import aie.extras.types as T from aie.xrt import XCLBin from filelock import FileLock @@ -250,7 +250,6 @@ def core(): aie.use_lock(lock_0_2_read_in_a, Release) aie.use_lock(lock_0_2_read_in_b, Release) aie.use_lock(lock_0_2_write_out_c, Release) - yield_([]) compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) @@ -438,8 +437,6 @@ def core(): linalg_fill(arith.constant(0), outs=[output]) buffer_0_2_c[0] = v - yield_([]) - compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) with FileLock("/tmp/npu.lock"): diff --git a/test/npu-xrt/matrix_transpose/aie2.py b/test/npu-xrt/matrix_transpose/aie2.py index 112762f3d0..9476ab3544 100644 --- a/test/npu-xrt/matrix_transpose/aie2.py +++ b/test/npu-xrt/matrix_transpose/aie2.py @@ -18,8 +18,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * - +from aie.extras.dialects.ext.scf import _for as range_ matrix_rows = 7 matrix_cols = 19 @@ -59,13 +58,12 @@ def device_body(): # Core @core(tiles[2][0], "kernel.o") def core_body(): - for _ in for_(0, 0xFFFFFFFF): + for _ in range_(0, 0xFFFFFFFF): elem_in = fifo_in.acquire(ObjectFifoPort.Consume, 1) elem_out = fifo_out.acquire(ObjectFifoPort.Produce, 1) call(passthrough_func, [elem_in, elem_out, matrix_size]) fifo_in.release(ObjectFifoPort.Consume, 1) fifo_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement @runtime_sequence(matrix_memref, matrix_memref) diff --git a/test/npu-xrt/nd_memcpy_transforms/aie2.py b/test/npu-xrt/nd_memcpy_transforms/aie2.py index 3ae680f7d0..afdca71eaa 100644 --- a/test/npu-xrt/nd_memcpy_transforms/aie2.py +++ b/test/npu-xrt/nd_memcpy_transforms/aie2.py @@ -18,7 +18,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ dtype = T.i16 @@ -63,7 +63,7 @@ def device_body(): # Core @core(tiles[2][0], "kernel.o") def core_body(): - for _ in for_(0, 0xFFFFFFFF): + for _ in range_(0, 0xFFFFFFFF): elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1) elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1) elem_b = fifo_b.acquire(ObjectFifoPort.Consume, 1) @@ -81,7 +81,6 @@ def core_body(): fifo_a.release(ObjectFifoPort.Consume, 1) fifo_b.release(ObjectFifoPort.Consume, 1) fifo_c.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement @runtime_sequence(memref_a, memref_b, memref_c) diff --git a/test/npu-xrt/sync_task_complete_token/aie2.py b/test/npu-xrt/sync_task_complete_token/aie2.py index 2acdacf421..29cb6ebb80 100644 --- a/test/npu-xrt/sync_task_complete_token/aie2.py +++ b/test/npu-xrt/sync_task_complete_token/aie2.py @@ -17,7 +17,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ dtype = T.i32 @@ -54,20 +54,18 @@ def device_body(): # Core @core(tiles[2][0]) def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_output = fifo_output.acquire(ObjectFifoPort.Produce, 1) zero = constant(T.i32(), 0) memref.store(zero, elem_output, [0]) - for _ in for_(16): + for _ in range_(16): elem_input = fifo_input.acquire(ObjectFifoPort.Consume, 1) a = memref.load(elem_output, [0]) b = memref.load(elem_input, [0]) c = a + b memref.store(c, elem_output, [0]) fifo_input.release(ObjectFifoPort.Consume, 1) - yield_([]) fifo_output.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement @runtime_sequence(memref_t, memref_t) diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py index 5a19f040d6..b8ee75a67f 100644 --- a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py +++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py @@ -17,7 +17,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ dtype = T.i32 @@ -54,20 +54,18 @@ def device_body(): # Core @core(tiles[2][0]) def core_body(): - for _ in for_(0xFFFFFFFF): + for _ in range_(0xFFFFFFFF): elem_output = fifo_output.acquire(ObjectFifoPort.Produce, 1) zero = constant(T.i32(), 0) memref.store(zero, elem_output, [0]) - for _ in for_(16): + for _ in range_(16): elem_input = fifo_input.acquire(ObjectFifoPort.Consume, 1) a = memref.load(elem_output, [0]) b = memref.load(elem_input, [0]) c = a + b memref.store(c, elem_output, [0]) fifo_input.release(ObjectFifoPort.Consume, 1) - yield_([]) fifo_output.release(ObjectFifoPort.Produce, 1) - yield_([]) # To/from AIE-array data movement @runtime_sequence(memref_t, memref_t) diff --git a/test/python/aievec.py b/test/python/aievec.py index 39a9663fad..14515b28c9 100644 --- a/test/python/aievec.py +++ b/test/python/aievec.py @@ -9,7 +9,7 @@ from aie.extras.dialects.ext.func import func from aie.extras.runtime.passes import Pipeline as p, run_pipeline -from aie.dialects import affine, aievec, scf, tosa, vector +from aie.dialects import affine, aievec, tosa, vector # noinspection PyUnresolvedReferences import aie.dialects.aie @@ -19,9 +19,7 @@ from aie.extras import types as T from aie.ir import AffineMap, AffineDimExpr from util import construct_and_print_module - -range_ = scf.for_ -yield_ = lambda: scf.yield_([]) +from aie.extras.dialects.ext.scf import _for as range_ # CHECK-LABEL: TEST: test_emit @@ -51,7 +49,7 @@ def mul_elem( B: T.memref(2048, T.i16()), C: T.memref(2048, T.i16()), ): - for i in scf.for_(0, 2048, 32): + for i in range_(0, 2048, 32): v0 = aievec.upd(T.vector(32, T.i16()), A, [i]) v1 = aievec.upd(T.vector(32, T.i16()), B, [i]) v2 = aievec.mul_elem( @@ -69,8 +67,6 @@ def mul_elem( in_bounds=[True], ) - scf.yield_([]) - # CHECK-LABEL: func.func @mul_elem( # CHECK-SAME: %[[VAL_0:.*]]: memref<2048xi16>, %[[VAL_1:.*]]: memref<2048xi16>, %[[VAL_2:.*]]: memref<2048xi16>) { # CHECK: %[[VAL_3:.*]] = arith.constant 0 : index @@ -350,8 +346,6 @@ def matmul_i32_i32( permutation_map=perm_map, in_bounds=[True], ) - yield_() - yield_() # CHECK-LABEL: func.func @matmul_i32_i32( # CHECK-SAME: %[[VAL_0:.*]]: memref<16x32xi32>, %[[VAL_1:.*]]: memref<32x16xi32>, %[[VAL_2:.*]]: memref<16x16xi32>) { diff --git a/test/python/code_region.py b/test/python/code_region.py index 4e9f529b8d..3516ebdcbb 100644 --- a/test/python/code_region.py +++ b/test/python/code_region.py @@ -15,12 +15,9 @@ object_fifo_link, tile, ) -from aie.dialects.scf import for_, yield_ -from aie.ir import TypeAttr +from aie.extras.dialects.ext.scf import _for as range_ from util import construct_and_print_module -range_ = for_ - # CHECK: module { # CHECK: aie.device(xcve2802) { @@ -65,4 +62,3 @@ def core_body(): elem0 = of1.acquire(ObjectFifoPort.Consume, 1) res = call("test_func", [elem0], [T.i32()]) of1.release(ObjectFifoPort.Consume, 1) - yield_([]) diff --git a/test/python/core_ext_kernel.py b/test/python/core_ext_kernel.py index 6d58ee168b..4ca923080d 100644 --- a/test/python/core_ext_kernel.py +++ b/test/python/core_ext_kernel.py @@ -17,13 +17,11 @@ end, ) from aie.extras.dialects.ext import arith -from aie.dialects.scf import for_, yield_ -from aie.ir import TypeAttr, Block, InsertionPoint +from aie.extras.dialects.ext.scf import _for as range_ +from aie.ir import Block, InsertionPoint from util import construct_and_print_module -range_ = for_ - # CHECK: module { # CHECK: aie.device(xcve2802) { @@ -73,5 +71,4 @@ def core_ext_kernel(): elem0 = of1.acquire(ObjectFifoPort.Consume, 1) res = call("test_func", [elem0, arith.constant(4)], [T.i32()]) of1.release(ObjectFifoPort.Consume, 1) - yield_([]) end() diff --git a/test/python/npu.py b/test/python/npu.py index 7ea96af716..2f8beccf19 100644 --- a/test/python/npu.py +++ b/test/python/npu.py @@ -24,14 +24,9 @@ tile, ) from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd, runtime_sequence -from aie.dialects.func import FuncOp -from aie.dialects.scf import for_ -from aie.dialects.scf import yield_ -from aie.ir import TypeAttr +from aie.extras.dialects.ext.scf import _for as range_ from util import construct_and_print_module -range_ = for_ - DMA = WireBundle.DMA S2MM = DMAChannelDir.S2MM MM2S = DMAChannelDir.MM2S @@ -72,8 +67,6 @@ def core_body(): call(scale_int32, [elem_in, elem_out]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) @runtime_sequence( T.memref(N, T.i32()), T.memref(N, T.i32()), T.memref(N, T.i32()) @@ -171,11 +164,7 @@ def core_body(): call(matmul_scalar, [elem_in_a, elem_in_b, elem_out]) of_inA.release(ObjectFifoPort.Consume, 1) of_inB.release(ObjectFifoPort.Consume, 1) - yield_([]) - of_outC.release(ObjectFifoPort.Produce, 1) - yield_([]) - yield_([]) @runtime_sequence( T.memref(A_sz_in_i32s, T.i32()), @@ -311,7 +300,6 @@ def core_body(): inOF_L2L1.release(ObjectFifoPort.Consume, 1) OF_2to3.release(ObjectFifoPort.Produce, 1) - yield_([]) @core(T3, "filter2d.cc.o") def core_body(): @@ -362,7 +350,6 @@ def core_body(): ) OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - yield_([]) # Postamble : Bottom Border elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) @@ -398,7 +385,6 @@ def core_body(): OF_3to4.release(ObjectFifoPort.Consume, 1) OF_4to5.release(ObjectFifoPort.Produce, 1) - yield_([]) @core(T5, "combined_gray2rgba_addWeighted.a") def core_body(): @@ -435,7 +421,6 @@ def core_body(): OF_5to5.release(ObjectFifoPort.Consume, 1) inOF_L2L1.release(ObjectFifoPort.Consume, 1) outOF_L1L2.release(ObjectFifoPort.Produce, 1) - yield_([]) @runtime_sequence( T.memref(2304, T.i32()), T.memref(2304, T.i32()), T.memref(2304, T.i32()) @@ -487,10 +472,8 @@ def core_body(): v0 = memref.load(elem_in, [i]) v1 = arith.addi(v0, arith.constant(1, T.i32())) memref.store(v1, elem_out, [i]) - yield_([]) of_in1.release(ObjectFifoPort.Consume, 1) of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) @runtime_sequence( T.memref(64, T.i32()), T.memref(32, T.i32()), T.memref(64, T.i32()) diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index eb6a16e324..0c8d8159db 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -21,7 +21,7 @@ from aie.dialects.aie import * from aie.dialects.aiex import * -from aie.dialects.scf import * +from aie.extras.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx from aie.utils.trace import * @@ -67,13 +67,12 @@ def device_body(): # Compute tile 2 @core(ComputeTile2, "passThrough.cc.o") def core_body(): - for _ in for_(sys.maxsize): + for _ in range_(sys.maxsize): elemOut = of_out.acquire(ObjectFifoPort.Produce, 1) elemIn = of_in.acquire(ObjectFifoPort.Consume, 1) call(passThroughLine, [elemIn, elemOut, lineWidthInBytes]) of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - yield_([]) # print(ctx.module.operation.verify())