Python buffer API refactoring, RTP Op Wrapper (Xilinx#1818)

fifield · Oct 9, 2024 · de904d3 · de904d3
1 parent 6e47acf
commit de904d3
Show file tree

Hide file tree

Showing 15 changed files with 542 additions and 390 deletions.
diff --git a/programming_examples/basic/matrix_multiplication/cascade/aie2.py b/programming_examples/basic/matrix_multiplication/cascade/aie2.py
@@ -246,7 +246,9 @@ def device_body():
                     )
                 else:
                     C_l1l2_buffers[row][col] = buffer(
-                        core_tiles[row][col], [m, n], dtype_out, f"C_L1L2_{col}_{row}"
+                        core_tiles[row][col],
+                        np.ndarray[(m, n), np.dtype[dtype_out]],
+                        f"C_L1L2_{col}_{row}",
                     )
 
             C_l2l3_fifos[col] = object_fifo(

diff --git a/programming_examples/basic/passthrough_pykernel/aie2.py b/programming_examples/basic/passthrough_pykernel/aie2.py
@@ -21,7 +21,7 @@ def passthroughKernel(vector_size):
 
     @device(AIEDevice.npu1_1col)
     def device_body():
-        # define types - for illustrative purposes, we use equivalent types of both MLIR MemRefType and np.ndarray type in this design
+        # define types
         line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]
 
         # AIE Core Python Function declarations

diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
@@ -142,10 +142,30 @@ def deviceBody():
 
             # runtime parameters
 
-            rtpComputeTile2 = Buffer(ComputeTile2, [16], np.int32, "rtpComputeTile2")
-            rtpComputeTile3 = Buffer(ComputeTile3, [16], np.int32, "rtpComputeTile3")
-            rtpComputeTile4 = Buffer(ComputeTile4, [16], np.int32, "rtpComputeTile4")
-            rtpComputeTile5 = Buffer(ComputeTile5, [16], np.int32, "rtpComputeTile5")
+            rtpComputeTile2 = buffer(
+                ComputeTile2,
+                np.ndarray[(16,), np.dtype[np.int32]],
+                "rtpComputeTile2",
+                use_write_rtp=True,
+            )
+            rtpComputeTile3 = buffer(
+                ComputeTile3,
+                np.ndarray[(16,), np.dtype[np.int32]],
+                "rtpComputeTile3",
+                use_write_rtp=True,
+            )
+            rtpComputeTile4 = buffer(
+                ComputeTile4,
+                np.ndarray[(16,), np.dtype[np.int32]],
+                "rtpComputeTile4",
+                use_write_rtp=True,
+            )
+            rtpComputeTile5 = buffer(
+                ComputeTile5,
+                np.ndarray[(16,), np.dtype[np.int32]],
+                "rtpComputeTile5",
+                use_write_rtp=True,
+            )
 
             # set up data movement with OFs
             # input tensor (with broadcast for skip connection)
@@ -524,13 +544,12 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     npu_write32(0, 2, 0x1D20C, 0x3)
 
                 # write RTP parameters
-                NpuWriteRTPOp("rtpComputeTile2", index=0, value=1)  # scale
-                NpuWriteRTPOp("rtpComputeTile3", index=0, value=1)  # scale
-                NpuWriteRTPOp("rtpComputeTile5", index=0, value=1)  # scale
-                NpuWriteRTPOp(
-                    "rtpComputeTile4", index=0, value=1
-                )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                NpuWriteRTPOp("rtpComputeTile4", index=1, value=0)  # skip_scale
+                rtpComputeTile2[0] = 1  # scale
+                rtpComputeTile3[0] = 1  # scale
+                rtpComputeTile5[0] = 1  # scale
+                # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
+                rtpComputeTile4[0] = 1
+                rtpComputeTile4[1] = 0  # skip_scale
 
                 npu_dma_memcpy_nd(
                     metadata=of_inOF_act_L3L2,

diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
@@ -87,7 +87,12 @@ def device_body():
 
             # Set up compute tiles
 
-            rtp2 = Buffer(ComputeTile2, [16], np.int32, "rtp2")
+            rtp2 = buffer(
+                ComputeTile2,
+                np.ndarray[(16,), np.dtype[np.int32]],
+                "rtp2",
+                use_write_rtp=True,
+            )
 
             # Compute tile 2
             @core(ComputeTile2, "conv2dk1_i8.o")
@@ -115,7 +120,7 @@ def core_body():
             # To/from AIE-array data movement
             @runtime_sequence(tensor_ty, weights_ty, tensor_ty)
             def sequence(I, W, O):
-                NpuWriteRTPOp("rtp2", index=0, value=10)
+                rtp2[0] = 10
 
                 npu_dma_memcpy_nd(
                     metadata=of_inOF_act_L3L2,

diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -96,7 +96,9 @@ def device_body():
 
             # Set up compute tiles
 
-            rtp2 = Buffer(ComputeTile2, [16], T.i32(), "rtp2")
+            rtp2 = buffer(
+                ComputeTile2, T.memref(16, T.i32()), "rtp2", use_write_rtp=True
+            )
 
             # Compute tile 2
             @core(ComputeTile2, "conv2dk1.o")
@@ -204,7 +206,7 @@ def sequence(I, W, O):
                     # Set start BD to our shim bd_Id (3)
                     npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
 
-                NpuWriteRTPOp("rtp2", index=0, value=1)
+                rtp2[0] = 1
 
                 npu_dma_memcpy_nd(
                     metadata=of_inOF_act_L3L2,