Skip to content

Commit

Permalink
Python buffer API refactoring, RTP Op Wrapper (Xilinx#1818)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunhoffe authored Oct 9, 2024
1 parent 6e47acf commit de904d3
Show file tree
Hide file tree
Showing 15 changed files with 542 additions and 390 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ def device_body():
)
else:
C_l1l2_buffers[row][col] = buffer(
core_tiles[row][col], [m, n], dtype_out, f"C_L1L2_{col}_{row}"
core_tiles[row][col],
np.ndarray[(m, n), np.dtype[dtype_out]],
f"C_L1L2_{col}_{row}",
)

C_l2l3_fifos[col] = object_fifo(
Expand Down
2 changes: 1 addition & 1 deletion programming_examples/basic/passthrough_pykernel/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def passthroughKernel(vector_size):

@device(AIEDevice.npu1_1col)
def device_body():
# define types - for illustrative purposes, we use equivalent types of both MLIR MemRefType and np.ndarray type in this design
# define types
line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]

# AIE Core Python Function declarations
Expand Down
41 changes: 30 additions & 11 deletions programming_examples/ml/bottleneck/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,30 @@ def deviceBody():

# runtime parameters

rtpComputeTile2 = Buffer(ComputeTile2, [16], np.int32, "rtpComputeTile2")
rtpComputeTile3 = Buffer(ComputeTile3, [16], np.int32, "rtpComputeTile3")
rtpComputeTile4 = Buffer(ComputeTile4, [16], np.int32, "rtpComputeTile4")
rtpComputeTile5 = Buffer(ComputeTile5, [16], np.int32, "rtpComputeTile5")
rtpComputeTile2 = buffer(
ComputeTile2,
np.ndarray[(16,), np.dtype[np.int32]],
"rtpComputeTile2",
use_write_rtp=True,
)
rtpComputeTile3 = buffer(
ComputeTile3,
np.ndarray[(16,), np.dtype[np.int32]],
"rtpComputeTile3",
use_write_rtp=True,
)
rtpComputeTile4 = buffer(
ComputeTile4,
np.ndarray[(16,), np.dtype[np.int32]],
"rtpComputeTile4",
use_write_rtp=True,
)
rtpComputeTile5 = buffer(
ComputeTile5,
np.ndarray[(16,), np.dtype[np.int32]],
"rtpComputeTile5",
use_write_rtp=True,
)

# set up data movement with OFs
# input tensor (with broadcast for skip connection)
Expand Down Expand Up @@ -524,13 +544,12 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
npu_write32(0, 2, 0x1D20C, 0x3)

# write RTP parameters
NpuWriteRTPOp("rtpComputeTile2", index=0, value=1) # scale
NpuWriteRTPOp("rtpComputeTile3", index=0, value=1) # scale
NpuWriteRTPOp("rtpComputeTile5", index=0, value=1) # scale
NpuWriteRTPOp(
"rtpComputeTile4", index=0, value=1
) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
NpuWriteRTPOp("rtpComputeTile4", index=1, value=0) # skip_scale
rtpComputeTile2[0] = 1 # scale
rtpComputeTile3[0] = 1 # scale
rtpComputeTile5[0] = 1 # scale
# scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
rtpComputeTile4[0] = 1
rtpComputeTile4[1] = 0 # skip_scale

npu_dma_memcpy_nd(
metadata=of_inOF_act_L3L2,
Expand Down
9 changes: 7 additions & 2 deletions programming_examples/ml/conv2d/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,12 @@ def device_body():

# Set up compute tiles

rtp2 = Buffer(ComputeTile2, [16], np.int32, "rtp2")
rtp2 = buffer(
ComputeTile2,
np.ndarray[(16,), np.dtype[np.int32]],
"rtp2",
use_write_rtp=True,
)

# Compute tile 2
@core(ComputeTile2, "conv2dk1_i8.o")
Expand Down Expand Up @@ -115,7 +120,7 @@ def core_body():
# To/from AIE-array data movement
@runtime_sequence(tensor_ty, weights_ty, tensor_ty)
def sequence(I, W, O):
NpuWriteRTPOp("rtp2", index=0, value=10)
rtp2[0] = 10

npu_dma_memcpy_nd(
metadata=of_inOF_act_L3L2,
Expand Down
6 changes: 4 additions & 2 deletions programming_examples/ml/conv2d_fused_relu/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def device_body():

# Set up compute tiles

rtp2 = Buffer(ComputeTile2, [16], T.i32(), "rtp2")
rtp2 = buffer(
ComputeTile2, T.memref(16, T.i32()), "rtp2", use_write_rtp=True
)

# Compute tile 2
@core(ComputeTile2, "conv2dk1.o")
Expand Down Expand Up @@ -204,7 +206,7 @@ def sequence(I, W, O):
# Set start BD to our shim bd_Id (3)
npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)

NpuWriteRTPOp("rtp2", index=0, value=1)
rtp2[0] = 1

npu_dma_memcpy_nd(
metadata=of_inOF_act_L3L2,
Expand Down
Loading

0 comments on commit de904d3

Please sign in to comment.