diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 82691ccf29..393365fb9e 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -69,47 +69,65 @@ class LockAnalysis {
 };
 
 //===----------------------------------------------------------------------===//
-// TileDMA Channel Analysis
+// DMA Channel Analysis
 //===----------------------------------------------------------------------===//
 class DMAChannelAnalysis {
-  DenseMap<Value, int> masterChannelsPerTile;
-  DenseMap<Value, int> slaveChannelsPerTile;
+  DenseMap<std::tuple<Value, DMAChannelDir, int>, int> channelsPerTile;
 
 public:
   DMAChannelAnalysis(DeviceOp &device) {
-    // go over the channels used for each tile and update the master/slave
-    // channel maps
+    // go over the channels used for each tile and update channel map
     for (auto memOp : device.getOps<MemOp>()) {
       Region &r = memOp.getBody();
       for (auto &bl : r.getBlocks()) {
         for (auto op : bl.getOps<DMAStartOp>()) {
-          if (op.isSend())
-            getMasterDMAChannel(memOp.getTile());
-          else
-            getSlaveDMAChannel(memOp.getTile());
+          channelsPerTile[{memOp.getTile(), op.getChannelDir(),
+                           op.getChannelIndex()}] = 1;
+        }
+      }
+    }
+    for (auto memOp : device.getOps<MemTileDMAOp>()) {
+      Region &r = memOp.getBody();
+      for (auto &bl : r.getBlocks()) {
+        for (auto op : bl.getOps<DMAStartOp>()) {
+          channelsPerTile[{memOp.getTile(), op.getChannelDir(),
+                           op.getChannelIndex()}] = 1;
+        }
+      }
+    }
+    for (auto memOp : device.getOps<ShimDMAOp>()) {
+      Region &r = memOp.getBody();
+      for (auto &bl : r.getBlocks()) {
+        for (auto op : bl.getOps<DMAStartOp>()) {
+          channelsPerTile[{memOp.getTile(), op.getChannelDir(),
+                           op.getChannelIndex()}] = 1;
         }
       }
     }
   }
 
-  /// Given an AIE tile, returns its next usable master channel.
-  DMAChannel getMasterDMAChannel(Value tile) {
-    if (masterChannelsPerTile.find(tile) == masterChannelsPerTile.end())
-      masterChannelsPerTile[tile] = 0;
-    else
-      masterChannelsPerTile[tile]++;
-    DMAChannel dmaChan = {DMAChannelDir::MM2S, masterChannelsPerTile[tile]};
-    return dmaChan;
-  }
-
-  /// Given an AIE tile, returns its next usable slave channel.
-  DMAChannel getSlaveDMAChannel(Value tile) {
-    if (slaveChannelsPerTile.find(tile) == slaveChannelsPerTile.end())
-      slaveChannelsPerTile[tile] = 0;
-    else
-      slaveChannelsPerTile[tile]++;
-    DMAChannel dmaChan = {DMAChannelDir::S2MM, slaveChannelsPerTile[tile]};
-    return dmaChan;
+  /// Given a tile and DMAChannelDir, returns next usable channel index for
+  /// that tile.
+  int getDMAChannelIndex(TileOp tileOp, DMAChannelDir dir) {
+    const auto &targetModel = getTargetModel(tileOp);
+    int maxChannelNum = 0;
+    if (tileOp.isShimTile())
+      maxChannelNum = 2;
+    else {
+      if (dir == DMAChannelDir::MM2S)
+        maxChannelNum = targetModel.getNumSourceSwitchboxConnections(
+            tileOp.getCol(), tileOp.getRow(), WireBundle::DMA);
+      else
+        maxChannelNum = targetModel.getNumDestSwitchboxConnections(
+            tileOp.getCol(), tileOp.getRow(), WireBundle::DMA);
+    }
+    for (int i = 0; i < maxChannelNum; i++)
+      if (int usageCnt = channelsPerTile[{tileOp.getResult(), dir, i}];
+          usageCnt == 0) {
+        channelsPerTile[{tileOp.getResult(), dir, i}] = 1;
+        return i;
+      }
+    return -1;
   }
 };
 
@@ -1518,8 +1536,12 @@ struct AIEObjectFifoStatefulTransformPass
     // rely on shared memory and share the same buffers.
     for (auto &[producer, consumers] : splitFifos) {
       // create producer tile DMA
-      DMAChannel producerChan =
-          dmaAnalysis.getMasterDMAChannel(producer.getProducerTile());
+      int producerChanIndex = dmaAnalysis.getDMAChannelIndex(
+          producer.getProducerTileOp(), DMAChannelDir::MM2S);
+      if (producerChanIndex == -1)
+        producer.getProducerTileOp().emitOpError(
+            "number of output DMA channel exceeded!");
+      DMAChannel producerChan = {DMAChannelDir::MM2S, producerChanIndex};
       createDMA(device, builder, producer, producerChan.direction,
                 producerChan.channel, 0, producer.getDimensionsToStreamAttr(),
                 producer.getPadDimensionsAttr());
@@ -1535,8 +1557,12 @@ struct AIEObjectFifoStatefulTransformPass
       for (auto consumer : consumers) {
 
         // create consumer tile DMA
-        DMAChannel consumerChan =
-            dmaAnalysis.getSlaveDMAChannel(consumer.getProducerTile());
+        int consumerChanIndex = dmaAnalysis.getDMAChannelIndex(
+            consumer.getProducerTileOp(), DMAChannelDir::S2MM);
+        if (consumerChanIndex == -1)
+          consumer.getProducerTileOp().emitOpError(
+              "number of input DMA channel exceeded!");
+        DMAChannel consumerChan = {DMAChannelDir::S2MM, consumerChanIndex};
         BDDimLayoutArrayAttr consumerDims =
             consumer.getDimensionsFromStreamPerConsumer()[0];
         createDMA(device, builder, consumer, consumerChan.direction,
diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir
deleted file mode 100644
index 52bc3e1af6..0000000000
--- a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie.mlir
+++ /dev/null
@@ -1,240 +0,0 @@
-//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-//
-//===----------------------------------------------------------------------===//
-
-module {
-  aie.device(npu1_4col) {
-    memref.global "public" @out_cons : memref<16xi32>
-    memref.global "public" @out : memref<16xi32>
-    memref.global "public" @in2_mem_cons : memref<256xi32>
-    memref.global "public" @in2_mem : memref<256xi32>
-    memref.global "public" @in1_cons : memref<16xi32>
-    memref.global "public" @in1 : memref<16xi32>
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_1_1 = aie.tile(1, 1)
-    %tile_2_1 = aie.tile(2, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<16xi32> 
-    %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<16xi32> 
-    %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"}
-    %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"}
-    %in2_mem_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_0"} : memref<256xi32> 
-    %in2_mem_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_1"} : memref<256xi32> 
-    %in2_mem_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_mem_cons_prod_lock"}
-    %in2_mem_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_mem_cons_cons_lock"}
-    %in2_mem_buff_0 = aie.buffer(%tile_0_1) {sym_name = "in2_mem_buff_0"} : memref<64xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]> 
-    %in2_mem_buff_1 = aie.buffer(%tile_1_1) {sym_name = "in2_mem_buff_1"} : memref<64xi32> = dense<[64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]>
-    %in2_mem_buff_2 = aie.buffer(%tile_2_1) {sym_name = "in2_mem_buff_2"} : memref<64xi32> = dense<[128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]> 
-    %in2_mem_buff_3 = aie.buffer(%tile_2_1) {sym_name = "in2_mem_buff_3"} : memref<64xi32> = dense<[192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]>
-    %in2_mem_prod_lock = aie.lock(%tile_0_1, 0) {init = 0 : i32, sym_name = "in2_mem_prod_lock"}
-    %in2_mem_cons_lock = aie.lock(%tile_0_1, 1) {init = 1 : i32, sym_name = "in2_mem_cons_lock"}
-    %in3_mem_prod_lock = aie.lock(%tile_1_1, 0) {init = 0 : i32, sym_name = "in3_mem_prod_lock"}
-    %in3_mem_cons_lock = aie.lock(%tile_1_1, 1) {init = 1 : i32, sym_name = "in3_mem_cons_lock"}
-    %in4_mem_prod_lock = aie.lock(%tile_2_1, 0) {init = 0 : i32, sym_name = "in4_mem_prod_lock"}
-    %in4_mem_cons_lock = aie.lock(%tile_2_1, 1) {init = 2 : i32, sym_name = "in4_mem_cons_lock"}
-    %in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_0"} : memref<16xi32> 
-    %in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_1"} : memref<16xi32> 
-    %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"}
-    %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"}
-    aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
-    aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c16 = arith.constant 16 : index
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c9223372036854775806 = arith.constant 9223372036854775806 : index
-      %c2 = arith.constant 2 : index
-      scf.for %arg0 = %c0 to %c9223372036854775806 step %c2 {
-        aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c1 {
-            %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32>
-            %2 = arith.muli %arg1, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_0[%arg2] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-          %0 = arith.addi %arg1, %c1 : index
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c2 {
-            %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32>
-            %2 = arith.muli %0, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_1[%arg2] : memref<16xi32>
-            %6 = arith.addi %arg2, %c1 : index
-            %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-            %8 = arith.muli %0, %c16 : index
-            %9 = arith.addi %6, %8 : index
-            %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32>
-            %11 = arith.addi %7, %10 : i32
-            memref.store %11, %out_buff_1[%6] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-        }
-        aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-        aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c1 {
-            %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32>
-            %2 = arith.muli %arg1, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_0[%arg2] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-          %0 = arith.addi %arg1, %c1 : index
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c2 {
-            %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32>
-            %2 = arith.muli %0, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_1[%arg2] : memref<16xi32>
-            %6 = arith.addi %arg2, %c1 : index
-            %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-            %8 = arith.muli %0, %c16 : index
-            %9 = arith.addi %6, %8 : index
-            %10 = memref.load %in2_mem_cons_buff_1[%9] : memref<256xi32>
-            %11 = arith.addi %7, %10 : i32
-            memref.store %11, %out_buff_1[%6] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-        }
-        aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-      }
-      aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c16 step %c2 {
-        aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-        aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c1 {
-          %1 = memref.load %in1_cons_buff_0[%arg1] : memref<16xi32>
-          %2 = arith.muli %arg0, %c16 : index
-          %3 = arith.addi %arg1, %2 : index
-          %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-          %5 = arith.addi %1, %4 : i32
-          memref.store %5, %out_buff_0[%arg1] : memref<16xi32>
-        }
-        aie.use_lock(%in1_cons_prod_lock, Release, 1)
-        aie.use_lock(%out_cons_lock, Release, 1)
-        %0 = arith.addi %arg0, %c1 : index
-        aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-        aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          %1 = memref.load %in1_cons_buff_1[%arg1] : memref<16xi32>
-          %2 = arith.muli %0, %c16 : index
-          %3 = arith.addi %arg1, %2 : index
-          %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-          %5 = arith.addi %1, %4 : i32
-          memref.store %5, %out_buff_1[%arg1] : memref<16xi32>
-          %6 = arith.addi %arg1, %c1 : index
-          %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-          %8 = arith.muli %0, %c16 : index
-          %9 = arith.addi %6, %8 : index
-          %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32>
-          %11 = arith.addi %7, %10 : i32
-          memref.store %11, %out_buff_1[%6] : memref<16xi32>
-        }
-        aie.use_lock(%in1_cons_prod_lock, Release, 1)
-        aie.use_lock(%out_cons_lock, Release, 1)
-      }
-      aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-      aie.end
-    }
-    aie.shim_dma_allocation @in1(MM2S, 0, 0)
-    aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32>
-      aiex.npu.dma_wait {symbol = @out}
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16)
-      aie.use_lock(%in1_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16)
-      aie.use_lock(%in1_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb1
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
-    ^bb4:  // 2 preds: ^bb3, ^bb5
-      aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_cons_buff_0 : memref<256xi32>, 0, 256)
-      aie.use_lock(%in2_mem_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb5
-    ^bb5:  // pred: ^bb4
-      aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_cons_buff_1 : memref<256xi32>, 0, 256)
-      aie.use_lock(%in2_mem_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb4
-    ^bb6:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9)
-    ^bb7:  // 2 preds: ^bb6, ^bb8
-      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16)
-      aie.use_lock(%out_prod_lock, Release, 1)
-      aie.next_bd ^bb8
-    ^bb8:  // pred: ^bb7
-      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16)
-      aie.use_lock(%out_prod_lock, Release, 1)
-      aie.next_bd ^bb7
-    ^bb9:  // pred: ^bb6
-      aie.end
-    }
-    aie.shim_dma_allocation @out(S2MM, 0, 0)
-    %memtile_dma_1_1 = aie.memtile_dma(%tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb5)
-    ^bb1:
-      aie.use_lock(%in2_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_0 : memref<64xi32>, 0, 64)
-      aie.use_lock(%in2_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb2
-    ^bb2:
-      aie.use_lock(%in3_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_1 : memref<64xi32>, 0, 64)
-      aie.use_lock(%in3_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb3
-    ^bb3:
-      aie.use_lock(%in4_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_2 : memref<64xi32>, 0, 64)
-      aie.use_lock(%in4_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb4
-    ^bb4:
-      aie.use_lock(%in4_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_3 : memref<64xi32>, 0, 64)
-      aie.use_lock(%in4_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb5
-    ^bb5:
-      aie.end
-    }
-  }
-}
-
diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py
new file mode 100644
index 0000000000..4e5bd39bbc
--- /dev/null
+++ b/test/npu-xrt/adjacent_memtile_access/three_memtiles/aie2.py
@@ -0,0 +1,206 @@
+# vector_vector_add/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# REQUIRES: ryzen_ai
+#
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# CHECK: PASS!
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.dialects import memref
+
+
+def my_vector_add():
+    N = 256
+    n = 16
+    N_div_n = N // n
+
+    @device(AIEDevice.npu1_4col)
+    def device_body():
+        # AIE Core Function declarations
+        tensor_ty_c = np.ndarray[(N,), np.dtype[np.int32]]
+        tensor_ty = np.ndarray[(N // 4,), np.dtype[np.int32]]
+        tensor_ty_s = np.ndarray[(n,), np.dtype[np.int32]]
+
+        memref.global_("out", T.memref(16, T.i32()), sym_visibility="public")
+        memref.global_("in1", T.memref(16, T.i32()), sym_visibility="public")
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        MemTile2 = tile(1, 1)
+        MemTile3 = tile(2, 1)
+        ComputeTile2 = tile(0, 2)
+
+        # MemTile elements
+        in2_mem_prod_lock = lock(MemTile, lock_id=0, init=0)
+        in2_mem_cons_lock = lock(MemTile, lock_id=1, init=1)
+        in2_mem_buff_0 = buffer(
+            tile=MemTile,
+            datatype=tensor_ty,
+            name="in2_mem_buff_0",
+            initial_value=np.arange(N // 4, dtype=np.int32),
+        )
+
+        # MemTile2 elements
+        in3_mem_prod_lock = lock(MemTile2, lock_id=0, init=0)
+        in3_mem_cons_lock = lock(MemTile2, lock_id=1, init=1)
+        in3_mem_buff_0 = buffer(
+            tile=MemTile2,
+            datatype=tensor_ty,
+            name="in3_mem_buff_0",
+            initial_value=np.arange(N // 4, (N // 4) * 2, dtype=np.int32),
+        )
+
+        # MemTile3 elements
+        in4_mem_prod_lock = lock(MemTile3, lock_id=0, init=0)
+        in4_mem_cons_lock = lock(MemTile3, lock_id=1, init=2)
+        in4_mem_buff_0 = buffer(
+            tile=MemTile3,
+            datatype=tensor_ty,
+            name="in4_mem_buff_0",
+            initial_value=np.arange((N // 4) * 2, (N // 4) * 3, dtype=np.int32),
+        )
+        in4_mem_buff_1 = buffer(
+            tile=MemTile3,
+            datatype=tensor_ty,
+            name="in4_mem_buff_1",
+            initial_value=np.arange((N // 4) * 3, N, dtype=np.int32),
+        )
+
+        # ComputeTile2 elements
+        # Input from ShimTile
+        in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1)
+        in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0)
+        in1_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_s,
+            name="in1_cons_buff_0",
+            initial_value=np.arange(n, dtype=np.int32),
+        )
+        # Input from MemTile
+        in2_mem_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=1)
+        in2_mem_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=0)
+        in2_mem_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_c,
+            name="in2_mem_cons_buff_0",
+            initial_value=np.arange(N, dtype=np.int32),
+        )
+        # Output to ShimTile
+        out_prod_lock = lock(ComputeTile2, lock_id=4, init=1)
+        out_cons_lock = lock(ComputeTile2, lock_id=5, init=0)
+        out_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_s,
+            name="out_buff_0",
+            initial_value=np.arange(n, dtype=np.int32),
+        )
+
+        flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0)
+        flow(MemTile2, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 1)
+        flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0)
+
+        # AIE-array data movement
+        shim_dma_allocation("in1", DMAChannelDir.MM2S, 0, 0)
+        shim_dma_allocation("out", DMAChannelDir.S2MM, 0, 0)
+
+        @memtile_dma(MemTile2)
+        def m(block):
+            s0 = dma_start(DMAChannelDir.MM2S, 0, dest=block[1], chain=block[5])
+            with block[1]:
+                use_lock(in2_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in2_mem_buff_0)
+                use_lock(in2_mem_prod_lock, LockAction.Release)
+                next_bd(block[2])
+            with block[2]:
+                use_lock(in3_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in3_mem_buff_0)
+                use_lock(in3_mem_prod_lock, LockAction.Release)
+                next_bd(block[3])
+            with block[3]:
+                use_lock(in4_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in4_mem_buff_0)
+                use_lock(in4_mem_prod_lock, LockAction.Release)
+                next_bd(block[4])
+            with block[4]:
+                use_lock(in4_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in4_mem_buff_1)
+                use_lock(in4_mem_prod_lock, LockAction.Release)
+                next_bd(block[1])
+            with block[5]:
+                EndOp()
+
+        @mem(ComputeTile2)
+        def m(block):
+            s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2])
+            with block[1]:
+                use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in1_cons_buff_0)
+                use_lock(in1_cons_cons_lock, LockAction.Release)
+                next_bd(block[1])
+            with block[2]:
+                s1 = dma_start(DMAChannelDir.S2MM, 1, dest=block[3], chain=block[4])
+            with block[3]:
+                use_lock(in2_mem_cons_prod_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in2_mem_cons_buff_0)
+                use_lock(in2_mem_cons_cons_lock, LockAction.Release)
+                next_bd(block[3])
+            with block[4]:
+                s2 = dma_start(DMAChannelDir.MM2S, 0, dest=block[5], chain=block[6])
+            with block[5]:
+                use_lock(out_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(out_buff_0)
+                use_lock(out_prod_lock, LockAction.Release)
+                next_bd(block[5])
+            with block[6]:
+                EndOp()
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in range_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                use_lock(in2_mem_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                for j in range_(N_div_n):
+                    use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                    use_lock(out_prod_lock, LockAction.AcquireGreaterEqual)
+                    for i in range_(n):
+                        out_buff_0[i] = (
+                            in2_mem_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i]
+                        )
+                    use_lock(in1_cons_prod_lock, LockAction.Release)
+                    use_lock(out_cons_lock, LockAction.Release)
+                use_lock(in2_mem_cons_prod_lock, LockAction.Release)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(tensor_ty_c, tensor_ty_c, tensor_ty_c)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_wait("out")
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_add()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit b/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit
deleted file mode 100644
index 81d333bfd4..0000000000
--- a/test/npu-xrt/adjacent_memtile_access/three_memtiles/run.lit
+++ /dev/null
@@ -1,10 +0,0 @@
-// (c) Copyright 2024 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// REQUIRES: ryzen_ai
-//
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
-// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
-// CHECK: PASS!
-
diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir
deleted file mode 100644
index fcffc71e9a..0000000000
--- a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie.mlir
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-//
-//===----------------------------------------------------------------------===//
-
-module {
-  aie.device(npu1_4col) {
-    memref.global "public" @out_cons : memref<16xi32>
-    memref.global "public" @out : memref<16xi32>
-    memref.global "public" @in2_mem_cons : memref<256xi32>
-    memref.global "public" @in2_mem : memref<256xi32>
-    memref.global "public" @in1_cons : memref<16xi32>
-    memref.global "public" @in1 : memref<16xi32>
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_1_1 = aie.tile(1, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<16xi32> 
-    %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<16xi32> 
-    %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"}
-    %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"}
-    %in2_mem_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_0"} : memref<256xi32> 
-    %in2_mem_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in2_mem_cons_buff_1"} : memref<256xi32> 
-    %in2_mem_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_mem_cons_prod_lock"}
-    %in2_mem_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_mem_cons_cons_lock"}
-    %in2_mem_buff_0 = aie.buffer(%tile_0_1) {sym_name = "in2_mem_buff_0"} : memref<128xi32> = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]> 
-    %in2_mem_buff_1 = aie.buffer(%tile_1_1) {sym_name = "in2_mem_buff_1"} : memref<128xi32> = dense<[128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]>
-    %in2_mem_prod_lock = aie.lock(%tile_0_1, 0) {init = 0 : i32, sym_name = "in2_mem_prod_lock"}
-    %in2_mem_cons_lock = aie.lock(%tile_0_1, 1) {init = 1 : i32, sym_name = "in2_mem_cons_lock"}
-    %in3_mem_prod_lock = aie.lock(%tile_1_1, 0) {init = 0 : i32, sym_name = "in3_mem_prod_lock"}
-    %in3_mem_cons_lock = aie.lock(%tile_1_1, 1) {init = 1 : i32, sym_name = "in3_mem_cons_lock"}
-    %in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_0"} : memref<16xi32> 
-    %in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in1_cons_buff_1"} : memref<16xi32> 
-    %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"}
-    %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"}
-    aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 1)
-    aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c16 = arith.constant 16 : index
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c9223372036854775806 = arith.constant 9223372036854775806 : index
-      %c2 = arith.constant 2 : index
-      scf.for %arg0 = %c0 to %c9223372036854775806 step %c2 {
-        aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c1 {
-            %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32>
-            %2 = arith.muli %arg1, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_0[%arg2] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-          %0 = arith.addi %arg1, %c1 : index
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c2 {
-            %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32>
-            %2 = arith.muli %0, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_1[%arg2] : memref<16xi32>
-            %6 = arith.addi %arg2, %c1 : index
-            %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-            %8 = arith.muli %0, %c16 : index
-            %9 = arith.addi %6, %8 : index
-            %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32>
-            %11 = arith.addi %7, %10 : i32
-            memref.store %11, %out_buff_1[%6] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-        }
-        aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-        aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c1 {
-            %1 = memref.load %in1_cons_buff_0[%arg2] : memref<16xi32>
-            %2 = arith.muli %arg1, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_0[%arg2] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-          %0 = arith.addi %arg1, %c1 : index
-          aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-          aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-          scf.for %arg2 = %c0 to %c16 step %c2 {
-            %1 = memref.load %in1_cons_buff_1[%arg2] : memref<16xi32>
-            %2 = arith.muli %0, %c16 : index
-            %3 = arith.addi %arg2, %2 : index
-            %4 = memref.load %in2_mem_cons_buff_1[%3] : memref<256xi32>
-            %5 = arith.addi %1, %4 : i32
-            memref.store %5, %out_buff_1[%arg2] : memref<16xi32>
-            %6 = arith.addi %arg2, %c1 : index
-            %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-            %8 = arith.muli %0, %c16 : index
-            %9 = arith.addi %6, %8 : index
-            %10 = memref.load %in2_mem_cons_buff_1[%9] : memref<256xi32>
-            %11 = arith.addi %7, %10 : i32
-            memref.store %11, %out_buff_1[%6] : memref<16xi32>
-          }
-          aie.use_lock(%in1_cons_prod_lock, Release, 1)
-          aie.use_lock(%out_cons_lock, Release, 1)
-        }
-        aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-      }
-      aie.use_lock(%in2_mem_cons_cons_lock, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c16 step %c2 {
-        aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-        aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c1 {
-          %1 = memref.load %in1_cons_buff_0[%arg1] : memref<16xi32>
-          %2 = arith.muli %arg0, %c16 : index
-          %3 = arith.addi %arg1, %2 : index
-          %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-          %5 = arith.addi %1, %4 : i32
-          memref.store %5, %out_buff_0[%arg1] : memref<16xi32>
-        }
-        aie.use_lock(%in1_cons_prod_lock, Release, 1)
-        aie.use_lock(%out_cons_lock, Release, 1)
-        %0 = arith.addi %arg0, %c1 : index
-        aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
-        aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-        scf.for %arg1 = %c0 to %c16 step %c2 {
-          %1 = memref.load %in1_cons_buff_1[%arg1] : memref<16xi32>
-          %2 = arith.muli %0, %c16 : index
-          %3 = arith.addi %arg1, %2 : index
-          %4 = memref.load %in2_mem_cons_buff_0[%3] : memref<256xi32>
-          %5 = arith.addi %1, %4 : i32
-          memref.store %5, %out_buff_1[%arg1] : memref<16xi32>
-          %6 = arith.addi %arg1, %c1 : index
-          %7 = memref.load %in1_cons_buff_1[%6] : memref<16xi32>
-          %8 = arith.muli %0, %c16 : index
-          %9 = arith.addi %6, %8 : index
-          %10 = memref.load %in2_mem_cons_buff_0[%9] : memref<256xi32>
-          %11 = arith.addi %7, %10 : i32
-          memref.store %11, %out_buff_1[%6] : memref<16xi32>
-        }
-        aie.use_lock(%in1_cons_prod_lock, Release, 1)
-        aie.use_lock(%out_cons_lock, Release, 1)
-      }
-      aie.use_lock(%in2_mem_cons_prod_lock, Release, 1)
-      aie.end
-    }
-    aie.shim_dma_allocation @in1(MM2S, 0, 0)
-    aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32>
-      aiex.npu.dma_wait {symbol = @out}
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16)
-      aie.use_lock(%in1_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16)
-      aie.use_lock(%in1_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb1
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
-    ^bb4:  // 2 preds: ^bb3, ^bb5
-      aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_cons_buff_0 : memref<256xi32>, 0, 256)
-      aie.use_lock(%in2_mem_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb5
-    ^bb5:  // pred: ^bb4
-      aie.use_lock(%in2_mem_cons_prod_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_cons_buff_1 : memref<256xi32>, 0, 256)
-      aie.use_lock(%in2_mem_cons_cons_lock, Release, 1)
-      aie.next_bd ^bb4
-    ^bb6:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9)
-    ^bb7:  // 2 preds: ^bb6, ^bb8
-      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16)
-      aie.use_lock(%out_prod_lock, Release, 1)
-      aie.next_bd ^bb8
-    ^bb8:  // pred: ^bb7
-      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16)
-      aie.use_lock(%out_prod_lock, Release, 1)
-      aie.next_bd ^bb7
-    ^bb9:  // pred: ^bb6
-      aie.end
-    }
-    aie.shim_dma_allocation @out(S2MM, 0, 0)
-    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:
-      aie.use_lock(%in2_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_0 : memref<128xi32>, 0, 128)
-      aie.use_lock(%in2_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb2
-    ^bb2:
-      aie.use_lock(%in3_mem_cons_lock, AcquireGreaterEqual, 1)
-      aie.dma_bd(%in2_mem_buff_1 : memref<128xi32>, 0, 128)
-      aie.use_lock(%in3_mem_prod_lock, Release, 1)
-      aie.next_bd ^bb1
-    ^bb3:
-      aie.end
-    }
-  }
-}
-
diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py
new file mode 100644
index 0000000000..3092d94e37
--- /dev/null
+++ b/test/npu-xrt/adjacent_memtile_access/two_memtiles/aie2.py
@@ -0,0 +1,179 @@
+# vector_vector_add/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+# REQUIRES: ryzen_ai
+#
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# CHECK: PASS!
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.dialects import memref
+
+
+def my_vector_add():
+    N = 256
+    n = 16
+    N_div_n = N // n
+
+    @device(AIEDevice.npu1_4col)
+    def device_body():
+        # AIE Core Function declarations
+        tensor_ty_c = np.ndarray[(N,), np.dtype[np.int32]]
+        tensor_ty = np.ndarray[(N // 2,), np.dtype[np.int32]]
+        tensor_ty_s = np.ndarray[(n,), np.dtype[np.int32]]
+
+        memref.global_("out", T.memref(16, T.i32()), sym_visibility="public")
+        memref.global_("in1", T.memref(16, T.i32()), sym_visibility="public")
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        MemTile2 = tile(1, 1)
+        ComputeTile2 = tile(0, 2)
+
+        # MemTile elements
+        in2_mem_prod_lock = lock(MemTile, lock_id=0, init=0)
+        in2_mem_cons_lock = lock(MemTile, lock_id=1, init=1)
+        in2_mem_buff_0 = buffer(
+            tile=MemTile,
+            datatype=tensor_ty,
+            name="in2_mem_buff_0",
+            initial_value=np.arange(N // 2, dtype=np.int32),
+        )
+
+        # MemTile2 elements
+        in3_mem_prod_lock = lock(MemTile2, lock_id=0, init=0)
+        in3_mem_cons_lock = lock(MemTile2, lock_id=1, init=1)
+        in3_mem_buff_0 = buffer(
+            tile=MemTile2,
+            datatype=tensor_ty,
+            name="in3_mem_buff_0",
+            initial_value=np.arange(N // 2, N, dtype=np.int32),
+        )
+
+        # ComputeTile2 elements
+        # Input from ShimTile
+        in1_cons_prod_lock = lock(ComputeTile2, lock_id=0, init=1)
+        in1_cons_cons_lock = lock(ComputeTile2, lock_id=1, init=0)
+        in1_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_s,
+            name="in1_cons_buff_0",
+            initial_value=np.arange(n, dtype=np.int32),
+        )
+        # Input from MemTile
+        in2_mem_cons_prod_lock = lock(ComputeTile2, lock_id=2, init=1)
+        in2_mem_cons_cons_lock = lock(ComputeTile2, lock_id=3, init=0)
+        in2_mem_cons_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_c,
+            name="in2_mem_cons_buff_0",
+            initial_value=np.arange(N, dtype=np.int32),
+        )
+        # Output to ShimTile
+        out_prod_lock = lock(ComputeTile2, lock_id=4, init=1)
+        out_cons_lock = lock(ComputeTile2, lock_id=5, init=0)
+        out_buff_0 = buffer(
+            tile=ComputeTile2,
+            datatype=tensor_ty_s,
+            name="out_buff_0",
+            initial_value=np.arange(n, dtype=np.int32),
+        )
+
+        flow(ShimTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 0)
+        flow(MemTile, WireBundle.DMA, 0, ComputeTile2, WireBundle.DMA, 1)
+        flow(ComputeTile2, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 0)
+
+        # AIE-array data movement
+        shim_dma_allocation("in1", DMAChannelDir.MM2S, 0, 0)
+        shim_dma_allocation("out", DMAChannelDir.S2MM, 0, 0)
+
+        @memtile_dma(MemTile)
+        def m(block):
+            s0 = dma_start(DMAChannelDir.MM2S, 0, dest=block[1], chain=block[3])
+            with block[1]:
+                use_lock(in2_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in2_mem_buff_0)
+                use_lock(in2_mem_prod_lock, LockAction.Release)
+                next_bd(block[2])
+            with block[2]:
+                use_lock(in3_mem_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in3_mem_buff_0)
+                use_lock(in3_mem_prod_lock, LockAction.Release)
+                next_bd(block[1])
+            with block[3]:
+                EndOp()
+
+        @mem(ComputeTile2)
+        def m(block):
+            s0 = dma_start(DMAChannelDir.S2MM, 0, dest=block[1], chain=block[2])
+            with block[1]:
+                use_lock(in1_cons_prod_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in1_cons_buff_0)
+                use_lock(in1_cons_cons_lock, LockAction.Release)
+                next_bd(block[1])
+            with block[2]:
+                s1 = dma_start(DMAChannelDir.S2MM, 1, dest=block[3], chain=block[4])
+            with block[3]:
+                use_lock(in2_mem_cons_prod_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(in2_mem_cons_buff_0)
+                use_lock(in2_mem_cons_cons_lock, LockAction.Release)
+                next_bd(block[3])
+            with block[4]:
+                s2 = dma_start(DMAChannelDir.MM2S, 0, dest=block[5], chain=block[6])
+            with block[5]:
+                use_lock(out_cons_lock, LockAction.AcquireGreaterEqual)
+                dma_bd(out_buff_0)
+                use_lock(out_prod_lock, LockAction.Release)
+                next_bd(block[5])
+            with block[6]:
+                EndOp()
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in range_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                use_lock(in2_mem_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                for j in range_(N_div_n):
+                    use_lock(in1_cons_cons_lock, LockAction.AcquireGreaterEqual)
+                    use_lock(out_prod_lock, LockAction.AcquireGreaterEqual)
+                    for i in range_(n):
+                        out_buff_0[i] = (
+                            in2_mem_cons_buff_0[j * N_div_n + i] + in1_cons_buff_0[i]
+                        )
+                    use_lock(in1_cons_prod_lock, LockAction.Release)
+                    use_lock(out_cons_lock, LockAction.Release)
+                use_lock(in2_mem_cons_prod_lock, LockAction.Release)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(tensor_ty_c, tensor_ty_c, tensor_ty_c)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_wait("out")
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_add()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit b/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit
deleted file mode 100644
index 81d333bfd4..0000000000
--- a/test/npu-xrt/adjacent_memtile_access/two_memtiles/run.lit
+++ /dev/null
@@ -1,10 +0,0 @@
-// (c) Copyright 2024 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// REQUIRES: ryzen_ai
-//
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
-// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
-// CHECK: PASS!
-
diff --git a/test/objectFifo-stateful-transform/tileDMA_test.mlir b/test/objectFifo-stateful-transform/tileDMA_test.mlir
index bea2793512..931972fd24 100644
--- a/test/objectFifo-stateful-transform/tileDMA_test.mlir
+++ b/test/objectFifo-stateful-transform/tileDMA_test.mlir
@@ -1,4 +1,4 @@
-//===- tileDMA_test.mlir --------------------------*- MLIR -*-===//
+//===- tileDMA_test.mlir ---------------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,7 @@
 // CHECK:           %[[VAL_13:.*]] = aie.lock(%[[VAL_0]], 1)
 // CHECK:           %[[VAL_14:.*]] = aie.buffer(%[[VAL_0]]) : memref<16xi32>
 // CHECK:           %[[VAL_15:.*]] = aie.lock(%[[VAL_0]], 2)
-// CHECK:           aie.flow(%[[VAL_0]], DMA : 1, %[[VAL_1]], DMA : 0)
+// CHECK:           aie.flow(%[[VAL_0]], DMA : 0, %[[VAL_1]], DMA : 0)
 // CHECK:           func.func @some_work(%[[VAL_16:.*]]: memref<16xi32>) {
 // CHECK:             return
 // CHECK:           }
@@ -50,7 +50,7 @@
 // CHECK:             aie.end
 // CHECK:           }
 // CHECK:           %[[VAL_23:.*]] = aie.mem(%[[VAL_0]]) {
-// CHECK:             %[[VAL_24:.*]] = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+// CHECK:             %[[VAL_24:.*]] = aie.dma_start(MM2S, 1, ^bb1, ^bb3)
 // CHECK:           ^bb1:  // 2 preds: ^bb0, ^bb2
 // CHECK:             aie.use_lock(%[[VAL_11]], Acquire, 1)
 // CHECK:             aie.dma_bd(%[[VAL_10]] : memref<16xi32>, 0, 16)
@@ -69,7 +69,7 @@
 // CHECK:             aie.use_lock(%[[VAL_15]], Release, 1)
 // CHECK:             aie.next_bd ^bb4
 // CHECK:           ^bb5:  // pred: ^bb3
-// CHECK:             %[[VAL_26:.*]] = aie.dma_start(MM2S, 1, ^bb6, ^bb8)
+// CHECK:             %[[VAL_26:.*]] = aie.dma_start(MM2S, 0, ^bb6, ^bb8)
 // CHECK:           ^bb6:  // 2 preds: ^bb5, ^bb7
 // CHECK:             aie.use_lock(%[[VAL_8]], Acquire, 1)
 // CHECK:             aie.dma_bd(%[[VAL_6]] : memref<16xi32>, 0, 16)
@@ -134,7 +134,7 @@ module @tileDMA_channels {
         }
 
         %mem12 = aie.mem(%tile12) {
-            %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+            %dma1 = aie.dma_start(MM2S, 1, ^bb1, ^bb3)
         ^bb1:
             aie.use_lock(%lock0, Acquire, 1)
             aie.dma_bd(%buff0 : memref<16xi32>, 0, 16)
diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir
new file mode 100644
index 0000000000..af95ba4abb
--- /dev/null
+++ b/test/objectFifo-stateful-transform/tileDMA_test_bad.mlir
@@ -0,0 +1,52 @@
+//===- tileDMA_test_bad.mlir -----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s
+
+// CHECK:   error: 'aie.tile' op number of output DMA channel exceeded!
+
+module @tileDMA_channels {
+    aie.device(xcvc1902) {
+        %tile12 = aie.tile(1, 2)
+        %tile33 = aie.tile(3, 3)
+
+        %buff0 = aie.buffer(%tile12) : memref<16xi32>
+        %lock0 = aie.lock(%tile12, 0)
+        %buff1 = aie.buffer(%tile12) : memref<16xi32>
+        %lock1 = aie.lock(%tile12, 1)
+        %buff2 = aie.buffer(%tile12) : memref<16xi32>
+        %lock2 = aie.lock(%tile12, 2)
+
+        aie.objectfifo @objfifo (%tile12, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+        %mem12 = aie.mem(%tile12) {
+            %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+        ^bb1:
+            aie.use_lock(%lock0, Acquire, 1)
+            aie.dma_bd(%buff0 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock0, Release, 0)
+            aie.next_bd ^bb2
+        ^bb2:
+            aie.use_lock(%lock1, Acquire, 1)
+            aie.dma_bd(%buff1 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock1, Release, 0)
+            aie.next_bd ^bb1
+        ^bb3:
+            %dma2 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+        ^bb4:
+            aie.use_lock(%lock2, Acquire, 0)
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock2, Release, 1)
+            aie.next_bd ^bb4
+        ^bb5:
+            aie.end
+        }
+    }
+}
diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir
new file mode 100644
index 0000000000..0c7eca1763
--- /dev/null
+++ b/test/objectFifo-stateful-transform/tileDMA_test_bad2.mlir
@@ -0,0 +1,52 @@
+//===- tileDMA_test_bad2.mlir ----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s
+
+// CHECK:   error: 'aie.tile' op number of input DMA channel exceeded!
+
+module @tileDMA_channels {
+    aie.device(xcvc1902) {
+        %tile12 = aie.tile(1, 2)
+        %tile33 = aie.tile(3, 3)
+
+        %buff0 = aie.buffer(%tile12) : memref<16xi32>
+        %lock0 = aie.lock(%tile12, 0)
+        %buff1 = aie.buffer(%tile12) : memref<16xi32>
+        %lock1 = aie.lock(%tile12, 1)
+        %buff2 = aie.buffer(%tile12) : memref<16xi32>
+        %lock2 = aie.lock(%tile12, 2)
+
+        aie.objectfifo @objfifo (%tile33, {%tile12}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+        %mem12 = aie.mem(%tile12) {
+            %dma1 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+        ^bb1:
+            aie.use_lock(%lock0, Acquire, 1)
+            aie.dma_bd(%buff0 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock0, Release, 0)
+            aie.next_bd ^bb2
+        ^bb2:
+            aie.use_lock(%lock1, Acquire, 1)
+            aie.dma_bd(%buff1 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock1, Release, 0)
+            aie.next_bd ^bb1
+        ^bb3:
+            %dma2 = aie.dma_start(S2MM, 1, ^bb4, ^bb5)
+        ^bb4:
+            aie.use_lock(%lock2, Acquire, 0)
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.use_lock(%lock2, Release, 1)
+            aie.next_bd ^bb4
+        ^bb5:
+            aie.end
+        }
+    }
+}
diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir
new file mode 100644
index 0000000000..b84cc3e243
--- /dev/null
+++ b/test/objectFifo-stateful-transform/tileDMA_test_bad3.mlir
@@ -0,0 +1,63 @@
+//===- tileDMA_test_bad3.mlir ----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s
+
+// CHECK:   error: 'aie.tile' op number of input DMA channel exceeded!
+
+module @tileDMA_channels {
+    aie.device(xcve2302) {
+        %tile11 = aie.tile(1, 1)
+        %tile33 = aie.tile(3, 3)
+
+        %buff0 = aie.buffer(%tile11) : memref<16xi32>
+        %buff1 = aie.buffer(%tile11) : memref<16xi32>
+        %buff2 = aie.buffer(%tile11) : memref<16xi32>
+        %buff3 = aie.buffer(%tile11) : memref<16xi32>
+        %buff4 = aie.buffer(%tile11) : memref<16xi32>
+        %buff5 = aie.buffer(%tile11) : memref<16xi32>
+
+        aie.objectfifo @objfifo (%tile33, {%tile11}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+        %mem11 = aie.memtile_dma(%tile11) {
+            %dma1 = aie.dma_start(S2MM, 0, ^bb1, ^bb2)
+        ^bb1:
+            aie.dma_bd(%buff0 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb1
+        ^bb2:
+            %dma2 = aie.dma_start(S2MM, 1, ^bb3, ^bb4)
+        ^bb3:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb3
+        ^bb4:
+            %dma3 = aie.dma_start(S2MM, 2, ^bb5, ^bb6)
+        ^bb5:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb5
+        ^bb6:
+            %dma4 = aie.dma_start(S2MM, 3, ^bb7, ^bb8)
+        ^bb7:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb7
+        ^bb8:
+            %dma5 = aie.dma_start(S2MM, 4, ^bb9, ^bb10)
+        ^bb9:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb9
+        ^bb10:
+            %dma6 = aie.dma_start(S2MM, 5, ^bb11, ^bb12)
+        ^bb11:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb11
+        ^bb12:
+            aie.end
+        }
+    }
+}
diff --git a/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir b/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir
new file mode 100644
index 0000000000..65bb4bfb60
--- /dev/null
+++ b/test/objectFifo-stateful-transform/tileDMA_test_bad4.mlir
@@ -0,0 +1,63 @@
+//===- tileDMA_test_bad4.mlir ----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: not aie-opt --aie-objectFifo-stateful-transform %s 2>&1 | FileCheck %s
+
+// CHECK:   error: 'aie.tile' op number of output DMA channel exceeded!
+
+module @tileDMA_channels {
+    aie.device(xcve2302) {
+        %tile11 = aie.tile(1, 1)
+        %tile33 = aie.tile(3, 3)
+
+        %buff0 = aie.buffer(%tile11) : memref<16xi32>
+        %buff1 = aie.buffer(%tile11) : memref<16xi32>
+        %buff2 = aie.buffer(%tile11) : memref<16xi32>
+        %buff3 = aie.buffer(%tile11) : memref<16xi32>
+        %buff4 = aie.buffer(%tile11) : memref<16xi32>
+        %buff5 = aie.buffer(%tile11) : memref<16xi32>
+
+        aie.objectfifo @objfifo (%tile11, {%tile33}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+        %mem11 = aie.memtile_dma(%tile11) {
+            %dma1 = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
+        ^bb1:
+            aie.dma_bd(%buff0 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb1
+        ^bb2:
+            %dma2 = aie.dma_start(MM2S, 1, ^bb3, ^bb4)
+        ^bb3:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb3
+        ^bb4:
+            %dma3 = aie.dma_start(MM2S, 2, ^bb5, ^bb6)
+        ^bb5:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb5
+        ^bb6:
+            %dma4 = aie.dma_start(MM2S, 3, ^bb7, ^bb8)
+        ^bb7:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb7
+        ^bb8:
+            %dma5 = aie.dma_start(MM2S, 4, ^bb9, ^bb10)
+        ^bb9:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb9
+        ^bb10:
+            %dma6 = aie.dma_start(MM2S, 5, ^bb11, ^bb12)
+        ^bb11:
+            aie.dma_bd(%buff2 : memref<16xi32>, 0, 16)
+            aie.next_bd ^bb11
+        ^bb12:
+            aie.end
+        }
+    }
+}