diff --git a/programming_examples/experimental/Makefile b/programming_examples/experimental/Makefile
new file mode 100644
index 0000000000..76d9b27789
--- /dev/null
+++ b/programming_examples/experimental/Makefile
@@ -0,0 +1,21 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+all: run
+
+run: ${srcdir}/example.py
+	rm -rf build
+	mkdir -p build
+	cd build && python3 $<
+
+clean:
+	rm -rf build
\ No newline at end of file
diff --git a/programming_examples/experimental/README.md b/programming_examples/experimental/README.md
new file mode 100644
index 0000000000..73613a7b34
--- /dev/null
+++ b/programming_examples/experimental/README.md
@@ -0,0 +1,9 @@
+# Experimental Example
+
+This example showcases what a higher-level (above IRON) syntax might look like one day for programming NPUs. It is designed to be at a similar level of abstraction to cupy.
+Run the design with:
+```bash
+./example.py
+```
+
+Warning: This example is very brittle. The support for this syntax is strictly proof-of-concept and any changes to the design will likely be beyond what the current implementation supports.
\ No newline at end of file
diff --git a/programming_examples/experimental/example.py b/programming_examples/experimental/example.py
new file mode 100644
index 0000000000..bfdf59346b
--- /dev/null
+++ b/programming_examples/experimental/example.py
@@ -0,0 +1,28 @@
+import numpy as np
+import aie.iron.experimental as iron
+
+MATRIX_DIMS = (8, 16)
+TILE_DIMS = (2, 4)
+MATRIX_DTYPE = np.int32
+NUM_WORKERS = 2
+
+A = iron.asarray(np.full(fill_value=1, shape=MATRIX_DIMS, dtype=MATRIX_DTYPE))
+B = iron.array(MATRIX_DIMS, MATRIX_DTYPE)
+
+
+def task_fn(a, b):
+    dim0, dim1 = a.shape
+    for i in iron.range(dim0):
+        for j in iron.range(dim1):
+            b[i, j] = a[i, j] + 1
+
+
+task_runner = iron.task_runner(task_fn, [(A, TILE_DIMS)], [(B, TILE_DIMS)], NUM_WORKERS)
+task_runner.run()
+
+npB = B.asnumpy()
+npA = A.asnumpy()
+if (npB == npA + 1).all():
+    print("PASS!")
+else:
+    print(f"Failed: {np.B}")
diff --git a/programming_examples/experimental/lit.local.cfg b/programming_examples/experimental/lit.local.cfg
new file mode 100644
index 0000000000..361bee56e5
--- /dev/null
+++ b/programming_examples/experimental/lit.local.cfg
@@ -0,0 +1,11 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+config.suffixes = ['.lit']
+
+if 'AIE2' not in config.vitis_components:
+    config.unsupported = True
\ No newline at end of file
diff --git a/programming_examples/experimental/run_makefile.lit b/programming_examples/experimental/run_makefile.lit
new file mode 100644
index 0000000000..d6a861757e
--- /dev/null
+++ b/programming_examples/experimental/run_makefile.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano 
+//
+// RUN: make -f %S/Makefile clean
+// RUN: %run_on_npu make -f %S/Makefile | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file
diff --git a/python/iron/device/device.py b/python/iron/device/device.py
index 7290d2a885..28b9b16af7 100644
--- a/python/iron/device/device.py
+++ b/python/iron/device/device.py
@@ -12,8 +12,6 @@
 from ..resolvable import Resolvable
 from .tile import Tile
 
-# TODO: we need an NPU2 implementation.
-
 
 class Device(Resolvable):
     """
diff --git a/python/iron/experimental/README.md b/python/iron/experimental/README.md
new file mode 100644
index 0000000000..2c51725412
--- /dev/null
+++ b/python/iron/experimental/README.md
@@ -0,0 +1,3 @@
+# Experimental Backend
+
+These files are the backend for the proof-of-concept example found in `programming_examples/experimental`. The implementation here is narrow (and brittle). A more complete backend of this experimental syntax should likely be backed by something similar to a compiler, to avoid an undisciplined soup of heuristics.
\ No newline at end of file
diff --git a/python/iron/experimental/__init__.py b/python/iron/experimental/__init__.py
new file mode 100644
index 0000000000..4c697444c4
--- /dev/null
+++ b/python/iron/experimental/__init__.py
@@ -0,0 +1,3 @@
+from .array import array, asarray
+from .task_runner import task_runner
+from aie.helpers.dialects.ext.scf import _for as range
diff --git a/python/iron/experimental/array.py b/python/iron/experimental/array.py
new file mode 100644
index 0000000000..5c18b19a29
--- /dev/null
+++ b/python/iron/experimental/array.py
@@ -0,0 +1,47 @@
+# array.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+import numpy as np
+from typing import Sequence
+
+
+class array:
+    def __init__(
+        self,
+        shape: Sequence[int],
+        dtype: np.dtype,
+        initial_values: np.ndarray | None = None,
+        num_buffs: int | None = 2,
+    ):
+        self._array = None
+        self._dtype = dtype
+        self._shape = shape
+        self._num_buffs = 2
+        if not (num_buffs is None):
+            if num_buffs < 1:
+                raise ValueError(f"num_buffs must be >= 1, but got {num_buffs}")
+            self._num_buffs = num_buffs
+        if not (initial_values is None):
+            if self._dtype != initial_values.dtype:
+                raise ValueError(
+                    f"Initial values dtype {initial_values.dtype} does not match given dtype {self._dtype}"
+                )
+            if self._shape != initial_values.shape:
+                raise ValueError(
+                    f"Initial values shape {initial_values.shape} does not match given shape {self._shape}"
+                )
+            self._array = initial_values
+
+    def asnumpy(self):
+        if self._array is None:
+            self._array = np.zeros(self._shape, self._dtype)
+        return self._array
+
+
+def asarray(arr: np.ndarray, num_buffs: int | None = None):
+    return array(arr.shape, arr.dtype, arr, num_buffs)
diff --git a/python/iron/experimental/task_runner.py b/python/iron/experimental/task_runner.py
new file mode 100644
index 0000000000..0abd63b58f
--- /dev/null
+++ b/python/iron/experimental/task_runner.py
@@ -0,0 +1,160 @@
+# task_runner.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+import numpy as np
+from typing import Sequence
+
+from ...compiler.aiecc.main import run as aiecc_run
+from ...utils.xrt import setup_aie, execute as execute_on_aie
+from ...helpers.taplib import TensorTiler2D
+from ..dataflow import ObjectFifo
+from ..device import NPU1Col4
+from ..placers import SequentialPlacer
+from ..program import Program
+from ..runtime import Runtime
+from ..worker import Worker
+
+from .array import array
+
+
+class TaskRunner:
+    _INSTS = "npu_insts.txt"
+    _XCLBIN = "final.xclbin"
+
+    def __init__(
+        self, module, input_arrs: Sequence[array], output_arrs: Sequence[array]
+    ):
+        self._module = module
+        self._input_arrs = input_arrs
+        self._output_arrs = output_arrs
+
+    @classmethod
+    def _aiecc_args(cls, xclbin, insts):
+        return [
+            "--aie-generate-cdo",
+            f"--xclbin-name={xclbin}",
+            "--no-xchesscc",
+            "--no-xbridge",
+            "--aie-generate-npu",
+            f"--npu-insts-name={insts}",
+        ]
+
+    def run(self):
+        # Compile
+        aiecc_run(self._module, self._aiecc_args(self._XCLBIN, self._INSTS))
+
+        MAX_INPUTS = 2
+        MAX_OUTPUTS = 1
+
+        if len(self._input_arrs) > MAX_INPUTS:
+            raise NotImplementedError(
+                f"setup_aie XRT wrapper can only handle {MAX_INPUTS} inputs as present, but got {len(self._input_arrs)}"
+            )
+        if len(self._output_arrs) > MAX_OUTPUTS:
+            raise NotImplementedError(
+                f"setup_aie XRT wrapper can only handle {MAX_OUTPUTS} outputs as present, but got {len(self._output_arrs)}"
+            )
+
+        # Setup input/output
+        kwargs = {}
+        for i, arr in enumerate(self._input_arrs):
+            kwargs[f"in_{i}_shape"] = arr._shape
+            kwargs[f"in_{i}_dtype"] = arr._dtype
+        for i in range(len(self._input_arrs), MAX_INPUTS):
+            kwargs[f"in_{i}_shape"] = None
+            kwargs[f"in_{i}_dtype"] = None
+
+        kwargs[f"out_buf_shape"] = self._output_arrs[0]._shape
+        kwargs[f"out_buf_dtype"] = self._output_arrs[0]._dtype
+
+        app = setup_aie(
+            self._XCLBIN,
+            self._INSTS,
+            **kwargs,
+        )
+
+        # Execute program and collect output
+        aie_output = execute_on_aie(app, *[arr.asnumpy() for arr in self._input_arrs])
+        self._output_arrs[0]._array = aie_output
+
+
+def task_runner(
+    task_fn,
+    tiled_inputs: Sequence[tuple[array, Sequence[int]]],
+    tiled_outputs: Sequence[tuple[array, Sequence[int]]],
+    num_workers: int = 1,
+) -> TaskRunner:
+
+    tas_ins = []
+    of_ins = [[] for _ in range(num_workers)]
+    rt_types = []
+    input_arrs = []
+    for i, tiles_param in enumerate(tiled_inputs):
+        arr, tile_shape = tiles_param
+        input_arrs.append(arr)
+        tile_type = np.ndarray[tile_shape, np.dtype[arr._dtype]]
+        tas_ins.append(TensorTiler2D.simple_tiler(arr._shape, tile_shape))
+        rt_types.append(np.ndarray[arr._shape, np.dtype[arr._dtype]])
+        for w in range(num_workers):
+            of_ins[w].append(
+                ObjectFifo(tile_type, default_depth=arr._num_buffs, name=f"in{i}_{w}")
+            )
+
+    tas_outs = []
+    of_outs = [[] for _ in range(num_workers)]
+    output_arrs = []
+    for i, tiles_param in enumerate(tiled_outputs):
+        arr, tile_shape = tiles_param
+        output_arrs.append(arr)
+        tile_type = np.ndarray[tile_shape, np.dtype[arr._dtype]]
+        tas_outs.append(TensorTiler2D.simple_tiler(arr._shape, tile_shape))
+        rt_types.append(np.ndarray[arr._shape, np.dtype[arr._dtype]])
+        for w in range(num_workers):
+            of_outs[w].append(
+                ObjectFifo(tile_type, default_depth=arr._num_buffs, name=f"out{i}_{w}")
+            )
+
+    def worker_wrapper(*args):
+        datas = []
+        for of in args:
+            datas.append(of.acquire(1))
+        task_fn(*datas)
+        for of in args:
+            of.release(1)
+
+    workers = []
+    for w in range(num_workers):
+        args = [of_in.cons() for of_in in of_ins[w]]
+        args += [of_out.prod() for of_out in of_outs[w]]
+        workers.append(Worker(worker_wrapper, args))
+
+    for i in range(num_workers):
+        of_outs[i] = [of.cons() for of in of_outs[i]]
+
+    rt = Runtime()
+    with rt.sequence(*rt_types) as rt_buffers:
+        rt.start(*workers)
+
+        taps_idx = 0
+        worker_idx = 0
+        while taps_idx < len(tas_ins[0]):
+            for i, tas in enumerate(tas_ins):
+                rt.fill(of_ins[worker_idx][i].prod(), rt_buffers[i], tas[taps_idx])
+            for i, tas in enumerate(tas_outs):
+                rt.drain(
+                    of_outs[worker_idx][i],
+                    rt_buffers[i + len(tas_ins)],
+                    tas[taps_idx],
+                    wait=(i == len(tas_outs) - 1),
+                )
+            taps_idx += 1
+            worker_idx = (worker_idx + 1) % num_workers
+
+    my_program = Program(NPU1Col4(), rt)
+    module = my_program.resolve_program(SequentialPlacer())
+    return TaskRunner(module, input_arrs, output_arrs)