diff --git a/programming_examples/experimental/Makefile b/programming_examples/experimental/Makefile new file mode 100644 index 0000000000..76d9b27789 --- /dev/null +++ b/programming_examples/experimental/Makefile @@ -0,0 +1,21 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +all: run + +run: ${srcdir}/example.py + rm -rf build + mkdir -p build + cd build && python3 $< + +clean: + rm -rf build \ No newline at end of file diff --git a/programming_examples/experimental/README.md b/programming_examples/experimental/README.md new file mode 100644 index 0000000000..73613a7b34 --- /dev/null +++ b/programming_examples/experimental/README.md @@ -0,0 +1,9 @@ +# Experimental Example + +This example showcases what a higher-level (above IRON) syntax might look like one day for programming NPUs. It is designed to be at a similar level of abstraction to cupy. +Run the design with: +```bash +./example.py +``` + +Warning: This example is very brittle. The support for this syntax is strictly proof-of-concept and any changes to the design will likely be beyond what the current implementation supports. \ No newline at end of file diff --git a/programming_examples/experimental/example.py b/programming_examples/experimental/example.py new file mode 100644 index 0000000000..bfdf59346b --- /dev/null +++ b/programming_examples/experimental/example.py @@ -0,0 +1,28 @@ +import numpy as np +import aie.iron.experimental as iron + +MATRIX_DIMS = (8, 16) +TILE_DIMS = (2, 4) +MATRIX_DTYPE = np.int32 +NUM_WORKERS = 2 + +A = iron.asarray(np.full(fill_value=1, shape=MATRIX_DIMS, dtype=MATRIX_DTYPE)) +B = iron.array(MATRIX_DIMS, MATRIX_DTYPE) + + +def task_fn(a, b): + dim0, dim1 = a.shape + for i in iron.range(dim0): + for j in iron.range(dim1): + b[i, j] = a[i, j] + 1 + + +task_runner = iron.task_runner(task_fn, [(A, TILE_DIMS)], [(B, TILE_DIMS)], NUM_WORKERS) +task_runner.run() + +npB = B.asnumpy() +npA = A.asnumpy() +if (npB == npA + 1).all(): + print("PASS!") +else: + print(f"Failed: {np.B}") diff --git a/programming_examples/experimental/lit.local.cfg b/programming_examples/experimental/lit.local.cfg new file mode 100644 index 0000000000..361bee56e5 --- /dev/null +++ b/programming_examples/experimental/lit.local.cfg @@ -0,0 +1,11 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 AMD Inc. + +config.suffixes = ['.lit'] + +if 'AIE2' not in config.vitis_components: + config.unsupported = True \ No newline at end of file diff --git a/programming_examples/experimental/run_makefile.lit b/programming_examples/experimental/run_makefile.lit new file mode 100644 index 0000000000..d6a861757e --- /dev/null +++ b/programming_examples/experimental/run_makefile.lit @@ -0,0 +1,8 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, peano +// +// RUN: make -f %S/Makefile clean +// RUN: %run_on_npu make -f %S/Makefile | FileCheck %s +// CHECK: PASS! \ No newline at end of file diff --git a/python/iron/device/device.py b/python/iron/device/device.py index 7290d2a885..28b9b16af7 100644 --- a/python/iron/device/device.py +++ b/python/iron/device/device.py @@ -12,8 +12,6 @@ from ..resolvable import Resolvable from .tile import Tile -# TODO: we need an NPU2 implementation. - class Device(Resolvable): """ diff --git a/python/iron/experimental/README.md b/python/iron/experimental/README.md new file mode 100644 index 0000000000..2c51725412 --- /dev/null +++ b/python/iron/experimental/README.md @@ -0,0 +1,3 @@ +# Experimental Backend + +These files are the backend for the proof-of-concept example found in `programming_examples/experimental`. The implementation here is narrow (and brittle). A more complete backend of this experimental syntax should likely be backed by something similar to a compiler, to avoid an undisciplined soup of heuristics. \ No newline at end of file diff --git a/python/iron/experimental/__init__.py b/python/iron/experimental/__init__.py new file mode 100644 index 0000000000..4c697444c4 --- /dev/null +++ b/python/iron/experimental/__init__.py @@ -0,0 +1,3 @@ +from .array import array, asarray +from .task_runner import task_runner +from aie.helpers.dialects.ext.scf import _for as range diff --git a/python/iron/experimental/array.py b/python/iron/experimental/array.py new file mode 100644 index 0000000000..5c18b19a29 --- /dev/null +++ b/python/iron/experimental/array.py @@ -0,0 +1,47 @@ +# array.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +import numpy as np +from typing import Sequence + + +class array: + def __init__( + self, + shape: Sequence[int], + dtype: np.dtype, + initial_values: np.ndarray | None = None, + num_buffs: int | None = 2, + ): + self._array = None + self._dtype = dtype + self._shape = shape + self._num_buffs = 2 + if not (num_buffs is None): + if num_buffs < 1: + raise ValueError(f"num_buffs must be >= 1, but got {num_buffs}") + self._num_buffs = num_buffs + if not (initial_values is None): + if self._dtype != initial_values.dtype: + raise ValueError( + f"Initial values dtype {initial_values.dtype} does not match given dtype {self._dtype}" + ) + if self._shape != initial_values.shape: + raise ValueError( + f"Initial values shape {initial_values.shape} does not match given shape {self._shape}" + ) + self._array = initial_values + + def asnumpy(self): + if self._array is None: + self._array = np.zeros(self._shape, self._dtype) + return self._array + + +def asarray(arr: np.ndarray, num_buffs: int | None = None): + return array(arr.shape, arr.dtype, arr, num_buffs) diff --git a/python/iron/experimental/task_runner.py b/python/iron/experimental/task_runner.py new file mode 100644 index 0000000000..0abd63b58f --- /dev/null +++ b/python/iron/experimental/task_runner.py @@ -0,0 +1,160 @@ +# task_runner.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. + +import numpy as np +from typing import Sequence + +from ...compiler.aiecc.main import run as aiecc_run +from ...utils.xrt import setup_aie, execute as execute_on_aie +from ...helpers.taplib import TensorTiler2D +from ..dataflow import ObjectFifo +from ..device import NPU1Col4 +from ..placers import SequentialPlacer +from ..program import Program +from ..runtime import Runtime +from ..worker import Worker + +from .array import array + + +class TaskRunner: + _INSTS = "npu_insts.txt" + _XCLBIN = "final.xclbin" + + def __init__( + self, module, input_arrs: Sequence[array], output_arrs: Sequence[array] + ): + self._module = module + self._input_arrs = input_arrs + self._output_arrs = output_arrs + + @classmethod + def _aiecc_args(cls, xclbin, insts): + return [ + "--aie-generate-cdo", + f"--xclbin-name={xclbin}", + "--no-xchesscc", + "--no-xbridge", + "--aie-generate-npu", + f"--npu-insts-name={insts}", + ] + + def run(self): + # Compile + aiecc_run(self._module, self._aiecc_args(self._XCLBIN, self._INSTS)) + + MAX_INPUTS = 2 + MAX_OUTPUTS = 1 + + if len(self._input_arrs) > MAX_INPUTS: + raise NotImplementedError( + f"setup_aie XRT wrapper can only handle {MAX_INPUTS} inputs as present, but got {len(self._input_arrs)}" + ) + if len(self._output_arrs) > MAX_OUTPUTS: + raise NotImplementedError( + f"setup_aie XRT wrapper can only handle {MAX_OUTPUTS} outputs as present, but got {len(self._output_arrs)}" + ) + + # Setup input/output + kwargs = {} + for i, arr in enumerate(self._input_arrs): + kwargs[f"in_{i}_shape"] = arr._shape + kwargs[f"in_{i}_dtype"] = arr._dtype + for i in range(len(self._input_arrs), MAX_INPUTS): + kwargs[f"in_{i}_shape"] = None + kwargs[f"in_{i}_dtype"] = None + + kwargs[f"out_buf_shape"] = self._output_arrs[0]._shape + kwargs[f"out_buf_dtype"] = self._output_arrs[0]._dtype + + app = setup_aie( + self._XCLBIN, + self._INSTS, + **kwargs, + ) + + # Execute program and collect output + aie_output = execute_on_aie(app, *[arr.asnumpy() for arr in self._input_arrs]) + self._output_arrs[0]._array = aie_output + + +def task_runner( + task_fn, + tiled_inputs: Sequence[tuple[array, Sequence[int]]], + tiled_outputs: Sequence[tuple[array, Sequence[int]]], + num_workers: int = 1, +) -> TaskRunner: + + tas_ins = [] + of_ins = [[] for _ in range(num_workers)] + rt_types = [] + input_arrs = [] + for i, tiles_param in enumerate(tiled_inputs): + arr, tile_shape = tiles_param + input_arrs.append(arr) + tile_type = np.ndarray[tile_shape, np.dtype[arr._dtype]] + tas_ins.append(TensorTiler2D.simple_tiler(arr._shape, tile_shape)) + rt_types.append(np.ndarray[arr._shape, np.dtype[arr._dtype]]) + for w in range(num_workers): + of_ins[w].append( + ObjectFifo(tile_type, default_depth=arr._num_buffs, name=f"in{i}_{w}") + ) + + tas_outs = [] + of_outs = [[] for _ in range(num_workers)] + output_arrs = [] + for i, tiles_param in enumerate(tiled_outputs): + arr, tile_shape = tiles_param + output_arrs.append(arr) + tile_type = np.ndarray[tile_shape, np.dtype[arr._dtype]] + tas_outs.append(TensorTiler2D.simple_tiler(arr._shape, tile_shape)) + rt_types.append(np.ndarray[arr._shape, np.dtype[arr._dtype]]) + for w in range(num_workers): + of_outs[w].append( + ObjectFifo(tile_type, default_depth=arr._num_buffs, name=f"out{i}_{w}") + ) + + def worker_wrapper(*args): + datas = [] + for of in args: + datas.append(of.acquire(1)) + task_fn(*datas) + for of in args: + of.release(1) + + workers = [] + for w in range(num_workers): + args = [of_in.cons() for of_in in of_ins[w]] + args += [of_out.prod() for of_out in of_outs[w]] + workers.append(Worker(worker_wrapper, args)) + + for i in range(num_workers): + of_outs[i] = [of.cons() for of in of_outs[i]] + + rt = Runtime() + with rt.sequence(*rt_types) as rt_buffers: + rt.start(*workers) + + taps_idx = 0 + worker_idx = 0 + while taps_idx < len(tas_ins[0]): + for i, tas in enumerate(tas_ins): + rt.fill(of_ins[worker_idx][i].prod(), rt_buffers[i], tas[taps_idx]) + for i, tas in enumerate(tas_outs): + rt.drain( + of_outs[worker_idx][i], + rt_buffers[i + len(tas_ins)], + tas[taps_idx], + wait=(i == len(tas_outs) - 1), + ) + taps_idx += 1 + worker_idx = (worker_idx + 1) % num_workers + + my_program = Program(NPU1Col4(), rt) + module = my_program.resolve_program(SequentialPlacer()) + return TaskRunner(module, input_arrs, output_arrs)