diff --git a/include/aie-c/Translation.h b/include/aie-c/Translation.h index 0bb56cac7e..c1256160a5 100644 --- a/include/aie-c/Translation.h +++ b/include/aie-c/Translation.h @@ -30,6 +30,10 @@ aieTranslateToCDODirect(MlirOperation moduleOp, MlirStringRef workDirPath, bool bigEndian, bool emitUnified, bool cdoDebug, bool aieSim, bool xaieDebug, bool enableCores); +MLIR_CAPI_EXPORTED MlirLogicalResult +aieTranslateToTxn(MlirOperation moduleOp, MlirStringRef workDirPath, + bool aieSim, bool xaieDebug, bool enableCores); + #ifdef __cplusplus } #endif diff --git a/include/aie/Targets/AIETargets.h b/include/aie/Targets/AIETargets.h index 65907c89ab..0ee9f7cbec 100644 --- a/include/aie/Targets/AIETargets.h +++ b/include/aie/Targets/AIETargets.h @@ -51,6 +51,12 @@ AIETranslateToCDODirect(mlir::ModuleOp m, llvm::StringRef workDirPath, bool bigEndian = false, bool emitUnified = false, bool cdoDebug = false, bool aieSim = false, bool xaieDebug = false, bool enableCores = true); +mlir::LogicalResult AIETranslateToTxn(mlir::ModuleOp m, + llvm::StringRef workDirPath, + bool aieSim = false, + bool xaieDebug = false, + bool enableCores = true); + #ifdef AIE_ENABLE_AIRBIN mlir::LogicalResult AIETranslateToAirbin(mlir::ModuleOp module, const std::string &outputFilename, diff --git a/lib/CAPI/Translation.cpp b/lib/CAPI/Translation.cpp index 7111469b88..7d63784de8 100644 --- a/lib/CAPI/Translation.cpp +++ b/lib/CAPI/Translation.cpp @@ -79,6 +79,25 @@ MlirLogicalResult aieTranslateToCDODirect(MlirOperation moduleOp, return wrap(status); } +MlirLogicalResult aieTranslateToTxn(MlirOperation moduleOp, + MlirStringRef workDirPath, bool aieSim, + bool xaieDebug, bool enableCores) { + ModuleOp mod = llvm::cast(unwrap(moduleOp)); + auto status = AIETranslateToTxn( + mod, llvm::StringRef(workDirPath.data, workDirPath.length), aieSim, + xaieDebug, enableCores); + std::vector diagnostics; + ScopedDiagnosticHandler handler(mod.getContext(), [&](Diagnostic &d) { + llvm::raw_string_ostream(diagnostics.emplace_back()) + << d.getLocation() << ": " << d; + }); + if (failed(status)) + for (const auto &diagnostic : diagnostics) + std::cerr << diagnostic << "\n"; + + return wrap(status); +} + MlirStringRef aieTranslateToNPU(MlirOperation moduleOp) { std::string npu; llvm::raw_string_ostream os(npu); diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp index a71c05a94b..cd4321eb8d 100644 --- a/lib/Targets/AIETargetCDODirect.cpp +++ b/lib/Targets/AIETargetCDODirect.cpp @@ -19,6 +19,7 @@ extern "C" { #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Region.h" +#include "mlir/Support/FileUtilities.h" #include "mlir/Support/LLVM.h" #include "mlir/Support/LogicalResult.h" @@ -27,6 +28,7 @@ extern "C" { #include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ToolOutputFile.h" #include #include @@ -109,13 +111,13 @@ static const std::map // https://stackoverflow.com/a/32230306 template -raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value) { +static raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value) { return out << label << "=" << std::forward

(value); } template -raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value, - T &&...rest) { +static raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value, + T &&...rest) { const char *pcomma = strchr(label, ','); return showArgs(out.write(label, pcomma - label) << "=" << std::forward

(value) << ',', @@ -124,19 +126,19 @@ raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value, #define SHOW_ARGS(os, ...) showArgs(os, #__VA_ARGS__, __VA_ARGS__) -raw_ostream &operator<<(raw_ostream &os, const XAie_LocType &loc) { +static raw_ostream &operator<<(raw_ostream &os, const XAie_LocType &loc) { os << "XAie_LocType(col: " << std::to_string(loc.Col) << ", row: " << std::to_string(loc.Row) << ")"; return os; } -raw_ostream &operator<<(raw_ostream &os, const XAie_Lock &lock) { +static raw_ostream &operator<<(raw_ostream &os, const XAie_Lock &lock) { os << "XAie_Lock(id: " << std::to_string(lock.LockId) << ", val: " << std::to_string(lock.LockVal) << ")"; return os; } -raw_ostream &operator<<(raw_ostream &os, const XAie_Packet &packet) { +static raw_ostream &operator<<(raw_ostream &os, const XAie_Packet &packet) { os << "XAie_Packet(id: " << std::to_string(packet.PktId) << ", type: " << std::to_string(packet.PktType) << ")"; return os; @@ -190,11 +192,10 @@ auto ps = std::filesystem::path::preferred_separator; #define MEM_TILE_LOCK_ID_INCR 64 #define BASE_ADDR_A_INCR 0x80000 -namespace xilinx::AIE { - -LogicalResult configureLocksInBdBlock(XAie_DmaDesc &dmaTileBd, Block &block, - const AIETargetModel &targetModel, - XAie_LocType &tileLoc) { +static LogicalResult configureLocksInBdBlock(XAie_DmaDesc &dmaTileBd, + Block &block, + const AIETargetModel &targetModel, + XAie_LocType &tileLoc) { LLVM_DEBUG(llvm::dbgs() << "\nstart configuring bds\n"); std::optional acqValue, relValue, acqLockId, relLockId; bool acqEn; @@ -241,11 +242,11 @@ LogicalResult configureLocksInBdBlock(XAie_DmaDesc &dmaTileBd, Block &block, return success(); } -LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd, - Block &block, - const AIETargetModel &targetModel, - XAie_LocType &tileLoc, int bdId, - std::optional nextBdId) { +static LogicalResult configureBdInBlock(XAie_DevInst &devInst, + XAie_DmaDesc &dmaTileBd, Block &block, + const AIETargetModel &targetModel, + XAie_LocType &tileLoc, int bdId, + std::optional nextBdId) { std::optional packetType; std::optional packetID; @@ -388,10 +389,11 @@ LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd, return success(); }; -LogicalResult pushToBdQueueAndEnable(XAie_DevInst &devInst, Operation &op, - XAie_LocType &tileLoc, int chNum, - const DMAChannelDir &channelDir, int bdId, - int repeatCount) { +static LogicalResult pushToBdQueueAndEnable(XAie_DevInst &devInst, + Operation &op, + XAie_LocType &tileLoc, int chNum, + const DMAChannelDir &channelDir, + int bdId, int repeatCount) { XAie_DmaDirection direction = channelDir == DMAChannelDir::S2MM ? DMA_S2MM : DMA_MM2S; auto enTokenIssue = tileLoc.Row == 0 && direction == DMA_S2MM; @@ -405,9 +407,9 @@ LogicalResult pushToBdQueueAndEnable(XAie_DevInst &devInst, Operation &op, return success(); }; -LogicalResult configureLocksAndBd(XAie_DevInst &devInst, Block &block, - XAie_LocType tileLoc, - const AIETargetModel &targetModel) { +static LogicalResult configureLocksAndBd(XAie_DevInst &devInst, Block &block, + XAie_LocType tileLoc, + const AIETargetModel &targetModel) { DMABDOp bd = *block.getOps().begin(); assert(bd.getBdId().has_value() && "DMABDOp must have assigned bd_id; did you forget to run " @@ -424,6 +426,7 @@ LogicalResult configureLocksAndBd(XAie_DevInst &devInst, Block &block, return success(); }; +namespace { struct AIEControl { XAie_Config configPtr; XAie_DevInst devInst; @@ -485,17 +488,28 @@ struct AIEControl { TRY_XAIE_API_FATAL_ERROR(XAie_UpdateNpiAddr, &devInst, NPI_ADDR); } - LogicalResult addAieElfToCDO(uint8_t col, uint8_t row, - const StringRef elfPath, bool aieSim) { + LogicalResult addAieElf(uint8_t col, uint8_t row, const StringRef elfPath, + bool aieSim) { + TRY_XAIE_API_LOGICAL_RESULT(XAie_CoreDisable, &devInst, + XAie_TileLoc(col, row)); + TRY_XAIE_API_LOGICAL_RESULT(XAie_DmaChannelResetAll, &devInst, + XAie_TileLoc(col, row), + XAie_DmaChReset::DMA_CHANNEL_RESET); + // loadSym: Load symbols from .map file. This argument is not used when // __AIESIM__ is not defined. TRY_XAIE_API_LOGICAL_RESULT(XAie_LoadElf, &devInst, XAie_TileLoc(col, row), elfPath.str().c_str(), /*loadSym*/ aieSim); + + TRY_XAIE_API_LOGICAL_RESULT(XAie_DmaChannelResetAll, &devInst, + XAie_TileLoc(col, row), + XAie_DmaChReset::DMA_CHANNEL_UNRESET); + return success(); } - LogicalResult addAieElfsToCDO(DeviceOp &targetOp, const StringRef workDirPath, - bool aieSim) { + LogicalResult addAieElfs(DeviceOp &targetOp, const StringRef workDirPath, + bool aieSim) { for (auto tileOp : targetOp.getOps()) if (tileOp.isShimNOCorPLTile()) { // Resets no needed with V2 kernel driver @@ -510,7 +524,7 @@ struct AIEControl { fileName = (llvm::Twine("core_") + std::to_string(col) + "_" + std::to_string(row) + ".elf") .str(); - if (failed(addAieElfToCDO( + if (failed(addAieElf( col, row, (llvm::Twine(workDirPath) + std::string(1, ps) + fileName) .str(), @@ -521,7 +535,7 @@ struct AIEControl { return success(); } - LogicalResult addInitConfigToCDO(DeviceOp &targetOp) { + LogicalResult addInitConfig(DeviceOp &targetOp) { for (auto tileOp : targetOp.getOps()) { auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex()); if (!tileOp.isShimTile() && tileOp.getCoreOp()) { @@ -734,7 +748,7 @@ struct AIEControl { return success(); } - LogicalResult addCoreEnableToCDO(DeviceOp &targetOp) { + LogicalResult addCoreEnable(DeviceOp &targetOp) { // Start execution of all the cores. for (auto tileOp : targetOp.getOps()) { auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex()); @@ -743,26 +757,20 @@ struct AIEControl { } return success(); } - - void dmaUpdateBdAddr(DeviceOp &targetOp, int col, int row, size_t addr, - size_t bdId) { - auto tileLoc = XAie_TileLoc(col, row); - TRY_XAIE_API_FATAL_ERROR(XAie_DmaUpdateBdAddr, &devInst, tileLoc, addr, - bdId); - } }; -} // namespace xilinx::AIE +} // namespace -void initializeCDOGenerator(byte_ordering endianness, bool cdoDebug) { +static void initializeCDOGenerator(byte_ordering endianness, bool cdoDebug) { // Enables AXI-MM prints for configs being added in CDO if (cdoDebug) EnAXIdebug(); setEndianness(endianness); }; -LogicalResult generateCDOBinary(const StringRef outputPath, - const std::function &cb) { +static LogicalResult +generateCDOBinary(const StringRef outputPath, + const std::function &cb) { // TODO(newling): Get bootgen team to remove print statement in this function. startCDOFileStream(outputPath.str().c_str()); @@ -777,58 +785,59 @@ LogicalResult generateCDOBinary(const StringRef outputPath, return success(); } -LogicalResult generateCDOBinariesSeparately(AIEControl &ctl, - const StringRef workDirPath, - DeviceOp &targetOp, bool aieSim, - bool enableCores) { +static LogicalResult generateCDOBinariesSeparately(AIEControl &ctl, + const StringRef workDirPath, + DeviceOp &targetOp, + bool aieSim, + bool enableCores) { if (failed(generateCDOBinary( (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_elfs.bin") .str(), [&ctl, &targetOp, &workDirPath, &aieSim] { - return ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim); + return ctl.addAieElfs(targetOp, workDirPath, aieSim); }))) return failure(); if (failed(generateCDOBinary( (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_init.bin") .str(), - [&ctl, &targetOp] { return ctl.addInitConfigToCDO(targetOp); }))) + [&ctl, &targetOp] { return ctl.addInitConfig(targetOp); }))) return failure(); if (enableCores && failed(generateCDOBinary( (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_enable.bin") .str(), - [&ctl, &targetOp] { return ctl.addCoreEnableToCDO(targetOp); }))) + [&ctl, &targetOp] { return ctl.addCoreEnable(targetOp); }))) return failure(); return success(); } -LogicalResult generateCDOUnified(AIEControl &ctl, const StringRef workDirPath, - DeviceOp &targetOp, bool aieSim, - bool enableCores) { +static LogicalResult generateCDOUnified(AIEControl &ctl, + const StringRef workDirPath, + DeviceOp &targetOp, bool aieSim, + bool enableCores) { return generateCDOBinary( (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo.bin").str(), [&ctl, &targetOp, &workDirPath, &aieSim, &enableCores] { if (!targetOp.getOps().empty() && - failed(ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim))) + failed(ctl.addAieElfs(targetOp, workDirPath, aieSim))) return failure(); - if (failed(ctl.addInitConfigToCDO(targetOp))) + if (failed(ctl.addInitConfig(targetOp))) return failure(); if (enableCores && !targetOp.getOps().empty() && - failed(ctl.addCoreEnableToCDO(targetOp))) + failed(ctl.addCoreEnable(targetOp))) return failure(); return success(); }); } -LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, - byte_ordering endianness, - bool emitUnified, bool cdoDebug, - bool aieSim, bool xaieDebug, - bool enableCores) { +static LogicalResult +translateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, + byte_ordering endianness, bool emitUnified, bool cdoDebug, + bool aieSim, bool xaieDebug, bool enableCores) { auto devOps = m.getOps(); assert(llvm::range_size(devOps) == 1 && @@ -854,16 +863,76 @@ LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, }(); return result; } -// Not sure why but defining this with xilinx::AIE will create a duplicate -// symbol in libAIETargets.a that then doesn't actually match the header? -namespace xilinx::AIE { -LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, - bool bigEndian, bool emitUnified, - bool cdoDebug, bool aieSim, - bool xaieDebug, bool enableCores) { + +static LogicalResult generateTxn(AIEControl &ctl, const StringRef workDirPath, + DeviceOp &targetOp, bool aieSim, + bool enableElfs, bool enableInit, + bool enableCores) { + if (enableElfs && !targetOp.getOps().empty() && + failed(ctl.addAieElfs(targetOp, workDirPath, aieSim))) + return failure(); + if (enableInit && failed(ctl.addInitConfig(targetOp))) + return failure(); + if (enableCores && !targetOp.getOps().empty() && + failed(ctl.addCoreEnable(targetOp))) + return failure(); + return success(); +} + +static LogicalResult translateToTxn(ModuleOp m, llvm::StringRef workDirPath, + bool aieSim, bool xaieDebug, + bool enableCores) { + + auto devOps = m.getOps(); + if (llvm::range_size(devOps) > 1) + return m.emitError("only exactly 1 device op supported."); + + DeviceOp targetOp = *devOps.begin(); + const BaseNPUTargetModel &targetModel = + (const BaseNPUTargetModel &)targetOp.getTargetModel(); + + if (!targetModel.isNPU()) + return failure(); + + AIEControl ctl(aieSim, xaieDebug, targetModel); + + // start collecting transations + XAie_StartTransaction(&ctl.devInst, XAIE_TRANSACTION_DISABLE_AUTO_FLUSH); + + auto result = + generateTxn(ctl, workDirPath, targetOp, aieSim, true, true, true); + + // Export the transactions to a buffer + uint8_t *txn_ptr = XAie_ExportSerializedTransaction(&ctl.devInst, 0, 0); + + // write transactions to file + XAie_TxnHeader *hdr = (XAie_TxnHeader *)txn_ptr; + std::string filename = + (llvm::Twine(workDirPath) + std::string(1, ps) + "txn.bin").str(); + + std::string errorMessage; + auto output = openOutputFile(filename, &errorMessage); + if (!output) { + llvm::errs() << errorMessage << "\n"; + return failure(); + } + output->os().write(reinterpret_cast(txn_ptr), hdr->TxnSize); + output->keep(); + return result; +} + +LogicalResult xilinx::AIE::AIETranslateToCDODirect( + ModuleOp m, llvm::StringRef workDirPath, bool bigEndian, bool emitUnified, + bool cdoDebug, bool aieSim, bool xaieDebug, bool enableCores) { byte_ordering endianness = bigEndian ? byte_ordering::Big_Endian : byte_ordering::Little_Endian; - return AIETranslateToCDODirect(m, workDirPath, endianness, emitUnified, - cdoDebug, aieSim, xaieDebug, enableCores); + return translateToCDODirect(m, workDirPath, endianness, emitUnified, cdoDebug, + aieSim, xaieDebug, enableCores); +} + +LogicalResult xilinx::AIE::AIETranslateToTxn(ModuleOp m, + llvm::StringRef workDirPath, + bool aieSim, bool xaieDebug, + bool enableCores) { + return translateToTxn(m, workDirPath, aieSim, xaieDebug, enableCores); } -} // namespace xilinx::AIE diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp index 58d8ef175e..1514d22687 100644 --- a/lib/Targets/AIETargetNPU.cpp +++ b/lib/Targets/AIETargetNPU.cpp @@ -191,18 +191,20 @@ std::vector xilinx::AIE::AIETranslateToNPU(ModuleOp module) { auto words = reserveAndGetTail(instructions, 4); + DeviceOp deviceOp = *module.getOps().begin(); + const AIETargetModel &tm = deviceOp.getTargetModel(); + // setup txn header uint8_t major = 1; uint8_t minor = 0; uint8_t devGen = 3; - uint8_t numRows = 6; - uint8_t numCols = 5; - uint8_t numMemTileRows = 1; + uint8_t numRows = tm.rows(); + uint8_t numCols = tm.columns(); + uint8_t numMemTileRows = tm.getNumMemTileRows(); uint32_t count = 0; words[0] = (numRows << 24) | (devGen << 16) | (minor << 8) | major; words[1] = (numMemTileRows << 8) | numCols; - DeviceOp deviceOp = *module.getOps().begin(); auto sequenceOps = deviceOp.getOps(); for (auto f : sequenceOps) { Block &entry = f.getBody().front(); diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp index 4214a20023..2c107d2aad 100644 --- a/lib/Targets/AIETargets.cpp +++ b/lib/Targets/AIETargets.cpp @@ -153,6 +153,10 @@ void registerAIETranslations() { "cdo-enable-cores", llvm::cl::init(true), llvm::cl::desc("Enable cores in CDO")); + static llvm::cl::opt npuInstGenBinary( + "aie-npu-instgen-binary", llvm::cl::init(false), + llvm::cl::desc("Emit binary (true) or text (false) NPU instructions")); + TranslateFromMLIRRegistration registrationMMap( "aie-generate-mmap", "Generate AIE memory map", [](ModuleOp module, raw_ostream &output) { @@ -332,9 +336,30 @@ void registerAIETranslations() { cdoXaieDebug, cdoEnableCores); }, registerDialects); + TranslateFromMLIRRegistration registrationCDOWithTxn( + "aie-generate-txn", "Generate TXN configuration", + [](ModuleOp module, raw_ostream &) { + SmallString<128> workDirPath_; + if (workDirPath.getNumOccurrences() == 0) { + if (llvm::sys::fs::current_path(workDirPath_)) + llvm::report_fatal_error( + "couldn't get cwd to use as work-dir-path"); + } else + workDirPath_ = workDirPath.getValue(); + LLVM_DEBUG(llvm::dbgs() << "work-dir-path: " << workDirPath_ << "\n"); + return AIETranslateToTxn(module, workDirPath_.c_str(), cdoAieSim, + cdoXaieDebug, cdoEnableCores); + }, + registerDialects); TranslateFromMLIRRegistration registrationNPU( "aie-npu-instgen", "Generate instructions for NPU", [](ModuleOp module, raw_ostream &output) { + if (npuInstGenBinary == true) { + auto instructions = AIETranslateToNPU(module); + output.write(reinterpret_cast(instructions.data()), + instructions.size() * sizeof(uint32_t)); + return success(); + } return AIETranslateToNPU(module, output); }, registerDialects); diff --git a/python/AIEMLIRModule.cpp b/python/AIEMLIRModule.cpp index 111a372d95..7ae267fe72 100644 --- a/python/AIEMLIRModule.cpp +++ b/python/AIEMLIRModule.cpp @@ -115,6 +115,21 @@ PYBIND11_MODULE(_aie, m) { "emit_unified"_a = false, "cdo_debug"_a = false, "aiesim"_a = false, "xaie_debug"_a = false, "enable_cores"_a = true); + m.def( + "generate_txn", + [](MlirOperation op, const std::string &workDirPath, bool aieSim, + bool xaieDebug, bool enableCores) { + mlir::python::CollectDiagnosticsToStringScope scope( + mlirOperationGetContext(op)); + if (mlirLogicalResultIsFailure( + aieTranslateToTxn(op, {workDirPath.data(), workDirPath.size()}, + aieSim, xaieDebug, enableCores))) + throw py::value_error("Failed to generate txn binary because: " + + scope.takeMessage()); + }, + "module"_a, "work_dir_path"_a, "aiesim"_a = false, "xaie_debug"_a = false, + "enable_cores"_a = true); + m.def( "npu_instgen", [&stealCStr](MlirOperation op) { diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3bdd9d38b0..31d32f6102 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -388,8 +388,9 @@ add_custom_command( TARGET AIEPythonModules PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/compiler/aiecc.py + ${CMAKE_CURRENT_SOURCE_DIR}/compiler/txn2mlir.py ${CMAKE_BINARY_DIR}/bin ) # during install -install(PROGRAMS compiler/aiecc.py DESTINATION bin) +install(PROGRAMS compiler/aiecc.py compiler/txn2mlir.py DESTINATION bin) diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index f09575f40b..dc7637e8b0 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -237,6 +237,14 @@ def parse_args(args=None): const=True, help="Generate libxaie v2 for CDO", ) + parser.add_argument( + "--aie-generate-txn", + dest="txn", + default=False, + action="store_const", + const=True, + help="Generate txn binary for configuration", + ) parser.add_argument( "--aie-generate-xclbin", dest="xcl", diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index 2ee1d5a132..4d3f969363 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -278,10 +278,15 @@ def generate_cores_list(mlir_module_str): ] -def emit_design_bif(root_path, has_cores=True, enable_cores=True): - cdo_elfs_file = f"file={root_path}/aie_cdo_elfs.bin" - cdo_init_file = f"file={root_path}/aie_cdo_init.bin" - cdo_enable_file = f"file={root_path}/aie_cdo_enable.bin" if enable_cores else "" +def emit_design_bif(root_path, has_cores=True, enable_cores=True, unified=False): + if unified: + cdo_unified_file = f"file={root_path}/aie_cdo.bin" if unified else "" + files = f"{cdo_unified_file}" + else: + cdo_elfs_file = f"file={root_path}/aie_cdo_elfs.bin" + cdo_init_file = f"file={root_path}/aie_cdo_init.bin" + cdo_enable_file = f"file={root_path}/aie_cdo_enable.bin" if enable_cores else "" + files = f"{cdo_elfs_file} {cdo_init_file} {cdo_enable_file}" return dedent( f"""\ all: @@ -291,11 +296,7 @@ def emit_design_bif(root_path, has_cores=True, enable_cores=True): image {{ name=aie_image, id=0x1c000000 - {{ type=cdo - {cdo_elfs_file} - {cdo_init_file} - {cdo_enable_file} - }} + {{ type=cdo {files} }} }} }} """ @@ -552,6 +553,25 @@ async def process_cdo(self): ) generate_cdo(input_physical.operation, self.tmpdirname) + async def process_txn(self): + from aie.dialects.aie import generate_txn + + with Context(), Location.unknown(): + for elf in glob.glob("*.elf"): + try: + shutil.copy(elf, self.tmpdirname) + except shutil.SameFileError: + pass + for elf_map in glob.glob("*.elf.map"): + try: + shutil.copy(elf_map, self.tmpdirname) + except shutil.SameFileError: + pass + input_physical = Module.parse( + await read_file_async(self.prepend_tmp("input_physical.mlir")) + ) + generate_txn(input_physical.operation, self.tmpdirname) + async def process_xclbin_gen(self): if opts.progress: task = self.progress_bar.add_task( @@ -1090,9 +1110,13 @@ async def run_flow(self): # Must have elfs, before we build the final binary assembly if opts.cdo and opts.execute: await self.process_cdo() + if opts.cdo or opts.xcl: await self.process_xclbin_gen() + if opts.txn and opts.execute: + await self.process_txn() + def dumpprofile(self): sortedruntimes = sorted( self.runtimes.items(), key=lambda item: item[1], reverse=True diff --git a/python/compiler/txn2mlir.py b/python/compiler/txn2mlir.py new file mode 100755 index 0000000000..402adc8c23 --- /dev/null +++ b/python/compiler/txn2mlir.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. + +import aie +from aie.ir import * +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.dialects.ext import memref + +import sys +import struct + + +def print_none(*args): + pass + + +def print_log_(*args): + print(*args) + + +print_log = print_none + + +def parse_txn(data, verbose=False): + print_log = print_log_ if verbose else print_none + + header_format = "BBBBBBII" + major, minor, dev_gen, num_rows, num_cols, num_mem_tile_rows, num_ops, txn_size = ( + struct.unpack(header_format, data[:16]) + ) + print(f"// Major: {major}") + print(f"// Minor: {minor}") + print(f"// DevGen: {dev_gen}") + print(f"// NumRows: {num_rows}") + print(f"// NumCols: {num_cols}") + print(f"// NumMemTileRows: {num_mem_tile_rows}") + print(f"// NumOps: {num_ops}") + print(f"// TxnSize: {txn_size} bytes") + operations = [] + i = 16 + # v0.1 + if major == 0 and minor == 1: + while i < len(data): + opc, _, _, _ = struct.unpack("BBBB", data[i : i + 4]) + print_log(f"opcode: {opc:#x}") + if opc == 0x00: + print_log("opcode: WRITE (0x00)") + addr0, addr1, value, size = struct.unpack("IIII", data[i + 8 : i + 24]) + addr = addr1 << 32 | addr0 + print_log(f"addr: {addr:#x}") + print_log(f"value: {value:#x}") + print_log(f"size: {size}") + operations.append((opc, addr, value)) + i = i + size + elif opc == 0x01: + print_log("opcode: BLOCKWRITE (0x01)") + _, addr, size = struct.unpack("III", data[i + 4 : i + 16]) + print_log(f"addr: {addr:#x}") + print_log(f"size: {size}") + operations.append((opc, addr, data[i + 16 : i + size - 16])) + i = i + size + elif opc == 0x03: + print_log("opcode: MASKWRITE (0x03)") + addr0, addr1, value, mask, size = struct.unpack( + "IIIII", data[i + 8 : i + 28] + ) + addr = addr1 << 32 | addr0 + print_log(f"addr: {addr:#x}") + print_log(f"value: {value:#x}") + print_log(f"mask: {mask:#x}") + print_log(f"size: {size}") + operations.append((opc, addr, value, mask)) + i = i + size + else: + value = struct.unpack("I", data[i : i + 4])[0] + raise Exception(f"Unhandled header: {value:#x}") + # v1.0 + if major == 1 and minor == 0: + while i < len(data): + opc, _, _, _ = struct.unpack("BBBB", data[i : i + 4]) + print_log(f"opcode: {opc:#x}") + if opc == 0x00: + print_log("opcode: WRITE (0x00)") + addr, value = struct.unpack("II", data[i + 4 : i + 12]) + print_log(f"addr: {addr:#x}") + print_log(f"value: {value:#x}") + operations.append((opc, addr, value)) + i = i + 12 + elif opc == 0x01: + print_log("opcode: BLOCKWRITE (0x01)") + addr, size = struct.unpack("II", data[i + 4 : i + 12]) + print_log(f"addr: {addr:#x}") + print_log(f"size: {size}") + operations.append((opc, addr, data[i + 12 : i + size])) + i = i + size + elif opc == 0x03: + print_log("opcode: MASKWRITE (0x03)") + addr, value, mask = struct.unpack("III", data[i + 4 : i + 16]) + print_log(f"addr: {addr:#x}") + print_log(f"value: {value:#x}") + print_log(f"mask: {mask:#x}") + operations.append((opc, addr, value, mask)) + i = i + 16 + else: + value = struct.unpack("I", data[i : i + 4])[0] + raise Exception(f"Unhandled header: {value:#x}") + return num_cols, operations + + +def operations_to_mlir(operations, columns=5): + with Context(), Location.unknown(): + module = Module.create() + global_data = [] + with InsertionPoint(module.body): + + devs = { + 1: AIEDevice.npu1_1col, + 2: AIEDevice.npu1_2col, + 3: AIEDevice.npu1_3col, + 4: AIEDevice.npu1_4col, + 5: AIEDevice.npu1, + } + + @device(devs[columns]) + def device_body(): + for op in operations: + if op[0] == 0x01: + d = np.frombuffer(op[2], dtype=np.int32) + blockwrite_data = memref.global_(initial_value=d) + global_data.append(blockwrite_data) + else: + global_data.append(None) + + @runtime_sequence() + def sequence(): + for op, payload in zip(operations, global_data): + if op[0] == 0x00: + addr = op[1] + value = op[2] + npu_write32(addr, value) + elif op[0] == 0x01: + addr = op[1] + d = memref.get_global( + payload.type_.value, payload.sym_name.value + ) + npu_blockwrite(addr, d) + elif op[0] == 0x03: + addr = op[1] + value = op[2] + mask = op[3] + npu_maskwrite32(addr, value, mask) + else: + raise Exception(f"Unhandled op: {op:#x}") + + return module + + +if __name__ == "__main__": + # Check if command line arguments are provided + if len(sys.argv) == 1: + # Read data from standard input + data = sys.stdin.buffer.read() + # Parse the TXN data + columns, operations = parse_txn(data) + else: + # Process each file provided as command line argument + operations = [] + for filename in sys.argv[1:]: + # Open the file in binary mode + with open(filename, "rb") as f: + # Read the data from the file + data = f.read() + # Parse the TXN data + columns, ops = parse_txn(data) + operations = operations + ops + + module = operations_to_mlir(operations, columns) + + print(str(module)) diff --git a/python/dialects/aie.py b/python/dialects/aie.py index 2c08a5a78e..64f4967879 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -22,6 +22,7 @@ aie_llvm_link, generate_bcf, generate_cdo, + generate_txn, generate_xaie, npu_instgen, register_dialect, diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir index 06eceab6e3..4ba0b41342 100644 --- a/test/Targets/NPU/npu_blockwrite_instgen.mlir +++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir @@ -15,7 +15,7 @@ module { // TXN header // CHECK: 06030001 - // CHECK: 00000105 + // CHECK: 00000104 // CHECK: 00000003 // CHECK: 00000058 diff --git a/test/Targets/NPU/npu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir index 5b2b9a3ec2..1eadf3d7eb 100644 --- a/test/Targets/NPU/npu_instgen.mlir +++ b/test/Targets/NPU/npu_instgen.mlir @@ -10,7 +10,7 @@ // RUN: aie-translate --aie-npu-instgen %s | FileCheck %s module { - aie.device(npu1_4col) { + aie.device(npu1) { memref.global "private" constant @write_data : memref<8xi32> = dense<[100, 101, 102, 103, 104 ,105, 106, 107]> aiex.runtime_sequence(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { diff --git a/test/lit.cfg.py b/test/lit.cfg.py index a13fa3774e..b7fbfc2e1f 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -291,6 +291,7 @@ def prepend_path(path): "llvm-objdump", "opt", "xchesscc_wrapper", + "txn2mlir.py", ] llvm_config.add_tool_substitutions(tools, tool_dirs) diff --git a/test/npu-xrt/add_one_two_txn/aie1.mlir b/test/npu-xrt/add_one_two_txn/aie1.mlir new file mode 100644 index 0000000000..5213729485 --- /dev/null +++ b/test/npu-xrt/add_one_two_txn/aie1.mlir @@ -0,0 +1,55 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %t00 = aie.tile(0, 0) + %t01 = aie.tile(0, 1) + %t02 = aie.tile(0, 2) + + aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] () + + aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] () + + aie.core(%t02) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_32 = arith.constant 1 : i32 + + scf.for %niter = %c0 to %c1 step %c1 { + scf.for %steps = %c0 to %c8 step %c1 { + %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<8xi32> + %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview> + %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8xi32> + scf.for %arg3 = %c0 to %c8 step %c1 { + %0 = memref.load %elem0[%arg3] : memref<8xi32> + %1 = arith.addi %0, %c1_32 : i32 + memref.store %1, %elem1[%arg3] : memref<8xi32> + } + aie.objectfifo.release @objFifo_in1(Consume, 1) + aie.objectfifo.release @objFifo_out1(Produce, 1) + } + } + aie.end + } + + aiex.runtime_sequence(%in : memref<64xi32>, %out : memref<64xi32>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} diff --git a/test/npu-xrt/add_one_two_txn/aie2.mlir b/test/npu-xrt/add_one_two_txn/aie2.mlir new file mode 100644 index 0000000000..0f6fb728b0 --- /dev/null +++ b/test/npu-xrt/add_one_two_txn/aie2.mlir @@ -0,0 +1,55 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %t00 = aie.tile(0, 0) + %t01 = aie.tile(0, 1) + %t02 = aie.tile(0, 2) + + aie.objectfifo @objFifo_in0(%t00, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_in1(%t01, {%t02}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] () + + aie.objectfifo @objFifo_out1(%t02, {%t01}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @objFifo_out0(%t01, {%t00}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] () + + aie.core(%t02) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2_32 = arith.constant 102 : i32 + + scf.for %niter = %c0 to %c1 step %c1 { + scf.for %steps = %c0 to %c8 step %c1 { + %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview> + %elem0 = aie.objectfifo.subview.access %subview0[0] : !aie.objectfifosubview> -> memref<8xi32> + %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview> + %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview> -> memref<8xi32> + scf.for %arg3 = %c0 to %c8 step %c1 { + %0 = memref.load %elem0[%arg3] : memref<8xi32> + %1 = arith.addi %0, %c2_32 : i32 + memref.store %1, %elem1[%arg3] : memref<8xi32> + } + aie.objectfifo.release @objFifo_in1(Consume, 1) + aie.objectfifo.release @objFifo_out1(Produce, 1) + } + } + aie.end + } + + aiex.runtime_sequence(%in : memref<64xi32>, %out : memref<64xi32>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} diff --git a/test/npu-xrt/add_one_two_txn/run.lit b/test/npu-xrt/add_one_two_txn/run.lit new file mode 100644 index 0000000000..6d6213b235 --- /dev/null +++ b/test/npu-xrt/add_one_two_txn/run.lit @@ -0,0 +1,12 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %python aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=add_one.xclbin --npu-insts-name=add_one_insts.txt %S/aie1.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-txn --aie-generate-npu --no-compile-host --npu-insts-name=add_two_insts.txt %S/aie2.mlir +// RUN: %python txn2mlir.py aie2.mlir.prj/txn.bin > add_two_cfg.mlir +// RUN: aie-translate -aie-npu-instgen -aie-npu-instgen-binary=true add_two_cfg.mlir -o add_two_cfg.bin +// RUN: %run_on_npu ./test.exe -x add_one.xclbin -i add_one_insts.txt -c add_two_cfg.bin -j add_two_insts.txt | FileCheck %s +// CHECK: PASS! diff --git a/test/npu-xrt/add_one_two_txn/test.cpp b/test/npu-xrt/add_one_two_txn/test.cpp new file mode 100644 index 0000000000..f014cebc7f --- /dev/null +++ b/test/npu-xrt/add_one_two_txn/test.cpp @@ -0,0 +1,279 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "experimental/xrt_kernel.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 64; +constexpr int OUT_SIZE = 64; + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +std::vector load_instr_binary(std::string instr_path) { + std::ifstream instr_file(instr_path); + // read size of file, reserve space in instr_v, then read the file into + // instr_v + instr_file.seekg(0, instr_file.end); + int size = instr_file.tellg(); + instr_file.seekg(0, instr_file.beg); + std::vector instr_v(size / 4); + instr_file.read(reinterpret_cast(instr_v.data()), size); + return instr_v; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")("verbosity,v", + po::value()->default_value(0), + "the verbosity of the output")( + "instr0,i", po::value()->required(), + "path to instructions for kernel0")("instr1,j", + po::value()->required(), + "path to instructions for kernel1")( + "cfg,c", po::value()->required(), "txn binary path"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + std::vector instr_0_v = + load_instr_sequence(vm["instr0"].as()); + + std::vector instr_1_v = + load_instr_sequence(vm["instr1"].as()); + + std::vector cfg_1_v = + load_instr_binary(vm["cfg"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) { + std::cout << "Sequence instr 0 count: " << instr_0_v.size() << "\n"; + std::cout << "Sequence instr 1 count: " << instr_1_v.size() << "\n"; + std::cout << "Sequence cfg count: " << cfg_1_v.size() << "\n"; + } + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel0 = *std::find_if(xkernels.begin(), xkernels.end(), + [](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name == "ADDONE"; + }); + auto kernelName0 = xkernel0.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + auto kernel0 = xrt::kernel(context, kernelName0); + + auto bo_instr_0 = xrt::bo(device, instr_0_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(1)); + auto bo_inA_0 = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3)); + auto bo_out_0 = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4)); + + auto bo_instr_1 = xrt::bo(device, instr_1_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(1)); + auto bo_cfg_1 = xrt::bo(device, cfg_1_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel0.group_id(1)); + auto bo_inA_1 = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(3)); + auto bo_out_1 = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel0.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + // Initializing the input vectors + std::vector srcVecA; + std::vector srcVecA_1; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(i + 1); + + for (int i = 0; i < IN_SIZE; i++) + srcVecA_1.push_back(i + 2); + + // Getting handles to the input data BOs and copying input data to them + uint32_t *bufInA_0 = bo_inA_0.map(); + uint32_t *bufInA_1 = bo_inA_1.map(); + memcpy(bufInA_0, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + memcpy(bufInA_1, srcVecA_1.data(), (srcVecA_1.size() * sizeof(uint32_t))); + + // Getting handles to the instruction sequence BOs and copy data to them + void *bufInstr_0 = bo_instr_0.map(); + void *bufInstr_1 = bo_instr_1.map(); + void *bufCfg_1 = bo_cfg_1.map(); + memcpy(bufInstr_0, instr_0_v.data(), instr_0_v.size() * sizeof(int)); + memcpy(bufInstr_1, instr_1_v.data(), instr_1_v.size() * sizeof(int)); + memcpy(bufCfg_1, cfg_1_v.data(), cfg_1_v.size() * sizeof(int)); + + // Synchronizing BOs + bo_instr_0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_cfg_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_0.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + + // Creating a runlist to contain two seperate runs + xrt::runlist runlist = xrt::runlist(context); + + // Creating the first run + xrt::run run0 = xrt::run(kernel0); + run0.set_arg(0, opcode); + run0.set_arg(1, bo_instr_0); + run0.set_arg(2, instr_0_v.size()); + run0.set_arg(3, bo_inA_0); + run0.set_arg(4, bo_out_0); + run0.set_arg(5, 0); + run0.set_arg(6, 0); + run0.set_arg(7, 0); + + xrt::run run1_cfg = xrt::run(kernel0); + run1_cfg.set_arg(0, opcode); + run1_cfg.set_arg(1, bo_cfg_1); + run1_cfg.set_arg(2, cfg_1_v.size()); + run1_cfg.set_arg(3, 0); + run1_cfg.set_arg(4, 0); + run1_cfg.set_arg(5, 0); + run1_cfg.set_arg(6, 0); + run1_cfg.set_arg(7, 0); + + // Creating the second run + xrt::run run1 = xrt::run(kernel0); + run1.set_arg(0, opcode); + run1.set_arg(1, bo_instr_1); + run1.set_arg(2, instr_1_v.size()); + run1.set_arg(3, bo_inA_1); + run1.set_arg(4, bo_out_1); + run1.set_arg(5, 0); + run1.set_arg(6, 0); + run1.set_arg(7, 0); + + // Executing and waiting on the runlist + runlist.add(run0); + runlist.add(run1_cfg); + runlist.add(run1); + runlist.execute(); + runlist.wait(); + + // Synchronizing the output BOs + bo_out_0.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bo_out_1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut_0 = bo_out_0.map(); + uint32_t *bufOut_1 = bo_out_1.map(); + + int errors = 0; + + for (uint32_t i = 0; i < 64; i++) { + uint32_t ref = (i + 1) + 1; + if (*(bufOut_0 + i) != ref) { + std::cout << "Error in output " << *(bufOut_0 + i) << " != " << ref + << std::endl; + errors++; + } else { + std::cout << "Correct output " << *(bufOut_0 + i) << " == " << ref + << std::endl; + } + } + + for (uint32_t i = 0; i < 64; i++) { + uint32_t ref = (i + 2) + 102; + if (*(bufOut_1 + i) != ref) { + std::cout << "Error in output " << *(bufOut_1 + i) << " != " << ref + << std::endl; + errors++; + } else { + std::cout << "Correct output " << *(bufOut_1 + i) << " == " << ref + << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed with " << errors << " errors \n\n"; + return 1; + } +} diff --git a/test/txn2mlir/roundtrip_npu1_1col.mlir b/test/txn2mlir/roundtrip_npu1_1col.mlir new file mode 100644 index 0000000000..0cb875a9d9 --- /dev/null +++ b/test/txn2mlir/roundtrip_npu1_1col.mlir @@ -0,0 +1,28 @@ +//===- roundtrip_npu1_1col.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-translate -aie-npu-instgen -aie-npu-instgen-binary=true %s | %python txn2mlir.py | FileCheck %s + +// CHECK: aie.device(npu1_1col) +// CHECK: memref.global "private" constant @blockwrite_data : memref<2xi32> = dense<[4195328, 0]> +// CHECK: aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 2 : ui32, value = 2 : ui32} +// CHECK: aiex.npu.write32 {address = 2224128 : ui32, value = 2 : ui32} +// CHECK: aiex.npu.blockwrite(%0) {address = 2215936 : ui32} : memref<2xi32> +module { + aie.device(npu1_1col) { + memref.global "private" constant @blockwrite_data : memref<2xi32> = dense<[4195328, 0]> + aiex.runtime_sequence() { + aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 2 : ui32, value = 2 : ui32} + aiex.npu.write32 {address = 2224128 : ui32, value = 2 : ui32} + %0 = memref.get_global @blockwrite_data : memref<2xi32> + aiex.npu.blockwrite(%0) {address = 2215936 : ui32} : memref<2xi32> + } + } +} diff --git a/test/txn2mlir/roundtrip_npu1_4col.mlir b/test/txn2mlir/roundtrip_npu1_4col.mlir new file mode 100644 index 0000000000..15c0e0c093 --- /dev/null +++ b/test/txn2mlir/roundtrip_npu1_4col.mlir @@ -0,0 +1,27 @@ +//===- roundtrip_npu1_4col.mlir --------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-translate -aie-npu-instgen -aie-npu-instgen-binary=true %s | %python txn2mlir.py | FileCheck %s + +// CHECK: aie.device(npu1_4col) +// CHECK: aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 1 : ui32, value = 1 : ui32} +// CHECK: aiex.npu.maskwrite32 {address = 35856384 : ui32, mask = 1 : ui32, value = 1 : ui32} +// CHECK: aiex.npu.maskwrite32 {address = 69410816 : ui32, mask = 1 : ui32, value = 1 : ui32} +// CHECK: aiex.npu.maskwrite32 {address = 102965248 : ui32, mask = 1 : ui32, value = 1 : ui32} +module { + aie.device(npu1_4col) { + aiex.runtime_sequence() { + aiex.npu.maskwrite32 {address = 2301952 : ui32, mask = 1 : ui32, value = 1 : ui32} + aiex.npu.maskwrite32 {address = 35856384 : ui32, mask = 1 : ui32, value = 1 : ui32} + aiex.npu.maskwrite32 {address = 69410816 : ui32, mask = 1 : ui32, value = 1 : ui32} + aiex.npu.maskwrite32 {address = 102965248 : ui32, mask = 1 : ui32, value = 1 : ui32} + } + } +}