From 7f0f6045f27a496dc8f22ec958255b411d3ea667 Mon Sep 17 00:00:00 2001 From: Unai Sainz de la Maza Date: Tue, 29 Oct 2024 13:36:08 +0100 Subject: [PATCH] continue with the pass, annotate some errors --- .../Affine/AffineDistributeToMPI.cpp | 100 +++++++++++++++--- 1 file changed, 86 insertions(+), 14 deletions(-) diff --git a/lib/Transform/Affine/AffineDistributeToMPI.cpp b/lib/Transform/Affine/AffineDistributeToMPI.cpp index 6bbb7d9..68abadc 100644 --- a/lib/Transform/Affine/AffineDistributeToMPI.cpp +++ b/lib/Transform/Affine/AffineDistributeToMPI.cpp @@ -20,11 +20,11 @@ struct AffineDistributeToMPI : impl::AffineDistributeToMPIBase { using AffineDistributeToMPIBase::AffineDistributeToMPIBase; - // TODO: change to work with funcOp instead of affineForOp? + // NOTE: change to work with funcOp instead of affineForOp? void runOnOperation() { // print number of ranks llvm::errs() << "n_ranks=" << n_ranks << "\n"; - + // capture affineForOp and walk the IR getOperation()->walk([&](AffineForOp op) { OpBuilder builder(op.getContext()); @@ -40,7 +40,7 @@ struct AffineDistributeToMPI auto rankOp = builder.create(op.getLoc(), retvalType, i32Type); - // create constants + // create constants for 0 and 1 auto c0 = builder.create(op.getLoc(), i32Type, builder.getI32IntegerAttr(0)); auto c1 = builder.create(op.getLoc(), i32Type, @@ -59,7 +59,7 @@ struct AffineDistributeToMPI // process rank 1 builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); - processRankOne(builder, op); + processRankOne(builder, op, c0, c1); // remove original loop op.erase(); @@ -68,15 +68,13 @@ struct AffineDistributeToMPI void processRankZero(OpBuilder &builder, affine::AffineForOp forOp, Value dest, Value tag) { - // send first half of data to rank 1 auto loc = forOp.getLoc(); auto retvalType = builder.getType(); - auto i32Type = builder.getI32Type(); // TODO: for (auto arg : funcOp.getArguments()) { mpi_send } - // send first half of data to rank 1 // get all memref operands from the loop body + // TODO: only send what is used by the other node? SmallVector memrefOperands; forOp.walk([&](Operation *op) { if (auto loadOp = dyn_cast(op)) { @@ -95,25 +93,99 @@ struct AffineDistributeToMPI } // create affine loop for the second half - auto upperBound = forOp.getUpperBound(); - auto lowerBound = getHalfPoint(builder, forOp); + // new bound for the new loop + auto upperBoundMap = forOp.getUpperBoundMap(); + auto upperBoundOperands = forOp.getUpperBoundOperands(); + auto lowerBoundMap = getHalfPoint(builder, forOp); + auto lowerBoundOperands = forOp.getLowerBoundOperands(); // insert new loop - + auto newLoop = builder.create( + loc, lowerBoundOperands, lowerBoundMap, upperBoundOperands, + upperBoundMap); + + // clone the original loop body into the new loop + IRMapping mapping; + mapping.map(forOp.getInductionVar(), newLoop.getInductionVar()); + + // get the original loop body + Block &originalBody = forOp.getRegion().front(); + + // clone operations from original body to new loop body, excluding the + // terminator + builder.setInsertionPointToStart(newLoop.getBody()); + for (auto &op : originalBody.without_terminator()) { + builder.clone(op, mapping); + } + // receive processed first half // only receive the result memref (assumed to be the last operand) + builder.setInsertionPointAfter(newLoop); if (!memrefOperands.empty()) { auto resultMemref = memrefOperands.back(); builder.create(loc, retvalType, resultMemref, dest, tag); } } - void processRankOne(OpBuilder &builder, affine::AffineForOp forOp) { - // allocate local buffers - // receive data from rank 0 + void processRankOne(OpBuilder &builder, affine::AffineForOp forOp, Value dest, + Value tag) { + auto loc = forOp.getLoc(); + auto retvalType = builder.getType(); + + // collect all memref operands from the loop body + /*SmallVector memrefOperands;*/ + /*SmallVector allocatedMemrefs;*/ + /*forOp.walk([&](Operation *op) {*/ + /* if (auto loadOp = dyn_cast(op)) {*/ + /* if (!llvm::is_contained(memrefOperands, loadOp.getMemref()))*/ + /* memrefOperands.push_back(loadOp.getMemref());*/ + /* }*/ + /* if (auto storeOp = dyn_cast(op)) {*/ + /* if (!llvm::is_contained(memrefOperands, storeOp.getMemref()))*/ + /* memrefOperands.push_back(storeOp.getMemref());*/ + /* }*/ + /*});*/ + + // allocate local buffers with same types as original memrefs + // FIXME:crashes here! + /*for (auto memref : memrefOperands) {*/ + /* auto memrefType = mlir::cast(memref.getType());*/ + /* auto allocated = builder.create(loc, memrefType);*/ + /* allocatedMemrefs.push_back(allocated);*/ + /*}*/ + /**/ + /*// receive data from rank 0*/ + /*for (auto localMemref : allocatedMemrefs) {*/ + /* builder.create(loc, retvalType, localMemref, dest, tag);*/ + /*}*/ + + // get bounds for the first half + auto upperBoundMap = getHalfPoint(builder, forOp); + auto upperBoundOperands = forOp.getUpperBoundOperands(); + auto lowerBoundMap = forOp.getLowerBoundMap(); + auto lowerBoundOperands = forOp.getLowerBoundOperands(); + // create affine loop for the first half + // FIXME:crashes here! + auto newLoop = builder.create( + loc, lowerBoundOperands, lowerBoundMap, upperBoundOperands, + upperBoundMap); + + // clone the original loop body into the new loop + IRMapping mapping; + mapping.map(forOp.getInductionVar(), newLoop.getInductionVar()); + // send result back to rank 0 - // cleanup local buffers (memrefs dealloc) + /*builder.setInsertionPointAfter(newLoop);*/ + /*if (!allocatedMemrefs.empty()) {*/ + /* auto resultMemref = allocatedMemrefs.back();*/ + /* builder.create(loc, retvalType, resultMemref, dest, tag);*/ + /*}*/ + + // cleanup: deallocate all local buffers + /*for (auto memref : allocatedMemrefs) {*/ + /* builder.create(loc, memref);*/ + /*}*/ } // helper function to get the midpoint of the loop range