From f1120d083f7066d38b35dfd20ae27e24e27beb86 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 8 Oct 2024 15:46:25 +0100 Subject: [PATCH] [Flang][OpenMP] Add host_eval clause lowering support This patch updates Flang lowering to use the `host_eval` clause in `omp.target` operations to pass host information into the applicable clauses inside of the target region, instead of the previous approach where these clauses were attached to the `omp.target` operation itself. --- flang/include/flang/Lower/OpenMP/Utils.h | 9 - flang/lib/Lower/OpenMP/OpenMP.cpp | 221 +++++++++--------- flang/lib/Lower/OpenMP/Utils.cpp | 78 ------- .../OpenMP/DoConcurrentConversion.cpp | 210 ++++++----------- .../Optimizer/OpenMP/MapInfoFinalization.cpp | 55 +++-- .../OpenMP/FIR/mismatched-bound-types.f90 | 14 +- .../test/Lower/OpenMP/eval-outside-target.f90 | 85 ++----- flang/test/Lower/OpenMP/target-spmd.f90 | 28 +-- .../Transforms/DoConcurrent/basic_device.f90 | 22 +- .../multiple_iteration_ranges.f90 | 77 ++++-- 10 files changed, 332 insertions(+), 467 deletions(-) diff --git a/flang/include/flang/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h index 7a622e1cb74ee2d..f342481781fdd3c 100644 --- a/flang/include/flang/Lower/OpenMP/Utils.h +++ b/flang/include/flang/Lower/OpenMP/Utils.h @@ -174,15 +174,6 @@ void genObjectList(const ObjectList &objects, void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp, mlir::Location loc); -// TODO: consider moving this to the `omp.loop_nest` op. Would be something like -// this: -// -// ``` -// mlir::Value LoopNestOp::calculateTripCount(mlir::OpBuilder &builder, -// mlir::OpBuilder::InsertPoint ip) -// ``` -mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, - const mlir::omp::LoopRelatedClauseOps &ops); } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 8938a104998b37b..aed65543789ca81 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -46,6 +46,25 @@ using namespace Fortran::lower::omp; // Code generation helper functions //===----------------------------------------------------------------------===// +/// Add to the given target operation a host_eval argument, which must be +/// defined outside. +/// +/// \return the entry block argument to represent \c hostVar inside of the +/// target region. +static mlir::Value addHostEvalVar(mlir::omp::TargetOp targetOp, + mlir::Value hostVar) { + assert(!targetOp.getRegion().isAncestor(hostVar.getParentRegion()) && + "variable must be defined outside of the target region"); + + auto argIface = llvm::cast(*targetOp); + unsigned insertIndex = + argIface.getHostEvalBlockArgsStart() + argIface.numHostEvalBlockArgs(); + + targetOp.getHostEvalVarsMutable().append(hostVar); + return targetOp.getRegion().insertArgument(insertIndex, hostVar.getType(), + hostVar.getLoc()); +} + namespace { /// Structure holding the information needed to create and bind entry block /// arguments associated to a single clause. @@ -64,6 +83,7 @@ struct EntryBlockArgsEntry { /// Structure holding the information needed to create and bind entry block /// arguments associated to all clauses that can define them. struct EntryBlockArgs { + EntryBlockArgsEntry hostEval; EntryBlockArgsEntry inReduction; EntryBlockArgsEntry map; EntryBlockArgsEntry priv; @@ -73,8 +93,8 @@ struct EntryBlockArgs { EntryBlockArgsEntry useDevicePtr; bool isValid() const { - return inReduction.isValid() && map.isValid() && priv.isValid() && - reduction.isValid() && taskReduction.isValid() && + return hostEval.isValid() && inReduction.isValid() && map.isValid() && + priv.isValid() && reduction.isValid() && taskReduction.isValid() && useDeviceAddr.isValid() && useDevicePtr.isValid(); } }; @@ -162,6 +182,18 @@ static bool evalHasSiblings(const lower::pft::Evaluation &eval) { }}); } +/// Check whether the given omp.target operation exists and we're compiling for +/// the host device. +static bool isHostTarget(mlir::omp::TargetOp targetOp) { + if (!targetOp) + return false; + + auto offloadModOp = llvm::cast( + *targetOp->getParentOfType()); + + return !offloadModOp.getIsTargetDevice(); +} + /// Check whether a given evaluation points to an OpenMP loop construct that /// represents a target SPMD kernel. For this to be true, it must be a `target /// teams distribute parallel do [simd]` or equivalent construct. @@ -169,7 +201,7 @@ static bool evalHasSiblings(const lower::pft::Evaluation &eval) { /// Currently, this is limited to cases where all relevant OpenMP constructs are /// either combined or directly nested within the same function. Also, the /// composite `distribute parallel do` is not identified if split into two -/// explicit nested loops (a `distribute` loop and a `parallel do` loop). +/// explicit nested loops (i.e. a `distribute` loop and a `parallel do` loop). static bool isTargetSPMDLoop(const lower::pft::Evaluation &eval) { using namespace llvm::omp; @@ -376,6 +408,8 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter, }; // Process in clause name alphabetical order to match block arguments order. + bindPrivateLike(args.hostEval.syms, args.hostEval.vars, + op.getHostEvalBlockArgs()); bindPrivateLike(args.inReduction.syms, args.inReduction.vars, op.getInReductionBlockArgs()); bindMapLike(args.map.syms, op.getMapBlockArgs()); @@ -440,20 +474,23 @@ static void genNestedEvaluations(lower::AbstractConverter &converter, converter.genEval(e); } -static bool -mustEvalTeamsThreadsOutsideTarget(const lower::pft::Evaluation &eval, - mlir::omp::TargetOp targetOp) { - if (!targetOp) - return false; - - auto offloadModOp = llvm::cast( - *targetOp->getParentOfType()); - if (offloadModOp.getIsTargetDevice()) +static bool mustEvalTeamsOutsideTarget(const lower::pft::Evaluation &eval, + mlir::omp::TargetOp targetOp) { + if (!isHostTarget(targetOp)) return false; llvm::omp::Directive dir = extractOmpDirective(eval.get()); - return llvm::omp::allTargetSet.test(dir) || !evalHasSiblings(eval); + return llvm::omp::allTeamsSet.test(dir) && + (llvm::omp::allTargetSet.test(dir) || !evalHasSiblings(eval)); +} + +static bool mustEvalTargetSPMDOutsideTarget(const lower::pft::Evaluation &eval, + mlir::omp::TargetOp targetOp) { + if (!isHostTarget(targetOp)) + return false; + + return isTargetSPMDLoop(eval); } //===----------------------------------------------------------------------===// @@ -482,6 +519,8 @@ class HostClausesInsertionGuard { } } + mlir::omp::TargetOp getTargetOp() const { return targetOp; } + private: mlir::OpBuilder &builder; mlir::OpBuilder::InsertPoint ip; @@ -980,11 +1019,11 @@ static mlir::Block *genEntryBlock(lower::AbstractConverter &converter, llvm::SmallVector types; llvm::SmallVector locs; - unsigned numVars = args.inReduction.vars.size() + args.map.vars.size() + - args.priv.vars.size() + args.reduction.vars.size() + - args.taskReduction.vars.size() + - args.useDeviceAddr.vars.size() + - args.useDevicePtr.vars.size(); + unsigned numVars = + args.hostEval.vars.size() + args.inReduction.vars.size() + + args.map.vars.size() + args.priv.vars.size() + + args.reduction.vars.size() + args.taskReduction.vars.size() + + args.useDeviceAddr.vars.size() + args.useDevicePtr.vars.size(); types.reserve(numVars); locs.reserve(numVars); @@ -997,6 +1036,7 @@ static mlir::Block *genEntryBlock(lower::AbstractConverter &converter, // Populate block arguments in clause name alphabetical order to match // expected order by the BlockArgOpenMPOpInterface. + extractTypeLoc(args.hostEval.vars); extractTypeLoc(args.inReduction.vars); extractTypeLoc(args.map.vars); extractTypeLoc(args.priv.vars); @@ -1519,10 +1559,29 @@ static void genLoopNestClauses(lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, const List &clauses, - mlir::Location loc, mlir::omp::LoopNestOperands &clauseOps, + mlir::Location loc, bool evalOutsideTarget, + mlir::omp::LoopNestOperands &clauseOps, llvm::SmallVectorImpl &iv) { ClauseProcessor cp(converter, semaCtx, clauses); - cp.processCollapse(loc, eval, clauseOps, iv); + + // Evaluate loop bounds on the host device, if the operation is defining part + // of a target SPMD kernel. + if (evalOutsideTarget) { + HostClausesInsertionGuard guard(converter.getFirOpBuilder()); + cp.processCollapse(loc, eval, clauseOps, iv); + + for (unsigned i = 0; i < clauseOps.loopLowerBounds.size(); ++i) { + clauseOps.loopLowerBounds[i] = + addHostEvalVar(guard.getTargetOp(), clauseOps.loopLowerBounds[i]); + clauseOps.loopUpperBounds[i] = + addHostEvalVar(guard.getTargetOp(), clauseOps.loopUpperBounds[i]); + clauseOps.loopSteps[i] = + addHostEvalVar(guard.getTargetOp(), clauseOps.loopSteps[i]); + } + } else { + cp.processCollapse(loc, eval, clauseOps, iv); + } + clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr(); } @@ -1549,20 +1608,20 @@ static void genParallelClauses( lower::StatementContext &stmtCtx, const List &clauses, mlir::Location loc, bool evalOutsideTarget, mlir::omp::ParallelOperands &clauseOps, - mlir::omp::NumThreadsClauseOps &numThreadsClauseOps, llvm::SmallVectorImpl &reductionSyms) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps); - // Don't store num_threads clause operators into clauseOps because then they - // would always be added to the omp.parallel operation during its creation. - // We might need to attach them to the parent omp.target. + // Evaluate NUM_THREADS on the host device, if the operation is defining part + // of a target SPMD kernel. if (evalOutsideTarget) { HostClausesInsertionGuard guard(converter.getFirOpBuilder()); - cp.processNumThreads(stmtCtx, numThreadsClauseOps); + if (cp.processNumThreads(stmtCtx, clauseOps)) + clauseOps.numThreads = + addHostEvalVar(guard.getTargetOp(), clauseOps.numThreads); } else { - cp.processNumThreads(stmtCtx, numThreadsClauseOps); + cp.processNumThreads(stmtCtx, clauseOps); } cp.processProcBind(clauseOps); @@ -1727,8 +1786,6 @@ static void genTeamsClauses( lower::StatementContext &stmtCtx, const List &clauses, mlir::Location loc, bool evalOutsideTarget, mlir::omp::TeamsOperands &clauseOps, - mlir::omp::NumTeamsClauseOps &numTeamsClauseOps, - mlir::omp::ThreadLimitClauseOps &threadLimitClauseOps, llvm::SmallVectorImpl &reductionSyms) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); @@ -1736,16 +1793,18 @@ static void genTeamsClauses( // Evaluate NUM_TEAMS and THREAD_LIMIT on the host device, if currently inside // of an omp.target operation. - // Don't store num_teams and thread_limit clause operators into clauseOps - // because then they would always be added to the omp.teams operation during - // its creation. We might need to attach them to the parent omp.target. if (evalOutsideTarget) { HostClausesInsertionGuard guard(converter.getFirOpBuilder()); - cp.processNumTeams(stmtCtx, numTeamsClauseOps); - cp.processThreadLimit(stmtCtx, threadLimitClauseOps); + if (cp.processNumTeams(stmtCtx, clauseOps)) + clauseOps.numTeamsUpper = + addHostEvalVar(guard.getTargetOp(), clauseOps.numTeamsUpper); + + if (cp.processThreadLimit(stmtCtx, clauseOps)) + clauseOps.threadLimit = + addHostEvalVar(guard.getTargetOp(), clauseOps.threadLimit); } else { - cp.processNumTeams(stmtCtx, numTeamsClauseOps); - cp.processThreadLimit(stmtCtx, threadLimitClauseOps); + cp.processNumTeams(stmtCtx, clauseOps); + cp.processThreadLimit(stmtCtx, clauseOps); } cp.processReduction(loc, clauseOps, reductionSyms); } @@ -1832,7 +1891,6 @@ static mlir::omp::LoopNestOp genLoopNestOp( std::pair> wrapperArgs, llvm::omp::Directive directive, DataSharingProcessor &dsp) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); auto ivCallback = [&](mlir::Operation *op) { genLoopVars(op, converter, loc, iv, wrapperArgs); @@ -1850,26 +1908,6 @@ static mlir::omp::LoopNestOp genLoopNestOp( .setGenRegionEntryCb(ivCallback), queue, item, clauseOps); - // Create trip_count if inside of omp.target and this is host compilation. - auto offloadMod = llvm::dyn_cast( - firOpBuilder.getModule().getOperation()); - auto targetOp = loopNestOp->getParentOfType(); - - if (offloadMod && !offloadMod.getIsTargetDevice() && isTargetSPMDLoop(eval)) { - assert(targetOp && "must have omp.target parent"); - - // Lower loop bounds and step, and process collapsing again, putting lowered - // values outside of omp.target this time. This enables calculating and - // accessing the trip count in the host, which is needed when lowering to - // LLVM IR via the OMPIRBuilder. - HostClausesInsertionGuard guard(firOpBuilder); - mlir::omp::LoopRelatedClauseOps loopRelatedOps; - llvm::SmallVector iv; - ClauseProcessor cp(converter, semaCtx, item->clauses); - cp.processCollapse(loc, eval, loopRelatedOps, iv); - targetOp.getTripCountMutable().assign( - calculateTripCount(firOpBuilder, loc, loopRelatedOps)); - } return loopNestOp; } @@ -1928,7 +1966,6 @@ static mlir::omp::ParallelOp genParallelOp( semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item, mlir::omp::ParallelOperands &clauseOps, - mlir::omp::NumThreadsClauseOps &numThreadsClauseOps, const EntryBlockArgs &args, DataSharingProcessor *dsp, bool isComposite = false, mlir::omp::TargetOp parentTarget = nullptr) { auto genRegionEntryCB = [&](mlir::Operation *op) { @@ -1952,13 +1989,6 @@ static mlir::omp::ParallelOp genParallelOp( auto parallelOp = genOpWithBody(genInfo, queue, item, clauseOps); parallelOp.setComposite(isComposite); - if (numThreadsClauseOps.numThreads) { - if (parentTarget) - parentTarget.getNumThreadsMutable().assign( - numThreadsClauseOps.numThreads); - else - parallelOp.getNumThreadsMutable().assign(numThreadsClauseOps.numThreads); - } return parallelOp; } @@ -2238,6 +2268,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, extractMappedBaseValues(clauseOps.mapVars, mapBaseValues); EntryBlockArgs args; + // TODO: Fill hostEval in advance rather than adding to it later on. // TODO: Add in_reduction syms and vars. args.map.syms = mapSyms; args.map.vars = mapBaseValues; @@ -2371,15 +2402,12 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::omp::TargetOp targetOp = findParentTargetOp(converter.getFirOpBuilder()); - bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp); + bool evalOutsideTarget = mustEvalTeamsOutsideTarget(eval, targetOp); mlir::omp::TeamsOperands clauseOps; - mlir::omp::NumTeamsClauseOps numTeamsClauseOps; - mlir::omp::ThreadLimitClauseOps threadLimitClauseOps; llvm::SmallVector reductionSyms; genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, - evalOutsideTarget, clauseOps, numTeamsClauseOps, - threadLimitClauseOps, reductionSyms); + evalOutsideTarget, clauseOps, reductionSyms); EntryBlockArgs args; // TODO: Add private syms and vars. @@ -2401,22 +2429,6 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, .setGenRegionEntryCb(genRegionEntryCB), queue, item, clauseOps); - if (numTeamsClauseOps.numTeamsUpper) { - if (evalOutsideTarget) - targetOp.getNumTeamsUpperMutable().assign( - numTeamsClauseOps.numTeamsUpper); - else - teamsOp.getNumTeamsUpperMutable().assign(numTeamsClauseOps.numTeamsUpper); - } - - if (threadLimitClauseOps.threadLimit) { - if (evalOutsideTarget) - targetOp.getTeamsThreadLimitMutable().assign( - threadLimitClauseOps.threadLimit); - else - teamsOp.getThreadLimitMutable().assign(threadLimitClauseOps.threadLimit); - } - return teamsOp; } @@ -2447,7 +2459,7 @@ static void genStandaloneDistribute(lower::AbstractConverter &converter, mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc, - loopNestClauseOps, iv); + /*evalOutsideTarget=*/false, loopNestClauseOps, iv); EntryBlockArgs distributeArgs; distributeArgs.priv.syms = dsp.getDelayedPrivSymbols(); @@ -2482,7 +2494,7 @@ static void genStandaloneDo(lower::AbstractConverter &converter, mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc, - loopNestClauseOps, iv); + /*evalOutsideTarget=*/false, loopNestClauseOps, iv); EntryBlockArgs wsloopArgs; // TODO: Add private syms and vars. @@ -2505,15 +2517,10 @@ static void genStandaloneParallel(lower::AbstractConverter &converter, ConstructQueue::const_iterator item) { lower::StatementContext stmtCtx; - mlir::omp::TargetOp targetOp = - findParentTargetOp(converter.getFirOpBuilder()); - bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp); - mlir::omp::ParallelOperands parallelClauseOps; - mlir::omp::NumThreadsClauseOps numThreadsClauseOps; llvm::SmallVector parallelReductionSyms; genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc, - evalOutsideTarget, parallelClauseOps, numThreadsClauseOps, + /*evalOutsideTarget=*/false, parallelClauseOps, parallelReductionSyms); std::optional dsp; @@ -2532,9 +2539,9 @@ static void genStandaloneParallel(lower::AbstractConverter &converter, parallelArgs.reduction.syms = parallelReductionSyms; parallelArgs.reduction.vars = parallelClauseOps.reductionVars; genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item, - parallelClauseOps, numThreadsClauseOps, parallelArgs, + parallelClauseOps, parallelArgs, enableDelayedPrivatization ? &dsp.value() : nullptr, - /*isComposite=*/false, evalOutsideTarget ? targetOp : nullptr); + /*isComposite=*/false); } static void genStandaloneSimd(lower::AbstractConverter &converter, @@ -2558,7 +2565,7 @@ static void genStandaloneSimd(lower::AbstractConverter &converter, mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc, - loopNestClauseOps, iv); + /*evalOutsideTarget=*/false, loopNestClauseOps, iv); EntryBlockArgs simdArgs; // TODO: Add private syms and vars. @@ -2600,14 +2607,13 @@ static void genCompositeDistributeParallelDo( mlir::omp::TargetOp targetOp = findParentTargetOp(converter.getFirOpBuilder()); - bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp); + bool evalOutsideTarget = mustEvalTargetSPMDOutsideTarget(eval, targetOp); // Create parent omp.parallel first. mlir::omp::ParallelOperands parallelClauseOps; - mlir::omp::NumThreadsClauseOps numThreadsClauseOps; llvm::SmallVector parallelReductionSyms; genParallelClauses(converter, semaCtx, stmtCtx, parallelItem->clauses, loc, - evalOutsideTarget, parallelClauseOps, numThreadsClauseOps, + evalOutsideTarget, parallelClauseOps, parallelReductionSyms); DataSharingProcessor dsp(converter, semaCtx, doItem->clauses, eval, @@ -2622,7 +2628,7 @@ static void genCompositeDistributeParallelDo( parallelArgs.reduction.syms = parallelReductionSyms; parallelArgs.reduction.vars = parallelClauseOps.reductionVars; genParallelOp(converter, symTable, semaCtx, eval, loc, queue, parallelItem, - parallelClauseOps, numThreadsClauseOps, parallelArgs, &dsp, + parallelClauseOps, parallelArgs, &dsp, /*isComposite=*/true, evalOutsideTarget ? targetOp : nullptr); // Clause processing. @@ -2638,7 +2644,7 @@ static void genCompositeDistributeParallelDo( mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, doItem->clauses, loc, - loopNestClauseOps, iv); + evalOutsideTarget, loopNestClauseOps, iv); // Operation creation. EntryBlockArgs distributeArgs; @@ -2676,14 +2682,13 @@ static void genCompositeDistributeParallelDoSimd( mlir::omp::TargetOp targetOp = findParentTargetOp(converter.getFirOpBuilder()); - bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp); + bool evalOutsideTarget = mustEvalTargetSPMDOutsideTarget(eval, targetOp); // Create parent omp.parallel first. mlir::omp::ParallelOperands parallelClauseOps; - mlir::omp::NumThreadsClauseOps numThreadsClauseOps; llvm::SmallVector parallelReductionSyms; genParallelClauses(converter, semaCtx, stmtCtx, parallelItem->clauses, loc, - evalOutsideTarget, parallelClauseOps, numThreadsClauseOps, + evalOutsideTarget, parallelClauseOps, parallelReductionSyms); DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval, @@ -2698,7 +2703,7 @@ static void genCompositeDistributeParallelDoSimd( parallelArgs.reduction.syms = parallelReductionSyms; parallelArgs.reduction.vars = parallelClauseOps.reductionVars; genParallelOp(converter, symTable, semaCtx, eval, loc, queue, parallelItem, - parallelClauseOps, numThreadsClauseOps, parallelArgs, &dsp, + parallelClauseOps, parallelArgs, &dsp, /*isComposite=*/true, evalOutsideTarget ? targetOp : nullptr); // Clause processing. @@ -2725,7 +2730,7 @@ static void genCompositeDistributeParallelDoSimd( mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc, - loopNestClauseOps, iv); + evalOutsideTarget, loopNestClauseOps, iv); // Operation creation. EntryBlockArgs distributeArgs; @@ -2791,7 +2796,7 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter, mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc, - loopNestClauseOps, iv); + /*evalOutsideTarget=*/false, loopNestClauseOps, iv); // Operation creation. EntryBlockArgs distributeArgs; @@ -2855,7 +2860,7 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, mlir::omp::LoopNestOperands loopNestClauseOps; llvm::SmallVector iv; genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc, - loopNestClauseOps, iv); + /*evalOutsideTarget=*/false, loopNestClauseOps, iv); // Operation creation. EntryBlockArgs wsloopArgs; diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index c705275e17ef966..c57cccfea738cb1 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -538,84 +538,6 @@ void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp, } } -mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, - const mlir::omp::LoopRelatedClauseOps &ops) { - using namespace mlir::arith; - assert(ops.loopLowerBounds.size() == ops.loopUpperBounds.size() && - ops.loopLowerBounds.size() == ops.loopSteps.size() && - !ops.loopLowerBounds.empty() && "Invalid bounds or step"); - - // Get the bit width of an integer-like type. - auto widthOf = [](mlir::Type ty) -> unsigned { - if (mlir::isa(ty)) { - return mlir::IndexType::kInternalStorageBitWidth; - } - if (auto tyInt = mlir::dyn_cast(ty)) { - return tyInt.getWidth(); - } - llvm_unreachable("Unexpected type"); - }; - - // For a type that is either IntegerType or IndexType, return the - // equivalent IntegerType. In the former case this is a no-op. - auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { - if (ty.isIndex()) { - return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); - } - assert(ty.isIntOrIndex() && "Unexpected type"); - return mlir::cast(ty); - }; - - // For two given values, establish a common signless IntegerType - // that can represent any value of type of x and of type of y, - // and return the pair of x, y converted to the new type. - auto unifyToSignless = - [&](fir::FirOpBuilder &b, mlir::Value x, - mlir::Value y) -> std::pair { - auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); - unsigned width = std::max(widthOf(tyX), widthOf(tyY)); - auto wideTy = mlir::IntegerType::get(b.getContext(), width, - mlir::IntegerType::Signless); - return std::make_pair(b.createConvert(loc, wideTy, x), - b.createConvert(loc, wideTy, y)); - }; - - // Start with signless i32 by default. - auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1); - - for (auto [origLb, origUb, origStep] : - llvm::zip(ops.loopLowerBounds, ops.loopUpperBounds, ops.loopSteps)) { - auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0); - auto [step, step0] = unifyToSignless(builder, origStep, tmpS0); - auto reverseCond = - builder.create(loc, CmpIPredicate::slt, step, step0); - auto negStep = builder.create(loc, step0, step); - mlir::Value absStep = - builder.create(loc, reverseCond, negStep, step); - - auto [lb, ub] = unifyToSignless(builder, origLb, origUb); - auto start = builder.create(loc, reverseCond, ub, lb); - auto end = builder.create(loc, reverseCond, lb, ub); - - mlir::Value range = builder.create(loc, end, start); - auto rangeCond = - builder.create(loc, CmpIPredicate::slt, end, start); - std::tie(range, absStep) = unifyToSignless(builder, range, absStep); - // numSteps = (range /u absStep) + 1 - auto numSteps = builder.create( - loc, builder.create(loc, range, absStep), - builder.createIntegerConstant(loc, range.getType(), 1)); - - auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0); - auto loopTripCount = - builder.create(loc, rangeCond, trip0, numSteps); - auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount); - tripCount = builder.create(loc, totalTC, thisTC); - } - - return tripCount; -} - } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index e2a109126810ddc..5d802b50a8c76c3 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -45,14 +45,12 @@ namespace internal { // TODO The following 2 functions are copied from "flang/Lower/OpenMP/Utils.h". // This duplication is temporary until we find a solution for a shared location // for these utils that does not introduce circular CMake deps. -mlir::omp::MapInfoOp -createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, - mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, - llvm::ArrayRef bounds, - llvm::ArrayRef members, - mlir::ArrayAttr membersIndex, uint64_t mapType, - mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, - bool partialMap = false) { +mlir::omp::MapInfoOp createMapInfoOp( + mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, + mlir::Value varPtrPtr, std::string name, llvm::ArrayRef bounds, + llvm::ArrayRef members, mlir::ArrayAttr membersIndex, + uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType, + mlir::Type retTy, bool partialMap = false) { if (auto boxTy = llvm::dyn_cast(baseAddr.getType())) { baseAddr = builder.create(loc, baseAddr); retTy = baseAddr.getType(); @@ -77,84 +75,6 @@ createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, return op; } -mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, - const mlir::omp::LoopRelatedClauseOps &ops) { - using namespace mlir::arith; - assert(ops.loopLowerBounds.size() == ops.loopUpperBounds.size() && - ops.loopLowerBounds.size() == ops.loopSteps.size() && - !ops.loopLowerBounds.empty() && "Invalid bounds or step"); - - // Get the bit width of an integer-like type. - auto widthOf = [](mlir::Type ty) -> unsigned { - if (mlir::isa(ty)) { - return mlir::IndexType::kInternalStorageBitWidth; - } - if (auto tyInt = mlir::dyn_cast(ty)) { - return tyInt.getWidth(); - } - llvm_unreachable("Unexpected type"); - }; - - // For a type that is either IntegerType or IndexType, return the - // equivalent IntegerType. In the former case this is a no-op. - auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { - if (ty.isIndex()) { - return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); - } - assert(ty.isIntOrIndex() && "Unexpected type"); - return mlir::cast(ty); - }; - - // For two given values, establish a common signless IntegerType - // that can represent any value of type of x and of type of y, - // and return the pair of x, y converted to the new type. - auto unifyToSignless = - [&](fir::FirOpBuilder &b, mlir::Value x, - mlir::Value y) -> std::pair { - auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); - unsigned width = std::max(widthOf(tyX), widthOf(tyY)); - auto wideTy = mlir::IntegerType::get(b.getContext(), width, - mlir::IntegerType::Signless); - return std::make_pair(b.createConvert(loc, wideTy, x), - b.createConvert(loc, wideTy, y)); - }; - - // Start with signless i32 by default. - auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1); - - for (auto [origLb, origUb, origStep] : - llvm::zip(ops.loopLowerBounds, ops.loopUpperBounds, ops.loopSteps)) { - auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0); - auto [step, step0] = unifyToSignless(builder, origStep, tmpS0); - auto reverseCond = - builder.create(loc, CmpIPredicate::slt, step, step0); - auto negStep = builder.create(loc, step0, step); - mlir::Value absStep = - builder.create(loc, reverseCond, negStep, step); - - auto [lb, ub] = unifyToSignless(builder, origLb, origUb); - auto start = builder.create(loc, reverseCond, ub, lb); - auto end = builder.create(loc, reverseCond, lb, ub); - - mlir::Value range = builder.create(loc, end, start); - auto rangeCond = - builder.create(loc, CmpIPredicate::slt, end, start); - std::tie(range, absStep) = unifyToSignless(builder, range, absStep); - // numSteps = (range /u absStep) + 1 - auto numSteps = builder.create( - loc, builder.create(loc, range, absStep), - builder.createIntegerConstant(loc, range.getType(), 1)); - - auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0); - auto loopTripCount = - builder.create(loc, rangeCond, trip0, numSteps); - auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount); - tripCount = builder.create(loc, totalTC, thisTC); - } - - return tripCount; -} - /// Check if cloning the bounds introduced any dependency on the outer region. /// If so, then either clone them as well if they are MemoryEffectFree, or else /// copy them to a new temporary and add them to the map and block_argument @@ -664,7 +584,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mlir::IRMapping mapper; if (mapToDevice) { + // TODO: Currently the loop bounds for the outer loop are duplicated. mlir::omp::TargetOperands targetClauseOps; + genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper, + loopNestClauseOps, &targetClauseOps); + + // Prevent mapping host-evaluated variables. + outermostLoopLiveIns.erase( + llvm::remove_if(outermostLoopLiveIns, + [&](mlir::Value liveIn) { + return llvm::is_contained( + targetClauseOps.hostEvalVars, liveIn); + }), + outermostLoopLiveIns.end()); // The outermost loop will contain all the live-in values in all nested // loops since live-in values are collected recursively for all nested @@ -673,16 +605,21 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { targetClauseOps.mapVars.push_back( genMapInfoOpForLiveIn(rewriter, liveIn)); - targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper, - outermostLoopLiveIns, targetClauseOps); + targetOp = + genTargetOp(doLoop.getLoc(), rewriter, mapper, outermostLoopLiveIns, + targetClauseOps, loopNestClauseOps); genTeamsOp(doLoop.getLoc(), rewriter); } - mlir::omp::ParallelOp parallelOp = genParallelOp( - doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps); + mlir::omp::ParallelOp parallelOp = + genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper); // Only set as composite when part of `distribute parallel do`. parallelOp.setComposite(mapToDevice); + if (!mapToDevice) + genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper, + loopNestClauseOps); + for (mlir::Value local : locals) looputils::localizeLoopLocalValue(local, parallelOp.getRegion(), rewriter); @@ -694,23 +631,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps, /*isComposite=*/mapToDevice); - // Now that we created the nested `ws.loop` op, we set can the `target` op's - // trip count. - if (mapToDevice) { - rewriter.setInsertionPoint(targetOp); - auto parentModule = doLoop->getParentOfType(); - fir::FirOpBuilder firBuilder(rewriter, fir::getKindMapping(parentModule)); - - mlir::omp::LoopRelatedClauseOps loopClauseOps; - loopClauseOps.loopLowerBounds.push_back(lbOp->getResult(0)); - loopClauseOps.loopUpperBounds.push_back(ubOp->getResult(0)); - loopClauseOps.loopSteps.push_back(stepOp->getResult(0)); - - mlir::cast(targetOp).getTripCountMutable().assign( - Fortran::lower::omp::internal::calculateTripCount( - firBuilder, doLoop.getLoc(), loopClauseOps)); - } - rewriter.eraseOp(doLoop); // Mark `unordered` loops that are not perfectly nested to be skipped from @@ -804,27 +724,29 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { captureKind, rawAddr.getType()); } - mlir::omp::TargetOp genTargetOp(mlir::Location loc, - mlir::ConversionPatternRewriter &rewriter, - mlir::IRMapping &mapper, - llvm::ArrayRef liveIns, - mlir::omp::TargetOperands &clauseOps) const { + mlir::omp::TargetOp + genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + mlir::IRMapping &mapper, llvm::ArrayRef mappedVars, + mlir::omp::TargetOperands &clauseOps, + mlir::omp::LoopNestOperands &loopNestClauseOps) const { auto targetOp = rewriter.create(loc, clauseOps); + auto argIface = llvm::cast(*targetOp); mlir::Region ®ion = targetOp.getRegion(); - llvm::SmallVector liveInTypes; - llvm::SmallVector liveInLocs; + llvm::SmallVector regionArgTypes; + llvm::SmallVector regionArgLocs; - for (mlir::Value liveIn : liveIns) { - liveInTypes.push_back(liveIn.getType()); - liveInLocs.push_back(liveIn.getLoc()); + for (auto var : + llvm::concat(clauseOps.hostEvalVars, mappedVars)) { + regionArgTypes.push_back(var.getType()); + regionArgLocs.push_back(var.getLoc()); } - rewriter.createBlock(®ion, {}, liveInTypes, liveInLocs); + rewriter.createBlock(®ion, {}, regionArgTypes, regionArgLocs); for (auto [arg, mapInfoOp] : - llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) { + llvm::zip_equal(argIface.getMapBlockArgs(), clauseOps.mapVars)) { auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); mlir::Value miOperand = miOp.getVariableOperand(0); @@ -841,6 +763,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mapper.map(origDeclareOp.getBase(), liveInDeclare.getBase()); } + for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(), + clauseOps.hostEvalVars)) + mapper.map(hostEval, arg); + + for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) { + loopNestClauseOps.loopLowerBounds[i] = + mapper.lookup(loopNestClauseOps.loopLowerBounds[i]); + loopNestClauseOps.loopUpperBounds[i] = + mapper.lookup(loopNestClauseOps.loopUpperBounds[i]); + loopNestClauseOps.loopSteps[i] = + mapper.lookup(loopNestClauseOps.loopSteps[i]); + } + fir::FirOpBuilder firBuilder( rewriter, fir::getKindMapping(targetOp->getParentOfType())); @@ -909,7 +844,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { void genLoopNestClauseOps( mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper, - mlir::omp::LoopNestOperands &loopNestClauseOps) const { + mlir::omp::LoopNestOperands &loopNestClauseOps, + mlir::omp::TargetOperands *targetClauseOps = nullptr) const { assert(loopNestClauseOps.loopLowerBounds.empty() && "Loop nest bounds were already emitted!"); @@ -930,18 +866,21 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { return result; }; - for (auto &[doLoop, _] : loopNest) { - mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); - loopNestClauseOps.loopLowerBounds.push_back( - cloneBoundOrStepOpChain(lbOp)->getResult(0)); + auto hostEvalCapture = [&](mlir::Value var, + llvm::SmallVectorImpl &bounds) { + var = cloneBoundOrStepOpChain(var.getDefiningOp())->getResult(0); + bounds.push_back(var); - mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); - loopNestClauseOps.loopUpperBounds.push_back( - cloneBoundOrStepOpChain(ubOp)->getResult(0)); + if (targetClauseOps) + targetClauseOps->hostEvalVars.push_back(var); + }; - mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); - loopNestClauseOps.loopSteps.push_back( - cloneBoundOrStepOpChain(stepOp)->getResult(0)); + for (auto &[doLoop, _] : loopNest) { + hostEvalCapture(doLoop.getLowerBound(), + loopNestClauseOps.loopLowerBounds); + hostEvalCapture(doLoop.getUpperBound(), + loopNestClauseOps.loopUpperBounds); + hostEvalCapture(doLoop.getStep(), loopNestClauseOps.loopSteps); } loopNestClauseOps.loopInclusive = rewriter.getUnitAttr(); @@ -985,18 +924,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { return result; } - mlir::omp::ParallelOp - genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, - looputils::LoopNestToIndVarMap &loopNest, - mlir::IRMapping &mapper, - mlir::omp::LoopNestOperands &loopNestClauseOps) const { + mlir::omp::ParallelOp genParallelOp(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + looputils::LoopNestToIndVarMap &loopNest, + mlir::IRMapping &mapper) const { auto parallelOp = rewriter.create(loc); rewriter.createBlock(¶llelOp.getRegion()); rewriter.setInsertionPoint(rewriter.create(loc)); genLoopNestIndVarAllocs(rewriter, loopNest, mapper); - genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps); - return parallelOp; } diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp index 8faffa20bb942a4..9e980171a0c58bb 100644 --- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp +++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp @@ -394,45 +394,44 @@ mlir::omp::MapInfoOp genDescriptorMemberMaps(mlir::omp::MapInfoOp op, if (!mapClauseOwner) return; - auto addOperands = [&](mlir::OperandRange &mapVarsArr, - mlir::MutableOperandRange &mutableOpRange, - auto directiveOp) { + auto addOperands = [&](mlir::MutableOperandRange &mapVarsArr, + mlir::Operation *directiveOp, + unsigned mapArgsStart = 0) { llvm::SmallVector newMapOps; - for (size_t i = 0; i < mapVarsArr.size(); ++i) { - if (mapVarsArr[i] == op) { - for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) { - newMapOps.push_back(mapMember); - // for TargetOp's which have IsolatedFromAbove we must align the - // new additional map operand with an appropriate BlockArgument, - // as the printing and later processing currently requires a 1:1 - // mapping of BlockArgs to MapInfoOp's at the same placement in - // each array (BlockArgs and MapVars). - if (directiveOp) { - directiveOp.getRegion().insertArgument(i + j, mapMember.getType(), - directiveOp->getLoc()); - } - } + for (auto [i, mapVar] : llvm::enumerate(mapVarsArr)) { + if (mapVar.get() != op) { + newMapOps.push_back(mapVar.get()); + continue; } - newMapOps.push_back(mapVarsArr[i]); + + for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) { + newMapOps.push_back(mapMember); + if (directiveOp) + directiveOp->getRegion(0).insertArgument( + mapArgsStart + i + j, mapMember.getType(), mapMember.getLoc()); + } + newMapOps.push_back(mapVar.get()); } - mutableOpRange.assign(newMapOps); + mapVarsArr.assign(newMapOps); }; + auto argIface = + llvm::dyn_cast(target); + if (auto mapClauseOwner = llvm::dyn_cast(target)) { - mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars(); - mlir::MutableOperandRange mapMutableOpRange = - mapClauseOwner.getMapVarsMutable(); - mlir::omp::TargetOp targetOp = - llvm::dyn_cast(target); - addOperands(mapVarsArr, mapMutableOpRange, targetOp); + mlir::MutableOperandRange mapVarsArr = mapClauseOwner.getMapVarsMutable(); + unsigned blockArgInsertIndex = + argIface ? argIface.getMapBlockArgsStart() : 0; + addOperands(mapVarsArr, llvm::dyn_cast(target), + blockArgInsertIndex); } if (auto targetDataOp = llvm::dyn_cast(target)) { - mlir::OperandRange useDevAddrArr = targetDataOp.getUseDeviceAddrVars(); - mlir::MutableOperandRange useDevAddrMutableOpRange = + mlir::MutableOperandRange useDevAddrArr = targetDataOp.getUseDeviceAddrVarsMutable(); - addOperands(useDevAddrArr, useDevAddrMutableOpRange, targetDataOp); + addOperands(useDevAddrArr, target, + argIface.getUseDeviceAddrBlockArgsStart()); } } diff --git a/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90 b/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90 index e64d5450846ec67..8b24b34cb55b6a2 100644 --- a/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90 +++ b/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90 @@ -1,7 +1,19 @@ ! RUN: %flang_fc1 -fopenmp -emit-fir %s -o - | FileCheck %s ! Check that this testcase is lowered to FIR successfully. -! CHECK: omp.target {{.*}}trip_count + +! CHECK: %[[ONE:.*]] = arith.constant 1 : i32 +! CHECK: %[[DECL_N:.*]] = fir.declare %{{.*}} {uniq_name = "_QMtestEn"} : (!fir.ref) -> !fir.ref +! CHECK: %[[HOST_N:.*]] = fir.load %[[DECL_N]] : !fir.ref +! CHECK: %[[HOST_LB:.*]] = fir.convert %[[ONE]] : (i32) -> i64 +! CHECK: %[[HOST_STEP:.*]] = fir.convert %[[ONE]] : (i32) -> i64 +! CHECK: omp.target +! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_N]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : i64, i64, i64) +! CHECK: omp.teams +! CHECK: omp.parallel +! CHECK: omp.distribute +! CHECK-NEXT: omp.wsloop +! CHECK-NEXT: omp.loop_nest ({{.*}}) : i64 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) module Test use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64 diff --git a/flang/test/Lower/OpenMP/eval-outside-target.f90 b/flang/test/Lower/OpenMP/eval-outside-target.f90 index ef578610e8e908c..d0925971e4b2bca 100644 --- a/flang/test/Lower/OpenMP/eval-outside-target.f90 +++ b/flang/test/Lower/OpenMP/eval-outside-target.f90 @@ -6,19 +6,15 @@ subroutine teams() ! BOTH: omp.target - ! HOST-SAME: num_teams({{.*}}) teams_thread_limit({{.*}}) - - ! DEVICE-NOT: num_teams({{.*}}) - ! DEVICE-NOT: teams_thread_limit({{.*}}) + ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_TEAMS:.*]], %{{.*}} -> %[[THREAD_LIMIT:.*]] : i32, i32) + + ! DEVICE-NOT: host_eval({{.*}}) ! DEVICE-SAME: { !$omp target ! BOTH: omp.teams - ! HOST-NOT: num_teams({{.*}}) - ! HOST-NOT: thread_limit({{.*}}) - ! HOST-SAME: { - + ! HOST-SAME: num_teams( to %[[NUM_TEAMS]] : i32) thread_limit(%[[THREAD_LIMIT]] : i32) ! DEVICE-SAME: num_teams({{.*}}) thread_limit({{.*}}) !$omp teams num_teams(1) thread_limit(2) call foo() @@ -27,60 +23,19 @@ subroutine teams() !$omp end target ! BOTH: omp.teams - ! BOTH-SAME: num_teams({{.*}}) thread_limit({{.*}}) + ! BOTH-SAME: num_teams({{.*}}) thread_limit({{.*}}) { !$omp teams num_teams(1) thread_limit(2) call foo() !$omp end teams end subroutine teams -! BOTH-LABEL: func.func @_QPparallel -subroutine parallel() - ! BOTH: omp.target - - ! HOST-SAME: num_threads({{.*}}) - - ! DEVICE-NOT: num_threads({{.*}}) - ! DEVICE-SAME: { - !$omp target - - ! BOTH: omp.parallel - - ! HOST-NOT: num_threads({{.*}}) - ! HOST-SAME: { - - ! DEVICE-SAME: num_threads({{.*}}) - !$omp parallel num_threads(1) - call foo() - !$omp end parallel - !$omp end target - - ! BOTH: omp.target - ! BOTH-NOT: num_threads({{.*}}) - ! BOTH-SAME: { - !$omp target - call foo() - - ! BOTH: omp.parallel - ! BOTH-SAME: num_threads({{.*}}) - !$omp parallel num_threads(1) - call foo() - !$omp end parallel - !$omp end target - - ! BOTH: omp.parallel - ! BOTH-SAME: num_threads({{.*}}) - !$omp parallel num_threads(1) - call foo() - !$omp end parallel -end subroutine parallel - ! BOTH-LABEL: func.func @_QPdistribute_parallel_do subroutine distribute_parallel_do() ! BOTH: omp.target - ! HOST-SAME: num_threads({{.*}}) + ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_THREADS:.*]], %{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32, i32) - ! DEVICE-NOT: num_threads({{.*}}) + ! DEVICE-NOT: host_eval({{.*}}) ! DEVICE-SAME: { ! BOTH: omp.teams @@ -88,13 +43,14 @@ subroutine distribute_parallel_do() ! BOTH: omp.parallel - ! HOST-NOT: num_threads({{.*}}) - ! HOST-SAME: { - + ! HOST-SAME: num_threads(%[[NUM_THREADS]] : i32) ! DEVICE-SAME: num_threads({{.*}}) ! BOTH: omp.distribute ! BOTH-NEXT: omp.wsloop + ! BOTH-NEXT: omp.loop_nest + + ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) !$omp distribute parallel do num_threads(1) do i=1,10 call foo() @@ -103,11 +59,11 @@ subroutine distribute_parallel_do() !$omp end target teams ! BOTH: omp.target - ! BOTH-NOT: num_threads({{.*}}) + ! BOTH-NOT: host_eval({{.*}}) ! BOTH-SAME: { ! BOTH: omp.teams !$omp target teams - call foo() + call foo() !< Prevents this from being SPMD. ! BOTH: omp.parallel ! BOTH-SAME: num_threads({{.*}}) @@ -139,9 +95,9 @@ end subroutine distribute_parallel_do subroutine distribute_parallel_do_simd() ! BOTH: omp.target - ! HOST-SAME: num_threads({{.*}}) + ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_THREADS:.*]], %{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32, i32) - ! DEVICE-NOT: num_threads({{.*}}) + ! DEVICE-NOT: host_eval({{.*}}) ! DEVICE-SAME: { ! BOTH: omp.teams @@ -149,14 +105,15 @@ subroutine distribute_parallel_do_simd() ! BOTH: omp.parallel - ! HOST-NOT: num_threads({{.*}}) - ! HOST-SAME: { - + ! HOST-SAME: num_threads(%[[NUM_THREADS]] : i32) ! DEVICE-SAME: num_threads({{.*}}) ! BOTH: omp.distribute ! BOTH-NEXT: omp.wsloop ! BOTH-NEXT: omp.simd + ! BOTH-NEXT: omp.loop_nest + + ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) !$omp distribute parallel do simd num_threads(1) do i=1,10 call foo() @@ -165,11 +122,11 @@ subroutine distribute_parallel_do_simd() !$omp end target teams ! BOTH: omp.target - ! BOTH-NOT: num_threads({{.*}}) + ! BOTH-NOT: host_eval({{.*}}) ! BOTH-SAME: { ! BOTH: omp.teams !$omp target teams - call foo() + call foo() !< Prevents this from being SPMD. ! BOTH: omp.parallel ! BOTH-SAME: num_threads({{.*}}) diff --git a/flang/test/Lower/OpenMP/target-spmd.f90 b/flang/test/Lower/OpenMP/target-spmd.f90 index acb28a206a6788b..bb90e5b3fc48570 100644 --- a/flang/test/Lower/OpenMP/target-spmd.f90 +++ b/flang/test/Lower/OpenMP/target-spmd.f90 @@ -3,7 +3,7 @@ ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_generic() { subroutine distribute_parallel_do_generic() ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target !$omp teams @@ -17,7 +17,7 @@ subroutine distribute_parallel_do_generic() !$omp end target ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target teams !$omp distribute parallel do @@ -29,7 +29,7 @@ subroutine distribute_parallel_do_generic() !$omp end target teams ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target teams !$omp distribute parallel do @@ -49,7 +49,7 @@ end subroutine distribute_parallel_do_generic ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_spmd() { subroutine distribute_parallel_do_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target !$omp teams !$omp distribute parallel do @@ -61,7 +61,7 @@ subroutine distribute_parallel_do_spmd() !$omp end target ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target teams !$omp distribute parallel do do i = 1, 10 @@ -74,7 +74,7 @@ end subroutine distribute_parallel_do_spmd ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_simd_generic() { subroutine distribute_parallel_do_simd_generic() ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target !$omp teams @@ -88,7 +88,7 @@ subroutine distribute_parallel_do_simd_generic() !$omp end target ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target teams !$omp distribute parallel do simd @@ -100,7 +100,7 @@ subroutine distribute_parallel_do_simd_generic() !$omp end target teams ! CHECK: omp.target - ! CHECK-NOT: trip_count({{.*}}) + ! CHECK-NOT: host_eval({{.*}}) ! CHECK-SAME: { !$omp target teams !$omp distribute parallel do simd @@ -120,7 +120,7 @@ end subroutine distribute_parallel_do_simd_generic ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_simd_spmd() { subroutine distribute_parallel_do_simd_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target !$omp teams !$omp distribute parallel do simd @@ -132,7 +132,7 @@ subroutine distribute_parallel_do_simd_spmd() !$omp end target ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target teams !$omp distribute parallel do simd do i = 1, 10 @@ -145,7 +145,7 @@ end subroutine distribute_parallel_do_simd_spmd ! CHECK-LABEL: func.func @_QPteams_distribute_parallel_do_spmd() { subroutine teams_distribute_parallel_do_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target !$omp teams distribute parallel do do i = 1, 10 @@ -158,7 +158,7 @@ end subroutine teams_distribute_parallel_do_spmd ! CHECK-LABEL: func.func @_QPteams_distribute_parallel_do_simd_spmd() { subroutine teams_distribute_parallel_do_simd_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target !$omp teams distribute parallel do simd do i = 1, 10 @@ -171,7 +171,7 @@ end subroutine teams_distribute_parallel_do_simd_spmd ! CHECK-LABEL: func.func @_QPtarget_teams_distribute_parallel_do_spmd() { subroutine target_teams_distribute_parallel_do_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target teams distribute parallel do do i = 1, 10 call foo(i) @@ -182,7 +182,7 @@ end subroutine target_teams_distribute_parallel_do_spmd ! CHECK-LABEL: func.func @_QPtarget_teams_distribute_parallel_do_simd_spmd() { subroutine target_teams_distribute_parallel_do_simd_spmd() ! CHECK: omp.target - ! CHECK-SAME: trip_count({{.*}}) + ! CHECK-SAME: host_eval({{.*}}) !$omp target teams distribute parallel do simd do i = 1, 10 call foo(i) diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 index 2e659895719e581..5b5d1f5ff77c524 100644 --- a/flang/test/Transforms/DoConcurrent/basic_device.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -21,6 +21,18 @@ program do_concurrent_basic ! CHECK-NOT: fir.do_loop + ! CHECK: %[[DUPLICATED_C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[DUPLICATED_LB:.*]] = fir.convert %[[DUPLICATED_C1]] : (i32) -> index + ! CHECK: %[[DUPLICATED_C10:.*]] = arith.constant 10 : i32 + ! CHECK: %[[DUPLICATED_UB:.*]] = fir.convert %[[DUPLICATED_C10]] : (i32) -> index + ! CHECK: %[[DUPLICATED_STEP:.*]] = arith.constant 1 : index + + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[HOST_LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + ! CHECK: %[[HOST_UB:.*]] = fir.convert %[[C10]] : (i32) -> index + ! CHECK: %[[HOST_STEP:.*]] = arith.constant 1 : index + ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1 ! CHECK: %[[C0:.*]] = arith.constant 0 : index ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index @@ -32,10 +44,8 @@ program do_concurrent_basic ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}}) ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]]) - ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64 - ! CHECK: omp.target - ! CHECK-SAME: trip_count(%[[TRIP_COUNT]] : i64) + ! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_UB]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : index, index, index) ! CHECK-SAME: map_entries(%[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]], ! CHECK-SAME: %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]] @@ -46,12 +56,6 @@ program do_concurrent_basic ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) - ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 - ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index - ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 - ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index - ! CHECK: %[[STEP:.*]] = arith.constant 1 : index - ! CHECK-NEXT: omp.distribute { ! CHECK-NEXT: omp.wsloop { diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 index 86dee0206eb87d7..5badd7923019e66 100644 --- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 +++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 @@ -30,7 +30,7 @@ program main do concurrent(i=1:n, j=1:m, k=1:l) a(i,j,k) = i * j + k end do -end +end !--- perfectly_nested.f90 program main @@ -62,9 +62,48 @@ program main end do end -! DEVICE: omp.target +! COMMON: func.func @_QQmain + +! DEVICE: %[[DUPLICATED_C1_1:.*]] = arith.constant 1 : i32 +! DEVICE: %[[DUPLICATED_LB_I:.*]] = fir.convert %[[DUPLICATED_C1_1]] : (i32) -> index +! DEVICE: %[[DUPLICATED_C10:.*]] = arith.constant 10 : i32 +! DEVICE: %[[DUPLICATED_UB_I:.*]] = fir.convert %[[DUPLICATED_C10]] : (i32) -> index +! DEVICE: %[[DUPLICATED_STEP_I:.*]] = arith.constant 1 : index + +! DEVICE: %[[C1_1:.*]] = arith.constant 1 : i32 +! DEVICE: %[[HOST_LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index +! DEVICE: %[[C10:.*]] = arith.constant 10 : i32 +! DEVICE: %[[HOST_UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index +! DEVICE: %[[HOST_STEP_I:.*]] = arith.constant 1 : index + +! DEVICE: %[[C1_2:.*]] = arith.constant 1 : i32 +! DEVICE: %[[HOST_LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index +! DEVICE: %[[C20:.*]] = arith.constant 20 : i32 +! DEVICE: %[[HOST_UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index +! DEVICE: %[[HOST_STEP_J:.*]] = arith.constant 1 : index + +! DEVICE: %[[C1_3:.*]] = arith.constant 1 : i32 +! DEVICE: %[[HOST_LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index +! DEVICE: %[[C30:.*]] = arith.constant 30 : i32 +! DEVICE: %[[HOST_UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index +! DEVICE: %[[HOST_STEP_K:.*]] = arith.constant 1 : index + +! DEVICE: omp.target host_eval( +! DEVICE-SAME: %[[HOST_LB_I]] -> %[[LB_I:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_UB_I]] -> %[[UB_I:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_STEP_I]] -> %[[STEP_I:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_LB_J]] -> %[[LB_J:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_UB_J]] -> %[[UB_J:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_STEP_J]] -> %[[STEP_J:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_LB_K]] -> %[[LB_K:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_UB_K]] -> %[[UB_K:[[:alnum:]]+]], +! DEVICE-SAME: %[[HOST_STEP_K]] -> %[[STEP_K:[[:alnum:]]+]] : +! DEVICE-SAME: index, index, index, index, index, index, index, index, index) ! DEVICE: omp.teams +! HOST-NOT: omp.target +! HOST-NOT: omp.teams + ! COMMON: omp.parallel { ! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"} @@ -76,23 +115,23 @@ program main ! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"} ! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"} -! COMMON: %[[C1_1:.*]] = arith.constant 1 : i32 -! COMMON: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index -! COMMON: %[[C10:.*]] = arith.constant 10 : i32 -! COMMON: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index -! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index - -! COMMON: %[[C1_2:.*]] = arith.constant 1 : i32 -! COMMON: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index -! COMMON: %[[C20:.*]] = arith.constant 20 : i32 -! COMMON: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index -! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index - -! COMMON: %[[C1_3:.*]] = arith.constant 1 : i32 -! COMMON: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index -! COMMON: %[[C30:.*]] = arith.constant 30 : i32 -! COMMON: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index -! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index +! HOST: %[[C1_1:.*]] = arith.constant 1 : i32 +! HOST: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index +! HOST: %[[C10:.*]] = arith.constant 10 : i32 +! HOST: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index +! HOST: %[[STEP_I:.*]] = arith.constant 1 : index + +! HOST: %[[C1_2:.*]] = arith.constant 1 : i32 +! HOST: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index +! HOST: %[[C20:.*]] = arith.constant 20 : i32 +! HOST: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index +! HOST: %[[STEP_J:.*]] = arith.constant 1 : index + +! HOST: %[[C1_3:.*]] = arith.constant 1 : i32 +! HOST: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index +! HOST: %[[C30:.*]] = arith.constant 30 : i32 +! HOST: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index +! HOST: %[[STEP_K:.*]] = arith.constant 1 : index ! DEVICE: omp.distribute