From 2dd4822e9aff74a12576039711dd8e82f091f5f3 Mon Sep 17 00:00:00 2001 From: RevathiJambunathan Date: Thu, 29 Feb 2024 17:55:00 -0800 Subject: [PATCH] Add Andrews SFC using vectorized levels --- Source/Parallelization/WarpXRegrid.cpp | 196 ++++++++++++++++++------- Source/WarpX.H | 2 + Source/WarpX.cpp | 1 + 3 files changed, 147 insertions(+), 52 deletions(-) diff --git a/Source/Parallelization/WarpXRegrid.cpp b/Source/Parallelization/WarpXRegrid.cpp index 17dfd6b93d6..68b9cb88d5f 100644 --- a/Source/Parallelization/WarpXRegrid.cpp +++ b/Source/Parallelization/WarpXRegrid.cpp @@ -71,82 +71,174 @@ WarpX::LoadBalance () int loadBalancedAnyLevel = false; const int nLevels = finestLevel(); - if (do_similar_dm_refpatch) { - for (int lev = nLevels; lev > 0; --lev) { - ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1)); + if (do_SFC_dm_vectorlevel) { + amrex::Vector rcost; + amrex::Vector current_pmap; + for (int lev = 0; lev <= nLevels; ++lev) + { + amrex::Vector rcost_lev(costs[lev]->size()); + amrex::ParallelDescriptor::GatherLayoutDataToVector(*costs[lev],rcost_lev, + amrex::ParallelDescriptor::IOProcessorNumber()); + rcost.insert(rcost.end(), rcost_lev.begin(), rcost_lev.end()); + auto& pmap_lev = costs[lev]->DistributionMap().ProcessorMap(); + current_pmap.insert(current_pmap.end(), pmap_lev.begin(), pmap_lev.end()); } - } - - for (int lev = 0; lev <= nLevels; ++lev) - { int doLoadBalance = false; - - // Compute the new distribution mapping - DistributionMapping newdm; - const amrex::Real nboxes = costs[lev]->size(); + amrex::Real currentEfficiency = 0.; + amrex::Real proposedEfficiency = 0.; + const amrex::Real nboxes = rcost.size(); const amrex::Real nprocs = ParallelContext::NProcsSub(); const int nmax = static_cast(std::ceil(nboxes/nprocs*load_balance_knapsack_factor)); - // These store efficiency (meaning, the average 'cost' over all ranks, - // normalized to max cost) for current and proposed distribution mappings - amrex::Real currentEfficiency = 0.0; - amrex::Real proposedEfficiency = 0.0; - - if (lev == 0 || !do_similar_dm_refpatch) { - newdm = (load_balance_with_sfc) - ? DistributionMapping::makeSFC(*costs[lev], - currentEfficiency, proposedEfficiency, - false, - ParallelDescriptor::IOProcessorNumber()) - : DistributionMapping::makeKnapSack(*costs[lev], - currentEfficiency, proposedEfficiency, - nmax, - false, - ParallelDescriptor::IOProcessorNumber()); - } else { - amrex::BoxArray coarse_ba = boxArray(lev-1); - amrex::DistributionMapping coarse_dm = DistributionMap(lev-1); - amrex::BoxArray ba = boxArray(lev); - ba.coarsen(WarpX::RefRatio(lev-1)); - newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB()); - } - Print() << "new dm on lev " << lev << ": \n"; - Print() << newdm << std::endl; - // As specified in the above calls to makeSFC and makeKnapSack, the new - // distribution mapping is NOT communicated to all ranks; the loadbalanced - // dm is up-to-date only on root, and we can decide whether to broadcast - if ((load_balance_efficiency_ratio_threshold > 0.0) - && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) + + amrex::BoxArray refined_ba = boxArray(0); + for (int lev = 1; lev <= nLevels; ++lev) { - doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency); + refined_ba.refine(refRatio(lev-1)); + amrex::BoxList refined_bl = refined_ba.boxList(); + refined_bl.join(boxArray(lev).boxList()); + refined_ba = amrex::BoxArray(refined_bl); } + amrex::Vector newdm(nLevels+1); + amrex::DistributionMapping r; + if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) + { + std::vector cost(rcost.size()); + + amrex::Real wmax = *std::max_element(rcost.begin(), rcost.end()); + amrex::Real scale = (wmax == 0) ? 1.e9_rt : 1.e9_rt/wmax; + + for (int i = 0; i < rcost.size(); ++i) { + cost[i] = amrex::Long(rcost[i]*scale) + 1L; + } + + // `sort` needs to be false here since there's a parallel reduce function + // in the processor map function, but we are executing only on root + int nprocs = ParallelDescriptor::NProcs(); + r.SFCProcessorMap(refined_ba, cost, nprocs, proposedEfficiency, false); + + amrex::DistributionMapping::ComputeDistributionMappingEfficiency(r, + rcost, + ¤tEfficiency); + + if ((load_balance_efficiency_ratio_threshold > 0.0)) + { + doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency); + } + amrex::Print() << " doloadbalance : " << doLoadBalance << "\n"; + amrex::Print() << proposedEfficiency << "\n"; + amrex::Print() << currentEfficiency << "\n"; + } ParallelDescriptor::Bcast(&doLoadBalance, 1, ParallelDescriptor::IOProcessorNumber()); if (doLoadBalance) { - Vector pmap; + amrex::Vector pmap(rcost.size()); if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) { - pmap = newdm.ProcessorMap(); + pmap = r.ProcessorMap(); } else { pmap.resize(static_cast(nboxes)); } - ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber()); + // Broadcast vector from which to construct new distribution mapping + amrex::ParallelDescriptor::Bcast(&pmap[0], pmap.size(), ParallelDescriptor::IOProcessorNumber()); - if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) + int lev_start = 0; + for (int lev = 0; lev <= nLevels; ++lev) { - newdm = DistributionMapping(pmap); + amrex::Vector pmap_lev(pmap.begin() + lev_start, + pmap.begin() + lev_start + costs[lev]->size()); + newdm[lev] = amrex::DistributionMapping(pmap_lev); + lev_start += costs[lev]->size(); } - RemakeLevel(lev, t_new[lev], boxArray(lev), newdm); - - // Record the load balance efficiency - setLoadBalanceEfficiency(lev, proposedEfficiency); + for (int lev = 0; lev <= nLevels; ++lev) + { + RemakeLevel(lev, t_new[lev], boxArray(lev), newdm[lev]); + setLoadBalanceEfficiency(lev, proposedEfficiency); + } } - loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance; + } else { + //if (do_similar_dm_refpatch) { + // for (int lev = nLevels; lev > 0; --lev) { + // ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1)); + // } + //} + + for (int lev = 0; lev <= nLevels; ++lev) + { + int doLoadBalance = false; + + // Compute the new distribution mapping + DistributionMapping newdm; + const amrex::Real nboxes = costs[lev]->size(); + const amrex::Real nprocs = ParallelContext::NProcsSub(); + const int nmax = static_cast(std::ceil(nboxes/nprocs*load_balance_knapsack_factor)); + // These store efficiency (meaning, the average 'cost' over all ranks, + // normalized to max cost) for current and proposed distribution mappings + amrex::Real currentEfficiency = 0.0; + amrex::Real proposedEfficiency = 0.0; + + if (lev == 0 || !do_similar_dm_refpatch) { + newdm = (load_balance_with_sfc) + ? DistributionMapping::makeSFC(*costs[lev], + currentEfficiency, proposedEfficiency, + false, + ParallelDescriptor::IOProcessorNumber()) + : DistributionMapping::makeKnapSack(*costs[lev], + currentEfficiency, proposedEfficiency, + nmax, + false, + ParallelDescriptor::IOProcessorNumber()); + } else { + amrex::BoxArray coarse_ba = boxArray(lev-1); + amrex::DistributionMapping coarse_dm = DistributionMap(lev-1); + amrex::BoxArray ba = boxArray(lev); + ba.coarsen(WarpX::RefRatio(lev-1)); + newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB()); + } + Print() << "new dm on lev " << lev << ": \n"; + Print() << newdm << std::endl; + // As specified in the above calls to makeSFC and makeKnapSack, the new + // distribution mapping is NOT communicated to all ranks; the loadbalanced + // dm is up-to-date only on root, and we can decide whether to broadcast + if ((load_balance_efficiency_ratio_threshold > 0.0) + && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) + { + doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency); + } + + ParallelDescriptor::Bcast(&doLoadBalance, 1, + ParallelDescriptor::IOProcessorNumber()); + + if (doLoadBalance) + { + Vector pmap; + if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) + { + pmap = newdm.ProcessorMap(); + } else + { + pmap.resize(static_cast(nboxes)); + } + ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber()); + + if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) + { + newdm = DistributionMapping(pmap); + } + + RemakeLevel(lev, t_new[lev], boxArray(lev), newdm); + + // Record the load balance efficiency + setLoadBalanceEfficiency(lev, proposedEfficiency); + } + + loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance; + } } if (loadBalancedAnyLevel) { diff --git a/Source/WarpX.H b/Source/WarpX.H index d24ab99e6aa..3b256fdfb13 100644 --- a/Source/WarpX.H +++ b/Source/WarpX.H @@ -1643,6 +1643,8 @@ private: amrex::Real costs_heuristic_particles_wt = amrex::Real(0); /** Whether to use MakeSimilarDM when load balancing lev > 1 */ int do_similar_dm_refpatch = 0; + /** Whether to use SFC on vectorized BoxArrays from all levels */ + int do_SFC_dm_vectorlevel = 0; // Determines timesteps for override sync utils::parser::IntervalsParser override_sync_intervals; diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp index 272e40ef0c4..d7b613da8bd 100644 --- a/Source/WarpX.cpp +++ b/Source/WarpX.cpp @@ -1308,6 +1308,7 @@ WarpX::ReadParameters () load_balance_intervals_string_vec); pp_algo.query("load_balance_with_sfc", load_balance_with_sfc); pp_algo.query("do_similar_dm_refpatch", do_similar_dm_refpatch); + pp_algo.query("do_SFC_dm_vectorlevel",do_SFC_dm_vectorlevel); // Knapsack factor only used with non-SFC strategy if (!load_balance_with_sfc) { pp_algo.query("load_balance_knapsack_factor", load_balance_knapsack_factor);