Skip to content

Commit

Permalink
Add Andrews SFC using vectorized levels
Browse files Browse the repository at this point in the history
  • Loading branch information
RevathiJambunathan committed Mar 20, 2024
1 parent 350b959 commit 2dd4822
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 52 deletions.
196 changes: 144 additions & 52 deletions Source/Parallelization/WarpXRegrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,82 +71,174 @@ WarpX::LoadBalance ()
int loadBalancedAnyLevel = false;

const int nLevels = finestLevel();
if (do_similar_dm_refpatch) {
for (int lev = nLevels; lev > 0; --lev) {
ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1));
if (do_SFC_dm_vectorlevel) {
amrex::Vector<amrex::Real> rcost;
amrex::Vector<int> current_pmap;
for (int lev = 0; lev <= nLevels; ++lev)
{
amrex::Vector<amrex::Real> rcost_lev(costs[lev]->size());
amrex::ParallelDescriptor::GatherLayoutDataToVector<amrex::Real>(*costs[lev],rcost_lev,
amrex::ParallelDescriptor::IOProcessorNumber());
rcost.insert(rcost.end(), rcost_lev.begin(), rcost_lev.end());
auto& pmap_lev = costs[lev]->DistributionMap().ProcessorMap();
current_pmap.insert(current_pmap.end(), pmap_lev.begin(), pmap_lev.end());
}
}

for (int lev = 0; lev <= nLevels; ++lev)
{
int doLoadBalance = false;

// Compute the new distribution mapping
DistributionMapping newdm;
const amrex::Real nboxes = costs[lev]->size();
amrex::Real currentEfficiency = 0.;
amrex::Real proposedEfficiency = 0.;
const amrex::Real nboxes = rcost.size();
const amrex::Real nprocs = ParallelContext::NProcsSub();
const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
// These store efficiency (meaning, the average 'cost' over all ranks,
// normalized to max cost) for current and proposed distribution mappings
amrex::Real currentEfficiency = 0.0;
amrex::Real proposedEfficiency = 0.0;

if (lev == 0 || !do_similar_dm_refpatch) {
newdm = (load_balance_with_sfc)
? DistributionMapping::makeSFC(*costs[lev],
currentEfficiency, proposedEfficiency,
false,
ParallelDescriptor::IOProcessorNumber())
: DistributionMapping::makeKnapSack(*costs[lev],
currentEfficiency, proposedEfficiency,
nmax,
false,
ParallelDescriptor::IOProcessorNumber());
} else {
amrex::BoxArray coarse_ba = boxArray(lev-1);
amrex::DistributionMapping coarse_dm = DistributionMap(lev-1);
amrex::BoxArray ba = boxArray(lev);
ba.coarsen(WarpX::RefRatio(lev-1));
newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB());
}
Print() << "new dm on lev " << lev << ": \n";
Print() << newdm << std::endl;
// As specified in the above calls to makeSFC and makeKnapSack, the new
// distribution mapping is NOT communicated to all ranks; the loadbalanced
// dm is up-to-date only on root, and we can decide whether to broadcast
if ((load_balance_efficiency_ratio_threshold > 0.0)
&& (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))

amrex::BoxArray refined_ba = boxArray(0);
for (int lev = 1; lev <= nLevels; ++lev)
{
doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
refined_ba.refine(refRatio(lev-1));
amrex::BoxList refined_bl = refined_ba.boxList();
refined_bl.join(boxArray(lev).boxList());
refined_ba = amrex::BoxArray(refined_bl);
}

amrex::Vector<amrex::DistributionMapping> newdm(nLevels+1);
amrex::DistributionMapping r;
if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
{
std::vector<amrex::Long> cost(rcost.size());

amrex::Real wmax = *std::max_element(rcost.begin(), rcost.end());
amrex::Real scale = (wmax == 0) ? 1.e9_rt : 1.e9_rt/wmax;

for (int i = 0; i < rcost.size(); ++i) {
cost[i] = amrex::Long(rcost[i]*scale) + 1L;
}

// `sort` needs to be false here since there's a parallel reduce function
// in the processor map function, but we are executing only on root
int nprocs = ParallelDescriptor::NProcs();
r.SFCProcessorMap(refined_ba, cost, nprocs, proposedEfficiency, false);

amrex::DistributionMapping::ComputeDistributionMappingEfficiency(r,
rcost,
&currentEfficiency);

if ((load_balance_efficiency_ratio_threshold > 0.0))
{
doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
}
amrex::Print() << " doloadbalance : " << doLoadBalance << "\n";
amrex::Print() << proposedEfficiency << "\n";
amrex::Print() << currentEfficiency << "\n";
}
ParallelDescriptor::Bcast(&doLoadBalance, 1,
ParallelDescriptor::IOProcessorNumber());

if (doLoadBalance)
{
Vector<int> pmap;
amrex::Vector<int> pmap(rcost.size());
if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
{
pmap = newdm.ProcessorMap();
pmap = r.ProcessorMap();
} else
{
pmap.resize(static_cast<std::size_t>(nboxes));
}
ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
// Broadcast vector from which to construct new distribution mapping
amrex::ParallelDescriptor::Bcast(&pmap[0], pmap.size(), ParallelDescriptor::IOProcessorNumber());

if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber())
int lev_start = 0;
for (int lev = 0; lev <= nLevels; ++lev)
{
newdm = DistributionMapping(pmap);
amrex::Vector<int> pmap_lev(pmap.begin() + lev_start,
pmap.begin() + lev_start + costs[lev]->size());
newdm[lev] = amrex::DistributionMapping(pmap_lev);
lev_start += costs[lev]->size();
}

RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);

// Record the load balance efficiency
setLoadBalanceEfficiency(lev, proposedEfficiency);
for (int lev = 0; lev <= nLevels; ++lev)
{
RemakeLevel(lev, t_new[lev], boxArray(lev), newdm[lev]);
setLoadBalanceEfficiency(lev, proposedEfficiency);
}
}

loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance;
} else {
//if (do_similar_dm_refpatch) {
// for (int lev = nLevels; lev > 0; --lev) {
// ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1));
// }
//}

for (int lev = 0; lev <= nLevels; ++lev)
{
int doLoadBalance = false;

// Compute the new distribution mapping
DistributionMapping newdm;
const amrex::Real nboxes = costs[lev]->size();
const amrex::Real nprocs = ParallelContext::NProcsSub();
const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
// These store efficiency (meaning, the average 'cost' over all ranks,
// normalized to max cost) for current and proposed distribution mappings
amrex::Real currentEfficiency = 0.0;
amrex::Real proposedEfficiency = 0.0;

if (lev == 0 || !do_similar_dm_refpatch) {
newdm = (load_balance_with_sfc)
? DistributionMapping::makeSFC(*costs[lev],
currentEfficiency, proposedEfficiency,
false,
ParallelDescriptor::IOProcessorNumber())
: DistributionMapping::makeKnapSack(*costs[lev],
currentEfficiency, proposedEfficiency,
nmax,
false,
ParallelDescriptor::IOProcessorNumber());
} else {
amrex::BoxArray coarse_ba = boxArray(lev-1);
amrex::DistributionMapping coarse_dm = DistributionMap(lev-1);
amrex::BoxArray ba = boxArray(lev);
ba.coarsen(WarpX::RefRatio(lev-1));
newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB());
}
Print() << "new dm on lev " << lev << ": \n";
Print() << newdm << std::endl;
// As specified in the above calls to makeSFC and makeKnapSack, the new
// distribution mapping is NOT communicated to all ranks; the loadbalanced
// dm is up-to-date only on root, and we can decide whether to broadcast
if ((load_balance_efficiency_ratio_threshold > 0.0)
&& (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
{
doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
}

ParallelDescriptor::Bcast(&doLoadBalance, 1,
ParallelDescriptor::IOProcessorNumber());

if (doLoadBalance)
{
Vector<int> pmap;
if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
{
pmap = newdm.ProcessorMap();
} else
{
pmap.resize(static_cast<std::size_t>(nboxes));
}
ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());

if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber())
{
newdm = DistributionMapping(pmap);
}

RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);

// Record the load balance efficiency
setLoadBalanceEfficiency(lev, proposedEfficiency);
}

loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance;
}
}
if (loadBalancedAnyLevel)
{
Expand Down
2 changes: 2 additions & 0 deletions Source/WarpX.H
Original file line number Diff line number Diff line change
Expand Up @@ -1643,6 +1643,8 @@ private:
amrex::Real costs_heuristic_particles_wt = amrex::Real(0);
/** Whether to use MakeSimilarDM when load balancing lev > 1 */
int do_similar_dm_refpatch = 0;
/** Whether to use SFC on vectorized BoxArrays from all levels */
int do_SFC_dm_vectorlevel = 0;

// Determines timesteps for override sync
utils::parser::IntervalsParser override_sync_intervals;
Expand Down
1 change: 1 addition & 0 deletions Source/WarpX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1308,6 +1308,7 @@ WarpX::ReadParameters ()
load_balance_intervals_string_vec);
pp_algo.query("load_balance_with_sfc", load_balance_with_sfc);
pp_algo.query("do_similar_dm_refpatch", do_similar_dm_refpatch);
pp_algo.query("do_SFC_dm_vectorlevel",do_SFC_dm_vectorlevel);
// Knapsack factor only used with non-SFC strategy
if (!load_balance_with_sfc) {
pp_algo.query("load_balance_knapsack_factor", load_balance_knapsack_factor);
Expand Down

0 comments on commit 2dd4822

Please sign in to comment.