From 2dd4822e9aff74a12576039711dd8e82f091f5f3 Mon Sep 17 00:00:00 2001
From: RevathiJambunathan <revanathan@gmail.com>
Date: Thu, 29 Feb 2024 17:55:00 -0800
Subject: [PATCH] Add Andrews SFC using vectorized levels

---
 Source/Parallelization/WarpXRegrid.cpp | 196 ++++++++++++++++++-------
 Source/WarpX.H                         |   2 +
 Source/WarpX.cpp                       |   1 +
 3 files changed, 147 insertions(+), 52 deletions(-)
diff --git a/Source/Parallelization/WarpXRegrid.cpp b/Source/Parallelization/WarpXRegrid.cpp
index 17dfd6b93d6..68b9cb88d5f 100644
--- a/Source/Parallelization/WarpXRegrid.cpp
+++ b/Source/Parallelization/WarpXRegrid.cpp
@@ -71,82 +71,174 @@ WarpX::LoadBalance ()
     int loadBalancedAnyLevel = false;
 
     const int nLevels = finestLevel();
-    if (do_similar_dm_refpatch) {
-        for (int lev = nLevels; lev > 0; --lev) {
-            ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1));
+    if (do_SFC_dm_vectorlevel) {
+        amrex::Vector<amrex::Real> rcost;
+        amrex::Vector<int> current_pmap;
+        for (int lev = 0; lev <= nLevels; ++lev)
+        {
+            amrex::Vector<amrex::Real> rcost_lev(costs[lev]->size());
+            amrex::ParallelDescriptor::GatherLayoutDataToVector<amrex::Real>(*costs[lev],rcost_lev,
+                amrex::ParallelDescriptor::IOProcessorNumber());
+            rcost.insert(rcost.end(), rcost_lev.begin(), rcost_lev.end());
+            auto& pmap_lev = costs[lev]->DistributionMap().ProcessorMap();
+            current_pmap.insert(current_pmap.end(), pmap_lev.begin(), pmap_lev.end());
         }
-    }
-
-    for (int lev = 0; lev <= nLevels; ++lev)
-    {
         int doLoadBalance = false;
-
-        // Compute the new distribution mapping
-        DistributionMapping newdm;
-        const amrex::Real nboxes = costs[lev]->size();
+        amrex::Real currentEfficiency = 0.;
+        amrex::Real proposedEfficiency = 0.;
+        const amrex::Real nboxes = rcost.size();
         const amrex::Real nprocs = ParallelContext::NProcsSub();
         const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
-        // These store efficiency (meaning, the  average 'cost' over all ranks,
-        // normalized to max cost) for current and proposed distribution mappings
-        amrex::Real currentEfficiency = 0.0;
-        amrex::Real proposedEfficiency = 0.0;
-
-        if (lev == 0 || !do_similar_dm_refpatch) {
-            newdm = (load_balance_with_sfc)
-                ? DistributionMapping::makeSFC(*costs[lev],
-                                               currentEfficiency, proposedEfficiency,
-                                               false,
-                                               ParallelDescriptor::IOProcessorNumber())
-                : DistributionMapping::makeKnapSack(*costs[lev],
-                                                    currentEfficiency, proposedEfficiency,
-                                                    nmax,
-                                                    false,
-                                                    ParallelDescriptor::IOProcessorNumber());
-        } else {
-            amrex::BoxArray coarse_ba = boxArray(lev-1);
-            amrex::DistributionMapping coarse_dm = DistributionMap(lev-1);
-            amrex::BoxArray ba = boxArray(lev);
-            ba.coarsen(WarpX::RefRatio(lev-1));
-            newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB());
-        }
-        Print() << "new dm on lev " << lev << ": \n";
-        Print() << newdm << std::endl;
-        // As specified in the above calls to makeSFC and makeKnapSack, the new
-        // distribution mapping is NOT communicated to all ranks; the loadbalanced
-        // dm is up-to-date only on root, and we can decide whether to broadcast
-        if ((load_balance_efficiency_ratio_threshold > 0.0)
-            && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
+
+        amrex::BoxArray refined_ba = boxArray(0);
+        for (int lev = 1; lev <= nLevels; ++lev)
         {
-            doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
+            refined_ba.refine(refRatio(lev-1));
+            amrex::BoxList refined_bl = refined_ba.boxList();
+            refined_bl.join(boxArray(lev).boxList());
+            refined_ba = amrex::BoxArray(refined_bl);
         }
 
+        amrex::Vector<amrex::DistributionMapping> newdm(nLevels+1);
+        amrex::DistributionMapping r;
+        if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
+        {
+            std::vector<amrex::Long> cost(rcost.size());
+
+            amrex::Real wmax = *std::max_element(rcost.begin(), rcost.end());
+            amrex::Real scale = (wmax == 0) ? 1.e9_rt : 1.e9_rt/wmax;
+
+            for (int i = 0; i < rcost.size(); ++i) {
+                cost[i] = amrex::Long(rcost[i]*scale) + 1L;
+            }
+
+            // `sort` needs to be false here since there's a parallel reduce function
+            // in the processor map function, but we are executing only on root
+            int nprocs = ParallelDescriptor::NProcs();
+            r.SFCProcessorMap(refined_ba, cost, nprocs, proposedEfficiency, false);
+
+            amrex::DistributionMapping::ComputeDistributionMappingEfficiency(r,
+                                                                             rcost,
+                                                                             &currentEfficiency);
+
+            if ((load_balance_efficiency_ratio_threshold > 0.0))
+            {
+                doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
+            }
+            amrex::Print() << " doloadbalance : " << doLoadBalance << "\n";
+            amrex::Print() << proposedEfficiency << "\n";
+            amrex::Print() << currentEfficiency << "\n";
+        }
         ParallelDescriptor::Bcast(&doLoadBalance, 1,
                                   ParallelDescriptor::IOProcessorNumber());
 
         if (doLoadBalance)
         {
-            Vector<int> pmap;
+            amrex::Vector<int> pmap(rcost.size());
             if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
             {
-                pmap = newdm.ProcessorMap();
+                pmap = r.ProcessorMap();
             } else
             {
                 pmap.resize(static_cast<std::size_t>(nboxes));
             }
-            ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
+            // Broadcast vector from which to construct new distribution mapping
+            amrex::ParallelDescriptor::Bcast(&pmap[0], pmap.size(), ParallelDescriptor::IOProcessorNumber());
 
-            if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber())
+            int lev_start = 0;
+            for (int lev = 0; lev <= nLevels; ++lev)
             {
-                newdm = DistributionMapping(pmap);
+                amrex::Vector<int> pmap_lev(pmap.begin() + lev_start,
+                                            pmap.begin() + lev_start + costs[lev]->size());
+                newdm[lev] = amrex::DistributionMapping(pmap_lev);
+                lev_start += costs[lev]->size();
             }
 
-            RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);
-
-            // Record the load balance efficiency
-            setLoadBalanceEfficiency(lev, proposedEfficiency);
+            for (int lev = 0; lev <= nLevels; ++lev)
+            {
+                RemakeLevel(lev, t_new[lev], boxArray(lev), newdm[lev]);
+                setLoadBalanceEfficiency(lev, proposedEfficiency);
+            }
         }
-
         loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance;
+    } else {
+        //if (do_similar_dm_refpatch) {
+        //    for (int lev = nLevels; lev > 0; --lev) {
+        //        ablastr::coarsen::sample::Coarsen(*costs[lev-1], *costs[lev],0,0,1,0,WarpX::RefRatio(lev-1));
+        //    }
+        //}
+
+        for (int lev = 0; lev <= nLevels; ++lev)
+        {
+            int doLoadBalance = false;
+
+            // Compute the new distribution mapping
+            DistributionMapping newdm;
+            const amrex::Real nboxes = costs[lev]->size();
+            const amrex::Real nprocs = ParallelContext::NProcsSub();
+            const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
+            // These store efficiency (meaning, the  average 'cost' over all ranks,
+            // normalized to max cost) for current and proposed distribution mappings
+            amrex::Real currentEfficiency = 0.0;
+            amrex::Real proposedEfficiency = 0.0;
+
+            if (lev == 0 || !do_similar_dm_refpatch) {
+                newdm = (load_balance_with_sfc)
+                    ? DistributionMapping::makeSFC(*costs[lev],
+                                                   currentEfficiency, proposedEfficiency,
+                                                   false,
+                                                   ParallelDescriptor::IOProcessorNumber())
+                    : DistributionMapping::makeKnapSack(*costs[lev],
+                                                        currentEfficiency, proposedEfficiency,
+                                                        nmax,
+                                                        false,
+                                                        ParallelDescriptor::IOProcessorNumber());
+            } else {
+                amrex::BoxArray coarse_ba = boxArray(lev-1);
+                amrex::DistributionMapping coarse_dm = DistributionMap(lev-1);
+                amrex::BoxArray ba = boxArray(lev);
+                ba.coarsen(WarpX::RefRatio(lev-1));
+                newdm = amrex::MakeSimilarDM(ba, coarse_ba, coarse_dm, getngEB());
+            }
+            Print() << "new dm on lev " << lev << ": \n";
+            Print() << newdm << std::endl;
+            // As specified in the above calls to makeSFC and makeKnapSack, the new
+            // distribution mapping is NOT communicated to all ranks; the loadbalanced
+            // dm is up-to-date only on root, and we can decide whether to broadcast
+            if ((load_balance_efficiency_ratio_threshold > 0.0)
+                && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
+            {
+                doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
+            }
+
+            ParallelDescriptor::Bcast(&doLoadBalance, 1,
+                                      ParallelDescriptor::IOProcessorNumber());
+
+            if (doLoadBalance)
+            {
+                Vector<int> pmap;
+                if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
+                {
+                    pmap = newdm.ProcessorMap();
+                } else
+                {
+                    pmap.resize(static_cast<std::size_t>(nboxes));
+                }
+                ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
+
+                if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber())
+                {
+                    newdm = DistributionMapping(pmap);
+                }
+
+                RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);
+
+                // Record the load balance efficiency
+                setLoadBalanceEfficiency(lev, proposedEfficiency);
+            }
+
+            loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance;
+        }
     }
     if (loadBalancedAnyLevel)
     {
diff --git a/Source/WarpX.H b/Source/WarpX.H
index d24ab99e6aa..3b256fdfb13 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -1643,6 +1643,8 @@ private:
     amrex::Real costs_heuristic_particles_wt = amrex::Real(0);
     /** Whether to use MakeSimilarDM when load balancing lev > 1 */
     int do_similar_dm_refpatch = 0;
+    /** Whether to use SFC on vectorized BoxArrays from all levels */
+    int do_SFC_dm_vectorlevel = 0;
 
     // Determines timesteps for override sync
     utils::parser::IntervalsParser override_sync_intervals;
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 272e40ef0c4..d7b613da8bd 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -1308,6 +1308,7 @@ WarpX::ReadParameters ()
             load_balance_intervals_string_vec);
         pp_algo.query("load_balance_with_sfc", load_balance_with_sfc);
         pp_algo.query("do_similar_dm_refpatch", do_similar_dm_refpatch);
+        pp_algo.query("do_SFC_dm_vectorlevel",do_SFC_dm_vectorlevel);
         // Knapsack factor only used with non-SFC strategy
         if (!load_balance_with_sfc) {
             pp_algo.query("load_balance_knapsack_factor", load_balance_knapsack_factor);