-
Notifications
You must be signed in to change notification settings - Fork 29
/
MultistepLB_SFC.cpp
285 lines (243 loc) · 9.62 KB
/
MultistepLB_SFC.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#include <charm++.h>
#include "MultistepLB_SFC.h"
#include "ParallelGravity.h"
#include "Vector3D.h"
#include "formatted_string.h"
CkpvExtern(int, _lb_obj_index);
using namespace std;
#if CHARM_VERSION > 61002
static void lbinit()
{
LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC",
"Works best with multistepped runs; uses SFC distribution");
}
#else
CreateLBFunc_Def(MultistepLB_SFC,
"Works best with multistepped runs; uses SFC distribution");
#endif
void MultistepLB_SFC::init() {
lbname = "MultistepLB_SFC";
if (CkpvAccess(_lb_obj_index) == -1)
CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D));
}
MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt)
{
init();
if (CkMyPe() == 0){
CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe());
}
}
bool MultistepLB_SFC::QueryBalanceNow(int step){
if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step);
return true;
}
/// @brief Implement load balancing: store loads and determine active
/// processors and objects, sort by SFC, then divide up among processors.
/// @param stats The Load Balancer statistics object.
void MultistepLB_SFC::work(BaseLB::LDStats* stats)
{
#if CMK_LBDB_ON
// find active objects - mark the inactive ones as non-migratable
const auto num_objs = stats->objData.size();
if(_lb_args.debug() >= 2 && step() > 0) {
// Write out "particle file" of measured load balance information
auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1);
write_LB_particles(stats, achFileName.c_str(), true);
}
int numActiveObjects = 0;
int numInactiveObjects = 0;
int minActiveProc = INT_MAX;
int maxActiveProc = 0;
for(int i = 0; i < num_objs; i++){
stats->to_proc[i] = stats->from_proc[i];
}
for(int i = 0; i < num_objs; i++){
if (!stats->objData[i].migratable) continue;
LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(udata->myNumParticles == 0){ // ignore pieces with no particles
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
continue;
}
if(udata->numActiveParticles == 0){
numInactiveObjects++;
}
else{
numActiveObjects++;
if(minActiveProc > stats->from_proc[i])
minActiveProc = stats->from_proc[i];
if(maxActiveProc < stats->from_proc[i])
maxActiveProc = stats->from_proc[i];
}
}
CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects,
numInactiveObjects);
CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc);
if(numActiveObjects < 0.1*numInactiveObjects) {
// only a small number of active objects, only migrate them
for(int i = 0; i < stats->objData.size(); i++){
if (!stats->objData[i].migratable) continue;
LDObjData &odata = stats->objData[i];
TaggedVector3D* udata =
(TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(udata->numActiveParticles == 0) {
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
}
}
}
else {
CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n",
numActiveObjects, numInactiveObjects);
}
// let the strategy take over on this modified instrumented data and processor information
work2(stats);
#endif //CMK_LDB_ON
}
/// @brief SFC load balance.
void MultistepLB_SFC::work2(BaseLB::LDStats *stats){
const int numobjs = stats->objData.size();
const int nmig = stats->n_migrateobjs;
// this data structure is used by the SFC strategy
// to balance objects. it is NOT indexed by tree piece index
// there are as many entries in it as there are
// migratable (active) tree pieces
vector<SFCObject> tp_array;
tp_array.resize(nmig);
if (_lb_args.debug()>=2) {
CkPrintf("[work2] ready tp_array data structure\n");
}
int numProcessed = 0;
double dBgLoad = 0.0;
for(int i = 0; i < stats->nprocs(); i++){
dBgLoad += stats->procs[i].bg_walltime;
}
dBgLoad /= numobjs;
dTotalLoad = 0.0;
for(int i = 0; i < numobjs; i++){
if(!stats->objData[i].migratable) continue;
float load;
LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(step() == 0){ // no load information, balance by particle numbers
load = udata->myNumParticles;
}
else{
// give each piece a portion of the background load
load = stats->objData[i].wallTime + dBgLoad;
}
tp_array[numProcessed] = SFCObject(i, load);
tp_array[numProcessed].centroid = udata->vec;
numProcessed++;
dTotalLoad += load;
}
if(verbosity > 0)
CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs,
dBgLoad);
CkAssert(numProcessed==nmig);
sfcPrepare(tp_array, nmig, stats);
sfcPartition(stats->nprocs(),tp_array, stats);
// refine(stats, numobjs);
Orb_PrintLBStats(stats, numobjs);
if(_lb_args.debug() >= 2) {
// Write out "particle file" of load balance information
auto achFileName = make_formatted_string("lb.%d.sim", step());
write_LB_particles(stats, achFileName.c_str(), false);
}
}
/// @brief Prepare structures for the ORB partition.
/// @param tp_array Reference to Vector of Objects representing TreePieces.
/// @param nObjs Number of tree pieces to partition.
/// @param stats Data from the load balancing framework.
/// @param node_partition Are we partitioning on nodes.
void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array,
int nObjs,
BaseLB::LDStats *stats,
bool node_partition){
OrientedBox<float> boundingBox;
int nmig = stats->n_migrateobjs;
if(dMaxBalance < 1.0)
dMaxBalance = 1.0;
// If using node based orb partition, then the maxPieceProc is total
// migratable objs / total number of node.
if (node_partition) {
maxPieceProc = dMaxBalance * nmig / CkNumNodes();
} else {
maxPieceProc = dMaxBalance*nmig/stats->nprocs();
}
if(maxPieceProc < 1.0)
maxPieceProc = 1.01;
CkAssert(tp_array.size() == nObjs);
mapping = &stats->to_proc;
from = &stats->from_proc;
CkPrintf("[LB_SFC] sorting\n");
for(int i = 0; i < nObjs; i++)
boundingBox.grow(tp_array[i].centroid);
// N.B. code below from TreePiece::assignKeys().
// Refactoring is a possibility.
// get longest axis
Vector3D<float> bsize = boundingBox.size();
float max = (bsize.x > bsize.y) ? bsize.x : bsize.y;
max = (max > bsize.z) ? max : bsize.z;
//
// Make the bounding box cubical.
//
Vector3D<float> bcenter = boundingBox.center();
// The magic number below is approximately 2^(-19)
const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall
// between 0 and 1.
bsize = Vector3D<float>(fEps*0.5*max);
boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize);
if(verbosity > 1)
ckout << "TreePiece: Bounding box now: " << boundingBox << endl;
for(unsigned int i = 0; i < nObjs; ++i) {
tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox);
}
sort(tp_array.begin(),tp_array.end());
}
/// @brief Partition treepieces among processors by
/// dividing the SFC as evenly as possible.
/// @param nprocs Number of processors over which to partition the
/// pieces. N.B. if node_partition is true, then this is the number of nodes.
/// @param tp Vector of TreePiece data.
/// @param stats Load balance data
void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp,
BaseLB::LDStats *stats,
bool node_partition){
double loadPrev = 0.0; // load on all lower processors
int iCurrPiece = 0; // Piece under consideration
const int nPieces = tp.size();
for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) {
if (!stats->procs[iProc].available)
continue;
// always assign one piece to a processor
SFCObject &oSFC = tp[iCurrPiece];
(*mapping)[oSFC.lbindex] = iProc;
double loadCurr = oSFC.load;
iCurrPiece++;
int nCurrPiece = 1; // number of pieces on this processor
double loadTarget = (iProc+1)*dTotalLoad/nProcs;
double dLoadError = fabs(loadPrev + loadCurr - loadTarget);
while ((nCurrPiece < maxPieceProc)
&& fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget)
< dLoadError
&& iCurrPiece < nPieces) { // add pieces to this
// processor to get the
// closest to the target
// load
oSFC = tp[iCurrPiece];
loadCurr += oSFC.load;
(*mapping)[oSFC.lbindex] = iProc;
dLoadError = fabs(loadPrev + loadCurr - loadTarget);
iCurrPiece++;
nCurrPiece++;
}
loadPrev += loadCurr;
}
CkAssert(iCurrPiece == tp.size());
}
void MultistepLB_SFC::pup(PUP::er &p){
CBase_MultistepLB_SFC::pup(p);
}
#include "MultistepLB_SFC.def.h"