-
Notifications
You must be signed in to change notification settings - Fork 28
/
DataManager.h
368 lines (320 loc) · 12 KB
/
DataManager.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/** @file DataManager.h
The DataManager holds data that TreePieces need access to, but not
their own copy.
@author Graeme Lufkin ([email protected])
*/
#ifndef DATAMANAGER_H
#define DATAMANAGER_H
#include <vector>
#include <map>
#include <string>
#include "GenericTreeNode.h"
#include "ParallelGravity.decl.h"
#if CHARM_VERSION > 60401 && CMK_BALANCED_INJECTION_API
#include "ckBIconfig.h"
#endif
/// @brief Information about TreePieces on an SMP node.
struct TreePieceDescriptor{
TreePiece *treePiece;
Tree::GenericTreeNode *root;
TreePieceDescriptor(){}
TreePieceDescriptor(TreePiece *tp_, GenericTreeNode *r){
treePiece = tp_;
root = r;
}
};
#ifdef CUDA
struct UpdateParticlesStruct{
CkCallback *cb;
DataManager *dm;
VariablePartData *buf;
int size;
};
// store pointers to flattened buffers if !gpuFree
struct PendingBuffers {
CkVec<CudaMultipoleMoments> *moments;
CkVec<CompactPartData> *particles;
int chunk;
/// Pointer to callback so it can be freed.
CkCallback *cb;
};
#endif
/** The DataManager is used to store information that all TreePieces will need,
but will not modify. The first example is the list of splitter keys and the
responsible chare for each interval. This data is given to the DataManager by
the Sorter. The DataManager then instructs all the TreePieces on its node
to evaluate the boundary keys.
*/
class DataManager : public CBase_DataManager {
friend class TreePiece;
friend class OctTreeBuildPhaseIWorker;
/// The array of TreePieces I hold data for.
CProxy_TreePiece treePieces;
protected:
/// The array of splitter keys for the sort.
std::vector<SFC::Key> boundaryKeys;
/// An array identifying which chare is responsible for each interval of keys.
std::vector<int> responsibleIndex;
/// An array with how many particles are held by each TreePiece when sorted.
std::vector<int> particleCounts;
/// A list of roots of the TreePieces in this node
// holds chare array indices of registered treepieces
CkVec<TreePieceDescriptor> registeredTreePieces;
/// Signal whether registeredTreePieces needs to be cleaned
/// when combining local trees
bool cleanupTreePieces;
#ifdef CUDA
//CkVec<int> registeredTreePieceIndices;
/// @brief counter for the number of tree nodes that are
/// replicated by TreePieces that share the same address space.
int cumNumReplicatedNodes;
int treePiecesDone;
int savedChunk;
int treePiecesDonePrefetch;
int treePiecesDoneLocalComputation;
// XXX - assumes that only one chunk can be on the gpu
// at a given time
int treePiecesDoneRemoteChunkComputation;
int treePiecesWantParticlesBack;
/// Reference count for Pieces that have finished updating
/// their acclerations.
int treePiecesParticlesUpdated;
int savedNumTotalParticles;
int savedNumTotalNodes;
// keeps track of buckets of particles that were
// received during the prefetch and which were subsequently
// shipped off to the gpu - XXX
// not including cached particles in postfrefetch entities shipped to gpu
// since it is hard to count their number given just the pointer to the cache entry
// * either do not concern yourself with cached particles
// * or for each entry, get key, find bucket node in CM, DM or TPs and get number
// for now, the former
std::map<NodeKey, int> cachedPartsOnGpu;
// local particles that have been copied to the gpu
//std::map<NodeKey, int> localPartsOnGpu;
// TreePiece counter for multi-threaded GPU host buffer copy
int treePiecesBufferFilled;
// can the gpu accept a chunk of remote particles/nodes?
bool gpuFree;
/// Callback pointer to pass to HAPI.
CkCallback *localTransferCallback;
PendingBuffers *currentChunkBuffers;
// queue that stores all pending chunk transfers
CkQ<PendingBuffers *> pendingChunkTransferQ;
// last remote chunk's size in moments and particles
int lastChunkMoments;
int lastChunkParticles;
/// host buffer to transfer remote moments to GPU
CudaMultipoleMoments *bufRemoteMoments;
/// host buffer to transfer remote particles to GPU
CompactPartData *bufRemoteParts;
/// Vector to accumulate localMoments for transfering to GPU
CkVec<CudaMultipoleMoments> localMoments;
/// host buffer to transfer local moments to GPU
CudaMultipoleMoments *bufLocalMoments;
/// host buffer to transfer local particles to GPU
CompactPartData *bufLocalParts;
/// host buffer to transfer initial accelerations to GPU
VariablePartData *bufLocalVars;
// Pointers to particle and tree data on GPU
CudaMultipoleMoments *d_localMoments;
CudaMultipoleMoments *d_remoteMoments;
CompactPartData *d_localParts;
CompactPartData *d_remoteParts;
VariablePartData *d_localVars;
size_t sMoments;
size_t sCompactParts;
size_t sVarParts;
int numStreams;
cudaStream_t *streams;
#endif
/// The root of the combined trees
Tree::GenericTreeNode * root;
/// Table for nodes created by the DataManager to combine trees.
/// Kept track of here so we can delete them when done.
CkVec<GenericTreeNode *> nodeTable;
/// Number of chunks in which the tree was splitted during last combine operation
int oldNumChunks;
/// Nodes currently used as roots for remote computation
Tree::NodeKey *chunkRoots;
/// Lookup table for the chunkRoots
Tree::NodeLookupType chunkRootTable;
public:
/*
** Cooling
*/
COOL *Cool;
/// @brief log of star formation events.
///
/// Star formation events are stored on the data manager since there
/// is no need to migrate them with the TreePiece.
StarLog *starLog;
/// @brief Lock for accessing starlog from TreePieces
CmiNodeLock lockStarLog;
DataManager(const CkArrayID& treePieceID);
DataManager(CkMigrateMessage *);
void startLocalWalk();
void resumeRemoteChunk();
#ifdef CUDA
void createStreams(int _numStreams, const CkCallback& cb);
void donePrefetch(int chunk); // serialize remote chunk wrapper
void serializeLocalTree();
#ifdef GPU_LOCAL_TREE_WALK
void transformLocalTreeRecursive(GenericTreeNode *node, CkVec<CudaMultipoleMoments>& localMoments);
#endif //GPU_LOCAL_TREE_WALK
// actual serialization methods
PendingBuffers *serializeRemoteChunk(GenericTreeNode *);
void serializeLocal(GenericTreeNode *);
void transferLocalToGPU(int nParts, GenericTreeNode *node);
void freeLocalTreeMemory();
void freeRemoteChunkMemory(int chunk);
void transferParticleVarsBack();
void updateParticles(UpdateParticlesStruct *data);
void updateParticlesFreeMemory(UpdateParticlesStruct *data);
void initiateNextChunkTransfer();
DataManager(){}
#endif
private:
void init();
public:
~DataManager() {
for (unsigned int i = 0; i < nodeTable.length(); i++) {
delete nodeTable[i];
}
nodeTable.clear();
CoolFinalize(Cool);
delete starLog;
CmiDestroyLock(lockStarLog);
#ifdef CUDA
for (int i = 0; i < numStreams; i++) {
cudaStreamDestroy(streams[i]);
}
delete[] streams;
#endif
}
/// Called by ORB Sorter, save the list of which TreePiece is
/// responsible for which interval.
void acceptResponsibleIndex(const int* responsible, const int n,
const CkCallback& cb);
/// Called by the Sorter, I save these final keys and the list
/// of which TreePiece is responsible for which interval.
/// This routine then calls TreePiece::unshuffleParticles to
/// move the particles around.
/// @param keys vector of boundary keys
/// @param responsible vector of which piece is responsible
/// for which interval
/// @param bins number of particles in each interval.
void acceptFinalKeys(const SFC::Key* keys, const int* responsible, uint64_t* bins, const int n, const CkCallback& cb);
void pup(PUP::er& p);
#ifdef CUDA
/*
std::map<NodeKey, int> &getLocalPartsOnGpuTable(){
return localPartsOnGpu;
}
*/
std::map<NodeKey, int> &getCachedPartsOnGpuTable(){
return cachedPartsOnGpu;
}
void unmarkTreePiecesForCleanup(const CkCallback& cb);
#endif
// Functions used to create a tree inside the DataManager comprising
// all the trees in the TreePieces in the local node
private:
Tree::GenericTreeNode *buildProcessorTree(int n, Tree::GenericTreeNode **gtn);
int createLookupRoots(Tree::GenericTreeNode *node, Tree::NodeKey *keys);
public:
/// \brief Collect roots of treepieces on this node.
///
/// The roots are stored in registeredChares to be used by TreePiece
/// combineLocalTrees.
void notifyPresence(Tree::GenericTreeNode *root, TreePiece *treePiece);
void clearRegisteredPieces(const CkCallback& cb);
void combineLocalTrees(CkReductionMsg *msg);
void getChunks(int &num, Tree::NodeKey *&roots);
inline Tree::GenericTreeNode *chunkRootToNode(const Tree::NodeKey k) {
NodeLookupType::iterator iter = chunkRootTable.find(k);
if (iter != chunkRootTable.end()) return iter->second;
else return NULL;
}
inline Tree::GenericTreeNode *getRoot() { return root; }
void initCooling(double dGmPerCcUnit, double dComovingGmPerCcUnit,
double dErgPerGmUnit, double dSecUnit, double dKpcUnit,
COOLPARAM inParam, const CkCallback& cb);
void initStarLog(std::string _fileName, const CkCallback &cb);
void dmCoolTableRead(double *dTableData, int nData, const CkCallback& cb);
void CoolingSetTime(double z, // redshift
double dTime, // Time
const CkCallback& cb);
void SetStarCM(double dCenterOfMass[4], const CkCallback& cb);
void memoryStats(const CkCallback& cb);
void resetReadOnly(Parameters param, const CkCallback &cb);
public:
static Tree::GenericTreeNode *pickNodeFromMergeList(int n, GenericTreeNode **gtn, int &nUnresolved, int &pickedIndex);
};
inline static void setBIconfig()
{
#if CHARM_VERSION > 60401 && CMK_BALANCED_INJECTION_API
if (CkMyRank()==0) {
#define GNI_BI_DEFAULT 64
uint16_t cur_bi = ck_get_GNI_BIConfig();
if (cur_bi > GNI_BI_DEFAULT) {
ck_set_GNI_BIConfig(GNI_BI_DEFAULT);
}
}
if (CkMyPe() == 0)
CkPrintf("Balanced injection is set to %d.\n", ck_get_GNI_BIConfig());
#endif
}
/** @brief Control recording of Charm++ projections logs
*
* The constructors for this class are also used to set default
* node-wide communication parameters.
*/
class ProjectionsControl : public CBase_ProjectionsControl {
public:
ProjectionsControl() {
#ifdef CUDA
// GPUs are assigned to nodes in a round-robin fashion. This allows the user to define
// one virtual node per device and utilize multiple GPUs on a single node
// Beacuse devices are assigned per-PE, this is a convenient place to call setDevice
// Note that this code has nothing to do with initalizing projections
int numGpus;
cudaGetDeviceCount(&numGpus);
cudaSetDevice(CmiMyNode() % numGpus);
#endif
setBIconfig();
LBTurnCommOff();
#ifndef LB_MANAGER_VERSION
// Older Charm++ requires this to avoid excessive delays between successive LBs even
// when using AtSync mode
LBSetPeriod(0.0);
#endif
}
ProjectionsControl(CkMigrateMessage *m) : CBase_ProjectionsControl(m) {
setBIconfig();
LBTurnCommOff();
#ifndef LB_MANAGER_VERSION
// Older Charm++ requires this to avoid excessive delays between successive LBs even
// when using AtSync mode
LBSetPeriod(0.0);
#endif
}
void on(CkCallback cb) {
if(CkMyPe() == 0){
CkPrintf("\n\n**** PROJECTIONS ON *****\n\n");
}
traceBegin();
contribute(cb);
}
void off(CkCallback cb) {
if(CkMyPe() == 0){
CkPrintf("\n\n**** PROJECTIONS OFF *****\n\n");
}
traceEnd();
contribute(cb);
}
void pup(PUP::er &p){
CBase_ProjectionsControl::pup(p);
}
};
#endif //DATAMANAGER_H