NEURON
cellorder2.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 
9 #include <cstdio>
10 #include <map>
11 #include <set>
12 #include <algorithm>
13 #include <cstring>
14 #include <numeric>
15 
16 #if CORENRN_BUILD
19 #else
20 #include "oc/nrnassrt.h"
21 #endif
22 
25 
26 
27 // experiment starting with identical cell ordering
28 // groupindex aleady defined that keeps identical cells together
29 // begin with leaf to root ordering
30 #if CORENRN_BUILD
31 namespace coreneuron {
32 #else
33 namespace neuron {
34 #endif
35 
36 using VTN = VecTNode; // level of nodes
37 using VVTN = std::vector<VTN>; // group of levels
38 using VVVTN = std::vector<VVTN>; // groups
39 
40 // verify level in groups of nident identical nodes
41 void chklevel(VTN& level, size_t nident = 8) {}
42 
43 // first child before second child, etc
44 // if same parent level, then parent order
45 // if not same parent, then earlier parent (no parent earlier than parent)
46 // if same parents, then children order
47 // if no parents then nodevec_index order.
48 static bool sortlevel_cmp(TNode* a, TNode* b) {
49  // when starting with leaf to root order
50  // note that leaves are at max level and all roots at level 0
51  bool result = false;
52  // since cannot have an index < 0, just add 1 to level
53  size_t palevel = a->parent ? 1 + a->parent->level : 0;
54  size_t pblevel = b->parent ? 1 + b->parent->level : 0;
55  if (palevel < pblevel) { // only used when starting leaf to root order
56  result = true; // earlier level first
57  } else if (palevel == pblevel) { // always true when starting root to leaf
58  if (palevel == 0) { // a and b are roots
59  if (a->nodevec_index < b->nodevec_index) {
60  result = true;
61  }
62  } else { // parent order (already sorted with proper treenode_order)
63  if (a->treenode_order < b->treenode_order) { // children order
64  result = true;
65  } else if (a->treenode_order == b->treenode_order) {
67  result = true;
68  }
69  }
70  }
71  }
72  return result;
73 }
74 
75 static void sortlevel(VTN& level) {
76  std::sort(level.begin(), level.end(), sortlevel_cmp);
77 
78  for (size_t i = 0; i < level.size(); ++i) {
79  level[i]->treenode_order = i;
80  }
81 }
82 
83 // TODO: refactor since sortlevel() is traversing the nodes in same order
84 static void set_treenode_order(VVTN& levels) {
85  size_t order = 0;
86  for (auto& level: levels) {
87  for (auto* nd: level) {
88  nd->treenode_order = order++;
89  }
90  }
91 }
92 
93 #if CORENRN_DEBUG
94 // every level starts out with no race conditions involving both
95 // parent and child in the same level. Can we arrange things so that
96 // every level has at least 32 nodes?
97 static size_t g32(TNode* nd) {
98  return nd->nodevec_index / warpsize;
99 }
100 
101 static bool is_parent_race(TNode* nd) { // vitiating
102  size_t pg = g32(nd);
103  for (const auto& child: nd->children) {
104  if (pg == g32(child)) {
105  return true;
106  }
107  }
108  return false;
109 }
110 #endif
111 
112 // less than 32 apart
113 static bool is_parent_race2(TNode* nd) { // vitiating
114  size_t pi = nd->nodevec_index;
115  for (const auto& child: nd->children) {
116  if (child->nodevec_index - pi < warpsize) {
117  return true;
118  }
119  }
120  return false;
121 }
122 
123 #if CORENRN_DEBUG
124 static bool is_child_race(TNode* nd) { // potentially handleable by atomic
125  if (nd->children.size() < 2) {
126  return false;
127  }
128  if (nd->children.size() == 2) {
129  return g32(nd->children[0]) == g32(nd->children[1]);
130  }
131  std::set<size_t> s;
132  for (const auto& child: nd->children) {
133  std::size_t gc = g32(child);
134  if (s.find(gc) != s.end()) {
135  return true;
136  }
137  s.insert(gc);
138  }
139  return false;
140 }
141 #endif
142 
143 static bool is_child_race2(TNode* nd) { // potentially handleable by atomic
144  if (nd->children.size() < 2) {
145  return false;
146  }
147  if (nd->children.size() == 2) {
148  size_t c0 = nd->children[0]->nodevec_index;
149  size_t c1 = nd->children[1]->nodevec_index;
150  c0 = (c0 < c1) ? (c1 - c0) : (c0 - c1);
151  return c0 < warpsize;
152  }
153  size_t ic0 = nd->children[0]->nodevec_index;
154  for (size_t i = 1; i < nd->children.size(); ++i) {
155  size_t ic = nd->children[i]->nodevec_index;
156  if (ic - ic0 < warpsize) {
157  return true;
158  }
159  ic0 = ic;
160  }
161  return false;
162 }
163 
164 size_t dist2child(TNode* nd) {
165  size_t d = 1000;
166  size_t pi = nd->nodevec_index;
167  for (const auto& child: nd->children) {
168  std::size_t d1 = child->nodevec_index - pi;
169  if (d1 < d) {
170  d = d1;
171  }
172  }
173  return d;
174 }
175 
176 // from stackoverflow.com
177 template <typename T>
178 static void move_range(size_t start, size_t length, size_t dst, std::vector<T>& v) {
179  typename std::vector<T>::iterator first, middle, last;
180  if (start < dst) {
181  first = v.begin() + start;
182  middle = first + length;
183  last = v.begin() + dst;
184  } else {
185  first = v.begin() + dst;
186  middle = v.begin() + start;
187  last = middle + length;
188  }
189  std::rotate(first, middle, last);
190 }
191 
192 static void move_nodes(size_t start, size_t length, size_t dst, VTN& nodes) {
193  nrn_assert(dst <= nodes.size());
194  nrn_assert(start + length <= dst);
195  move_range(start, length, dst, nodes);
196 
197  // check correctness of move
198  for (size_t i = start; i < dst - length; ++i) {
199  nrn_assert(nodes[i]->nodevec_index == i + length);
200  }
201  for (size_t i = dst - length; i < dst; ++i) {
202  nrn_assert(nodes[i]->nodevec_index == start + (i - (dst - length)));
203  }
204 
205  // update nodevec_index
206  for (size_t i = start; i < dst; ++i) {
207  nodes[i]->nodevec_index = i;
208  }
209 }
210 
211 #if CORENRN_DEBUG
212 // least number of nodes to move after nd to eliminate prace
213 static size_t need2move(TNode* nd) {
214  size_t d = dist2child(nd);
215  return warpsize - ((nd->nodevec_index % warpsize) + d);
216 }
217 
218 static void how_many_warpsize_groups_have_only_leaves(VTN& nodes) {
219  size_t n = 0;
220  for (size_t i = 0; i < nodes.size(); i += warpsize) {
221  bool r = true;
222  for (size_t j = 0; j < warpsize; ++j) {
223  if (!nodes[i + j]->children.empty()) {
224  r = false;
225  break;
226  }
227  }
228  if (r) {
229  printf("warpsize group %ld starting at level %ld\n", i / warpsize, nodes[i]->level);
230  ++n;
231  }
232  }
233  printf("number of warpsize groups with only leaves = %ld\n", n);
234 }
235 
236 static void pr_race_situation(VTN& nodes) {
237  size_t prace2 = 0;
238  size_t prace = 0;
239  size_t crace = 0;
240  for (size_t i = nodes.size() - 1; nodes[i]->level != 0; --i) {
241  TNode* nd = nodes[i];
242  if (is_parent_race2(nd)) {
243  ++prace2;
244  }
245  if (is_parent_race(nd)) {
246  printf("level=%ld i=%ld d=%ld n=%ld",
247  nd->level,
248  nd->nodevec_index,
249  dist2child(nd),
250  need2move(nd));
251  for (const auto& cnd: nd->children) {
252  printf(" %ld %ld", cnd->level, cnd->nodevec_index);
253  }
254  printf("\n");
255  ++prace;
256  }
257  if (is_child_race(nd)) {
258  ++crace;
259  }
260  }
261  printf("prace=%ld crace=%ld prace2=%ld\n", prace, crace, prace2);
262 }
263 #endif
264 
265 static size_t next_leaf(TNode* nd, VTN& nodes) {
266  size_t i = 0;
267  for (i = nd->nodevec_index - 1; i > 0; --i) {
268  if (nodes[i]->children.empty()) {
269  return i;
270  }
271  }
272  // nrn_assert(i > 0);
273  return 0;
274 }
275 
276 static void checkrace(TNode* nd, VTN& nodes) {
277  for (size_t i = nd->nodevec_index; i < nodes.size(); ++i) {
278  if (is_parent_race2(nodes[i])) {
279  // printf("checkrace %ld\n", i);
280  }
281  }
282 }
283 
284 static bool eliminate_race(TNode* nd, size_t d, VTN& nodes, TNode* look) {
285  // printf("eliminate_race %ld %ld\n", nd->nodevec_index, d);
286  // opportunistically move that number of leaves
287  // error if no leaves left to move.
288  size_t i = look->nodevec_index;
289  while (d > 0) {
290  i = next_leaf(nodes[i], nodes);
291  if (i == 0) {
292  return false;
293  }
294  size_t n = 1;
295  while (nodes[i - 1]->children.empty() && n < d) {
296  --i;
297  ++n;
298  }
299  // printf(" move_nodes src=%ld len=%ld dest=%ld\n", i, n, nd->nodevec_index);
300  move_nodes(i, n, nd->nodevec_index + 1, nodes);
301  d -= n;
302  }
303  checkrace(nd, nodes);
304  return true;
305 }
306 
307 static void eliminate_prace(TNode* nd, VTN& nodes) {
308  size_t d = warpsize - dist2child(nd);
309  bool b = eliminate_race(nd, d, nodes, nd);
310  if (0 && !b) {
311  printf("could not eliminate prace for g=%ld c=%ld l=%ld o=%ld %ld\n",
312  nd->groupindex,
313  nd->cellindex,
314  nd->level,
315  nd->treenode_order,
316  nd->hash);
317  }
318 }
319 
320 static void eliminate_crace(TNode* nd, VTN& nodes) {
321  size_t c0 = nd->children[0]->nodevec_index;
322  size_t c1 = nd->children[1]->nodevec_index;
323  size_t d = warpsize - ((c0 > c1) ? (c0 - c1) : (c1 - c0));
324  TNode* cnd = nd->children[0];
325  bool b = eliminate_race(cnd, d, nodes, nd);
326  if (0 && !b) {
327  printf("could not eliminate crace for g=%ld c=%ld l=%ld o=%ld %ld\n",
328  nd->groupindex,
329  nd->cellindex,
330  nd->level,
331  nd->treenode_order,
332  nd->hash);
333  }
334 }
335 
336 static void question2(VVTN& levels) {
337  // number of compartments in the group
338  std::size_t nnode = std::accumulate(levels.begin(),
339  levels.end(),
340  0,
341  [](std::size_t s, const VTN& l) { return s + l.size(); });
342  VTN nodes(nnode); // store the sorted nodes from analyze function
343  nnode = 0;
344  for (const auto& level: levels) {
345  for (const auto& l: level) {
346  nodes[nnode++] = l;
347  }
348  }
349  for (size_t i = 0; i < nodes.size(); ++i) {
350  nodes[i]->nodevec_index = i;
351  }
352 
353  // how_many_warpsize_groups_have_only_leaves(nodes);
354 
355  // Here we need to make sure that the dependent nodes
356  // belong to separate warps
357 
358  // work backward and check the distance from parent to children.
359  // if parent in different group (warp?) then there is no vitiating race.
360  // if children in different group (warp?) then ther is no race (satisfied by
361  // atomic).
362  // If there is a vitiating race, then figure out how many nodes
363  // need to be inserted just before the parent to avoid the race.
364  // It is not clear if we should prioritize safe nodes (when moved they
365  // do not introduce a race) and/or contiguous nodes (probably, to keep
366  // the low hanging fruit together).
367  // At least, moved nodes should have proper tree order and not themselves
368  // introduce a race at their new location. Leaves are nice in that there
369  // are no restrictions in movement toward higher indices.
370  // Note that unless groups of 32 are inserted, it may be the case that
371  // races are generated at greater indices since otherwise a portion of
372  // each group is placed into the next group. This would not be an issue
373  // if, in fact, the stronger requirement of every parent having
374  // pi (parent index) + 32 <= ci (child index) is demanded instead of merely being in different
375  // warpsize. One nice thing about adding warpsize nodes is that it does not disturb any
376  // existing contiguous groups except the moved group which gets divided between parent
377  // warpsize and child, where the nodes past the parent get same relative indices in the next
378  // warpsize
379 
380  // let's see how well we can do by opportunistically moving leaves to
381  // separate parents from children by warpsize (ie is_parent_prace2 is false)
382  // Hopefully, we won't run out of leaves before eliminating all
383  // is_parent_prace2
384 
385  if (0 && nodes.size() % warpsize != 0) {
386  size_t nnode = nodes.size() - levels[0].size();
387  printf("warp of %ld cells has %ld nodes in last cycle %ld\n",
388  levels[0].size(),
389  nnode % warpsize,
390  nnode / warpsize + 1);
391  }
392 
393  // pr_race_situation(nodes);
394 
395  // eliminate parent and children races using leaves
396  // traverse all the children (no roots)
397  for (size_t i = nodes.size() - 1; i >= levels[0].size(); --i) {
398  TNode* nd = nodes[i];
399  if (is_child_race2(nd)) {
400  eliminate_crace(nd, nodes);
401  i = nd->nodevec_index;
402  }
403  if (is_parent_race2(nd)) {
404  eliminate_prace(nd, nodes);
405  i = nd->nodevec_index;
406  }
407  }
408  // copy nodes indices to treenode_order
409  for (size_t i = 0; i < nodes.size(); ++i) {
410  nodes[i]->treenode_order = i;
411  }
412 }
413 
414 // analyze each group of cells
415 // the cells are grouped based on warp balance (lpt) algorithm
416 static void analyze(VVTN& levels) {
417  // sort each level with respect to parent level order
418  // earliest parent level first.
419 
420  // treenode order can be anything as long as first children < second
421  // children etc.. After sorting a level, the order will be correct for
422  // that level, ranging from [0:level.size]
423  for (auto& level: levels) {
424  chklevel(level); // does nothing
425  for (const auto& nd: level) {
426  for (size_t k = 0; k < nd->children.size(); ++k) {
427  nd->children[k]->treenode_order = k;
428  }
429  }
430  }
431 
432  for (auto& level: levels) {
433  sortlevel(level);
434  chklevel(level); // does nothing
435  }
436 
437  set_treenode_order(levels);
438 }
439 
440 void prgroupsize(VVVTN& groups) {
441 #if CORENRN_DEBUG
442  for (size_t i = 0; i < groups[0].size(); ++i) {
443  printf("%5ld\n", i);
444  for (const auto& group: groups) {
445  printf(" %5ld", group[i].size());
446  }
447  printf("\n");
448  }
449 #endif
450 }
451 
452 // group index primary, treenode_order secondary
453 static bool final_nodevec_cmp(TNode* a, TNode* b) {
454  bool result = false;
455  if (a->groupindex < b->groupindex) {
456  result = true;
457  } else if (a->groupindex == b->groupindex) {
458  if (a->treenode_order < b->treenode_order) {
459  result = true;
460  }
461  }
462  return result;
463 }
464 
465 static void set_nodeindex(VecTNode& nodevec) {
466  for (size_t i = 0; i < nodevec.size(); ++i) {
467  nodevec[i]->nodevec_index = i;
468  }
469 }
470 
471 void group_order2(VecTNode& nodevec, size_t groupsize, size_t ncell) {
472  size_t maxlevel = level_from_root(nodevec);
473 
474  // reset TNode.groupindex
475  size_t nwarp = warp_balance(ncell, nodevec);
476 
477  // work on a cellgroup as a vector of levels. ie only possible race is
478  // two children in same warpsize
479 
480  // every warp deals with a group of cells
481  // the cell dispatching to the available groups is done through the warp_balance function (lpt
482  // algo)
483  VVVTN groups(nwarp ? nwarp : (ncell / groupsize + ((ncell % groupsize) ? 1 : 0)));
484 
485  for (auto& group: groups) {
486  group.resize(maxlevel + 1);
487  }
488 
489  // group the cells according to their groupindex and according to their level (see
490  // level_from_root)
491  for (const auto& nd: nodevec) {
492  groups[nd->groupindex][nd->level].push_back(nd);
493  }
494 
495  prgroupsize(groups); // debugging
496 
497  // deal with each group
498  for (auto& group: groups) {
499  analyze(group);
500  question2(group);
501  }
502 
503  // final nodevec order according to group_index and treenode_order
504  std::sort(nodevec.begin() + ncell, nodevec.end(), final_nodevec_cmp);
505  set_nodeindex(nodevec);
506 }
507 } // namespace coreneuron
static int maxlevel
Definition: clamp.cpp:36
TNode is the tree node that represents the tree of the compartments.
Definition: tnode.hpp:27
size_t groupindex
Cell ID that this compartment belongs to.
Definition: tnode.hpp:58
size_t nodevec_index
Total number of compartments from the current node and below.
Definition: tnode.hpp:37
VecTNode children
Definition: tnode.hpp:32
size_t level
For cell permute 1 (Interleaved):
Definition: tnode.hpp:56
size_t hash
Hash algorith that generates a hash based on the hash of the children and the number of compartments ...
Definition: tnode.hpp:35
size_t cellindex
level of of this compartment in the tree
Definition: tnode.hpp:57
size_t treenode_order
index in nodevec that is set in check() In cell permute 2 this is set as Breadth First traversal
Definition: tnode.hpp:39
TNode * parent
Definition: tnode.hpp:31
#define v
Definition: md1redef.h:11
#define i
Definition: md1redef.h:19
static double order(void *v)
Definition: cvodeobj.cpp:218
static RNG::key_type k
Definition: nrnran123.cpp:9
printf
Definition: extdef.h:5
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
In mechanism libraries, cannot use auto const token = nrn_ensure_model_data_are_sorted(); because the...
Definition: tnode.hpp:17
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:784
static void move_range(size_t start, size_t length, size_t dst, std::vector< T > &v)
Definition: cellorder2.cpp:178
static bool is_parent_race2(TNode *nd)
Definition: cellorder2.cpp:113
void group_order2(VecTNode &, size_t groupsize, size_t ncell)
Implementation of the advanced interleaving strategy (interleave_permute_type == 2)
Definition: cellorder2.cpp:471
static void set_nodeindex(VecTNode &nodevec)
Definition: cellorder2.cpp:465
size_t level_from_root(VecTNode &)
Definition: cellorder1.cpp:221
std::vector< VTN > VVTN
Definition: cellorder2.cpp:37
int ic
Definition: cellorder.cpp:619
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
Definition: balance.cpp:52
static void analyze(VVTN &levels)
Definition: cellorder2.cpp:416
void chklevel(VTN &level, size_t nident=8)
Definition: cellorder2.cpp:41
static void set_treenode_order(VVTN &levels)
Definition: cellorder2.cpp:84
static bool is_child_race2(TNode *nd)
Definition: cellorder2.cpp:143
static void move_nodes(size_t start, size_t length, size_t dst, VTN &nodes)
Definition: cellorder2.cpp:192
static void checkrace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:276
static size_t groupsize
Definition: cellorder1.cpp:43
static void sortlevel(VTN &level)
Definition: cellorder2.cpp:75
std::vector< VVTN > VVVTN
Definition: cellorder2.cpp:38
static size_t next_leaf(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:265
static bool eliminate_race(TNode *nd, size_t d, VTN &nodes, TNode *look)
Definition: cellorder2.cpp:284
VecTNode VTN
Definition: cellorder2.cpp:36
static void question2(VVTN &levels)
Definition: cellorder2.cpp:336
static bool sortlevel_cmp(TNode *a, TNode *b)
Definition: cellorder2.cpp:48
void prgroupsize(VVVTN &groups)
Definition: cellorder2.cpp:440
static void eliminate_prace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:307
size_t dist2child(TNode *nd)
Definition: cellorder2.cpp:164
static bool final_nodevec_cmp(TNode *a, TNode *b)
Definition: cellorder2.cpp:453
static void eliminate_crace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:320
std::vector< TNode * > VecTNode
Definition: tnode.hpp:21
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
int const size_t const size_t n
Definition: nrngsl.h:10
size_t j
s
Definition: multisend.cpp:521
static double look(void *v)
Definition: ocbbs.cpp:423
static double children(void *v)
Definition: seclist.cpp:96
#define warpsize
Definition: tnode.hpp:89