NEURON
cellorder.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 #include <set>
9 #include <vector>
10 
11 #if CORENRN_BUILD
12 #include "coreneuron/nrnconf.h"
16 #else
17 #include "nrnoc/multicore.h"
18 #include "nrnoc/nrn_ansi.h"
19 #include "nrnoc/nrniv_mf.h"
20 #include "nrnoc/section.h"
21 #include "oc/nrnassrt.h"
23 #endif
24 
27 #include "coreneuron/utils/lpt.hpp"
30 
31 #include "coreneuron/permute/node_permute.h" // for print_quality
32 
33 #ifdef _OPENACC
34 #include <openacc.h>
35 #endif
36 
37 #if CORENRN_BUILD
38 namespace coreneuron {
39 #else
40 namespace neuron {
41 #endif
43 InterleaveInfo* interleave_info; // nrn_nthread array
44 
45 
46 void InterleaveInfo::swap(InterleaveInfo& info) {
47  std::swap(nwarp, info.nwarp);
48  std::swap(nstride, info.nstride);
49 
50  std::swap(stridedispl, info.stridedispl);
51  std::swap(stride, info.stride);
52  std::swap(firstnode, info.firstnode);
53  std::swap(lastnode, info.lastnode);
54  std::swap(cellsize, info.cellsize);
55 
56  std::swap(nnode, info.nnode);
57  std::swap(ncycle, info.ncycle);
58  std::swap(idle, info.idle);
59  std::swap(cache_access, info.cache_access);
60  std::swap(child_race, info.child_race);
61 }
62 
63 InterleaveInfo::InterleaveInfo(const InterleaveInfo& info) {
64  nwarp = info.nwarp;
65  nstride = info.nstride;
66 
67  copy_align_array(stridedispl, info.stridedispl, nwarp + 1);
69  copy_align_array(firstnode, info.firstnode, nwarp + 1);
70  copy_align_array(lastnode, info.lastnode, nwarp + 1);
71  copy_align_array(cellsize, info.cellsize, nwarp);
72 
73  copy_array(nnode, info.nnode, nwarp);
74  copy_array(ncycle, info.ncycle, nwarp);
75  copy_array(idle, info.idle, nwarp);
76  copy_array(cache_access, info.cache_access, nwarp);
77  copy_array(child_race, info.child_race, nwarp);
78 }
79 
80 InterleaveInfo& InterleaveInfo::operator=(const InterleaveInfo& info) {
81  // self assignment
82  if (this == &info)
83  return *this;
84 
85  InterleaveInfo temp(info);
86 
87  this->swap(temp);
88  return *this;
89 }
90 
91 InterleaveInfo::~InterleaveInfo() {
92  if (stride) {
97  }
98  if (stridedispl) {
100  }
101  if (idle) {
102  delete[] nnode;
103  delete[] ncycle;
104  delete[] idle;
105  delete[] cache_access;
106  delete[] child_race;
107  }
108 }
109 
113 }
114 
116  if (interleave_info) {
117  delete[] interleave_info;
118  interleave_info = nullptr;
119  }
120 }
121 
122 // more precise visualization of the warp quality
123 // can be called after admin2
124 static void print_quality2(int iwarp, InterleaveInfo& ii, int* p) {
125  // '.' p[i] = p[i-1] + 1 (but first of cacheline is 'o')
126  // 'o' p[i] != p[i-1] + 1 and not a child race
127  // 'r' p[i] = p[i-1] + 1 and race
128  // 'R' p[1] != p[i-1] + 1 and race
129  // 'X' core is unused
130  int pc = (iwarp == 0); // print warp 0
131  pc = 0; // turn off printing
132  int nodebegin = ii.lastnode[iwarp];
133  int* stride = ii.stride + ii.stridedispl[iwarp];
134  int ncycle = ii.cellsize[iwarp];
135 
136  int inode = nodebegin;
137 
138  size_t nn = 0; // number of nodes in warp. '.oRr'
139  size_t nx = 0; // number of idle cores on all cycles. 'X'
140  size_t ncacheline = 0; // number of parent memory cacheline accesses.
141  // assume warpsize is max number in a cacheline so all o
142  size_t ncr = 0; // number of child race. nchild-1 of same parent in same cycle
143 
144  for (int icycle = 0; icycle < ncycle; ++icycle) {
145  int s = stride[icycle];
146  int lastp = -2;
147  if (pc)
148  printf(" ");
149  std::set<int> crace; // how many children have same parent in a cycle
150  for (int icore = 0; icore < warpsize; ++icore) {
151  char ch = '.';
152  if (icore < s) {
153  int par = p[inode];
154  if (crace.find(par) != crace.end()) {
155  ch = 'r';
156  ++ncr;
157  } else {
158  crace.insert(par);
159  }
160 
161  if (par != lastp + 1) {
162  ch = (ch == 'r') ? 'R' : 'o';
163  ++ncacheline;
164  }
165  lastp = p[inode++];
166  ++nn;
167  } else {
168  ch = 'X';
169  ++nx;
170  }
171  if (pc)
172  printf("%c", ch);
173  }
174  if (pc)
175  printf("\n");
176  }
177 
178  ii.nnode[iwarp] = nn;
179  ii.ncycle[iwarp] = size_t(ncycle);
180  ii.idle[iwarp] = nx;
181  ii.cache_access[iwarp] = ncacheline;
182  ii.child_race[iwarp] = ncr;
183  if (pc)
184  printf("warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
185  iwarp,
186  nn,
187  ncycle,
188  nx,
189  ncacheline,
190  ncr);
191 }
192 
193 static void print_quality1(int iwarp, InterleaveInfo& ii, int ncell, int* p) {
194  int pc = ((iwarp == 0) || iwarp == (ii.nwarp - 1)); // warp not to skip printing
195  pc = 0; // turn off printing.
196  int* stride = ii.stride;
197  int cellbegin = iwarp * warpsize;
198  int cellend = cellbegin + warpsize;
199  cellend = (cellend < stride[0]) ? cellend : stride[0];
200 
201  int ncycle = 0;
202  for (int i = cellbegin; i < cellend; ++i) {
203  if (ncycle < ii.cellsize[i]) {
204  ncycle = ii.cellsize[i];
205  }
206  }
207  nrn_assert(ncycle == ii.cellsize[cellend - 1]);
208  nrn_assert(ncycle <= ii.nstride);
209 
210  int ncell_in_warp = cellend - cellbegin;
211 
212  size_t n = 0; // number of nodes in warp (not including roots)
213  size_t nx = 0; // number of idle cores on all cycles. X
214  size_t ncacheline = 0; // number of parent memory cacheline accesses. assume warpsize is max
215  // number in a cacheline so first core has all o
216 
217  int inode = ii.firstnode[cellbegin];
218  for (int icycle = 0; icycle < ncycle; ++icycle) {
219  int sbegin = ncell - stride[icycle] - cellbegin;
220  int lastp = -2;
221  if (pc)
222  printf(" ");
223  for (int icore = 0; icore < warpsize; ++icore) {
224  char ch = '.';
225  if (icore < ncell_in_warp && icore >= sbegin) {
226  int par = p[inode + icore];
227  if (par != lastp + 1) {
228  ch = 'o';
229  ++ncacheline;
230  }
231  lastp = par;
232  ++n;
233  } else {
234  ch = 'X';
235  ++nx;
236  }
237  if (pc)
238  printf("%c", ch);
239  }
240  if (pc)
241  printf("\n");
242  inode += ii.stride[icycle + 1];
243  }
244 
245  ii.nnode[iwarp] = n;
246  ii.ncycle[iwarp] = (size_t) ncycle;
247  ii.idle[iwarp] = nx;
248  ii.cache_access[iwarp] = ncacheline;
249  ii.child_race[iwarp] = 0;
250  if (pc)
251  printf("warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access\n",
252  iwarp,
253  n,
254  ncycle,
255  nx,
256  ncacheline);
257 }
258 
259 static void warp_balance(int ith, InterleaveInfo& ii) {
260  size_t nwarp = size_t(ii.nwarp);
261  size_t smm[4][3]; // sum_min_max see cp below
262  for (size_t j = 0; j < 4; ++j) {
263  smm[j][0] = 0;
264  smm[j][1] = 1000000000;
265  smm[j][2] = 0;
266  }
267  double emax = 0.0, emin = 1.0;
268  for (size_t i = 0; i < nwarp; ++i) {
269  size_t n = ii.nnode[i];
270  double e = double(n) / (n + ii.idle[i]);
271  if (emax < e) {
272  emax = e;
273  }
274  if (emin > e) {
275  emin = e;
276  }
277  size_t s[4] = {n, ii.idle[i], ii.cache_access[i], ii.child_race[i]};
278  for (size_t j = 0; j < 4; ++j) {
279  smm[j][0] += s[j];
280  if (smm[j][1] > s[j]) {
281  smm[j][1] = s[j];
282  }
283  if (smm[j][2] < s[j]) {
284  smm[j][2] = s[j];
285  }
286  }
287  }
288  std::vector<size_t> v(nwarp);
289  for (size_t i = 0; i < nwarp; ++i) {
290  v[i] = ii.ncycle[i];
291  }
292  double bal = load_balance(v);
293 #ifdef DEBUG
294  printf(
295  "thread %d nwarp=%ld balance=%g warp_efficiency %g to %g\n", ith, nwarp, bal, emin, emax);
296  const char* cp[4] = {"nodes", "idle", "ca", "cr"};
297  for (size_t i = 0; i < 4; ++i) {
298  printf(" %s=%ld (%ld:%ld)", cp[i], smm[i][0], smm[i][1], smm[i][2]);
299  }
300  printf("\n");
301 #else
302  (void) bal; // Remove warning about unused
303 #endif
304 }
305 
306 #if !CORENRN_BUILD
307 static void prnode(const char* mes, NrnThread& nt) {
308  printf("%s nrnthread %d node info\n", mes, nt.id);
309  for (int i = 0; i < nt.end; ++i) {
310  printf(
311  " _v_node[%2d]->v_node_index=%2d %p"
312  " _v_parent[%2d]->v_node_index=%2d parent[%2d]=%2d\n",
313  i,
314  nt._v_node[i]->v_node_index,
315  nt._v_node[i],
316  i,
317  nt._v_parent[i] ? nt._v_parent[i]->v_node_index : -1,
318  i,
319  nt._v_parent_index[i]);
320  }
321  for (auto tml = nt.tml; tml; tml = tml->next) {
322  Memb_list* ml = tml->ml;
323  printf(" %s %d\n", memb_func[tml->index].sym->name, ml->nodecount);
324  for (int i = 0; i < ml->nodecount; ++i) {
325  printf(" %2d ndindex=%2d nd=%p [%2d] pdata=%p prop=%p\n",
326  i,
327  ml->nodeindices[i],
328  ml->nodelist[i],
329  ml->nodelist[i]->v_node_index,
330  ml->pdata[i],
331  ml->prop[i]);
332  }
333  }
334 }
335 
337  if (type != interleave_permute_type) {
338  tree_changed = 1; // calls setup_topology. v_stucture_change = 1 may be better.
339  }
341  return type;
342 }
343 #endif // !CORENRN_BUILD
344 
345 #if CORENRN_BUILD
346 int* interleave_order(int ith, int ncell, int nnode, int* parent) {
347 #else
348 std::vector<int> interleave_order(int ith, int ncell, int nnode, int* parent) {
349 #endif
350  // return if there are no nodes to permute
351  if (nnode <= 0)
352  return {};
353 
354  // ensure parent of root = -1
355  for (int i = 0; i < ncell; ++i) {
356  if (parent[i] == 0) {
357  parent[i] = -1;
358  }
359  }
360 
361  int nwarp = 0, nstride = 0, *stride = nullptr, *firstnode = nullptr;
362  int *lastnode = nullptr, *cellsize = nullptr, *stridedispl = nullptr;
363 
364  auto order = node_order(
365  ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode, cellsize, stridedispl);
366 
367  if (interleave_info) {
369  ii.nwarp = nwarp;
370  ii.nstride = nstride;
371  ii.stridedispl = stridedispl;
372  ii.stride = stride;
373  ii.firstnode = firstnode;
374  ii.lastnode = lastnode;
375  ii.cellsize = cellsize;
376  if (0 && ith == 0 && interleave_permute_type == 1) {
377  printf("ith=%d nstride=%d ncell=%d nnode=%d\n", ith, nstride, ncell, nnode);
378  for (int i = 0; i < ncell; ++i) {
379  printf("icell=%d cellsize=%d first=%d last=%d\n",
380  i,
381  cellsize[i],
382  firstnode[i],
383  lastnode[i]);
384  }
385  for (int i = 0; i < nstride; ++i) {
386  printf("istride=%d stride=%d\n", i, stride[i]);
387  }
388  }
389  if (ith == 0) {
390  // needed for print_quality[12] and done once here to save time
391  std::vector<int> p(nnode);
392 
393  for (int i = 0; i < nnode; ++i) {
394  p[i] = parent[i];
395  }
396 #if CORENRN_BUILD
397  permute_ptr(p.data(), p.size(), order);
398  node_permute(p.data(), p.size(), order);
399 #else
401  update_parent_index(p.data(), p.size(), order);
402 #endif
403 
404  ii.nnode = new size_t[nwarp];
405  ii.ncycle = new size_t[nwarp];
406  ii.idle = new size_t[nwarp];
407  ii.cache_access = new size_t[nwarp];
408  ii.child_race = new size_t[nwarp];
409  for (int i = 0; i < nwarp; ++i) {
410  if (interleave_permute_type == 1) {
411  print_quality1(i, interleave_info[ith], ncell, p.data());
412  }
413  if (interleave_permute_type == 2) {
414  print_quality2(i, interleave_info[ith], p.data());
415  }
416  }
417  warp_balance(ith, interleave_info[ith]);
418  }
419  }
420 
421  return order;
422 }
423 
424 #if !CORENRN_BUILD
427  return;
428  }
429  // printf("enter nrn_permute_node_order\n");
432  for (int tid = 0; tid < nrn_nthread; ++tid) {
433  auto& nt = nrn_threads[tid];
434  auto perm = interleave_order(tid, nt.ncell, nt.end, nt._v_parent_index);
435  auto p = inverse_permute_vector(perm);
436 #if 0
437  for (int i = 0; i < nt.end; ++i) {
438  int x = nt._v_parent_index[p[i]];
439  int par = x >= 0 ? perm[x] : -1;
440  printf("%2d <- %2d parent=%2d\n", i, p[i], par);
441  }
442 #endif
443  // prnode("before perm", nt);
444  forward_permute(nt._v_node, nt.end, p);
445  forward_permute(nt._v_parent, nt.end, p);
446  forward_permute(nt._v_parent_index, nt.end, p);
447  update_parent_index(nt._v_parent_index, nt.end, perm);
448  for (int i = 0; i < nt.end; ++i) {
449  nt._v_node[i]->v_node_index = i;
450  }
451  for (auto tml = nt.tml; tml; tml = tml->next) {
452  Memb_list* ml = tml->ml;
453  for (int i = 0; i < ml->nodecount; ++i) {
454  ml->nodeindices[i] = perm[ml->nodeindices[i]];
455  }
456  sort_ml(ml); // all fields in increasing nodeindex order
457  }
458  // prnode("after perm", nt);
459  }
460  // printf("leave nrn_permute_node_order\n");
461 }
462 #endif // !CORENRN_BUILD
463 
464 #if INTERLEAVE_DEBUG // only the cell per core style
465 static int** cell_indices_debug(NrnThread& nt, InterleaveInfo& ii) {
466  int ncell = nt.ncell;
467  int nnode = nt.end;
468  int* parents = nt._v_parent_index;
469 
470  // we expect the nodes to be interleave ordered with smallest cell first
471  // establish consistency with ii.
472  // first ncell parents are -1
473  for (int i = 0; i < ncell; ++i) {
474  nrn_assert(parents[i] == -1);
475  }
476  int* sz = new int[ncell];
477  int* cell = new int[nnode];
478  for (int i = 0; i < ncell; ++i) {
479  sz[i] = 0;
480  cell[i] = i;
481  }
482  for (int i = ncell; i < nnode; ++i) {
483  cell[i] = cell[parents[i]];
484  sz[cell[i]] += 1;
485  }
486 
487  // cells are in inceasing sz order;
488  for (int i = 1; i < ncell; ++i) {
489  nrn_assert(sz[i - 1] <= sz[i]);
490  }
491  // same as ii.cellsize
492  for (int i = 0; i < ncell; ++i) {
493  nrn_assert(sz[i] == ii.cellsize[i]);
494  }
495 
496  int** cellindices = new int*[ncell];
497  for (int i = 0; i < ncell; ++i) {
498  cellindices[i] = new int[sz[i]];
499  sz[i] = 0; // restart sz counts
500  }
501  for (int i = ncell; i < nnode; ++i) {
502  cellindices[cell[i]][sz[cell[i]]] = i;
503  sz[cell[i]] += 1;
504  }
505  // cellindices first and last same as ii first and last
506  for (int i = 0; i < ncell; ++i) {
507  nrn_assert(cellindices[i][0] == ii.firstnode[i]);
508  nrn_assert(cellindices[i][sz[i] - 1] == ii.lastnode[i]);
509  }
510 
511  delete[] sz;
512  delete[] cell;
513 
514  return cellindices;
515 }
516 
517 static int*** cell_indices_threads;
518 void mk_cell_indices() {
519  cell_indices_threads = new int**[nrn_nthread];
520  for (int i = 0; i < nrn_nthread; ++i) {
521  NrnThread& nt = nrn_threads[i];
522  if (nt.ncell) {
523  cell_indices_threads[i] = cell_indices_debug(nt, interleave_info[i]);
524  } else {
525  cell_indices_threads[i] = nullptr;
526  }
527  }
528 }
529 #endif // INTERLEAVE_DEBUG
530 
531 #if CORENRN_BUILD
532 #define GPU_V(i) nt->_actual_v[i]
533 #define GPU_A(i) nt->_actual_a[i]
534 #define GPU_B(i) nt->_actual_b[i]
535 #define GPU_D(i) nt->_actual_d[i]
536 #define GPU_RHS(i) nt->_actual_rhs[i]
537 #else
538 #define GPU_V(i) vec_v[i]
539 #define GPU_A(i) vec_a[i]
540 #define GPU_B(i) vec_b[i]
541 #define GPU_D(i) vec_d[i]
542 #define GPU_RHS(i) vec_rhs[i]
543 #endif
544 #define GPU_PARENT(i) nt->_v_parent_index[i]
545 
546 // How does the interleaved permutation with stride get used in
547 // triagularization?
548 
549 // each cell in parallel regardless of inhomogeneous topology
551  int icell,
552  int icellsize,
553  int nstride,
554  int* stride,
555  int* lastnode) {
556 #if !CORENRN_BUILD
557  auto* const vec_a = nt->node_a_storage();
558  auto* const vec_b = nt->node_b_storage();
559  auto* const vec_d = nt->node_d_storage();
560  auto* const vec_rhs = nt->node_rhs_storage();
561 #endif
562  int i = lastnode[icell];
563  for (int istride = nstride - 1; istride >= 0; --istride) {
564  if (istride < icellsize) { // only first icellsize strides matter
565  // what is the index
566  int ip = GPU_PARENT(i);
567 #ifndef CORENEURON_ENABLE_GPU
568  nrn_assert(ip >= 0); // if (ip < 0) return;
569 #endif
570  double p = GPU_A(i) / GPU_D(i);
571  GPU_D(ip) -= p * GPU_B(i);
572  GPU_RHS(ip) -= p * GPU_RHS(i);
573  i -= stride[istride];
574  }
575  }
576 }
577 
578 // back substitution?
579 static void bksub_interleaved(NrnThread* nt,
580  int icell,
581  int icellsize,
582  int /* nstride */,
583  int* stride,
584  int* firstnode) {
585 #if !CORENRN_BUILD
586  auto* const vec_a = nt->node_a_storage();
587  auto* const vec_b = nt->node_b_storage();
588  auto* const vec_d = nt->node_d_storage();
589  auto* const vec_rhs = nt->node_rhs_storage();
590 #endif
591  int i = firstnode[icell];
592  GPU_RHS(icell) /= GPU_D(icell); // the root
593  for (int istride = 0; istride < icellsize; ++istride) {
594  int ip = GPU_PARENT(i);
595 #ifndef CORENEURON_ENABLE_GPU
596  nrn_assert(ip >= 0);
597 #endif
598  GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
599  GPU_RHS(i) /= GPU_D(i);
600  i += stride[istride + 1];
601  }
602 }
603 
604 nrn_pragma_acc(routine vector)
605 static void solve_interleaved2_loop_body(NrnThread* nt,
606  int icore,
607  int* ncycles,
608  int* strides,
610  int* rootbegin,
611  int* nodebegin) {
612 #if !CORENRN_BUILD
613  auto* const vec_a = nt->node_a_storage();
614  auto* const vec_b = nt->node_b_storage();
615  auto* const vec_d = nt->node_d_storage();
616  auto* const vec_rhs = nt->node_rhs_storage();
617 #endif
618  int iwarp = icore / warpsize; // figure out the >> value
619  int ic = icore & (warpsize - 1); // figure out the & mask
622  int root = rootbegin[iwarp]; // cell ID -> [0, ncell)
626 
627  // triang_interleaved2
628  {
629  int icycle = ncycle - 1;
630  int istride = stride[icycle];
631  int ii = lastnode - istride + ic;
632 
633  // execute until all tree depths are executed
635 
636  nrn_pragma_acc(loop seq)
637  for (; has_subtrees_to_compute;) { // ncycle loop
638  // serial test, gpu does this in parallel
639  nrn_pragma_acc(loop vector)
640  nrn_pragma_omp(loop bind(parallel))
641  for (int icore = 0; icore < warpsize; ++icore) {
642  int i = ii + icore;
643  if (icore < istride) { // most efficient if istride equal warpsize
644  // what is the index
645  int ip = GPU_PARENT(i);
646  double p = GPU_A(i) / GPU_D(i);
647  nrn_pragma_acc(atomic update)
648  nrn_pragma_omp(atomic update)
649  GPU_D(ip) -= p * GPU_B(i);
650  nrn_pragma_acc(atomic update)
651  nrn_pragma_omp(atomic update)
652  GPU_RHS(ip) -= p * GPU_RHS(i);
653  }
654  }
655  // if finished with all tree depths then ready to break
656  // (note that break is not allowed in OpenACC)
657  if (icycle == 0) {
658  has_subtrees_to_compute = false;
659  continue;
660  }
661  --icycle;
662  istride = stride[icycle];
663  ii -= istride;
664  }
665  }
666  // bksub_interleaved2
667  auto const bksub_root = root + ic;
668  {
669  nrn_pragma_acc(loop seq)
670  for (int i = bksub_root; i < lastroot; i += 1) {
671  GPU_RHS(i) /= GPU_D(i); // the root
672  }
673 
674  int i = firstnode + ic;
675  int ii = i;
676  nrn_pragma_acc(loop seq)
677  for (int icycle = 0; icycle < ncycle; ++icycle) {
678  int istride = stride[icycle];
679  // serial test, gpu does this in parallel
680  nrn_pragma_acc(loop vector)
681  nrn_pragma_omp(loop bind(parallel))
682  for (int icore = 0; icore < warpsize; ++icore) {
683  int i = ii + icore;
684  if (icore < istride) {
685  int ip = GPU_PARENT(i);
686  GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
687  GPU_RHS(i) /= GPU_D(i);
688  }
689  i += istride;
690  }
691  ii += istride;
692  }
693  }
694 }
695 
696 /**
697  * \brief Solve Hines matrices/cells with compartment-based granularity.
698  *
699  * The node ordering/permutation guarantees cell interleaving (as much coalesced memory access as
700  * possible) and balanced warps (through the use of lpt algorithm to define the groups/warps). Every
701  * warp deals with a group of cells, therefore multiple compartments (finer level of parallelism).
702  */
703 void solve_interleaved2(int ith) {
704  NrnThread* nt = nrn_threads + ith;
705  InterleaveInfo& ii = interleave_info[ith];
706  int nwarp = ii.nwarp;
707  if (nwarp == 0)
708  return;
709 
710  int ncore = nwarp * warpsize;
711 
712 #ifdef _OPENACC
714  auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
715  auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
716  solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
717  } else {
718 #endif
719  int* ncycles = ii.cellsize; // nwarp of these
720  int* stridedispl = ii.stridedispl; // nwarp+1 of these
721  int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize)
722  int* rootbegin = ii.firstnode; // nwarp+1 of these
723  int* nodebegin = ii.lastnode; // nwarp+1 of these
724 #if defined(CORENEURON_ENABLE_GPU)
725  int nstride = stridedispl[nwarp];
726 #endif
727  // nvc++/22.3 does not respect an if clause inside nrn_pragma_omp...
728 #if CORENRN_BUILD
729  if (nt->compute_gpu) {
730 #else
731  // bad clang-format
732  if (0) { // nt->compute_gpu) {
733 #endif
734  /* If we compare this loop with the one from cellorder.cu (CUDA version), we will
735  * understand that the parallelism here is exposed in steps, while in the CUDA version
736  * all the parallelism is exposed from the very beginning of the loop. In more details,
737  * here we initially distribute the outermost loop, e.g. in the CUDA blocks, and for the
738  * innermost loops we explicitly use multiple threads for the parallelization (see for
739  * example the loop directives in triang/bksub_interleaved2). On the other hand, in the
740  * CUDA version the outermost loop is distributed to all the available threads, and
741  * therefore there is no need to have the innermost loops. Here, the loop/icore jumps
742  * every warpsize, while in the CUDA version the icore increases by one. Other than
743  * this, the two loop versions are equivalent (same results).
744  */
745  nrn_pragma_acc(parallel loop gang present(nt [0:1],
746  strides [0:nstride],
747  ncycles [0:nwarp],
748  stridedispl [0:nwarp + 1],
749  rootbegin [0:nwarp + 1],
750  nodebegin [0:nwarp + 1]) async(nt->stream_id))
751  // clang-format off
752  nrn_pragma_omp(target teams loop map(present, alloc: nt[:1],
753  strides[:nstride],
754  ncycles[:nwarp],
755  stridedispl[:nwarp + 1],
756  rootbegin[:nwarp + 1],
757  nodebegin[:nwarp + 1]))
758  // clang-format on
759  for (int icore = 0; icore < ncore; icore += warpsize) {
760  solve_interleaved2_loop_body(
762  }
763  nrn_pragma_acc(wait(nt->stream_id))
764  } else {
765  for (int icore = 0; icore < ncore; icore += warpsize) {
766  solve_interleaved2_loop_body(
768  }
769  }
770 #ifdef _OPENACC
771  }
772 #endif
773 }
774 
775 /**
776  * \brief Solve Hines matrices/cells with cell-based granularity.
777  *
778  * The node ordering guarantees cell interleaving (as much coalesced memory access as possible),
779  * but parallelism granularity is limited to a per cell basis. Therefore every execution stream
780  * is mapped to a cell/tree.
781  */
782 void solve_interleaved1(int ith) {
783  NrnThread* nt = nrn_threads + ith;
784  int ncell = nt->ncell;
785  if (ncell == 0) {
786  return;
787  }
788  InterleaveInfo& ii = interleave_info[ith];
789  int nstride = ii.nstride;
790  int* stride = ii.stride;
791  int* firstnode = ii.firstnode;
792  int* lastnode = ii.lastnode;
793  int* cellsize = ii.cellsize;
794 
795  // OL211123: can we preserve the error checking behaviour of OpenACC's
796  // present clause with OpenMP? It is a bug if these data are not present,
797  // so diagnostics are helpful...
798  nrn_pragma_acc(parallel loop present(nt [0:1],
799  stride [0:nstride],
800  firstnode [0:ncell],
801  lastnode [0:ncell],
802  cellsize [0:ncell]) if (nt->compute_gpu)
803  async(nt->stream_id))
804  nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
805  for (int icell = 0; icell < ncell; ++icell) {
806  int icellsize = cellsize[icell];
807  triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
808  bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
809  }
810  nrn_pragma_acc(wait(nt->stream_id))
811 }
812 
813 void solve_interleaved(int ith) {
814  if (interleave_permute_type != 1) {
815  solve_interleaved2(ith);
816  } else {
817  solve_interleaved1(ith);
818  }
819 }
820 } // namespace coreneuron
int tree_changed
Definition: cabcode.cpp:51
#define GPU_RHS(i)
Definition: cellorder.cpp:542
#define GPU_B(i)
Definition: cellorder.cpp:540
#define GPU_D(i)
Definition: cellorder.cpp:541
#define GPU_A(i)
Definition: cellorder.cpp:539
#define GPU_PARENT(i)
Definition: cellorder.cpp:544
#define v
Definition: md1redef.h:11
#define i
Definition: md1redef.h:19
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:66
static double order(void *v)
Definition: cvodeobj.cpp:218
double load_balance(std::vector< size_t > &v)
Definition: lpt.cpp:74
void free_memory(void *pointer)
Definition: memory.h:213
static double map(void *v)
Definition: mlinedit.cpp:43
printf
Definition: extdef.h:5
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
NrnThread * nrn_threads
Definition: multicore.cpp:56
int nrn_nthread
Definition: multicore.cpp:55
InterleaveInfo * interleave_info
void update(NrnThread *_nt)
int interleave_permute_type
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
Definition: ivocvect.cpp:30
corenrn_parameters corenrn_param
Printing method.
In mechanism libraries, cannot use auto const token = nrn_ensure_model_data_are_sorted(); because the...
Definition: tnode.hpp:17
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:784
int ncycle
Definition: cellorder.cpp:620
int lastroot
Definition: cellorder.cpp:623
auto *const vec_d
Definition: cellorder.cpp:615
int ic
Definition: cellorder.cpp:619
void nrn_permute_node_order()
Compute and carry out the permutation for interleave_permute_type.
Definition: cellorder.cpp:425
int firstnode
Definition: cellorder.cpp:624
std::vector< int > interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
Definition: cellorder.cpp:348
int ii
Definition: cellorder.cpp:631
int nstride
Definition: cellorder.cpp:789
int int int * strides
Definition: cellorder.cpp:608
auto const bksub_root
Definition: cellorder.cpp:667
auto *const vec_b
Definition: cellorder.cpp:614
static void print_quality1(int iwarp, InterleaveInfo &ii, int ncell, int *p)
Definition: cellorder.cpp:193
static void triang_interleaved(NrnThread *nt, int icell, int icellsize, int nstride, int *stride, int *lastnode)
Definition: cellorder.cpp:550
int int * ncycles
Definition: cellorder.cpp:607
void copy_array(T *&dest, T *src, size_t n)
Definition: cellorder.hpp:129
std::vector< int > node_order(int ncell, int nnode, int *parents, int &nwarp, int &nstride, int *&stride, int *&firstnode, int *&lastnode, int *&cellsize, int *&stridedispl)
Function that returns a permutation of length nnode.
Definition: cellorder1.cpp:321
void copy_align_array(T *&dest, T *src, size_t n)
Definition: cellorder.hpp:136
int * stride
Definition: cellorder.cpp:621
void create_interleave_info()
Definition: cellorder.cpp:110
static void print_quality2(int iwarp, InterleaveInfo &ii, int *p)
Definition: cellorder.cpp:124
if(ncell==0)
Definition: cellorder.cpp:785
void update_parent_index(int *vec, int vec_size, const std::vector< int > &permute)
auto *const vec_rhs
Definition: cellorder.cpp:616
bool has_subtrees_to_compute
Definition: cellorder.cpp:634
int iwarp
Definition: cellorder.cpp:618
int root
Definition: cellorder.cpp:622
int lastnode
Definition: cellorder.cpp:625
void sort_ml(Memb_list *ml)
void solve_interleaved(int ith)
Solve the Hines matrices based on the interleave_permute_type (1 or 2).
int int int int * stridedispl
Definition: cellorder.cpp:609
int istride
Definition: cellorder.cpp:630
static void prnode(const char *mes, NrnThread &nt)
Definition: cellorder.cpp:307
int icore
Definition: cellorder.cpp:606
static void bksub_interleaved(NrnThread *nt, int icell, int icellsize, int, int *stride, int *firstnode)
Definition: cellorder.cpp:579
void permute_ptr(int *vec, int n, int *p)
int int int int int int * nodebegin
Definition: cellorder.cpp:611
int int int int int * rootbegin
Definition: cellorder.cpp:610
static void warp_balance(int ith, InterleaveInfo &ii)
Definition: cellorder.cpp:259
int * cellsize
Definition: cellorder.cpp:793
void destroy_interleave_info()
Definition: cellorder.cpp:115
int nrn_optimize_node_order(int type)
Select node ordering for optimum gaussian elimination.
Definition: cellorder.cpp:336
static List * info
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
int const size_t const size_t n
Definition: nrngsl.h:10
for(i=0;i< n;i++)
size_t p
size_t j
s
Definition: multisend.cpp:521
std::vector< Memb_func > memb_func
Definition: init.cpp:145
short type
Definition: cabvars.h:10
static double cell(void *v)
Definition: ocbbs.cpp:540
std::vector< T > inverse_permute_vector(const std::vector< T > &p)
void forward_permute(std::vector< T > &data, const std::vector< int > &perm)
#define pc
Definition: section.h:37
A view into a set of mechanism instances.
Definition: nrnoc_ml.h:34
int nodecount
Definition: nrnoc_ml.h:78
int * nodeindices
Definition: nrnoc_ml.h:74
Node ** nodelist
Definition: nrnoc_ml.h:68
Datum ** pdata
Definition: nrnoc_ml.h:75
Prop ** prop
Definition: nrnoc_ml.h:76
int v_node_index
Definition: section.h:212
Represent main neuron object computed by single thread.
Definition: multicore.h:58
int * _v_parent_index
Definition: multicore.h:89
NrnThreadMembList * tml
Definition: multicore.h:62
int ncell
Definition: multicore.h:64
double * node_a_storage()
Definition: multicore.cpp:1054
int id
Definition: multicore.h:66
double * node_rhs_storage()
Definition: multicore.cpp:1074
int end
Definition: multicore.h:65
Node ** _v_parent
Definition: multicore.h:91
double * node_d_storage()
Definition: multicore.cpp:1069
Node ** _v_node
Definition: multicore.h:90
double * node_b_storage()
Definition: multicore.cpp:1064
struct NrnThreadMembList * next
Definition: multicore.h:34
bool cuda_interface
Enable GPU computation.
#define warpsize
Definition: tnode.hpp:89