NEURON
multisplit.cpp
Go to the documentation of this file.
1 #include <../../nrnconf.h>
2 
3 #include <InterViews/resource.h>
4 #include "cabcode.h"
5 #include "nrn_ansi.h"
6 #include "nrndae_c.h"
7 #include "nrniv_mf.h"
8 #include <nrnoc2iv.h>
9 #include <nrnmpi.h>
10 #include <multisplit.h>
11 
12 #include <cerrno>
13 #include <cstdio>
14 #include <cstdlib>
15 #include <memory>
16 #include <unordered_map>
17 #include <vector>
18 
19 void nrnmpi_multisplit(Section*, double x, int sid, int backbone_style);
21 
22 extern void (*nrn_multisplit_setup_)();
23 extern void* nrn_multisplit_triang(NrnThread*);
25 extern void* nrn_multisplit_bksub(NrnThread*);
26 extern double t;
27 
28 extern void (*nrn_multisplit_solve_)();
29 static void multisplit_v_setup();
30 static void multisplit_solve();
31 
32 extern double nrnmpi_rtcomp_time_;
33 #if NRNMPI
34 extern double nrnmpi_splitcell_wait_;
35 #else
36 static double nrnmpi_splitcell_wait_;
37 #endif
38 
39 #if NRNMPI
40 #else
41 static int nrnmpi_use;
42 static void nrnmpi_int_allgather(int*, int*, int) {}
43 static void nrnmpi_int_allgatherv(int*, int*, int*, int*) {}
44 static void nrnmpi_postrecv_doubles(double*, int, int, int, void**) {}
45 static void nrnmpi_send_doubles(double*, int, int, int) {}
46 static void nrnmpi_wait(void**) {}
47 static void nrnmpi_barrier() {}
48 static double nrnmpi_wtime() {
49  return 0.0;
50 }
51 #endif
52 
53 class MultiSplit;
54 class MultiSplitControl;
55 
56 #define D(i) vec_d[i]
57 #define RHS(i) vec_rhs[i]
58 #define S1A(i) sid1A[i]
59 #define S1B(i) sid1B[i]
60 
61 // any number of nodes can have the same sid (generally those nodes
62 // will be on different machines).
63 // A node cannot have more than one sid.
64 // A tree cannot have more than two nodes with an sid and those nodes
65 // must be distinct.
66 
67 // NOTE:
68 // For cache efficiency, it would be nice to keep individual cell nodes
69 // together (i.e v_node from 0 to rootnodecount-1 are NOT the root nodes.)
70 // This would allow keeping the backbones contiguous and would simplify the
71 // solution of the sid0,1 2x2 at the end of triangularization that forms
72 // an N shaped matrix. Unfortunately, in several places, e.g. matrix setup,
73 // the first rootnodecount v_node are assumed to be the individual cell roots
74 // and at this time it would be confusing to work on that tangential issue.
75 // However, it does mean that we need some special structure since
76 // sid0 ...backbone interior nodes... sid1 are not contiguous in the v_node
77 // for a single cell.
78 // Presently the node order is
79 // 1) all the roots of cells with 0 or 1 sid (no backbone involved)
80 // 2) all the sid0 when there are two sids in a cell. 1) + 2) = rootnodecount
81 // 3) the interior backbone nodes (does not include sid0 or sid1 nodes)
82 // 4) all the sid1 nodes
83 // 5) all remaining nodes
84 // backbone_begin is the index for 2).
85 // backbone_interior_begin is one more than the last index for 2)
86 // backbone_sid1_begin is the index for 4)
87 // backbone_end is the index for 5.
88 
89 // Modification to handle short backbones
90 // Numerical accuracy/stability suffers when backbones are electrically
91 // short. Unfortunately, it is impossible to divide many 3-d reconstructed
92 // cells with any kind of load balance without using them. Also,
93 // unfortunately, it is probably not possible to solve the problem
94 // in general. However we can extend, without introducing
95 // too much parallel inefficiency, the implementation to allow an
96 // arbitrarily short backbone treated as a single node, as long as it
97 // does not connect to a short backbone on another machine. e.g we can
98 // have alternating short and long sausages but not a sequence of
99 // contiguous short sausages. The idea is to exchange backbone info
100 // in two stages. First receive the info that short backbones need
101 // (by definition only from long backbones), compute the exact D,RHS
102 // backbone end points (only a 2x2 matrix solve), and then send the
103 // exact short backbone info. matrix_exchange carries out the
104 // augmented communication. But we need to distinguish short from
105 // long backbones since for short ones we need to factor out the
106 // first part of bksub_backbone and do it in the middle of
107 // matrix_exchange (as well as avoid doing that twice in bksub_backbone).
108 // We can accomplish that by introducing the index
109 // backbone_long_begin between the backbone_begin and backbone_interior_begin
110 // indices. Then in matrix exchange we do backbone_begin to
111 // backbone_long_begin - 1 , and in bksub_backbone, the first part is
112 // from backbone_long_begin to backbone_interior_begin - 1.
113 
114 // Modification to handle any number of backbones, short or long, exactly.
115 // In this case, it is not necessary to consider numerical stability/accuracy.
116 // However a tree matrix with rank equal to the number of distinct sid for
117 // an entire parallelized cell must be solved between the send and receive
118 // stages for that cell. The main issue to be resolved is: which machine receives
119 // the matrix information. ie. a single sid subtree sends d and rhs to the
120 // (rthost)machine that gaussian eliminates the reduced tree
121 // matrix and a two sid backbone subtree sends the two d, a, b, and two rhs
122 // to rthost. After rthost solves the matrix (triangualize and back substitute)
123 // the answer for each sid is sent back to the machines needing it.
124 // Clearly, some savings results if rthost happens to be one of the machines
125 // with a 2 sid backbone. Also, it is probably best if a machine serves
126 // as rthost to as few cells as possible.
127 
128 // Modification to handle multiple subtrees with same sid on same machine.
129 // This is very valuable for load balancing. It demands a backbone style
130 // of 2 (using reduced tree matrices) and we allow user specified host
131 // for the reduced tree. The exact solution using reduced trees has
132 // proved so much more useful than the long/short backbone methods
133 // that we should consider removing the code
134 // for those others and implement only the reduced tree method.
135 
136 // Area handling prior to ReducedTree extension was and mostly still is:
137 // 1) sending long to short, long, reduced on the send side multiplies
138 // the send buffer (D and RHS components) by area (if non-zero area node)
139 // from 0 to iarea_short_long using area_node_indices and buf_area_indices.
140 // 2) receive what is needed by short backbone (and reduced tree) and the
141 // receive buffer for short backbong divides by area from iarea_short_long
142 // to narea using aread_node_indices and buf_area_indices. (not needed by
143 // reduced tree but the local D and RHS will have to be multiplied by the
144 // node area when filling the ReducedTree. Also the sid1A and sid1B terms
145 // will have to be area multiplied both on the ReducedTree host
146 // and when backbones are sent to the ReducedTree host
147 // 3) The short backbone is solved consistent with its original node area
148 // (and the ReducedTree is solved with all its equation having been multiplied
149 // by area.
150 // 4) Note that solving the short and reduced tree gives the answer which
151 // does not have to be scaled but we multiply the D and RHS by 1e30 so
152 // those equation are v = RHS/D regardless of area or coupling. So no need
153 // to divide by area on the short or reduced tree machines and it would not
154 // matter if we did so or not.
155 // 5) all remaining receives take place and what is received gets divided
156 // by the node area (0 to iarea_short_long). As mentioned above, this does
157 // not affect the result that was sent from the short or reduced tree
158 // host (since D and RHS were scaled by 1e30).
159 // So... Anything sent to reduced trees from another machine needs to get its
160 // info multiplied by the node area. This is sid0 D and RHS and if there is
161 // a sid1 then that D and RHS as well as sid1A and sid1B.
162 // Also... Anything rmapped into a reduced tree from the reduced tree machine
163 // needs to get its rmap info multiplied by the node area ( in fact we can
164 // do that in place before calling fillrmap since those values get the result
165 // *1e30 returned into them).
166 // We do the old methods (backbone style 0 and 1) completely segregated from
167 // the new reducedtree method by introducing a vector parallel to
168 // nodeindex_buffer_ called nodeindex_rthost_ that contains the hostid for the
169 // reduced tree host associated with that sid. Then -1 means old style and
170 // >= 0 means reducedtree style.
171 
172 // the implementation so far has turned out to be too brittle with regard
173 // to unimportant topology changes in the sense of new point processes or
174 // anything that ends up calling multisplit_v_setup. The experienced
175 // cause of this is because MultiSplitTransferInfo holds pointers into
176 // the sid1A and sid1B off diag elements and those arrays are freed
177 // and reallocated when multisplit_v_setup is called. Although
178 // exchange_setup cries out for a simplified and
179 // more efficient implementation, for now, opportunistically reset the
180 // pointers in the MultiSplitTransferInfo at the end multisplit_v_setup
181 
182 
183 struct Area2Buf {
184  int inode;
185  int n; // using only for transfer to ReducedTree on another
186  int ibuf[3]; // machine so n is 2 or 3
187  double adjust_rhs_; // for cvode
188  MultiSplit* ms; // for recalculating ibuf pointers after v_setup
189 };
190 struct Area2RT {
191  int inode;
192  int n; // 2 or 3
193  double* pd[3];
194  double adjust_rhs_; // for cvode
195  MultiSplit* ms; // for recalculating pd pointers after v_setup
196 };
197 
198 using Int2IntTable = std::unordered_map<int, int>;
199 
200 // d and rhs keep getting reorderd according to thread cache efficiency
201 // so we need to retain some logical info in order to update the
202 // pointer lists in the Reduced tree. Prior to this comment, this was
203 // done only for sid1A and sid1B near the end of v_setup. That is ok
204 // since sid1A and sid1B are allocated by v_setup. But d and rhs are
205 // filled into the Node* after return from v_setup.
206 
207 class ReducedTree {
208  public:
209  ReducedTree(MultiSplitControl*, int rank, int mapsize);
210  virtual ~ReducedTree();
211  void solve();
212  void nocap();
213  // private:
214  public:
215  void gather();
216  void scatter();
217 
219  int n;
220  int* ip;
221  double* rhs;
222  double* d;
223  double* a;
224  double* b;
225  int n2, n4, nmap;
226 
227  public:
228  // next implementation, exploit the fact that send buffer can be
229  // smaller than the receive buffer since we only need to send back
230  // the answer in RHS. Now we are sending back RHS, D,
231  // and even the offdiags.
232  double **smap, **rmap;
233  int *ismap, *irmap;
234  int nsmap, irfill;
235 
236  int* rmap2smap_index; // needed for nocap when cvode used
237  int* nzindex; // needed for nocap when cvode used
238  double* v; // needed for nocap when cvode used
239 
240  void reorder(int j, int nt, int* mark, int* all_bb_relation, int* allsid);
241  std::unique_ptr<Int2IntTable> s2rt{new Int2IntTable()}; // sid2rank table
242  void fillrmap(int sid1, int sid2, double* pd);
243  void fillsmap(int sid, double* prhs, double* pdiag);
244  void pr_map(int, double*);
245 };
246 
247 class MultiSplit {
248  public:
249  Node* nd[2];
250  int sid[2];
251  int backbone_style; // 0 = no, 1 = yes, 2 = reducedtree (exact)
252  int rthost; // nrnmpi_myid where reducedtree will be solved
253  // the problem with using backsid_ to find the index now is that
254  // the value may not be unique since several backbones can be
255  // connected at the same sid on this machine. So store the index into
256  // the backsid) array.
258  // and need the thread id as well
259  int ithread;
260  // reduced tree pointers to d and rhs can become invalid and
261  // need to be restored. According to the exchange_setup that calls
262  // ReducedTree::fillrmap, the order is nd[0]->rhs, nd[0]->d, nd[1]->rhs
263  // nd[1]->d.
267 };
268 
269 // note: the tag_ below was added in case this machine interacts with
270 // another via more than one message type. i.e combinations of short<->long,
271 // reduced<->long, and long<->long backbone interactions.
272 // The user should avoid this
273 // eventuality and in fact we might not allow it.
274 
275 // mpi transfer information
277  int host_; // host id for send-receive
278  int nnode_; // number of nodes involved
279  int* nodeindex_; // v_node indices of those nodes. Pointer into nodeindex_buffer_
280  int* nodeindex_th_; // thread for above
281  int nnode_rt_; // number of off diag elements involved
282  int* nd_rt_index_; // v_node indices of offdiag. Not pointer into nodeindex_buffer. Needed for
283  // area2buf
284  int* nd_rt_index_th_; // thread for above
285  double** offdiag_; // pointers to sid1A or sid1B off diag elements
286  int* ioffdiag_; // indices of above, to recalculate offdiag when freed
287  int size_; // 2*nnode_ + nnode_rt_ doubles needed in buffer
288  int displ_; // displacement into trecvbuf_
289  void* request_; // MPI_Request
290  int tag_; // short<->long, long<->long, subtree->rthost, rthost->subtree
291  int rthost_; // host id where the reduced tree is located (normally -1)
292 };
293 
294 using MultiSplitTable = std::unordered_map<Node*, MultiSplit*>;
295 using MultiSplitList = std::vector<MultiSplit*>;
296 
297 #include <multisplitcontrol.h>
299 
300 void nrnmpi_multisplit(Section* sec, double x, int sid, int backbone_style) {
301  if (!msc_) {
302  msc_ = new MultiSplitControl();
303  }
304  msc_->multisplit(sec, x, sid, backbone_style);
305 }
306 
308  narea2buf_ = narea2rt_ = 0;
309  area2buf_ = 0;
310  area2rt_ = 0;
311  nthost_ = 0;
313  msti_ = 0;
314  tbsize = 0;
315  ndbsize = 0;
316  trecvbuf_ = 0;
317  tsendbuf_ = 0;
318  nodeindex_buffer_ = 0;
320  nodeindex_rthost_ = 0;
321  narea_ = 0;
322  iarea_short_long_ = 0;
323  buf_area_indices_ = 0;
324  area_node_indices_ = 0;
325 
326  nrtree_ = 0;
327  rtree_ = 0;
328 
329 
330  multisplit_list_ = 0;
331  nth_ = 0;
332  mth_ = 0;
333 }
334 
336 
340  sid1A = sid1B = 0;
341  sid0i = 0;
342 
343  nbackrt_ = 0;
344  backsid_ = 0;
345  backAindex_ = 0;
346  backBindex_ = 0;
347  i1 = i2 = i3 = 0;
348 }
349 
351  del_sidA();
352 }
353 
354 void MultiSplitControl::multisplit(Section* sec, double x, int sid, int backbone_style) {
355 #if 0
356  if (sid > 1000) { pexch(); return; }
357  if (sid >= 1000) { pmat(sid>1000); return; }
358 #endif
359  if (sid < 0) {
364  }
365  exchange_setup();
366  return;
367  }
369  if (backbone_style != 2) {
370  hoc_execerror("only backbone_style 2 is now supported", 0);
371  }
374  classical_root_to_multisplit_->reserve(97);
376  }
377  Node* nd = node_exact(sec, x);
378  if (tree_changed) {
379  setup_topology();
380  }
381  // printf("root of %s(%g) ", secname(sec), x);
382  Node* root;
383  for (sec = nd->sec; sec; sec = sec->parentsec) {
384  root = sec->parentnode;
385  }
386  assert(root);
387  // printf("is %s\n", secname(root->sec));
388  MultiSplit* ms;
389  const auto& msiter = classical_root_to_multisplit_->find(root);
390  if (msiter != classical_root_to_multisplit_->end()) {
391  ms = msiter->second;
392  if (backbone_style == 2) {
393  if (ms->backbone_style != 2) {
394  hoc_execerror("earlier call for this cell did not have a backbone style = 2", 0);
395  }
396  } else if (backbone_style == 1) {
397  ms->backbone_style = 1;
398  }
399  ms->nd[1] = nd;
400  ms->sid[1] = sid;
401  if (ms->sid[1] == ms->sid[0]) {
402  char s[100];
403  Sprintf(s, "two sid = %d at same point on tree rooted at", sid);
404  hoc_execerror(s, secname(root->sec));
405  }
406  } else {
407  ms = new MultiSplit();
408  ms->backbone_style = backbone_style;
409  ms->rthost = -1;
410  ms->nd[0] = nd;
411  ms->nd[1] = 0;
412  ms->sid[0] = sid;
413  ms->sid[1] = -1;
414  ms->back_index = -1;
415  ms->ithread = -1;
416  ms->rt_ = 0;
417  ms->rmap_index_ = -1;
418  ms->smap_index_ = -1;
419  (*classical_root_to_multisplit_)[root] = ms;
420  multisplit_list_->push_back(ms);
421  }
422 }
423 
425  if (sid1A) {
426  delete[] sid1A;
427  delete[] sid1B;
428  delete[] sid0i;
429  sid1A = 0;
430  sid1B = 0;
431  sid0i = 0;
432  }
433  if (nbackrt_) {
434  delete[] backsid_;
435  delete[] backAindex_;
436  delete[] backBindex_;
437  nbackrt_ = 0;
438  }
439 }
440 
442  int i;
443  if (nrtree_) {
444  for (i = 0; i < nrtree_; ++i) {
445  delete rtree_[i];
446  }
447  delete[] rtree_;
448  nrtree_ = 0;
449  }
450  if (msti_) {
451  for (i = 0; i < nthost_; ++i) {
452  if (msti_[i].nnode_rt_) {
453  delete[] msti_[i].nd_rt_index_;
454  delete[] msti_[i].nd_rt_index_th_;
455  delete[] msti_[i].offdiag_;
456  delete[] msti_[i].ioffdiag_;
457  }
458  }
459  delete[] msti_;
460  msti_ = 0;
461  if (nodeindex_buffer_) {
462  delete[] nodeindex_buffer_;
463  delete[] nodeindex_buffer_th_;
464  delete[] nodeindex_rthost_;
465  }
466  nodeindex_buffer_ = 0;
468  nodeindex_rthost_ = 0;
469  if (trecvbuf_) {
470  delete[] trecvbuf_;
471  delete[] tsendbuf_;
472  }
473  trecvbuf_ = 0;
474  tsendbuf_ = 0;
475  if (narea_) {
476  delete[] buf_area_indices_;
477  delete[] area_node_indices_;
478  buf_area_indices_ = 0;
479  area_node_indices_ = 0;
480  narea_ = 0;
481  }
482  if (narea2buf_) {
483  // at present, statically 3 since only used for ReducedTree
484  // for (i=0; i < narea2buf_; ++i) {
485  // delete [] area2buf_->ibuf;
486  // }
487  delete[] area2buf_;
488  area2buf_ = 0;
489  narea2buf_ = 0;
490  }
491  if (narea2rt_) {
492  delete[] area2rt_;
493  area2rt_ = 0;
494  narea2rt_ = 0;
495  }
496  }
497 }
498 
500  if (msc_) {
503  }
504 }
505 
507  // printf("nrnmpi_multisplit_clear()\n");
508  int i;
511  for (i = 0; i < nth_; ++i) {
512  mth_[i].del_sidA();
513  }
514  if (mth_) {
515  delete[] mth_;
516  mth_ = 0;
517  }
518  nth_ = 0;
519  del_msti();
521  for (const auto& mspair: *classical_root_to_multisplit_) {
522  delete mspair.second;
523  }
525  delete multisplit_list_;
526  multisplit_list_ = nullptr;
527  }
528 }
529 
530 // in the incremental development of exchange_setup from only long to short
531 // to reduced tree we end up with many iterations over all pc.multisplit
532 // calls. ie. every machines sid info. If the performance becomes
533 // visible relative to the total setup time then one can replace all those
534 // iterations by preserving only the sid graph info that is needed by each
535 // machine. A sid graph, tree, would contain a list of sid's and a list of
536 // machines. Then there would be a map of sid2graph. This list and map
537 // could be constructed with a single iteration. ie. when an edge is seen
538 // the graphs of the two sids would merge. Then one could throw away all
539 // graphs that have no sid on this machine. I would expect an order of
540 // magnitude performance improvement in exchange_setup.
541 
543  int i, j, k;
544  del_msti(); // above recalc_diam so multisplit_v_setup does not
545  // attempt to fill offdiag_
546  if (diam_changed) {
547  recalc_diam();
548  }
549 
550  // how many nodes are involved on this machine
551  // all nodes are distinct. For now we independently assert that
552  // all the sid on this machine are distinct. So when we get an
553  // sid from another machine it does not have to be directed to
554  // more than one place. So, just like splitcell, we cannot test
555  // a multisplit on a single machine.
556  // When sids are not distinct, in the context of reduced trees,
557  // there is no problem with respect to sending sid info to
558  // multiple places since each sid info gets added to the reduced
559  // tree and the reduced tree answer gets sent back to each sid.
560  // So there is not a lot of special code involved with multiple
561  // pieces on the same host, since each piece <-> reduced tree
562  // independently of where the piece and reduced tree are located.
563  int n = 0;
564  int nwc = 0; // number of backbonestyle=2 nodes.
566  for (MultiSplit* ms: *multisplit_list_) {
567  ++n;
568  if (ms->nd[1]) {
569  ++n;
570  }
571  if (ms->backbone_style == 2) {
572  ++nwc;
573  if (ms->nd[1]) {
574  ++nwc;
575  }
576  } else if (ms->backbone_style == 1) {
577  if (!ms->nd[1]) {
578  ms->backbone_style = 0;
579  }
580  }
581  }
582  }
583  // printf("%d n=%d nsbb=%d nwc=%d\n", nrnmpi_myid, n, nsbb, nwc);
584  if (nrnmpi_numprocs == 1 && n == 0) {
585  return;
586  }
587  // how many nodes are involved on each of the other machines
588  int* nn = new int[nrnmpi_numprocs];
589  if (nrnmpi_use) {
590  nrnmpi_int_allgather(&n, nn, 1);
591  } else {
592  nn[0] = n;
593  }
594  // if (nrnmpi_myid==0) for (i=0; i < nrnmpi_numprocs;++i) printf("%d %d nn=%d\n", nrnmpi_myid,
595  // i, nn[i]);
596  // what are the sid's on this machine
597  int* sid = 0;
598  int* inode = 0;
599  int* threadid = 0;
600  // an parallel array back to MultiSplit
601  MultiSplit** vec2ms = new MultiSplit*[n];
602  // following used to be is_ssb (is sid for short backbone)
603  // now we code that as well as enough information for the
604  // backbone_style==2 case to figure out the reduced tree matrix
605  // so 0 means backbone_style == 0, 1 means backbone_style == 1
606  // and > 1 means backbone_style == 2 and the value is
607  // other backbone sid+3 (so a value of 2 means that
608  // it is a single sid subtree and a value of 3 means that the
609  // other backbone sid is 0.
610  int* bb_relation = 0;
611  if (n > 0) {
612  sid = new int[n];
613  inode = new int[n];
614  bb_relation = new int[n];
615  threadid = new int[n];
616  }
618  i = 0;
619  for (MultiSplit* ms: *multisplit_list_) {
620  sid[i] = ms->sid[0];
621  inode[i] = ms->nd[0]->v_node_index;
622  threadid[i] = ms->ithread;
623  bb_relation[i] = ms->backbone_style;
624  vec2ms[i] = ms;
625  ++i;
626  if (ms->nd[1]) {
627  sid[i] = ms->sid[1];
628  inode[i] = ms->nd[1]->v_node_index;
629  threadid[i] = ms->ithread;
630  bb_relation[i] = ms->backbone_style;
631  if (ms->backbone_style == 2) {
632  bb_relation[i - 1] += 1 + sid[i];
633  bb_relation[i] += 1 + sid[i - 1];
634  }
635  vec2ms[i] = ms;
636  ++i;
637  }
638  }
639  }
640  // for (i=0; i < n; ++i) { printf("%d %d sid=%d inode=%d %s %d\n", nrnmpi_myid, i, sid[i],
641  // inode[i], secname(v_node[inode[i]]->sec), v_node[inode[i]]->sec_node_index_);}
642  // what are the sid's and backbone status on each of the other machines
643  // first need a buffer for them and their offsets
644  int* displ = new int[nrnmpi_numprocs + 1];
645  displ[0] = 0;
646  for (i = 0; i < nrnmpi_numprocs; ++i) {
647  displ[i + 1] = displ[i] + nn[i];
648  }
649  int nt = displ[nrnmpi_numprocs]; // total number of (distinct) nodes
650  int* allsid = new int[nt];
651  int* all_bb_relation = new int[nt];
652  if (nrnmpi_use) {
653  nrnmpi_int_allgatherv(sid, allsid, nn, displ);
654  nrnmpi_int_allgatherv(bb_relation, all_bb_relation, nn, displ);
655  } else {
656  for (i = 0; i < n; ++i) {
657  allsid[i] = sid[i];
658  all_bb_relation[i] = bb_relation[i];
659  }
660  }
661  if (!n) {
662  delete[] allsid;
663  delete[] all_bb_relation;
664  delete[] displ;
665  delete[] nn;
666  delete[] vec2ms;
667  errno = 0;
668  return;
669  }
670 
671  // now we need to analyze
672  // how many machines do we need to communicate with
673 
674  // note that when long sid is connected to 1 or more long sid
675  // and also a short backbone sid that there is no need to send
676  // or receive info from the long sid because the short sid sends
677  // back the answer.
678 
679  // A similar communication pattern holds for the backbone_style==2
680  // case. i.e. a star instead of all2all. The only problem is
681  // which host is the center of the star. The following
682  // code for backbone_style < 2 is almost unchanged but
683  // backbone_style >=2 issue handling is inserted as needed.
684 
685  // there should not be that many sid on one machine. so just use
686  // linear search
687 
688  // for an index into the all... vectors, mark[index] is the index
689  // into this hosts vectors of the corresponding sid. It gets slowly
690  // transformed into a communication pattern in the sense that
691  // mark[index] = -1 means no communication will take place between
692  // this host and the host associated with the index.
693  // For reduced tree purposes, we overload the mark semantics so
694  // that mark[index] points to one (of the possibly two) sid on the
695  // same cell. ie. all the sid on other cpus that are part of the
696  // whole cell point to that sid.
697 
698  // For backbone style 2,
699  // when there are several pieces on the same host with the same
700  // sid, then the mark points to the principle pieces sid. i.e all
701  // the sid on this cpu that are part of the whole cell also point
702  // to that sid.
703 
704  int* mark = new int[nt];
705  int* connects2short = new int[n];
706  for (i = 0; i < n; ++i) { // so we know if we will be sending to
707  connects2short[i] = 0; // a short backbone.
708  }
709  for (i = 0; i < nt; ++i) {
710  mark[i] = -1;
711  for (j = 0; j < n; ++j) {
712  if (allsid[i] == sid[j]) {
713  // here we can errcheck the case of 2 on one
714  // machine and a different style for the same
715  // sid on another machine. If this passes on
716  // all machines then a single cell is necessarily
717  // consistent with regard to all sids for it
718  // being backbone_style == 2
719  if ((bb_relation[j] >= 2) != (all_bb_relation[i] >= 2)) {
720  hoc_execerror("backbone_style==2 inconsistent between hosts for same sid", 0);
721  }
722 
723  if (all_bb_relation[i] < 2) {
724  mark[i] = j;
725  if (all_bb_relation[i] == 1) {
726  connects2short[j] = 1;
727  }
728  }
729  }
730  }
731  }
732 
733  // but we do not want to mark allsid on this machine
734  for (i = displ[nrnmpi_myid]; i < displ[nrnmpi_myid] + n; ++i) {
735  mark[i] = -1;
736  }
737  // but note that later when we mark the reduced tree items we
738  // do leave the marks for allsid on this machine (but the marks
739  // have a slightly different meaning
740 
741  // undo all the marks for long to long if there is a long to short
742  // but be careful not to undo the short to long
743  for (i = 0; i < nt; ++i) {
744  if (mark[i] >= 0 && connects2short[mark[i]] && all_bb_relation[i] == 0 &&
745  bb_relation[mark[i]] == 0) {
746  mark[i] = -1;
747  }
748  }
749  // make sure no short to short connections
750  for (i = 0; i < nt; ++i) {
751  if (mark[i] >= 0) {
752  if (bb_relation[mark[i]] == 1 && all_bb_relation[i] == 1) {
753  hoc_execerror("a short to short backbone interprocessor connection exists", 0);
754  }
755  }
756  }
757 
758  // right now, the communication pattern represented by mark is
759  // bogus for backbone_style == 2 and they are all -1.
760  // We want to adjust mark so all the sid on
761  // a single cell point to one of the sid on this cell
762  // The following uses an excessive number of iterations
763  // over the all... vectors (proportional to number of distinct sids
764  // on a cell and should be made more efficient
765  for (j = 0; j < n; ++j) {
766  if (bb_relation[j] >= 2) {
767  reduced_mark(j, sid[j], nt, mark, allsid, all_bb_relation);
768  // note the above will also mark both this host sids
769  // so when the other one comes around here it will
770  // mark nothing.
771  }
772  }
773 
774 #if 0
775 for (i=0; i < nt; ++i) { printf("%d %d allsid=%d all_bb_relation=%d mark=%d\n",
776 nrnmpi_myid, i, allsid[i], all_bb_relation[i], mark[i]);}
777 #endif
778 
779  // it is an error if there are two pieces of the same cell on the
780  // same host. All we need to do is make sure the mark for this
781  // cpu are unique. i.e belong to different cells
782  // This is no longer an error. For backbonestyle 2, allowing
783  // multiple pieces of same cell on same cpu.
784  if (0) {
785  int* mcnt = new int[n];
786  for (j = 0; j < n; ++j) {
787  mcnt[j] = 0;
788  }
789  for (i = displ[nrnmpi_myid]; i < displ[nrnmpi_myid] + n; ++i) {
790  if (all_bb_relation[i] >= 2 && mark[i] >= 0) {
791  mcnt[mark[i]] += 1;
792  if (all_bb_relation[i] > 2) {
793  ++i;
794  }
795  }
796  }
797  for (j = 0; j < n; ++j) {
798  assert(mcnt[j] < 2);
799  }
800  delete[] mcnt;
801  }
802 
803  // for reduced trees, we now need to settle on which host will solve
804  // a reduced tree. If rthost is this one then this receives from
805  // and sends to every host that has a sid on the cell. If the rthost
806  // is another, then this sends and receives to rthost.
807 
808  // For now let it be the first host with two sid, and if there is
809  // no such host (then why are we using style 2?) then the first
810  // host with 1.
811  nrtree_ = 0;
812  int* rthost = new int[n];
813  ReducedTree** rt = new ReducedTree*[n];
814  for (j = 0; j < n; ++j) {
815  rthost[j] = -1;
816  rt[j] = 0;
817  }
818  if (nwc) {
819  for (MultiSplit* ms: *multisplit_list_) {
820  if (ms->backbone_style == 2) {
821  for (j = 0; j < n; ++j) { // find the mark value
822  if (sid[j] == ms->sid[0]) {
823  break;
824  }
825  }
826  ms->rthost = -1;
827  k = -1;
828  for (int ih = 0; ih < nrnmpi_numprocs; ++ih) {
829  for (i = displ[ih]; i < displ[ih + 1]; ++i) {
830  if (mark[i] == j) {
831  if (all_bb_relation[i] > 2) {
832  ms->rthost = ih;
833  break;
834  }
835  if (all_bb_relation[i] == 2) {
836  if (k == -1) {
837  k = ih;
838  }
839  }
840  }
841  }
842  if (ms->rthost != -1) {
843  break;
844  }
845  }
846  if (ms->rthost == -1) {
847  ms->rthost = k;
848  }
849  // only want rthost[j] for sid[0], not sid[1]
850  rthost[j] = ms->rthost;
851  // printf("%d sid0=%d sid1=%d rthost=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[1],
852  // ms->rthost);
853  }
854  }
855  // it is important that only the principal sid has rthost != 0
856  // since we count them below, nrtree_, to allocate rtree_.
857  }
858 #if 0
859 for (j=0; j < n; ++j) {
860 printf("%d j=%d sid=%d bb_relation=%d rthost=%d\n", nrnmpi_myid,j, sid[j],
861 bb_relation[j], rthost[j]);
862 }
863 #endif
864  // there can be a problem when one host sends parts of two cells
865  // to another host due to cells not being in the same order with
866  // respect to NrnHashIterate. So demand that they are in the
867  // same order. Note this can also be used to assert that there
868  // are not two pieces on a host from the same cell.
869  // But we no longer use NrnHashIterate and the order is consistent
870  // because ordering is always with respect to the allgathered
871  // arrays of size nt.
872  if (0) {
873  for (i = 0; i < nrnmpi_numprocs; ++i) {
874  int ix = -1;
875  int nj = displ[i + 1];
876  for (j = displ[i]; j < nj; ++j) {
877  if (all_bb_relation[j] >= 2 && mark[j] >= 0 && rthost[mark[j]] == nrnmpi_myid) {
878  assert(mark[j] > ix);
879  ix = mark[j];
880  if (all_bb_relation[j] > 2) {
881  ++j;
882  }
883  }
884  }
885  }
886  }
887 
888  // allocate rtree_
889  for (j = 0; j < n; ++j) {
890  if (rthost[j] == nrnmpi_myid) {
891  ++nrtree_;
892  }
893  }
894  if (nrtree_) {
895  i = 0;
896  rtree_ = new ReducedTree*[nrtree_];
897  for (j = 0; j < n; ++j) {
898  if (rthost[j] == nrnmpi_myid) {
899  // sid to rank table
900  std::unique_ptr<Int2IntTable> s2rt{new Int2IntTable()};
901  s2rt->reserve(20);
902  int rank = 0;
903  int mapsize = 0;
904  for (k = 0; k < nt; ++k) {
905  if (mark[k] == j && all_bb_relation[k] >= 2) {
906  const auto& result = s2rt->insert({allsid[k], rank});
907  if (result.second) {
908  rank++;
909  }
910  if (all_bb_relation[k] == 2) {
911  mapsize += 2;
912  } else {
913  mapsize += 3;
914  }
915  }
916  }
917  rtree_[i] = new ReducedTree(this, rank, mapsize);
918  rt[j] = rtree_[i]; // needed for third pass below
919  // printf("%d new ReducedTree(%d, %d)\n", nrnmpi_myid, rank, mapsize);
920  // at this point the ReducedTree.s2rt is not in tree order
921  // (in fact we do not even know if it is a tree)
922  // so reorder. For a tree, we know there must be ReducedTree.n - 1
923  // edges.
924  rtree_[i]->s2rt = std::move(s2rt);
925  rtree_[i]->reorder(j, nt, mark, all_bb_relation, allsid);
926  ++i;
927  }
928  }
929  }
930 
931  // fill in the rest of the rthost[], mt->rthost, and rt
932  j = 0;
933  for (MultiSplit* ms: *multisplit_list_) {
934  if (ms->backbone_style == 2) {
935  int jj = mark[displ[nrnmpi_myid] + j];
936  ms->rthost = rthost[jj];
937  rthost[j] = ms->rthost;
938  rt[j] = rt[jj];
939  if (ms->nd[1]) {
940  ++j;
941  }
942  }
943  ++j;
944  }
945 #if 0
946  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
947  MultiSplit* ms = multisplit_list_->item(ii);
948  if (ms->backbone_style == 2) {
949 printf("%d sid0=%d sid1=%d rthost=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[1], ms->rthost);
950  }
951  }
952 for (j=0; j < n; ++j) {
953 printf("%d j=%d sid=%d bb_relation=%d rthost=%d\n", nrnmpi_myid,j, sid[j],
954 bb_relation[j], rthost[j]);
955 }
956 #endif
957 
958  // count the reduced tree related nodes that have non-zero area
959  for (i = 0; i < n; ++i) {
960  // remember rthost >= 0 only for sid[0]
961  if (rthost[i] >= 0)
962  for (j = 0; j < 2; ++j) {
963  NrnThread* _nt = nrn_threads + threadid[i];
964  Node* nd = _nt->_v_node[inode[i + j]];
965  if (nd->_classical_parent && nd->sec_node_index_ < nd->sec->nnode - 1) {
966  if (rthost[i] == nrnmpi_myid) {
967  ++narea2rt_;
968  } else {
969  ++narea2buf_;
970  }
971  }
972  if (bb_relation[i] == 2) { // sid[0] only
973  break;
974  }
975  if (j == 1) {
976  ++i;
977  }
978  }
979  }
980  if (narea2rt_) {
981  area2rt_ = new Area2RT[narea2rt_];
982  }
983  if (narea2buf_) {
985  }
986  // can fill in all of the info for area2rt_ but only some for area2buf_
987  // actually, area2rt is the one used in practice since only the soma ever
988  // has a sid at a non-zero area node and the reduced tree will probably
989  // be on the machine with the soma.
990  narea2rt_ = narea2buf_ = 0;
991  for (i = 0; i < n; ++i) {
992  // remember rthost >= 0 only for sid[0]
993  if (rthost[i] >= 0)
994  for (j = 0; j < 2; ++j) {
995  NrnThread* _nt = nrn_threads + threadid[i];
996  Node* nd = _nt->_v_node[inode[i + j]];
997  auto* const vec_d = _nt->node_d_storage();
998  auto* const vec_rhs = _nt->node_rhs_storage();
999  if (nd->_classical_parent && nd->sec_node_index_ < nd->sec->nnode - 1) {
1000  if (rthost[i] == nrnmpi_myid) {
1001  Area2RT& art = area2rt_[narea2rt_];
1002  art.ms = vec2ms[i];
1003  art.inode = inode[i + j];
1004  art.n = 2;
1005  art.pd[0] = &D(art.inode);
1006  art.pd[1] = &RHS(art.inode);
1007  if (bb_relation[i] > 2) {
1008  art.n = 3;
1009  k = vec2ms[i]->back_index;
1010  MultiSplitThread& t = mth_[vec2ms[i]->ithread];
1011  if (j == 0) {
1012  art.pd[2] = t.sid1A + t.backAindex_[k];
1013  } else {
1014  art.pd[2] = t.sid1B + t.backBindex_[k];
1015  }
1016  }
1017  ++narea2rt_;
1018  } else {
1019  Area2Buf& ab = area2buf_[narea2buf_];
1020  ab.ms = vec2ms[i];
1021  ab.inode = inode[i + j];
1022  // can figure out how many but not
1023  // the indices into the send buffer.
1024  ab.n = 2;
1025  if (bb_relation[i] > 2) {
1026  ab.n = 3;
1027  }
1028  ++narea2buf_;
1029  }
1030  }
1031  if (bb_relation[i] == 2) { // sid[0] only
1032  break;
1033  }
1034  }
1035  }
1036 #if 0
1037 printf("%d narea2rt=%d narea2buf=%d\n", nrnmpi_myid, narea2rt_, narea2buf_);
1038 for (i = 0; i < narea2rt_; ++i) {
1039  Area2RT& art = area2rt_[i];
1040  printf("%d area2rt[%d] inode=%d n=%d thread=%d\n", nrnmpi_myid, i, art.inode, art.n, art.ms->ithread);
1041 }
1042 for (i = 0; i < narea2buf_; ++i) {
1043  Area2Buf& ab = area2buf_[i];
1044  printf("%d area2buf[%d] inode=%d n=%d\n", nrnmpi_myid, i, ab.inode, ab.n);
1045 }
1046 #endif
1047 #if 0
1048 if (nrnmpi_myid == 0) {
1049 for (i=0; i < n; ++i) {
1050  printf("%d %d sid=%d bbrelation=%d rthost=%d rt=%p\n", nrnmpi_myid, i, sid[i], bb_relation[i], rthost[i], rt[i]);
1051 }
1052 for (i=0; i < nrnmpi_numprocs+1; ++i) {
1053  printf("%d displ[%d]=%d\n", nrnmpi_myid, i, displ[i]);
1054 }
1055 for (i=0; i < nt; ++i) {
1056  printf("%d %d allsid=%d mark=%d all_bb_relation=%d\n", nrnmpi_myid, i, allsid[i], mark[i], all_bb_relation[i]);
1057 }}
1058 #endif
1059  // it is best if the user arranges things so that machine -> machine
1060  // has only one type of connection, i.e long->short, long->long,
1061  // or short->long since that means only one message required.
1062  // But for generality we program for the possibility of needing
1063  // two message types (long->short and short->long are the same
1064  // type and distinct from long->long)
1065  // Actually we do need 3 because one cannot mix long->short
1066  // and long->long in the same message because they need
1067  // different tags.
1068  // we also calculate the indices for the types
1069  // we need to have separate tags for long <-> reduced trees
1070  nthost_ = 0;
1071  // int ndbsize, tbsize;
1072  ndbsize = 0;
1073  tbsize = 0;
1074 
1075  for (i = 0; i < nrnmpi_numprocs; ++i) {
1076  int nh = 0;
1077  int type = 0;
1078  int nh1 = 0;
1079  for (j = displ[i]; j < displ[i + 1]; ++j) {
1080  if (mark[j] >= 0) {
1081  // how many distinct all_bb_relation<2: 1,2,3?
1082  if (bb_relation[mark[j]] == 1) { // short<->long
1083  assert(all_bb_relation[j] == 0);
1084  type |= 04;
1085  ++ndbsize; // space for node index
1086  tbsize += 2; // d and rhs
1087  } else if (all_bb_relation[j] == 1) { // long<->short
1088  type |= 01;
1089  ++ndbsize;
1090  tbsize += 2;
1091  } else if (all_bb_relation[j] < 2) { // long<->long
1092  type |= 02;
1093  ++ndbsize;
1094  tbsize += 2;
1095  } else if (all_bb_relation[j] >= 2) {
1096  int rth = rthost[mark[j]];
1097  if (rth == nrnmpi_myid) {
1098  if (i != nrnmpi_myid) {
1099  // this is the rthost
1100  // reducedtree->long
1101  type |= 010;
1102  // no node buffer since all
1103  // info <-> ReducedTree
1104  // ++ndbsize;
1105  tbsize += 2;
1106  if (all_bb_relation[j] > 2) {
1107  tbsize += 1;
1108  }
1109  }
1110  } else if (i == nrnmpi_myid) {
1111  // only this to the rthost
1112  // long->reducedtree
1113  nh1 += 1;
1114  // may not be distinct hosts
1115  // be careful not to count twice
1116  for (int jj = displ[i]; jj < j; ++jj) {
1117  if (rth == rthost[mark[jj]]) {
1118  nh1 -= 1;
1119  break;
1120  }
1121  }
1122  ++ndbsize;
1123  tbsize += 2; // off diag element
1124  if (all_bb_relation[j] > 2) {
1125  tbsize += 1;
1126  }
1127  }
1128  }
1129  }
1130  }
1131  nh = (type & 1) + ((type / 2) & 1) + ((type / 4) & 1) + ((type / 010) & 1);
1132  nthost_ += nh + nh1;
1133  // printf("%d type=%d %d %d %d %d %d nh=%d nthost_=%d\n", nrnmpi_myid, type, type&1,
1134  // (type/2)&1, (type/4)&1, (type/010)&1, (type/020)&1, nh, nthost_);
1135  }
1136  // printf("%d x nthost_=%d\n", nrnmpi_myid, nthost_);
1138  for (i = 0; i < nthost_; ++i) { // two of these needed before rest of fill
1139  msti_[i].nnode_rt_ = 0;
1140  msti_[i].nd_rt_index_ = 0;
1141  msti_[i].nd_rt_index_th_ = 0;
1142  msti_[i].offdiag_ = 0;
1143  msti_[i].ioffdiag_ = 0;
1144  msti_[i].rthost_ = -1;
1145  }
1146  if (ndbsize) { // can be 0 if rthost
1147  nodeindex_buffer_ = new int[ndbsize];
1148  nodeindex_buffer_th_ = new int[ndbsize];
1149  nodeindex_rthost_ = new int[ndbsize];
1150  }
1151  for (i = 0; i < ndbsize; ++i) {
1152  nodeindex_rthost_[i] = -1;
1153  }
1154  // printf("%d nthost_=%d ndbsize=%d tbsize=%d nrtree_=%d\n", nrnmpi_myid, nthost_, ndbsize,
1155  // tbsize, nrtree_); tbsize_=tbsize;
1156  if (tbsize) {
1157  trecvbuf_ = new double[tbsize];
1158  tsendbuf_ = new double[tbsize];
1159  }
1160  // We need to fill the msti_ array
1161  // in the order long<->short long<->reduced
1162  // followed by long<->long followed by reduced<->long
1163  // followed short<->long. // three distinct message tags
1164  nthost_ = 0;
1165  int mdisp = 0;
1166  k = 0;
1167 
1168  // first pass for the long<->short backbones
1169  for (i = 0; i < nrnmpi_numprocs; ++i) {
1170  int b = 0;
1171  for (j = displ[i]; j < displ[i + 1]; ++j) {
1172  if (mark[j] >= 0 && bb_relation[mark[j]] == 0 && all_bb_relation[j] == 1) {
1173  nodeindex_buffer_th_[k] = threadid[mark[j]];
1174  nodeindex_buffer_[k++] = inode[mark[j]];
1175  // printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k,
1176  // nthost_, mark[j], inode[mark[j]]);
1177  ++b;
1178  }
1179  }
1180  if (b) {
1181  msti_[nthost_].displ_ = mdisp;
1182  msti_[nthost_].size_ = 2 * b;
1183  mdisp += 2 * b;
1184  msti_[nthost_].host_ = i;
1185  msti_[nthost_].nnode_ = b;
1186  msti_[nthost_].tag_ = 1;
1187  ++nthost_;
1188  }
1189  }
1190 
1191  // second pass for the long<->reducedtree
1192  // includes the communication of sid <-> reduced tree for the
1193  // non rthost cpus
1194  // first make a list of the rthost values, the size of that
1195  // list will be how many msti we need for this pass.
1196  int* tmphost = new int[n]; // cannot be more than this.
1197  int ntmphost = 0;
1198  i = nrnmpi_myid;
1199  for (j = displ[i]; j < displ[i + 1]; ++j) {
1200  int j1 = mark[j];
1201  if (j1 >= 0 && bb_relation[j1] >= 2 && rthost[j1] != i) {
1202  tmphost[ntmphost++] = rthost[j1];
1203  for (int itmp = 1; itmp < ntmphost; ++itmp) {
1204  if (tmphost[itmp - 1] == rthost[j1]) {
1205  ntmphost--;
1206  break;
1207  }
1208  }
1209  }
1210  }
1211  // printf("%d second pass ntmphost = %d\n", nrnmpi_myid, ntmphost);
1212  // now we can loop over that list and fill the msti
1213  for (int itmp = 0; itmp < ntmphost; ++itmp) {
1214  i = nrnmpi_myid;
1215  int b = 0;
1216  int br = 0;
1217  // only the ones on this machine and rthost not this machine
1218  // Note: rthost generally has different sid value
1219  // and number of sid than this machine
1220  for (j = displ[i]; j < displ[i + 1]; ++j) {
1221  int jj = j - displ[i];
1222  int j1 = mark[j]; // point to sid[0] on this machine
1223  // skip if sid[1] of the rthost
1224  // printf("%d i=%d j=%d j1=%d bb_relation[j1]=%d rthost[j1]=%d\n",
1225  // nrnmpi_myid, i, j, j1, bb_relation[j1], rthost[j1]);
1226  if (j1 >= 0 && bb_relation[j1] >= 2 && rthost[j1] == tmphost[itmp]) { // only the first
1227  // has rthost
1228  // printf("%d sid=%d bb_relation=%d send to rthost=%d k=%d\n", nrnmpi_myid,
1229  // sid[j1], bb_relation[j1], rthost[j1], k);
1230  // mark[j] points to the index for sid0
1231  nodeindex_rthost_[k] = rthost[j1];
1232  nodeindex_buffer_th_[k] = threadid[j - displ[i]];
1233  nodeindex_buffer_[k++] = inode[j - displ[i]];
1234  ++b;
1235  // one or two?
1236  if (bb_relation[jj] > 2) {
1237  // int iii = i;
1238  {
1239  // except that msti_[nthost_].offdiag_ has not been allocated, this is a
1240  // good time to figure out the pointers to sid1A and sid1B and which is
1241  // which
1243  int i;
1244  int* ix;
1245  int* ixth;
1246  double** od;
1247  int* iod;
1248  // start by assuming 2 and then increment by 2
1249  if (m.offdiag_) {
1250  ix = m.nd_rt_index_;
1251  ixth = m.nd_rt_index_th_;
1252  od = m.offdiag_;
1253  iod = m.ioffdiag_;
1254  m.nd_rt_index_ = new int[m.nnode_rt_ + 2];
1255  m.nd_rt_index_th_ = new int[m.nnode_rt_ + 2];
1256  m.offdiag_ = new double*[m.nnode_rt_ + 2];
1257  m.ioffdiag_ = new int[m.nnode_rt_ + 2];
1258  for (i = 0; i < m.nnode_rt_; ++i) {
1259  m.nd_rt_index_[i] = ix[i];
1260  m.nd_rt_index_th_[i] = ixth[i];
1261  m.offdiag_[i] = od[i];
1262  m.ioffdiag_[i] = iod[i];
1263  }
1264  delete[] ix;
1265  delete[] ixth;
1266  delete[] od;
1267  delete[] iod;
1268  ix = m.nd_rt_index_ + m.nnode_rt_;
1269  ixth = m.nd_rt_index_th_;
1270  od = m.offdiag_ + m.nnode_rt_;
1271  iod = m.ioffdiag_ + m.nnode_rt_;
1272  } else {
1273  m.nd_rt_index_ = new int[2];
1274  m.nd_rt_index_th_ = new int[2];
1275  m.offdiag_ = new double*[2];
1276  m.ioffdiag_ = new int[2];
1277  ix = m.nd_rt_index_;
1278  ixth = m.nd_rt_index_th_;
1279  od = m.offdiag_;
1280  iod = m.ioffdiag_;
1281  }
1282  // now fill in
1283  // sid0 is for sid1A[0] , sid1 is for sid1B[???]
1284  // sid0: nodeindex = inode[j1]
1285  // in multisplit_v_setup, we created two parallel vectors of size
1286  // nbackrt_ (number of backbones sending to reduced tree).
1287  // backsid_ is sid0, and
1288  // backAindex_ corresponding sid1A index for sid0
1289  // backBindex_ sid1B index for sid1
1290  // Should not be too many backbones on this machine so linear
1291  // search should be ok. BUG again. i is MultiSplit.back_index and
1292  // can determine MultiSplit from j1.
1293  i = vec2ms[jj]->back_index;
1294  MultiSplitThread& t = mth_[vec2ms[jj]->ithread];
1295  assert(sid[jj] == t.backsid_[i]);
1296  ix[0] = inode[jj];
1297  ix[1] = inode[jj + 1];
1298  ixth[0] = vec2ms[jj]->ithread;
1299  ixth[1] = vec2ms[jj]->ithread;
1300  od[0] = t.sid1A + t.backAindex_[i];
1301  od[1] = t.sid1B + t.backBindex_[i];
1302  iod[0] = t.backAindex_[i];
1303  iod[1] = t.backBindex_[i];
1304 #if 0
1305 printf("%d offdiag nbrt=%d iii=%d i=%d j1=%d jj=%d sid=%d ix = %d %d back = %d %d\n",
1306 nrnmpi_myid, nbackrt_, iii, i, j1, jj, sid[j1], ix[0], ix[1], t.backAindex_[i], t.backBindex_[i]);
1307 #endif
1308  m.nnode_rt_ += 2;
1309  }
1310  nodeindex_rthost_[k] = rthost[j1];
1311  ++j;
1312  ++jj;
1313  nodeindex_buffer_th_[k] = threadid[jj];
1314  nodeindex_buffer_[k++] = inode[jj];
1315  ++b;
1316  br += 2;
1317  }
1318  }
1319  }
1320  if (b || br) {
1321  msti_[nthost_].displ_ = mdisp;
1322  msti_[nthost_].size_ = 2 * b + br;
1323  mdisp += 2 * b + br;
1324  msti_[nthost_].host_ = tmphost[itmp];
1325  msti_[nthost_].rthost_ = tmphost[itmp];
1326  msti_[nthost_].nnode_ = b;
1327  msti_[nthost_].tag_ = 3;
1328  ++nthost_;
1329  }
1330  }
1331  delete[] tmphost;
1332 
1333  // third pass for the long->long backbones
1334  for (i = 0; i < nrnmpi_numprocs; ++i) {
1335  int b = 0;
1336  for (j = displ[i]; j < displ[i + 1]; ++j) {
1337  if (mark[j] >= 0 && bb_relation[mark[j]] == 0 && all_bb_relation[j] == 0) {
1338  nodeindex_buffer_th_[k] = threadid[mark[j]];
1339  nodeindex_buffer_[k++] = inode[mark[j]];
1340  // printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k,
1341  // nthost_, mark[j], inode[mark[j]]);
1342  ++b;
1343  }
1344  }
1345  if (b) {
1346  msti_[nthost_].displ_ = mdisp;
1347  msti_[nthost_].size_ = 2 * b;
1348  mdisp += 2 * b;
1349  msti_[nthost_].host_ = i;
1350  msti_[nthost_].nnode_ = b;
1351  msti_[nthost_].tag_ = 2;
1352  ++nthost_;
1353  }
1354  }
1355 
1356  // fourth pass for the reducedtree->long backbones
1357  // the reduced tree <-> sids not on this host
1359  for (i = 0; i < nrnmpi_numprocs; ++i) {
1360  int b = 0;
1361  int br = 0;
1362  // not the ones on this machine and rthost this machine
1363  for (j = displ[i]; j < displ[i + 1]; ++j) {
1364  int j1 = mark[j];
1365  if (j1 >= 0 && all_bb_relation[j] >= 2 && rthost[j1] == nrnmpi_myid &&
1366  rthost[j1] != i) {
1367  // printf("%d i=%d sid=%d bb_relation=%d send to rthost=%d rt=%p\n", nrnmpi_myid, i,
1368  // allsid[j], all_bb_relation[j], rthost[j1], rt[j1]);
1369  // don't care about the nodeindex, only
1370  // the ReducedTree.
1371  // fill in the ReducedTree map with RHS and D pointers to the sid[0] related receive
1372  // buffer and next iteration the sid[1]
1373  int ib = mdisp + 2 * b;
1374  // printf("%d fill buf %d\n", nrnmpi_myid, ib);
1375  rt[j1]->fillrmap(allsid[j], -1, trecvbuf_ + ib + 1); // exchange order is d, rhs
1376  rt[j1]->fillrmap(allsid[j], allsid[j], trecvbuf_ + ib);
1377  rt[j1]->fillsmap(allsid[j], tsendbuf_ + ib + 1, tsendbuf_ + ib);
1378  ++b; // at least D and RHS received and sent back
1379  if (all_bb_relation[j] > 2) { // two of these
1380  // need to fill in off diag after this iteration is done and then re-iterate
1381  // because they are at the end of the buffer section for machine i.
1382  ++br; // an offdiag received
1383  // though I suppose it does not
1384  // need to be sent back
1385  }
1386  }
1387  }
1388  // reiterate and fill in off diag
1389  br = 0;
1390  for (j = displ[i]; j < displ[i + 1]; ++j) {
1391  int j1 = mark[j];
1392  if (j1 >= 0 && all_bb_relation[j] >= 2 && rthost[j1] == nrnmpi_myid &&
1393  rthost[j1] != i) {
1394  if (all_bb_relation[j] > 2) { // two of these
1395  int ib = mdisp + 2 * b + br;
1396  // printf("%d offdiag fill buf %d j1=%d rt=%p\n", nrnmpi_myid, ib, j1, rt[j1]);
1397  rt[j1]->fillrmap(allsid[j + 1], allsid[j], trecvbuf_ + ib);
1398  rt[j1]->fillrmap(allsid[j], allsid[j + 1], trecvbuf_ + ib + 1);
1399  br += 2;
1400  j += 1;
1401  }
1402  }
1403  }
1404 
1405  if (b || br) {
1406  msti_[nthost_].displ_ = mdisp;
1407  msti_[nthost_].size_ = 2 * b + br;
1408  mdisp += 2 * b + br;
1409  msti_[nthost_].host_ = i;
1411  msti_[nthost_].nnode_ = b;
1412  // no nnode_rt_ for this phase
1413  // msti_[nthost_].nnode_rt_ = br;
1414  // msti_[nthost_].offdiag_ = new double*[br];
1415  msti_[nthost_].tag_ = 3;
1416  ++nthost_;
1417  }
1418  }
1419 
1420  // fifth pass for the short->long backbones
1421  int tmp_index = k;
1423  for (i = 0; i < nrnmpi_numprocs; ++i) {
1424  int b = 0;
1425  for (j = displ[i]; j < displ[i + 1]; ++j) {
1426  if (mark[j] >= 0 && bb_relation[mark[j]] == 1 && all_bb_relation[j] == 0) {
1427  nodeindex_buffer_th_[k] = threadid[mark[j]];
1428  nodeindex_buffer_[k++] = inode[mark[j]];
1429  // printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k,
1430  // nthost_, mark[j], inode[mark[j]]);
1431  ++b;
1432  }
1433  }
1434  if (b) {
1435  msti_[nthost_].displ_ = mdisp;
1436  msti_[nthost_].size_ = 2 * b;
1437  mdisp += 2 * b;
1438  msti_[nthost_].host_ = i;
1439  msti_[nthost_].nnode_ = b;
1440  msti_[nthost_].tag_ = 1;
1441  ++nthost_;
1442  }
1443  }
1444 
1445 #if 0
1446 printf("%d nthost_ = %d\n", nrnmpi_myid, nthost_);
1447 for (i=0; i < nthost_; ++i) {
1449 printf("%d i=%d displ_=%d host_=%d nnode_=%d tag_=%d\n", nrnmpi_myid, i, m.displ_,
1450 m.host_, m.nnode_, m.tag_);
1451 }
1452 #endif
1453 
1454 #if 0
1455 for (i=0; i < k; ++i) {
1457 printf("%d %d nodeindex_buffer %d %s %d\n", nrnmpi_myid, i, j,
1458 secname(v_node[j]->sec), v_node[j]->sec_node_index_);
1459 }
1460 #endif
1461 
1462  // the sids on this machine with rthost on this machine
1463  // go into the ReducedTree directly
1464  if (nrtree_) {
1465  i = 0;
1466  int ib = 0;
1467  for (MultiSplit* ms: *multisplit_list_) {
1468  NrnThread* _nt = nrn_threads + ms->ithread;
1469  MultiSplitThread& t = mth_[ms->ithread];
1470  auto* const vec_d = _nt->node_d_storage();
1471  auto* const vec_rhs = _nt->node_rhs_storage();
1472  if (ms->rthost == nrnmpi_myid) {
1473  // printf("%d nrtree_=%d i=%d rt=%p\n", nrnmpi_myid, nrtree_, i, rt[i]);
1474  int j = ms->nd[0]->v_node_index;
1475  // printf("%d call fillrmap sid %d,%d %d node=%d\n", nrnmpi_myid, ms->sid[0],
1476  // ms->sid[0], j);
1477  ms->rt_ = rt[i];
1478  ms->rmap_index_ = rt[i]->irfill;
1479  ms->smap_index_ = rt[i]->nsmap;
1480  rt[i]->fillrmap(ms->sid[0], -1, &RHS(j));
1481  rt[i]->fillrmap(ms->sid[0], ms->sid[0], &D(j));
1482  rt[i]->fillsmap(ms->sid[0], &RHS(j), &D(j));
1483  if (ms->nd[1]) {
1484  ib = ms->back_index;
1485  assert(t.backsid_[ib] == ms->sid[0]);
1486  // fill sid1 row
1487  // note: for cvode need to keep fillrmap as pairs so following
1488  // moved from between sid1A,sid1B fillrmap.
1489  j = ms->nd[1]->v_node_index;
1490  // printf("%d call fillrmap sid1 %d %d node=%d\n", nrnmpi_myid, ms->sid[1],
1491  // ms->sid[1], j);
1492  rt[i]->fillrmap(ms->sid[1], -1, &RHS(j));
1493  rt[i]->fillrmap(ms->sid[1], ms->sid[1], &D(j));
1494  rt[i]->fillsmap(ms->sid[1], &RHS(j), &D(j));
1495 
1496  // fill sid1A for sid0 row
1497  // printf("%d call fillrmap sid %d,%d %d ib=%d t.backsid=%d t.backAindex=%d\n",
1498  // nrnmpi_myid, ms->sid[1], ms->sid[0], ib, t.backsid_[ib], t.backAindex_[ib]);
1499  rt[i]->fillrmap(ms->sid[1], ms->sid[0], t.sid1A + t.backAindex_[ib]);
1500 
1501  // fill sid1B
1502  // printf("%d call fillrmap sid %d,%d ib=%d backBindex=%d\n", nrnmpi_myid,
1503  // ms->sid[0], ms->sid[1], ib, backBindex_[ib]);
1504  rt[i]->fillrmap(ms->sid[0], ms->sid[1], t.sid1B + t.backBindex_[ib]);
1505  }
1506  }
1507  ++i;
1508  if (ms->nd[1]) {
1509  ++i;
1510  }
1511  }
1512  }
1513 
1514  if (nthost_) {
1517  for (i = 1; i < nthost_; ++i) {
1519  msti_[i].nodeindex_ = msti_[i - 1].nodeindex_ + msti_[i - 1].nnode_;
1520  }
1521  }
1522  // count how many nodeindex_buffer are non-zero area nodes
1523  // iarea_short_long_ and narea_ only for ones
1524  // not related to the ReducedTree method
1525  // printf("%d ndbsize=%d\n", nrnmpi_myid, ndbsize);
1526  for (i = 0; i < ndbsize; ++i) {
1528  Node* nd = _nt->_v_node[nodeindex_buffer_[i]];
1529  if (i == tmp_index) {
1531  }
1532  if (nd->_classical_parent && nd->sec_node_index_ < nd->sec->nnode - 1) {
1533  if (nodeindex_rthost_[i] < 0) {
1534  ++narea_;
1535  }
1536  }
1537  }
1538  // printf("%d narea=%d iarea_short_long=%d\n", nrnmpi_myid, narea_, iarea_short_long_);
1539  if (narea_) {
1540  buf_area_indices_ = new int[narea_];
1541  area_node_indices_ = new int[narea_];
1542  k = 0;
1543  for (i = 0; i < ndbsize; ++i) {
1545  Node* nd = _nt->_v_node[nodeindex_buffer_[i]];
1546  if (nd->_classical_parent && nd->sec_node_index_ < nd->sec->nnode - 1) {
1547  buf_area_indices_[k] = 2 * i;
1549  ++k;
1550  }
1551  }
1552  }
1553 
1554  // look through the nthost list and fill in the area2buf receive buffer
1555  // indices for info sent to reducetrees (by definition,not on this machine)
1556  for (i = 0; i < nthost_; ++i) {
1557  MultiSplitTransferInfo& msti = msti_[i];
1558  if (msti.tag_ == 3) { // reduced tree related
1559  // but we only want to, not from, the reduced tree machine
1560  // any nodes nonzero area?
1561  // printf("%d i=%d nnode=%d nodeindex=%p host=%d rthost=%d\n", nrnmpi_myid, i,
1562  // msti.nnode_, msti.nodeindex_, msti.host_, msti.rthost_);
1563  if (msti.rthost_ != nrnmpi_myid)
1564  for (j = 0; j < msti.nnode_; ++j) {
1565  NrnThread* _nt = nrn_threads + msti.nodeindex_th_[j];
1566  int in = msti.nodeindex_[j];
1567  Node* nd = _nt->_v_node[in];
1568  if (nd->_classical_parent && nd->sec_node_index_ < nd->sec->nnode - 1) {
1569  // non-zero area
1570  // search the area2buf list
1571  for (k = 0; k < narea2buf_; ++k) {
1572  if (area2buf_[k].inode == in && area2buf_[k].ms->ithread == _nt->id) {
1573  break;
1574  }
1575  }
1576  assert(k < narea2buf_);
1577  Area2Buf& ab = area2buf_[k];
1578  // ok. fill in the ibuf info
1579  ab.ibuf[0] = msti.displ_ + 2 * j;
1580  ab.ibuf[1] = ab.ibuf[0] + 1;
1581  if (ab.n == 3) {
1582  // which offdiag item?
1583  int ioff;
1584  for (ioff = 0; ioff < msti.nnode_rt_; ++ioff) {
1585  if (msti.nd_rt_index_[ioff] == in &&
1586  msti.nd_rt_index_th_[ioff] == _nt->id) {
1587  break;
1588  }
1589  }
1590  assert(ioff < msti.nnode_rt_);
1591  ab.ibuf[2] = msti.displ_ + 2 * msti.nnode_ + ioff;
1592  }
1593 #if 0
1594 printf("%d in=%d n=%d ibuf %d %d %d\n", nrnmpi_myid, in, ab.n,
1595 ab.ibuf[0], ab.ibuf[1], (ab.n == 3) ? ab.ibuf[2] : -1);
1596 #endif
1597  }
1598  }
1599  }
1600  }
1601 
1602  // printf("%d leave exchange_setup\n", nrnmpi_myid);
1603  // nrnmpi_int_allgather(&n, nn, 1);
1604 
1605  delete[] nn;
1606  delete[] sid;
1607  delete[] inode;
1608  delete[] threadid;
1609  delete[] displ;
1610  delete[] allsid;
1611  delete[] vec2ms;
1612  delete[] mark;
1613  delete[] bb_relation;
1614  delete[] all_bb_relation;
1615  delete[] connects2short;
1616  delete[] rthost;
1617  delete[] rt;
1618 
1619  errno = 0;
1620 }
1621 
1622 // when d and rhs pointers are updated in the Node, they must be updated in
1623 // the reducedtree rmap and smap as well
1625  msc_->rt_map_update();
1626 }
1627 
1629  for (MultiSplit* ms: *multisplit_list_) {
1630  if (ms->rthost == nrnmpi_myid) { // reduced tree on this host
1631  assert(ms->rt_);
1632  assert(ms->rmap_index_ >= 0);
1633  assert(ms->smap_index_ >= 0);
1634  MultiSplitThread& t = mth_[ms->ithread];
1635  double** r = ms->rt_->rmap + ms->rmap_index_;
1636  double** s = ms->rt_->smap + ms->smap_index_;
1637  for (int j = 0; j < 2; ++j)
1638  if (ms->nd[j]) {
1639  *s++ = *r++ = &NODERHS(ms->nd[j]);
1640  *s++ = *r++ = &NODED(ms->nd[j]);
1641  }
1642  if (ms->nd[1]) { // do sid1a and sid1b as well
1643  assert(ms->back_index >= 0);
1644  *r++ = t.sid1A + t.backAindex_[ms->back_index];
1645  *r++ = t.sid1B + t.backBindex_[ms->back_index];
1646  }
1647  }
1648  }
1649  // also need to do the Area2RT.pd[3] where the
1650  // order is d, rhs, and possibly sid1A or sid1B
1651  for (int i = 0; i < narea2rt_; ++i) {
1652  Area2RT& art = area2rt_[i];
1653  MultiSplit& ms = *art.ms;
1654  NrnThread* _nt = nrn_threads + ms.ithread;
1655  art.pd[0] = _nt->node_d_storage() + art.inode;
1656  art.pd[1] = _nt->node_rhs_storage() + art.inode;
1657  if (art.n == 3) {
1658  MultiSplitThread& t = mth_[ms.ithread];
1659  if (art.inode == ms.nd[0]->v_node_index) {
1660  art.pd[2] = t.sid1A + t.backAindex_[ms.back_index];
1661  } else if (art.inode == ms.nd[1]->v_node_index) {
1662  art.pd[2] = t.sid1B + t.backBindex_[ms.back_index];
1663  } else {
1664  assert(0);
1665  }
1666  }
1667  }
1668 }
1669 
1671  int i, j, k, id;
1672  id = nrnmpi_myid;
1673  // assume only one thread when there is transfer info
1674  NrnThread* nt = nrn_threads;
1675  Printf("%d nthost_=%d\n", id, nthost_);
1676  for (i = 0; i < nthost_; ++i) {
1678  Printf("%d %d host=%d nnode=%d displ=%d\n", id, i, ms.host_, ms.nnode_, ms.displ_);
1679  for (j = 0; j < ms.nnode_; ++j) {
1680  k = ms.nodeindex_[j];
1681  Printf("%d %d %d %d %s %d\n",
1682  id,
1683  i,
1684  j,
1685  k,
1686  secname(nt->_v_node[k]->sec),
1687  nt->_v_node[k]->sec_node_index_);
1688  }
1689  }
1690 }
1691 
1693  int sid,
1694  int nt,
1695  int* mark,
1696  int* allsid,
1697  int* all_bb_relation) {
1698  int i;
1699  for (i = 0; i < nt; ++i) {
1700  if (mark[i] == -1 && allsid[i] == sid) {
1701  mark[i] = m;
1702  if (all_bb_relation[i] > 2) {
1703  int sid2 = all_bb_relation[i] - 3;
1704  reduced_mark(m, sid2, nt, mark, allsid, all_bb_relation);
1705  }
1706  }
1707  }
1708 }
1709 
1710 /*
1711 section ParallelContext.multisplit_connect(x, sid) is a generalization of
1712 root ParallelContext.splitcell_connect(adjacent_host).
1713 sid refers to a point and all points with the same sid are connected
1714 together with 0 resistance wires.
1715 If a connected set of sections has only 1 sid then that point will be the
1716 root and triangularization can proceed normally.
1717 If a connected set of sections has 2 sid then triangularization proceeds
1718 from the leaves to the backbone between the 2 sid which forms a cable.
1719 Then special triangularization with fill-in of the sid1A column
1720 proceeds from the backbone element adjacent to sid1 and then with fill-in
1721 of the sid1B column (which modifies the sid1A column) from the backbone end
1722 adjacent to sid0
1723 in the hope that the fill-in elements at the sid are small enough
1724 to ignore in the interprocessor jacobian element matrix exchange. Note
1725 that gaussian elimination in the backbone is more complex as normal
1726 tree elimination with the same number of nodes.
1727 We defer the implementation of more than 2 sid in a subtree.
1728 */
1729 
1730 /* Gaussian elimination strategy for the backbone.
1731 There are two different strategies that optimize either minimum
1732 number of matrix operations that is best for long backbones, or
1733 numerical stability by minimizing the far off diagonal just before
1734 exchange which would be best for short (tightly coupled) backbones.
1735 
1736 For long backbones, it is simplest to begin triangularization
1737 using the equations adjacent to the center as pivot equations.
1738 This fills in a single column. But in the trivial case of the center
1739 being the only interior point, there are no triangularization operations
1740 before exchange.
1741 1 xx
1742 2 xxx
1743 3 xxx
1744 4 xxx
1745 5 xxx
1746 6 xxx
1747 7 xx
1748 
1749 
1750 1 x x
1751 2 xx x
1752 3 xxx
1753 4 xxx
1754 5 xxx
1755 6 x xx
1756 7 x x
1757 
1758 exchange occurs
1759 
1760 1 x x
1761 2 x x
1762 3 xx
1763 4 x
1764 5 xx
1765 6 x x
1766 7 x x
1767 
1768 For short backbones and greatest stability/accuracy and still simple but
1769 many more operations, one begins adjacent to the sids. Now,
1770 in the trivial case of the center being the only interior point, it is
1771 used to reduce the off diagonal element. If there is no interior point
1772 nothing happens.
1773 
1774 1 xx
1775 2 xxx triangularize starting with equation 6 (cannot start with
1776 3 xxx equation 1 or 7 because we do not know the correct value for d)
1777 4 xxx this eliminates the a elements but at the cost of introducing
1778 5 xxx A elements in column 7
1779 6 xxx
1780 7 xx
1781 
1782 1 x x cannot use 1, instead do the same starting at 2
1783 2 xx x this eliminates b elements but at the cost of introducing
1784 3 xx x B elements in column 1. Also A elements are modified.
1785 4 xx x
1786 5 xx x
1787 6 xxx
1788 7 xx
1789 
1790 1 x x
1791 2 xx x
1792 3 x x x
1793 4 x x x
1794 5 x x x
1795 6 x xx
1796 7 x x
1797 
1798 exchange occurs (modified d for 1 and 7
1799 solve the 1 and 7 2x2 equations then back substitute
1800 
1801 */
1802 
1804  int id, i, it;
1805  for (id = 0; id < nrnmpi_numprocs; ++id) {
1806  nrnmpi_barrier();
1807  if (id == nrnmpi_myid) {
1808  Printf("myid=%d\n", id);
1809  Printf(" MultiSplit %ld\n", multisplit_list_->size());
1810  for (i = 0; i < multisplit_list_->size(); ++i) {
1811  MultiSplit* ms = (*multisplit_list_)[i];
1812  Printf(" %2d bbs=%d bi=%-2d rthost=%-4d %-4d %s{%d}",
1813  i,
1814  ms->backbone_style,
1815  ms->back_index,
1816  ms->rthost,
1817  ms->sid[0],
1818  secname(ms->nd[0]->sec),
1819  ms->nd[0]->sec_node_index_);
1820  if (ms->nd[1]) {
1821  Printf(" %-4d %s{%d}",
1822  ms->sid[1],
1823  secname(ms->nd[1]->sec),
1824  ms->nd[1]->sec_node_index_);
1825  }
1826  Printf("\n");
1827  }
1828  for (it = 0; it < nrn_nthread; ++it) {
1829  NrnThread* _nt = nrn_threads + it;
1830  MultiSplitThread& t = mth_[it];
1831  Printf(" backbone_begin=%d backbone_long_begin=%d backbone_interior_begin=%d\n",
1832  t.backbone_begin,
1833  t.backbone_long_begin,
1834  t.backbone_interior_begin);
1835  Printf(" backbone_sid1_begin=%d backbone_long_sid1_begin=%d backbone_end=%d\n",
1836  t.backbone_sid1_begin,
1837  t.backbone_long_sid1_begin,
1838  t.backbone_end);
1839  Printf(" nbackrt_=%d i, backsid_[i], backAindex_[i], backBindex_[i]\n",
1840  t.nbackrt_);
1841  if (t.nbackrt_) {
1842  for (int i = 0; i < t.nbackrt_; ++i) {
1843  Printf(" %2d %2d %5d %5d",
1844  i,
1845  t.backsid_[i],
1846  t.backAindex_[i],
1847  t.backBindex_[i]);
1848  Node* nd = _nt->_v_node[t.backbone_begin + t.backAindex_[i]];
1849  Printf(" %s{%d}", secname(nd->sec), nd->sec_node_index_);
1850  nd = _nt->_v_node[t.backbone_begin + t.backBindex_[i]];
1851  Printf(" %s{%d}", secname(nd->sec), nd->sec_node_index_);
1852  Printf("\n");
1853  }
1854  }
1855  }
1856  Printf(" ReducedTree %d\n", nrtree_);
1857  for (i = 0; i < nrtree_; ++i) {
1858  ReducedTree* rt = rtree_[i];
1859  Printf(" %d n=%d nmap=%d\n", i, rt->n, rt->nmap);
1860  rt->pr_map(tbsize, trecvbuf_);
1861  }
1862  Printf(" MultiSplitTransferInfo %d\n", nthost_);
1863  for (i = 0; i < nthost_; ++i) {
1865  Printf(" %d host=%d rthost=%d nnode=%d nnode_rt=%d size=%d tag=%d\n",
1866  i,
1867  m.host_,
1868  m.rthost_,
1869  m.nnode_,
1870  m.nnode_rt_,
1871  m.size_,
1872  m.tag_);
1873  if (m.nnode_) {
1874  Printf(" nodeindex=%p nodeindex_buffer = %p\n",
1875  fmt::ptr(m.nodeindex_),
1876  fmt::ptr(nodeindex_buffer_));
1877  }
1878  }
1879  Printf(" ndbsize=%d i nodeindex_buffer_=%p nodeindex_rthost_=%p\n",
1880  ndbsize,
1881  fmt::ptr(nodeindex_buffer_),
1882  fmt::ptr(nodeindex_rthost_));
1883  if (ndbsize) {
1884  for (int i = 0; i < ndbsize; ++i) {
1885  Printf(" %d %d %d\n", i, nodeindex_buffer_[i], nodeindex_rthost_[i]);
1886  }
1887  }
1888  Printf(" tbsize=%d trecvbuf_=%p tsendbuf_=%p\n",
1889  tbsize,
1890  fmt::ptr(trecvbuf_),
1891  fmt::ptr(tsendbuf_));
1892  Printf("\n");
1893  }
1894  }
1895  nrnmpi_barrier();
1896 }
1897 
1898 // following uses the short backbone method (N form of the matrix)
1899 
1900 
1901 // What is a good order for the above? We already have a tree order structure
1902 // with a root. If there is a single sid involved then that should be the
1903 // root and we are conceptually the same as splitcell.cpp
1904 // For two sid, then the tree is ordered as (see above comment at the NOTE)
1905 
1907  msc_->solve();
1908 }
1909 
1911  msc_->mth_[nt->id].triang(nt);
1912  return nullptr;
1913 }
1915  if (nt->id == 0) {
1916  msc_->reduce_solve();
1917  }
1918  return nullptr;
1919 }
1921  msc_->mth_[nt->id].bksub(nt);
1922  return nullptr;
1923 }
1924 
1926  // if (t < .025) prstruct();
1927  // if (nrnmpi_myid == 0) pmat(true);
1928  NrnThread* nt = nrn_threads;
1929  MultiSplitThread& t = mth_[0];
1930  t.triang_subtree2backbone(nt);
1931  t.triang_backbone(nt);
1932  // if (nrnmpi_myid == 4) pmat(true);
1933  // pmat1("t");
1934  // printf("%d enter matrix exchange\n", nrnmpi_myid);
1935  matrix_exchange();
1936  // printf("%d leave matrix exchange\n", nrnmpi_myid);
1937  // pmat1("e");
1938  t.bksub_backbone(nt);
1939  t.bksub_subtrees(nt);
1940  // pmat(true);
1941  // nrnmpi_barrier(); // only possible if everyone is actually multisplit
1942  // abort();
1943 }
1944 
1947  triang_backbone(_nt);
1948 }
1950  matrix_exchange();
1951 }
1953  bksub_backbone(_nt);
1954  bksub_subtrees(_nt);
1955 }
1956 
1957 // In the typical case, all nodes connected to the same sids have 0 area
1958 // and thus the weighted average voltage is just sum RHS / sum D.
1959 // However, nrn_multisplit_nocap_v is somewhat of a misnomer because one or
1960 // more compartments (but not connected together with the same sid) may have
1961 // non-zero area and thus that voltage is correct and must be transferred
1962 // to all the connecting (with same sid) zero-area nodes. Furthermore
1963 // the sum RHS (of the zero area nodes) must be added to the RHS of non-zero
1964 // area node (after that non-zero RHS has been computed back in the caller).
1965 // Lastly, that sum of RHS has to be adjusted using the sum of D of the
1966 // zero area nodes so that the RHS is proper with respect to the correct
1967 // value of V.
1968 // So. When there is no non-zero area node for a sid, the voltage is
1969 // sum rhs / sum d. For a non-zero area node the V must be sent and the
1970 // sum rhs and sum d from the others must be received and the true rhs
1971 // put into adjust_rhs_ using rhs - d*v. For a zero-area
1972 // node (with same sid as non-zero area node), rhs and d must be sent, and
1973 // voltage can be computed after receiving the changed rhs and d by
1974 // rhs/d.
1975 
1980 }
1983 }
1986 }
1989 }
1990 
1992  int i;
1993  // scale the non-zero area node elements since those already
1994  // have the answer.
1995 
1996  // the problem here is that non-zero area v is correct,
1997  // but rhs ends up wrong for
1998  // non-zero area nodes (because current from zero area not added)
1999  // so encode v into D and sum of zero-area rhs will end up in
2000  // rhs.
2001  auto* const vec_d = _nt->node_d_storage();
2002  auto* const vec_rhs = _nt->node_rhs_storage();
2003  auto* const vec_v = _nt->node_voltage_storage();
2004  if (_nt->id == 0)
2005  for (i = 0; i < narea2buf_; ++i) {
2006  Area2Buf& ab = area2buf_[i];
2007  vec_d[ab.inode] = 1e50; // sentinal
2008  vec_rhs[ab.inode] = vec_v[ab.inode] * 1e50;
2009  }
2010  // also scale the non-zero area elements on this host
2011  for (i = 0; i < narea2rt_; ++i) {
2012  Area2RT& ar = area2rt_[i];
2013  if (_nt->id == ar.ms->ithread) {
2014  vec_d[ar.inode] = 1e50;
2015  vec_rhs[ar.inode] = vec_v[ar.inode] * 1e50;
2016  }
2017  }
2018 }
2020  if (_nt->id == 0) {
2022  }
2023 }
2025  // at this point, for zero area nodes, D is
2026  // 1.0, and the voltage is RHS/D).
2027  // So zero-area node information is fine.
2028  // But for non-zero area nodes, D is the sum of all zero-area
2029  // node d, and RHS is the sum of all zero-area node rhs.
2030  int i;
2031  auto* const vec_area = _nt->node_area_storage();
2032  auto* const vec_d = _nt->node_d_storage();
2033  auto* const vec_rhs = _nt->node_rhs_storage();
2034  auto* const vec_v = _nt->node_voltage_storage();
2035  if (_nt->id == 0)
2036  for (i = 0; i < narea2buf_; ++i) {
2037  Area2Buf& ab = area2buf_[i];
2038  int j = ab.inode;
2039  double afac = 100. / vec_area[j];
2040  ab.adjust_rhs_ = (vec_rhs[j] - vec_d[j] * vec_v[j]) * afac;
2041  // printf("%d nz1 %d D=%g RHS=%g V=%g afac=%g adjust=%g\n",
2042  // nrnmpi_myid, i, D(i), RHS(i), vec_v[j], afac, ab.adjust_rhs_);
2043  }
2044  for (i = 0; i < narea2rt_; ++i) {
2045  Area2RT& ar = area2rt_[i];
2046  if (_nt->id == ar.ms->ithread) {
2047  int j = ar.inode;
2048  double afac = 100. / vec_area[j];
2049  ar.adjust_rhs_ = (vec_rhs[j] - vec_d[j] * vec_v[j]) * afac;
2050  // printf("%d nz2 %d D=%g RHS=%g V=%g afac=%g adjust=%g\n",
2051  // nrnmpi_myid, i, D(i), RHS(i), vec_v[j], afac, ar.adjust_rhs_);
2052  }
2053  }
2054 }
2055 
2058 }
2059 
2061  int i;
2062  auto* const vec_rhs = _nt->node_rhs_storage();
2063  if (_nt->id == 0)
2064  for (i = 0; i < narea2buf_; ++i) {
2065  Area2Buf& ab = area2buf_[i];
2066  vec_rhs[ab.inode] += ab.adjust_rhs_;
2067  }
2068  // also scale the non-zero area elements on this host
2069  for (i = 0; i < narea2rt_; ++i) {
2070  Area2RT& ar = area2rt_[i];
2071  if (_nt->id == ar.ms->ithread) {
2072  // printf("%d adjust %d %g %g\n",
2073  // nrnmpi_myid, ar.inode, ar.adjust_rhs_, VEC_RHS(ar.inode));
2074  vec_rhs[ar.inode] += ar.adjust_rhs_;
2075  }
2076  }
2077 }
2078 
2079 // Two phases: Send all the info from long backbones and receive
2080 // all the short backbone info for this machine.
2081 // Then process the 2x2 short backbone and send remaining (short)
2082 // backbone and receive the rest (long) of the backbone info.
2083 
2084 // The ReducedTree solve fits into this strategy receiving from long at the
2085 // same time as receiving from short and
2086 // sending to long at the same time as sending from short
2087 
2088 // The MultiSplitTransferInfo order is
2089 // this -> other
2090 // long -> short 0, ihost_long_long-1
2091 // long -> long ihost_long_long, ihost_long_reduced-1
2092 // long -> reduced ihost_long_reduced, ihost_reduced_long-1
2093 // reduced -> long ihost_reduced_long, ihost_short_long-1
2094 // short -> long ihost_short_long, nthost_
2095 
2097  int i, j, jj, k;
2098  double* tbuf;
2099  double rttime;
2100  double wt = nrnmpi_wtime();
2101  NrnThread* _nt;
2102  // the mpi strategy is copied from the
2103  // cvode/examples_par/pvkxb.cpp exchange strategy
2104 
2105 #define EXCHANGE_ON 1
2106 #if EXCHANGE_ON
2107  // post all the receives
2108  for (i = 0; i < nthost_; ++i) {
2109  int tag;
2111  tag = mt.tag_;
2112  // receiving the result from a reduced tree is tag 4.
2113  if (tag == 3 && nrnmpi_myid != mt.rthost_) {
2114  tag = 4;
2115  }
2116  nrnmpi_postrecv_doubles(trecvbuf_ + mt.displ_, mt.size_, mt.host_, tag, &mt.request_);
2117 #if 0
2118 printf("%d post receive %d displ=%d size=%d host=%d tag=%d\n",
2119 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, tag);
2120 #endif
2121  }
2122 
2123  // Note that we are assuming only one thread (_nt) when
2124  // mpi is used. (In order for D and RHS to make sense)
2125  // fill the send buffer with the long backbone info
2126  // i.e. long->short, long -> long, long -> reduced
2127  for (i = 0; i < ihost_reduced_long_; ++i) {
2129  j = 0;
2130  tbuf = tsendbuf_ + mt.displ_;
2131  for (jj = 0; jj < mt.nnode_; ++jj) {
2132  k = mt.nodeindex_[jj];
2133  _nt = nrn_threads + mt.nodeindex_th_[jj];
2134  tbuf[j++] = _nt->actual_d(k);
2135  tbuf[j++] = _nt->actual_rhs(k);
2136  }
2137  // each sent backbone will have added 2 to mt.nnode_rt_
2138  for (jj = 0; jj < mt.nnode_rt_; ++jj) {
2139  tbuf[j++] = *mt.offdiag_[jj];
2140  }
2141 #if 0
2142 //if (nrnmpi_myid == 4) {
2143 printf("%d send to %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2144 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2145 //}
2146 #endif
2147 #if 0
2148 //if (nrnmpi_myid == 4) {
2149  for (j = 0; j < mt.nnode_; ++j) {
2150 printf("%d send to %d tbuf[%d] = %g tbuf[%d] = %g from node %d\n",
2151 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2152  }
2153  for (j = 0; j < mt.nnode_rt_; ++j) {
2154  int jj = 2*mt.nnode_ + j;
2155 printf("%d send to %d offdiag tbuf[%d] = %g\n",
2156 nrnmpi_myid, mt.host_, jj, tbuf[jj]);
2157  }
2158 //)
2159 #endif
2160  }
2161 
2162  // adjust area for any D, RHS, sid1A, sid1B going to a ReducedTree host
2163  for (i = 0; i < narea2buf_; ++i) {
2164  Area2Buf& ab = area2buf_[i];
2165  _nt = nrn_threads + ab.ms->ithread;
2166  double afac = 0.01 * _nt->node_area_storage()[ab.inode];
2167  tbuf = tsendbuf_;
2168  for (j = 0; j < ab.n; ++j) {
2169  tbuf[ab.ibuf[j]] *= afac;
2170 #if 0
2171 printf("%d area2buf * afac=%g i=%d j=%d node=%d ibuf=%d buf=%g\n", nrnmpi_myid, afac,
2172 i, j, ab.inode, ab.ibuf[j], tbuf[ab.ibuf[j]]);
2173 #endif
2174  }
2175  }
2176 
2177  // send long backbone info (long->short, long->long)
2178  for (i = 0; i < ihost_reduced_long_; ++i) {
2180  tbuf = tsendbuf_ + mt.displ_;
2181  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, mt.tag_);
2182 #if 0
2183 printf("%d post send %d displ=%d size=%d host=%d tag=%d\n",
2184 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, mt.tag_);
2185 #endif
2186  }
2187 
2188 #if 0
2189  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2191 printf("%d will send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2192  }
2193 #endif
2194  // handle the reduced trees and short backbones
2195 
2196  // wait for receives from the long backbones needed by reduced tree
2197  // and short backbones to complete (reduced <- long, short <- long)
2198  for (i = ihost_reduced_long_; i < nthost_; ++i) {
2200 #if 0
2201 printf("%d wait one %d\n", nrnmpi_myid, mt.host_);
2202 #endif
2203  nrnmpi_wait(&mt.request_);
2204 #if 0
2205 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d displ=%d\n",
2206 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_, mt.displ_);
2207 //for (j=0; j < mt.size_; ++j) { printf("%d receive tbuf[%d]=%g\n", nrnmpi_myid, j, trecvbuf_[mt.displ_ + j]);}
2208 #endif
2209  }
2210 #if 0
2211 for (i=0; i < tbsize_; ++i) { printf("%d trecvbuf[%d] = %g\n", nrnmpi_myid, i, trecvbuf_[i]);}
2212 #endif
2213 
2214  // measure reducedtree,short backbone computation time
2215  rttime = nrnmpi_wtime();
2216 
2217  // adjust area in place for any D, RHS, sid1A, sid1B on this host
2218  // going to ReducedTree on this host
2219  // if (narea2rt_) printf("%d adjust area in place\n", nrnmpi_myid);
2220  for (i = 0; i < narea2rt_; ++i) {
2221  Area2RT& ar = area2rt_[i];
2222  NrnThread* _nt = nrn_threads + ar.ms->ithread;
2223  double afac = 0.01 * _nt->node_area_storage()[ar.inode];
2224  for (j = 0; j < ar.n; ++j) {
2225  *ar.pd[j] *= afac;
2226  }
2227  }
2228 #endif // EXCHANGE_ON
2229 
2230  for (i = 0; i < nrtree_; ++i) {
2231  rtree_[i]->solve();
2232  }
2233 
2234 #if EXCHANGE_ON
2235  // measure reducedtree,short backbone computation time
2236  nrnmpi_rtcomp_time_ += nrnmpi_wtime() - rttime;
2237 
2238  // send reduced and short backbone info (reduced -> long, short -> long)
2239  for (i = ihost_reduced_long_; i < nthost_; ++i) {
2240  int tag;
2242  tbuf = tsendbuf_ + mt.displ_;
2243  // printf("%d send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_,
2244  // mt.tag_);
2245  tag = mt.tag_;
2246  // sending result from reduced tree means tag is 4
2247  if (tag == 3) {
2248  tag = 4;
2249  }
2250  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, tag);
2251  // only moderately wasteful (50%) in sending back irrelevant
2252  // off diag info. Required is only 2*mt.nnode_. But these
2253  // messages are pretty short, anyway.
2254  }
2255 
2256  // handle the long backbones
2257 
2258  // wait for all remaining receives to complete
2259  // long <- short, long <- long, long <- reduced
2260  for (i = 0; i < ihost_reduced_long_; ++i) {
2262 #if 0
2263 printf("%d wait two %d\n", nrnmpi_myid, mt.host_);
2264 #endif
2265  nrnmpi_wait(&mt.request_);
2266 #if 0
2267 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2268 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2269 #endif
2270  }
2271 
2272  // add to matrix (long <- long)
2273  // and also (long <- short) even though in principle should
2274  // not add those since already solved on short backbone machine
2275  // we can add since the ones sent were scaled by 1e30 which
2276  // will eliminate the effect of existing D, RHS, and S1A or S1B
2277  // as well as making the possible area scaling irrelevant
2278  // being able to handle them together means we do not need
2279  // 0 to ihost_long_long - 1 to refer to long <-> short
2280  for (i = 0; i < ihost_reduced_long_; ++i) {
2282  j = 0;
2283  tbuf = trecvbuf_ + mt.displ_;
2284  for (jj = 0; jj < mt.nnode_; ++jj) {
2285  k = mt.nodeindex_[jj];
2286  _nt = nrn_threads + mt.nodeindex_th_[jj];
2287  _nt->actual_d(k) += tbuf[j++];
2288  _nt->actual_rhs(k) += tbuf[j++];
2289  }
2290 #if 0
2291 if (nrnmpi_myid == 4) {
2292  for (j = 0; j < mt.nnode_; ++j) {
2293 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2294 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2295  }
2296 }
2297 #endif
2298  }
2299 #endif // EXCHANGE_ON
2300 
2301 #if NRNMPI
2303 #endif
2304  errno = 0;
2305 }
2306 
2307 void MultiSplitControl::matrix_exchange_nocap() { // copy of matrix_exchange() and modified
2308  int i, j, jj, k;
2309  double* tbuf;
2310  double rttime;
2311  double wt = nrnmpi_wtime();
2312  NrnThread* _nt;
2313  // the mpi strategy is copied from the
2314  // cvode/examples_par/pvkxb.cpp exchange strategy
2315 
2316 #define EXCHANGE_ON 1
2317 #if EXCHANGE_ON
2318  // post all the receives
2319  for (i = 0; i < nthost_; ++i) {
2320  int tag;
2322  tag = mt.tag_;
2323  // receiving the result from a reduced tree is tag 4.
2324  if (tag == 3 && nrnmpi_myid != mt.rthost_) {
2325  tag = 4;
2326  }
2327  nrnmpi_postrecv_doubles(trecvbuf_ + mt.displ_, mt.size_, mt.host_, tag, &mt.request_);
2328 #if 0
2329 printf("%d post receive %d displ=%d size=%d host=%d tag=%d\n",
2330 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, tag);
2331 #endif
2332  }
2333 
2334  // fill the send buffer with the long backbone info
2335  // i.e. long->short, long -> long, long -> reduced
2336  for (i = 0; i < ihost_reduced_long_; ++i) {
2338  j = 0;
2339  tbuf = tsendbuf_ + mt.displ_;
2340  for (jj = 0; jj < mt.nnode_; ++jj) {
2341  k = mt.nodeindex_[jj];
2342  _nt = nrn_threads + mt.nodeindex_th_[jj];
2343  tbuf[j++] = _nt->actual_d(k);
2344  tbuf[j++] = _nt->actual_rhs(k);
2345  }
2346  // each sent backbone will have added 2 to mt.nnode_rt_
2347  for (jj = 0; jj < mt.nnode_rt_; ++jj) {
2348  tbuf[j++] = *mt.offdiag_[jj];
2349  }
2350 #if 0
2351 //if (nrnmpi_myid == 4) {
2352 printf("%d send to %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2353 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2354 //}
2355 #endif
2356 #if 0
2357 //if (nrnmpi_myid == 4) {
2358  for (j = 0; j < mt.nnode_; ++j) {
2359 printf("%d send to %d tbuf[%d] = %g tbuf[%d] = %g from node %d\n",
2360 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2361  }
2362  for (j = 0; j < mt.nnode_rt_; ++j) {
2363  int jj = 2*mt.nnode_ + j;
2364 printf("%d send to %d offdiag tbuf[%d] = %g\n",
2365 nrnmpi_myid, mt.host_, jj, tbuf[jj]);
2366  }
2367 //}
2368 #endif
2369  }
2370 
2371  // send long backbone info (long->short, long->long)
2372  for (i = 0; i < ihost_reduced_long_; ++i) {
2374  tbuf = tsendbuf_ + mt.displ_;
2375  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, mt.tag_);
2376 #if 0
2377 printf("%d post send %d displ=%d size=%d host=%d tag=%d\n",
2378 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, mt.tag_);
2379 #endif
2380  }
2381 
2382 #if 0
2383  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2385 printf("%d will send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2386  }
2387 #endif
2388  // handle the reduced trees and short backbones
2389 
2390  // wait for receives from the long backbones needed by reduced tree
2391  // and short backbones to complete (reduced <- long, short <- long)
2392  for (i = ihost_reduced_long_; i < nthost_; ++i) {
2394 #if 0
2395 printf("%d wait one %d\n", nrnmpi_myid, mt.host_);
2396 #endif
2397  nrnmpi_wait(&mt.request_);
2398 #if 0
2399 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d displ=%d\n",
2400 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_, mt.displ_);
2401 //for (j=0; j < mt.size_; ++j) { printf("%d receive tbuf[%d]=%g\n", nrnmpi_myid, j, trecvbuf_[mt.displ_ + j]);}
2402 #endif
2403  }
2404 #if 0
2405 for (i=0; i < tbsize_; ++i) { printf("%d trecvbuf[%d] = %g\n", nrnmpi_myid, i, trecvbuf_[i]);}
2406 #endif
2407 
2408  // measure reducedtree,short backbone computation time
2409  rttime = nrnmpi_wtime();
2410 
2411 #endif // EXCHANGE_ON
2412 
2413  // remember V may be encoded in D
2414  for (i = 0; i < nrtree_; ++i) {
2415  rtree_[i]->nocap();
2416  }
2417 
2418 #if EXCHANGE_ON
2419  // replace in matrix
2420  // for zero area nodes, D is 1.0 and RHS is V from zero area node.
2421  // for non-zero area nodes, they are sum from zero area.
2422  for (i = ihost_short_long_; i < nthost_; ++i) {
2424  j = 0;
2425  tbuf = trecvbuf_ + mt.displ_;
2426  for (jj = 0; jj < mt.nnode_; ++jj) {
2427  k = mt.nodeindex_[jj];
2428  _nt = nrn_threads + mt.nodeindex_th_[jj];
2429  _nt->actual_d(k) = tbuf[j++];
2430  _nt->actual_rhs(k) = tbuf[j++];
2431  }
2432 #if 0
2433  for (j = 0; j < mt.nnode_; ++j) {
2434 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2435 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2436  }
2437 #endif
2438  }
2439  // measure reducedtree,short backbone computation time
2440  nrnmpi_rtcomp_time_ += nrnmpi_wtime() - rttime;
2441 
2442  // send reduced and short backbone info (reduced -> long, short -> long)
2443  for (i = ihost_reduced_long_; i < nthost_; ++i) {
2444  int tag;
2446  tbuf = tsendbuf_ + mt.displ_;
2447  // printf("%d send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_,
2448  // mt.tag_);
2449  tag = mt.tag_;
2450  // sending result from reduced tree means tag is 4
2451  if (tag == 3) {
2452  tag = 4;
2453  }
2454  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, tag);
2455  // only moderately wasteful (50%) in sending back irrelevant
2456  // off diag info. Required is only 2*mt.nnode_. But these
2457  // messages are pretty short, anyway.
2458  }
2459 
2460  // handle the long backbones
2461 
2462  // wait for all remaining receives to complete
2463  // long <- short, long <- long, long <- reduced
2464  for (i = 0; i < ihost_reduced_long_; ++i) {
2466 #if 0
2467 printf("%d wait two %d\n", nrnmpi_myid, mt.host_);
2468 #endif
2469  nrnmpi_wait(&mt.request_);
2470 #if 0
2471 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2472 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2473 #endif
2474  }
2475 
2476  // replace in matrix
2477  // for zero area nodes, D is 1.0 and RHS is V from zero area node.
2478  // for non-zero area nodes, they are sum from zero area.
2479  for (i = 0; i < ihost_reduced_long_; ++i) {
2481  j = 0;
2482  tbuf = trecvbuf_ + mt.displ_;
2483  for (jj = 0; jj < mt.nnode_; ++jj) {
2484  k = mt.nodeindex_[jj];
2485  _nt = nrn_threads + mt.nodeindex_th_[jj];
2486  _nt->actual_d(k) = tbuf[j++];
2487  _nt->actual_rhs(k) = tbuf[j++];
2488  }
2489 #if 0
2490 if (nrnmpi_myid == 4) {
2491  for (j = 0; j < mt.nnode_; ++j) {
2492 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2493 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2494  }
2495 }
2496 #endif
2497  }
2498 #endif // EXCHANGE_ON
2499 
2500 #if NRNMPI
2502 #endif
2503  errno = 0;
2504 }
2505 
2507  // printf("ReducedTree::ReducedTree(%d, %d)\n", rank, mapsize);
2508  int i;
2509  msc = ms;
2510  n = rank;
2511  assert(n > 0);
2512  assert(mapsize > 0);
2513  ip = new int[n];
2514  rhs = new double[4 * n];
2515  d = rhs + n;
2516  a = d + n;
2517  b = a + n;
2518 
2519  n2 = 2 * n;
2520  n4 = 4 * n;
2521  nmap = mapsize;
2522  smap = new double*[nmap];
2523  rmap = new double*[nmap];
2524  ismap = new int[nmap];
2525  irmap = new int[nmap];
2526  nzindex = new int[n];
2527  rmap2smap_index = new int[nmap];
2528  v = new double[n];
2529  nsmap = 0;
2530  irfill = 0;
2531  for (i = 0; i < nmap; ++i) {
2532  smap[i] = 0;
2533  ismap[i] = -1;
2534  rmap[i] = 0;
2535  irmap[i] = -1;
2536  rmap2smap_index[i] = -1;
2537  }
2538 }
2539 
2541  delete[] ip;
2542  delete[] rhs;
2543  delete[] smap;
2544  delete[] ismap;
2545  delete[] rmap;
2546  delete[] irmap;
2547  delete[] nzindex;
2548  delete[] v;
2549  delete[] rmap2smap_index;
2550 }
2551 
2553  // remember that info from zero area nodes is D=d and RHS=rhs
2554  // and from non-zero area is D=1e50 and RHS=v*1e50
2555  // on return, the info for zero area if no nonzero should be
2556  // D=sum d and RHS=sum rhs
2557  // the info for zero area if nonzero should be D=1.0 and RHS=v
2558  // and the info for nonzero should be D=sum d and RHS=sum rhs.
2559  int i;
2560  for (i = 0; i < n; ++i) {
2561  rhs[i] = 0.0;
2562  d[i] = 0.0;
2563  nzindex[i] = -1;
2564  }
2565  // not supposed to add when D=1e50. Note that order in rmap is
2566  // rhs followed by d
2567  for (i = 0; i < nmap; i += 2) {
2568  int j = irmap[i]; // reduced matrix row index for rhs
2569  if (*rmap[i + 1] == 1e50) {
2570  v[j] = *rmap[i] * 1e-50;
2571  nzindex[j] = rmap2smap_index[i];
2572  } else {
2573  rhs[j] += *rmap[i];
2574  d[j] += *rmap[i + 1];
2575  }
2576  }
2577 #if 0
2578  for (i=0; i < n; ++i) {
2579  printf("%d ReducedTree %2d %12.5g %12.5g %d %12.5g\n",
2580  nrnmpi_myid, i, d[i], rhs[i], nzindex[i], v[i]);
2581  }
2582 #endif
2583  for (i = 0; i < nsmap; i += 2) {
2584  int j = ismap[i]; // reduced matrix matrix row index for rhs
2585  if (nzindex[j] == -1 || i == nzindex[j]) {
2586  // for the non-zero area nodes and zero area nodes
2587  // when no non-zero area node involved
2588  *smap[i] = rhs[j];
2589  *smap[i + 1] = d[j];
2590  } else {
2591  // for the zero area nodes
2592  *smap[i] = v[j];
2593  *smap[i + 1] = 1.0;
2594  }
2595  }
2596 }
2597 
2599  int i;
2600  double p;
2601  gather();
2602 #if 0
2603  for (i=0; i < n; ++i) {
2604  printf("%d ReducedTree %2d %12.5g %12.5g %2d %12.5g %12.5g\n",
2605  nrnmpi_myid, i, b[i], d[i], ip[i], ip[i] >= 0?a[i]:0., rhs[i]);
2606  }
2607 #endif
2608  // triangularization
2609  for (i = n - 1; i > 0; --i) {
2610  p = a[i] / d[i];
2611  d[ip[i]] -= p * b[i];
2612  rhs[ip[i]] -= p * rhs[i];
2613  }
2614  // back substitution
2615  rhs[0] /= d[0];
2616  for (i = 1; i < n; ++i) {
2617  rhs[i] -= b[i] * rhs[ip[i]];
2618  rhs[i] /= d[i];
2619  }
2620 #if 0
2621  for (i=0; i < n; ++i) {
2622  printf("%d Result %2d %12.5g\n", nrnmpi_myid, i, rhs[i]);
2623  }
2624 #endif
2625  scatter();
2626 }
2627 
2629  int i;
2630  for (i = 0; i < n4; ++i) {
2631  rhs[i] = 0.0;
2632  }
2633  for (i = 0; i < nmap; ++i) {
2634  rhs[irmap[i]] += *rmap[i];
2635  }
2636 #if 0
2637 for (i=0; i < nmap; ++i){
2638 printf("%d %p gather i=%d ie=%d %g\n", nrnmpi_myid, this, i, irmap[i], *rmap[i]);
2639 }
2640 #endif
2641 }
2642 
2644  int i;
2645  // scatter the result for this host
2646  // fill the send buffer for other hosts
2647 #if 0
2648 for (i=0; i < nsmap; i += 2) {
2649 printf("%d enter scatter %d %p %g %p %g\n", nrnmpi_myid, i,
2650 smap[i], *smap[i], smap[i+1], *smap[i+1]);
2651 }
2652 #endif
2653  // printf("nsmap=%d\n", nsmap);
2654  for (i = 0; i < nsmap; i += 2) {
2655  // printf("i=%d ismap[i]=%d\n", i, ismap[i]);
2656  *smap[i] = 1e30 * rhs[ismap[i]];
2657  *smap[i + 1] = 1e30;
2658  }
2659 #if 0
2660 for (i=0; i < nsmap; i += 2) if (i > 10){
2661 printf("%d leave scatter %d %p %g %p %g\n", nrnmpi_myid, i,
2662 smap[i], *smap[i], smap[i+1], *smap[i+1]);
2663 }
2664 #endif
2665 }
2666 
2667 void ReducedTree::pr_map(int tsize, double* trbuf) {
2668  int i;
2669  Printf(" rmap\n");
2670  for (i = 0; i < nmap; ++i) {
2671  for (int it = 0; it < nrn_nthread; ++it) {
2672  NrnThread* nt = nrn_threads + it;
2673  MultiSplitThread& t = msc_->mth_[it];
2674  int nb = t.backbone_end - t.backbone_begin;
2675  if (rmap[i] >= trbuf && rmap[i] < (trbuf + tsize)) {
2676  Printf(" %2d rhs[%2d] += tbuf[%ld]\n", i, irmap[i], rmap[i] - trbuf);
2677  }
2678  if (rmap[i] >= nt->node_rhs_storage() && rmap[i] < (nt->node_rhs_storage() + nt->end)) {
2679  Node* nd = nt->_v_node[rmap[i] - nt->node_rhs_storage()];
2680  Printf(" %2d rhs[%2d] rhs[%d] += rhs[%ld] \t%s{%d}\n",
2681  i,
2682  irmap[i],
2683  irmap[i],
2684  rmap[i] - nt->node_rhs_storage(),
2685  secname(nd->sec),
2686  nd->sec_node_index_);
2687  }
2688  if (rmap[i] >= nt->node_d_storage() && rmap[i] < (nt->node_d_storage() + nt->end)) {
2689  Printf(" %2d rhs[%2d] d[%d] += d[%ld]\n",
2690  i,
2691  irmap[i],
2692  irmap[i] - n,
2693  rmap[i] - nt->node_d_storage());
2694  }
2695  if (rmap[i] >= t.sid1A && rmap[i] < (t.sid1A + nb)) {
2696  Printf(" %2d rhs[%2d] a[%d] += sid1A[%ld]",
2697  i,
2698  irmap[i],
2699  irmap[i] - 2 * n,
2700  rmap[i] - t.sid1A);
2701  int j = (rmap[i] - t.sid1A) + t.backbone_begin;
2702  Node* nd = nt->_v_node[j];
2703  Printf(" \tA(%d) %s{%d}", j, secname(nd->sec), nd->sec_node_index_);
2704  Printf("\n");
2705  }
2706  if (rmap[i] >= t.sid1B && rmap[i] < (t.sid1B + nb)) {
2707  Printf(" %2d rhs[%2d] b[%d] += sid1B[%ld]",
2708  i,
2709  irmap[i],
2710  irmap[i] - 3 * n,
2711  rmap[i] - t.sid1B);
2712  int j = (rmap[i] - t.sid1B) + t.backbone_begin;
2713  Node* nd = nt->_v_node[j];
2714  Printf("\tB(%d) %s{%d}", j, secname(nd->sec), nd->sec_node_index_);
2715  Printf("\n");
2716  }
2717  }
2718  }
2719 }
2720 
2721 void ReducedTree::reorder(int j, int nt, int* mark, int* allbbr, int* allsid) {
2722  // specify ip and modify s2rt so they have tree order consistency
2723  // there should be n-1 edges, so
2724  if (n == 1) {
2725  ip[0] = -1;
2726  return;
2727  }
2728  int* e1 = new int[n - 1];
2729  int* e2 = new int[n - 1];
2730  int* order = new int[n];
2731  int* sid = new int[n];
2732  int singlesid = -1;
2733  int i;
2734  for (i = 0; i < n; ++i) {
2735  order[i] = -1;
2736  }
2737  // fill in the edges.
2738  int ne = n - 1;
2739  int ie = 0;
2740  for (i = 0; i < nt; ++i) {
2741  if (mark[i] == j && allbbr[i] == 2) {
2742  singlesid = allsid[i];
2743  }
2744  if (mark[i] == j && allbbr[i] > 2 && allsid[i] < allbbr[i] - 3) {
2745  // printf("i=%d ie=%d ne=%d mark=%d allsid=%d allbbr=%d\n", i, ie, ne, mark[i],
2746  // allsid[i], allbbr[i]-3);
2747  assert(ie < ne);
2748  const auto& e1ieiter = s2rt->find(allsid[i]);
2749  nrn_assert(e1ieiter != s2rt->end());
2750  e1[ie] = e1ieiter->second;
2751  sid[e1[ie]] = allsid[i];
2752  const auto& e2ieiter = s2rt->find(allbbr[i] - 3);
2753  nrn_assert(e2ieiter != s2rt->end());
2754  e2[ie] = e2ieiter->second;
2755  sid[e2[ie]] = allbbr[i] - 3;
2756  ++ie;
2757  }
2758  }
2759  assert(ie == ne);
2760  if (ne == 0) {
2761  assert(singlesid >= 0);
2762  sid[0] = singlesid;
2763  }
2764  // order the elements starting at 0 // also fill in the ip vector
2765  // should replace by a O(n) or at worst a O(nlogn) algorithm
2766  ip[0] = -1;
2767  order[0] = 0;
2768  int ordered = 1;
2769  while (ordered < n) {
2770  int old = ordered;
2771  for (i = 0; i < ne; ++i) {
2772  if (e1[i] >= 0) {
2773  if (order[e1[i]] >= 0) {
2774  assert(order[e2[i]] == -1);
2775  ip[ordered] = order[e1[i]];
2776  order[e2[i]] = ordered++;
2777  e1[i] = -1; // edge handled
2778  e2[i] = -1;
2779  } else if (order[e2[i]] >= 0) {
2780  assert(order[e1[i]] == -1);
2781  ip[ordered] = order[e2[i]];
2782  order[e1[i]] = ordered++;
2783  e1[i] = -1; // edge handled
2784  e2[i] = -1;
2785  }
2786  }
2787  }
2788  assert(ordered > old); // if tree then accomplished something
2789  }
2790 
2791  // reset the s2rt ranks for the sids
2792  for (i = 0; i < n; ++i) {
2793  (*s2rt)[sid[i]] = order[i];
2794  }
2795 
2796 #if 0
2797  printf("%d ReducedTree n=%d nmap=%d\n", nrnmpi_myid, n, nmap);
2798  for (i=0; i < n; ++i) {
2799  printf("%d i=%d sid=%d sid2i=%d\n", nrnmpi_myid, i, sid[i], (*s2rt)[sid[i]]);
2800  }
2801  for (i=0; i < n; ++i) {
2802  printf("%d i=%d ip=%d\n", nrnmpi_myid, i, ip[i]);
2803  }
2804 #endif
2805 
2806  delete[] e1;
2807  delete[] e2;
2808  delete[] order;
2809  delete[] sid;
2810 }
2811 
2812 void ReducedTree::fillrmap(int sid1, int sid2, double* pd) {
2813  const auto& sid1_iter = s2rt->find(sid1);
2814  nrn_assert(sid1_iter != s2rt->end());
2815  const int i = sid1_iter->second;
2816  int j;
2817 
2818  // type order is RHS, D, A, B
2819  if (sid2 < 0) { // RHS
2820  j = i;
2821  } else if (sid2 == sid1) { // D
2822  j = i + n;
2823  } else { // A or B?
2824  const auto& sid2_iter = s2rt->find(sid2);
2825  nrn_assert(sid2_iter != s2rt->end());
2826  j = sid2_iter->second;
2827  if (ip[i] == j) { // A
2828  j = i + 2 * n;
2829  } else if (ip[j] == i) { // B
2830  j = j + 3 * n;
2831  } else {
2832  assert(0);
2833  }
2834  }
2835  // in most cases we could have done RHS and D together since they
2836  // are adjacent in the receive buffer. However they are not adjacent
2837  // for the sids on this machine.
2838  // printf("%d fillrmap sid1=%d sid2=%d i=%d j=%d irfill=%d\n", nrnmpi_myid, sid1, sid2, i, j,
2839  // irfill);
2840  irmap[irfill] = j;
2841  rmap[irfill] = pd;
2843  irfill += 1;
2844 }
2845 
2846 void ReducedTree::fillsmap(int sid, double* prhs, double* pd) {
2847  const auto& sid_iter = s2rt->find(sid);
2848  nrn_assert(sid_iter != s2rt->end());
2849  const int i = sid_iter->second;
2850 
2851  // printf("%d fillsmap sid=%d i=%d nsmap=%d\n", nrnmpi_myid, sid, i, nsmap);
2852  ismap[nsmap] = i;
2853  smap[nsmap] = prhs;
2854  ismap[nsmap + 1] = i;
2855  smap[nsmap + 1] = pd;
2856  nsmap += 2;
2857 }
2858 
2859 // triangularize all subtrees, results in a backbone tri-diagonal
2861  int i, ip;
2862  double p;
2863  auto* const vec_a = _nt->node_a_storage();
2864  auto* const vec_b = _nt->node_b_storage();
2865  auto* const vec_d = _nt->node_d_storage();
2866  auto* const vec_rhs = _nt->node_rhs_storage();
2867  // eliminate a of the subtrees
2868  for (i = i3 - 1; i >= backbone_end; --i) {
2869  ip = _nt->_v_parent_index[i];
2870  p = vec_a[i] / vec_d[i];
2871  vec_d[ip] -= p * vec_b[i];
2872  RHS(ip) -= p * RHS(i);
2873  }
2874 #if 0
2875 printf("end of triang_subtree2backbone\n");
2876 for (i=i1; i < backbone_end; ++i) {
2877  printf("i=%d D=%g RHS=%g\n", i, D(i), RHS(i));
2878 }
2879 #endif
2880 }
2881 
2882 // backbone starts as tridiagonal, ends up in form of N.
2884  int i, j, ip, jp;
2885  double p;
2886  // begin the backbone triangularization. This eliminates a and fills in
2887  // sid1A column. Begin with pivot equation adjacent to sid1.
2888  auto* const vec_a = _nt->node_a_storage();
2889  for (i = backbone_sid1_begin; i < backbone_end; ++i) {
2890  // what is the equation index for A(i)
2891  j = _nt->_v_parent_index[i] - backbone_begin;
2892  S1A(j) = vec_a[i];
2893  }
2894  auto* const vec_b = _nt->node_b_storage();
2895  auto* const vec_d = _nt->node_d_storage();
2896  auto* const vec_rhs = _nt->node_rhs_storage();
2897  for (i = backbone_sid1_begin - 1; i >= backbone_interior_begin; --i) {
2898  ip = _nt->_v_parent_index[i];
2899  j = i - backbone_begin;
2900  jp = ip - backbone_begin;
2901  p = vec_a[i] / D(i);
2902  D(ip) -= p * vec_b[i];
2903  RHS(ip) -= p * RHS(i);
2904  S1A(jp) = -p * S1A(j);
2905  // printf("iter i=%d ip=%d j=%d jp=%d D(ip)=%g RHS(ip)=%g S1A(ip)=%g\n",
2906  // i, ip, j, jp, D(ip), RHS(ip), S1A(jp));
2907  }
2908 
2909  // complete the backbone triangularization to form the N shaped matrix
2910  // This eliminates b and fills in the sid1B column. Begin with
2911  // pivot equation adjacent to sid0 (the root)
2913  ip = _nt->_v_parent_index[i];
2914  j = i - backbone_begin;
2915  if (ip < backbone_interior_begin) {
2916  S1B(j) = vec_b[i];
2917  continue;
2918  }
2919  jp = ip - backbone_begin;
2920  p = vec_b[i] / D(ip);
2921  RHS(i) -= p * RHS(ip);
2922  S1A(j) -= p * S1A(jp);
2923  S1B(j) = -p * S1B(jp);
2924  // printf("iter2 i=%d j=%d D(i)=%g RHS(i)=%g S1A(j)=%g S1B(j)=%g\n",
2925  // i, j, D(i), RHS(i), S1A(j), S1B(j));
2926  }
2927  // exactly like above but S1A happens to be D
2928  for (i = backbone_sid1_begin; i < backbone_end; ++i) {
2929  ip = _nt->_v_parent_index[i];
2930  j = i - backbone_begin;
2931  if (ip < backbone_interior_begin) {
2932  S1B(j) = vec_b[i];
2933  continue;
2934  }
2935  jp = ip - backbone_begin;
2936  p = vec_b[i] / D(ip);
2937  RHS(i) -= p * RHS(ip);
2938  D(i) -= p * S1A(jp);
2939  S1B(j) = -p * S1B(jp);
2940  // printf("iter3 i=%d j=%d D(i)=%g RHS(i)=%g S1B(j)=%g\n",
2941  // i, j, D(i), RHS(i), S1B(j));
2942  }
2943 #if 0
2944 printf("%d end of triang_backbone\n", nrnmpi_myid);
2945 for (i=i1; i < backbone_end; ++i) {
2946  printf("%d i=%d D=%g RHS=%g\n", nrnmpi_myid, i, D(i), RHS(i));
2947 }
2948 #endif
2949 }
2950 
2951 // exchange of d and rhs of sids has taken place and we can solve for the
2952 // backbone nodes
2954  auto* const vec_d = _nt->node_d_storage();
2955  auto* const vec_rhs = _nt->node_rhs_storage();
2956  int i, j;
2957  double a, b, p, vsid1;
2958  // need to solve the 2x2 consisting of sid0 and sid1 points
2959  // this part has already been done for short backbones
2961  // printf("%d, backbone %d %d %d %d %d %d\n", nrnmpi_myid,
2962  // backbone_begin, backbone_long_begin, backbone_interior_begin,
2963  // backbone_sid1_begin, backbone_long_sid1_begin, backbone_end);
2965  a = S1A(i - backbone_begin);
2966  b = S1B(j - backbone_begin);
2967  p = b / D(i);
2968  D(j) -= p * a;
2969  RHS(j) -= p * RHS(i);
2970  RHS(j) /= D(j);
2971  RHS(i) -= a * RHS(j);
2972  RHS(i) /= D(i);
2973  ++j;
2974  }
2975  // now sid0 and sid1 values give us the column values
2976  // do in two sweeps. Wish we could keep a single cell backbone
2977  // contiguous.
2978  // Do the S1A sweep on a per cell basis.
2979  for (i = backbone_sid1_begin; i < backbone_end; ++i) {
2980  vsid1 = RHS(i);
2981  for (j = _nt->_v_parent_index[i]; j >= backbone_interior_begin;
2982  j = _nt->_v_parent_index[j]) {
2983  RHS(j) -= S1A(j - backbone_begin) * vsid1;
2984  }
2985  }
2986  // For the S1B sweep we use sid0 root info for each interior node
2988  j = i - backbone_begin;
2989  RHS(i) -= S1B(j) * RHS(sid0i[j]);
2990  RHS(i) /= D(i);
2991  }
2992 #if 0
2993 printf("%d end of bksub_backbone\n", nrnmpi_myid);
2994 for (i=i1; i < backbone_end; ++i) {
2995  printf("%d i=%d RHS=%g\n", nrnmpi_myid, i, RHS(i));
2996 }
2997 #endif
2998 }
2999 
3001  auto* const vec_d = _nt->node_d_storage();
3002  auto* const vec_rhs = _nt->node_rhs_storage();
3003  int i, j;
3004  double a, b, p;
3005  // solve the 2x2 consisting of sid0 and sid1 points.
3007  for (i = backbone_begin; i < backbone_long_begin; ++i) {
3008  a = S1A(i - backbone_begin);
3009  b = S1B(j - backbone_begin);
3010 #if 0
3011 printf("%d part1 i=%d j=%d\n",
3012 nrnmpi_myid, i, j);
3013 printf("%d part1 d=%12.5g a=%12.5g rhs=%12.5g\n",
3014 nrnmpi_myid,D(i), a, RHS(i));
3015 printf("%d part1 b=%12.5g d=%12.5g rhs=%12.5g\n",
3016 nrnmpi_myid, b, D(j), RHS(j));
3017 #endif
3018  p = b / D(i);
3019  D(j) -= p * a;
3020  RHS(j) -= p * RHS(i);
3021  RHS(j) /= D(j);
3022  RHS(i) -= a * RHS(j);
3023  RHS(i) /= D(i);
3024 #if 0
3025 printf("%d part1 result %12.5g %12.5g\n",
3026 nrnmpi_myid, RHS(i), RHS(j));
3027 #endif
3028  ++j;
3029  }
3030  // Note that it no longer makes sense to add D and RHS
3031  // to the other machines since RHS is the solution itself
3032  // at the other machine corresponding points. In fact
3033  // we don't need to send D at all. Just replace the
3034  // RHS, skip that equation, and go on. But for
3035  // code uniformity and since skipping is more complex
3036  // it seems simpler to just setup the right equation
3037  // on the other machine. i.e. 1 * v = rhs
3038 }
3039 
3040 // solve the subtrees, rhs on the backbone is already solved
3042  auto* const vec_b = _nt->node_b_storage();
3043  auto* const vec_d = _nt->node_d_storage();
3044  auto* const vec_rhs = _nt->node_rhs_storage();
3045  int i, ip;
3046  // solve all rootnodes not part of a backbone
3047  for (i = i1; i < backbone_begin; ++i) {
3048  RHS(i) /= D(i);
3049  }
3050  // solve the subtrees
3051  for (i = backbone_end; i < i3; ++i) {
3052  ip = _nt->_v_parent_index[i];
3053  RHS(i) -= vec_b[i] * RHS(ip);
3054  RHS(i) /= D(i);
3055  }
3056 #if 0
3057 printf("%d end of bksub_subtrees (just the backbone)\n", nrnmpi_myid);
3058 for (i=i1; i < backbone_end; ++i) {
3059  printf("%d i=%d RHS=%g\n", nrnmpi_myid, i, RHS(i));
3060 }
3061 #endif
3062 }
3063 
3064 // fill the v_node, v_parent node vectors in the proper order and
3065 // determine backbone index values. See the NOTE above. The relevant order
3066 // is:
3067 // 1) all the roots of cells with 0 or 1 sid (no backbone involved)
3068 // 2) all the sid0 when there are two sids in a cell. 1) + 2) = rootnodecount
3069 // 3) the interior backbone nodes (does not include sid0 or sid1 nodes)
3070 // 4) all the sid1 nodes
3071 // 5) all remaining nodes
3072 // backbone_begin is the index for 2).
3073 // backbone_long_begin ; earlier are short backbones, later are long
3074 // backbone_interior_begin is one more than the last index for 2)
3075 // backbone_sid1_begin is the index for 4)
3076 // backbone_long_sid1_begin ; ealier are short, later are long
3077 // backbone_end is the index for 5.
3078 // We allow the single or pair sid nodes to be different from a rootnode.
3079 // Note: generally speaking, the first sid will probably always be
3080 // a classical root node. However, we do want to handle the case where
3081 // many branches connect to soma(0.5), so do the whole tree ordering
3082 // as the general case.
3083 
3085  msc_->v_setup();
3086 }
3087 
3090  return;
3091  }
3092  // the typical case is that this comes from the beginning of exchange_setup
3093  // through recalc_diam since exhange_setup needs the proper
3094  // thread nt->_v_node and ms->ithread. Hence anything that
3095  // changes the overall structure
3096  // requires a complete start over from the point prior to splitting.
3097  assert(!use_sparse13);
3098  int i;
3099  // first time through, nth_ = 0
3100  if (nth_ == 0) {
3101  // printf("MultiSplitControl::v_setup due to multisplit()\n");
3102  assert(mth_ == 0);
3103  nth_ = nrn_nthread;
3104  mth_ = new MultiSplitThread[nth_];
3105  for (i = 0; i < nrn_nthread; ++i) {
3106  mth_[i].v_setup(nrn_threads + i);
3107  }
3108  } else {
3109  if (nth_ != nrn_nthread) {
3110  hoc_execerror(
3111  "ParallelContext.nthread() was changed after ParallelContext.multisplit()", 0);
3112  }
3113  // pointers have changed and need to reorder and update reduced tree maps
3114  // printf("MultiSplitControl::v_setup due to pointer change\n");
3115  for (i = 0; i < nrn_nthread; ++i) {
3116  mth_[i].v_setup(nrn_threads + i);
3117  }
3118  }
3119 }
3120 
3122  // Precondition:
3123  // v_node and v_parent node vectors are in their classical tree order
3124  // and Node.v_node_index also is consistent with that.
3125  // calls to pc.multisplit() define the implicit connections
3126  // between splitcell
3127  // Postcondition: v_node and v_parent node vectors are in their multisplit
3128  // tree order. That info can be subsequently used to re-allocate
3129  // the actual v
3130  i1 = 0;
3131  i2 = i1 + nt->ncell;
3132  i3 = nt->end;
3133  int nnode = i3 - i1;
3134 
3135  MultiSplitTable* classical_root_to_multisplit_ = msc_->classical_root_to_multisplit_.get();
3136  MultiSplitList* multisplit_list_ = msc_->multisplit_list_;
3137 
3138  // node vs v_node relation is always with an i1 offset
3139  Node** node = new Node*[nnode];
3140  Node** parent = new Node*[nnode];
3141  Node* nd;
3142  int i, j, i0, ii, in, ip, nback, ib, ibl, ibs;
3143  int is0, is1, k0, k1, iss0, iss1, isl0, isl1;
3144 
3145 #if 0
3146 printf("multisplit_v_setup %d\n", nnode);
3147  for (i=i1; i < i3; ++i) {
3148  assert(nt->_v_node[i]->v_node_index == i);
3149  assert(nt->_v_parent[i] == 0 || nt->_v_parent[i]->v_node_index < i);
3150  }
3151 #endif
3152 #if 0
3153 printf("multisplit_v_setup %d\n", nnode);
3154 printf("\nclassical order\n");
3155 for (i=i1; i < i3; ++i) {
3156  printf("%d %d %d %s %d\n", i, nt->_v_node[i]->v_node_index,
3157  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1, secname(nt->v_node[i]->sec),
3158  nt->_v_node[i]->sec_node_index_);
3159 }
3160 #endif
3161  del_sidA();
3162 
3163  // the first phase is to transform from the classical tree order
3164  // to a tree order where the sid0 are roots. In splitcell, this
3165  // was done at the hoc level (rerooting a tree involves reversing
3166  // the parent child relationship on the line between the old and
3167  // new roots). However at the hoc level it is only possible to
3168  // have split points at section boundaries and
3169  // that meant that soma(0.5) could not
3170  // be a root. We want to allow that. It is also much simpler
3171  // to deal with the backbone between sid0 and sid1
3172  // if sid0 is already a root.
3173 
3174  // how many sid0 backbone (long and short) roots are there
3175  backbone_begin = i2;
3177  if (classical_root_to_multisplit_) {
3178  for (MultiSplit* ms: *multisplit_list_) {
3179  if (ms->nd[1]) {
3180  if (ms->nd[1]->_nt != nt) {
3181  continue;
3182  }
3183  --backbone_begin;
3184  if (ms->backbone_style != 1) {
3186  }
3187  if (ms->backbone_style == 2) {
3188  ++nbackrt_;
3189  }
3190  }
3191  }
3192  }
3194  if (nbackrt_) {
3195  backsid_ = new int[nbackrt_];
3196  backAindex_ = new int[nbackrt_];
3197  backBindex_ = new int[nbackrt_];
3198  }
3199 
3200  // reorder the trees so sid0 is the root, at least with respect to v_parent
3201  for (i = i1; i < i2; ++i) {
3202  Node* oldroot = nt->_v_node[i];
3203  if (classical_root_to_multisplit_) {
3204  const auto& msiter = classical_root_to_multisplit_->find(oldroot);
3205  if (msiter != classical_root_to_multisplit_->end()) {
3206  nd = msiter->second->nd[0];
3207  if (nd == oldroot) { // the cell tree is fine
3208  // the way it is (the usual case)
3209  nt->_v_parent[nd->v_node_index] = 0;
3210  } else {
3211  // need to reverse the parent/child relationship
3212  // on the path from nd to oldroot
3213  in = nd->v_node_index;
3214  nd = 0;
3215  while (in > i) {
3216  ip = nt->_v_parent[in]->v_node_index;
3217  nt->_v_parent[in] = nd;
3218  nd = nt->_v_node[in];
3219  in = ip;
3220  }
3221  nt->_v_parent[in] = nd;
3222  }
3223  }
3224  }
3225  }
3226 
3227  // Now the only problem is that assert(v_parent[i] < v_node[i])
3228  // is false. But we can take care of that at the end.
3229  // the sid0 are now root nodes with respect to v_parent
3230 
3231 #if 0
3232 printf("\nsid0 is a root\n");
3233 for (i=i1; i < i3; ++i) {
3234  printf("%d %d %d %s %d\n", i, nt->_v_node[i]->v_node_index,
3235  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1,
3236  secname(nt->_v_node[i]->sec), nt->_v_node[i]->sec_node_index_);
3237 }
3238 #endif
3239  // Index all the classical rootnodes and sid rootnodes
3240  // If the classical rootnode has only 1 sid, it
3241  // replaces it as a rootnode. If there are 2 the first sid node
3242  // gets shifted to the backbone_begin indices in the proper
3243  // short or long section.
3244  backbone_end = i2;
3245  ii = i1;
3246  ibl = backbone_long_begin;
3247  ibs = backbone_begin;
3248  for (i = i1; i < i2; ++i) {
3249  nd = nt->_v_node[i];
3250  if (classical_root_to_multisplit_ &&
3251  classical_root_to_multisplit_->find(nd) != classical_root_to_multisplit_->end()) {
3252  MultiSplit* ms = classical_root_to_multisplit_->operator[](nd);
3253  if (ms->nd[1]) {
3254  ib = ms->backbone_style >= 1 ? ibs : ibl;
3255  node[ib - i1] = ms->nd[0];
3256  parent[ib - i1] = 0;
3257  if (ms->backbone_style >= 1) {
3258  ++ibs;
3259  } else {
3260  ++ibl;
3261  }
3262  // here we can get a bit greedy and
3263  // start counting how many indices are
3264  // needed for the backbones
3265  // how many nodes between nd[0] and nd[1]
3266  // note that nd[0] IS a root.
3267  i0 = ms->nd[0]->v_node_index;
3268  int iii = ms->nd[1]->v_node_index;
3269  while (i0 != iii) {
3270  iii = nt->_v_parent[iii]->v_node_index;
3271  ++backbone_end;
3272  }
3273  } else {
3274  node[ii - i1] = ms->nd[0];
3275  parent[ii - i1] = 0;
3276  ++ii;
3277  }
3278  } else {
3279  node[ii - i1] = nd;
3280  parent[ii - i1] = 0;
3281  ++ii;
3282  }
3283  }
3286 // printf("backbone begin=%d interior=%d sid1=%d end=%d\n",
3287 // backbone_begin, backbone_interior_begin, backbone_sid1_begin, backbone_end);
3288 #if 0
3289  for (i=i1; i < backbone_interior_begin; ++i) {
3290  assert(parent[i-i1] == 0);
3291  }
3292 #endif
3293  nback = backbone_end - backbone_begin;
3294  if (nback) {
3295  sid0i = new int[nback];
3296  sid1A = new double[nback];
3297  sid1B = new double[nback];
3298  for (i = 0; i < nback; ++i) {
3299  sid1A[i] = sid1B[i] = 0.;
3300  sid0i[i] = -1;
3301  }
3302  }
3303  iss0 = backbone_begin;
3304  iss1 = backbone_sid1_begin;
3305  isl0 = backbone_long_begin;
3306  isl1 = backbone_long_sid1_begin;
3307  ib = backbone_sid1_begin;
3308  // order backbone_sid1_begin to backbone_begin (same order as
3309  // backbone_begin to backbone_interior)
3310  // again, short are before long
3311  // the backbone interior can be any order
3312  int ibrt = 0;
3313  for (i = i1; i < i2; ++i) {
3314  nd = nt->_v_node[i];
3315  if (classical_root_to_multisplit_) {
3316  MultiSplit* ms;
3317  const auto& msiter = classical_root_to_multisplit_->find(nd);
3318  if (msiter != classical_root_to_multisplit_->end()) {
3319  ms = msiter->second;
3320  ms->ithread = nt->id;
3321  if (ms->nd[1]) {
3322  if (ms->backbone_style >= 1) {
3323  is0 = iss0++;
3324  is1 = iss1++;
3325  } else {
3326  is0 = isl0++;
3327  is1 = isl1++;
3328  }
3329  i0 = ms->nd[0]->v_node_index;
3330  ii = ms->nd[1]->v_node_index;
3331  node[is1 - i1] = ms->nd[1];
3332  parent[is1 - i1] = nt->_v_parent[ii];
3333  ii = nt->_v_parent[ii]->v_node_index;
3334  while (i0 != ii) {
3335  --ib;
3336  node[ib - i1] = nt->_v_node[ii];
3337  parent[ib - i1] = nt->_v_parent[ii];
3338  // printf("sid0i[%d] = %d\n", ib-backbone_begin, is0);
3339  sid0i[ib - backbone_begin] = is0;
3340  ii = nt->_v_parent[ii]->v_node_index;
3341  }
3342  if (ms->backbone_style == 2) {
3343  backsid_[ibrt] = ms->sid[0];
3344  ms->back_index = ibrt;
3345  backAindex_[ibrt] = is0 - backbone_begin;
3346  backBindex_[ibrt] = is1 - backbone_begin;
3347  // printf("backAindex[%d] = %d sid=%d Bindex=%d\n",
3348  // ibrt, backAindex_[ibrt], backsid_[ibrt], backBindex_[ibrt]);
3349  ++ibrt;
3350  }
3351  }
3352  }
3353  }
3354  }
3355  // printf("backbone order complete\n");
3356 
3357  // most of the rest are already in tree order and have the correct
3358  // parents. They just need to be copied into node and parent
3359  // so that the parent index is alway < the node index
3360  // However, sid0 becoming root can cause many places where
3361  // parent_index > node_index and these need to be relocated
3362  for (i = i1; i < i3; ++i) {
3363  // can exploit this variable because it will be set later
3364  nt->_v_node[i]->eqn_index_ = -1;
3365  }
3366  for (i = i1; i < backbone_end; ++i) {
3367 #if 0
3368 printf("backbone i=%d %d %s %d", i, node[i]->v_node_index, secname(node[i]->sec), node[i]->sec_node_index_);
3369 printf(" -> %s %d\n", parent[i]?secname(parent[i]->sec):"root",
3370 parent[i]?parent[i]->sec_node_index_:-1);
3371 #endif
3372  node[i - i1]->eqn_index_ = i;
3373  }
3374  // the rest are in order
3375  j = backbone_end;
3376  for (i = i1; i < i3; ++i) {
3377  k0 = 0;
3378  k1 = i;
3379  // printf("i=%d\n", i);
3380  while (nt->_v_node[k1]->eqn_index_ < 0) {
3381  // printf("counting i=%d k1=%d k0=%d\n", i, k1, k0);
3382  ++k0;
3383  if (!nt->_v_parent[k1]) {
3384  break;
3385  }
3386  k1 = nt->_v_parent[k1]->v_node_index;
3387  }
3388  // printf("i=%d k0=%d\n", i, k0);
3389  k1 = i;
3390  j += k0;
3391  k0 = j - 1;
3392  while (nt->_v_node[k1]->eqn_index_ < 0) {
3393  // printf("ordering i=%d k1=%d k0=%d parent=%p\n", i, k1, k0, nt->_v_parent[k1]);
3394  node[k0 - i1] = nt->_v_node[k1];
3395  parent[k0 - i1] = nt->_v_parent[k1];
3396  node[k0 - i1]->eqn_index_ = k0;
3397  --k0;
3398  if (!nt->_v_parent[k1]) {
3399  break;
3400  }
3401  k1 = nt->_v_parent[k1]->v_node_index;
3402  }
3403  }
3404  for (i = i1; i < i3; ++i) {
3405  assert(i == node[i - i1]->eqn_index_);
3406  nt->_v_node[i] = node[i - i1];
3407  nt->_v_parent[i] = parent[i - i1];
3408  nt->_v_node[i]->v_node_index = i;
3409  }
3410  delete[] node;
3411  delete[] parent;
3412 
3413 #if 0
3414 printf("\nmultisplit reordering\n");
3415 printf("backbone begin=%d long=%d interior=%d sid1=%d long=%d end=%d\n",
3418 for (i=i1; i < i3; ++i) {
3419  printf("%d %d %s %d ", nt->_v_node[i]->v_node_index,
3420  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1,
3421  secname(nt->_v_node[i]->sec), nt->_v_node[i]->sec_node_index_);
3422  if (nt->_v_parent[i]) {
3423  printf(" -> %s %d\n", secname(nt->_v_parent[i]->sec), nt->_v_parent[i]->sec_node_index_);
3424  }else{
3425  printf(" root\n");
3426  }
3427 }
3428 #endif
3429  for (i = i1; i < i3; ++i)
3430  if (nt->_v_parent[i] && nt->_v_parent[i]->v_node_index >= i) {
3431  printf("i=%d parent=%d\n", i, nt->_v_parent[i]->v_node_index);
3432  }
3433  for (i = i1; i < i3; ++i) {
3434  assert(nt->_v_node[i]->v_node_index == i);
3435  assert(nt->_v_parent[i] == 0 || nt->_v_parent[i]->v_node_index < i);
3436  }
3437 
3438  // freeing sid1A... invalidated any offdiag_ pointers.
3439  for (i = 0; i < msc_->nthost_; ++i) {
3440  MultiSplitTransferInfo& msti = msc_->msti_[i];
3441  for (j = 0; j < msti.nnode_rt_; j += 2) {
3442  msti.offdiag_[j] = sid1A + msti.ioffdiag_[j];
3443  msti.offdiag_[j + 1] = sid1B + msti.ioffdiag_[j + 1];
3444  }
3445  }
3446  // sid1A, sid1B pointers in reducedtree map will be updated by
3447  // rt_map_update along with d and rhs
3448 }
3449 
3450 void MultiSplitControl::pmat(bool full) {
3451  int it, i, is;
3452  Printf("\n");
3453  for (it = 0; it < nrn_nthread; ++it) {
3454  NrnThread* _nt = nrn_threads + it;
3455  MultiSplitThread& t = mth_[it];
3456  for (i = 0; i < _nt->end; ++i) {
3457  is = _nt->_v_node[i]->_classical_parent ? _nt->_v_node[i]->sec_node_index_ : -1;
3458  Printf("%d %d %s %d",
3459  _nt->_v_node[i]->v_node_index,
3460  _nt->_v_parent[i] ? _nt->_v_parent[i]->v_node_index : -1,
3461  secname(_nt->_v_node[i]->sec),
3462  is);
3463  if (_nt->_v_parent[i]) {
3464  is = _nt->_v_parent[i]->_classical_parent ? _nt->_v_parent[i]->sec_node_index_ : -1;
3465  Printf(" -> %s %d", secname(_nt->_v_parent[i]->sec), is);
3466  Printf("\t %10.5g %10.5g", NODEB(_nt->_v_node[i]), NODEA(_nt->_v_node[i]));
3467  } else {
3468  Printf(" root\t\t %10.5g %10.5g", 0., 0.);
3469  }
3470 
3471  if (full) {
3472  Printf(" %10.5g %10.5g", NODED(_nt->_v_node[i]), NODERHS(_nt->_v_node[i]));
3473  if (t.sid0i && i >= t.backbone_begin && i < t.backbone_end) {
3474  Printf(" %10.5g %10.5g",
3475  t.S1B(i - t.backbone_begin),
3476  t.S1A(i - t.backbone_begin));
3477  }
3478  }
3479  Printf("\n");
3480  }
3481  }
3482 }
3483 
3484 void MultiSplitControl::pmatf(bool full) {
3485  int it, i, is;
3486  FILE* f;
3487  char fname[100];
3488 
3489  Sprintf(fname, "pmat.%04d", nrnmpi_myid);
3490  f = fopen(fname, "w");
3491  for (it = 0; it < nrn_nthread; ++it) {
3492  NrnThread* _nt = nrn_threads + it;
3493  MultiSplitThread& t = mth_[it];
3494  fprintf(f, "%d %d\n", it, _nt->end);
3495  for (i = 0; i < _nt->end; ++i) {
3496  is = _nt->_v_node[i]->_classical_parent ? _nt->_v_node[i]->sec_node_index_ : -1;
3497  fprintf(f,
3498  "%d %d %s %d",
3499  _nt->_v_node[i]->v_node_index,
3500  _nt->_v_parent[i] ? _nt->_v_parent[i]->v_node_index : -1,
3501  secname(_nt->_v_node[i]->sec),
3502  is);
3503  if (_nt->_v_parent[i]) {
3504  is = _nt->_v_parent[i]->_classical_parent ? _nt->_v_parent[i]->sec_node_index_ : -1;
3505  fprintf(f, " -> %s %d", secname(_nt->_v_parent[i]->sec), is);
3506  fprintf(f, "\t %10.5g %10.5g", NODEB(_nt->_v_node[i]), NODEA(_nt->_v_node[i]));
3507  } else {
3508  fprintf(f, " root\t\t %10.5g %10.5g", 0., 0.);
3509  }
3510 
3511  if (full) {
3512  fprintf(f, " %10.5g %10.5g", NODED(_nt->_v_node[i]), NODERHS(_nt->_v_node[i]));
3513  if (t.sid0i && i >= t.backbone_begin && i < t.backbone_end) {
3514  fprintf(f,
3515  " %10.5g %10.5g",
3516  t.S1B(i - t.backbone_begin),
3517  t.S1A(i - t.backbone_begin));
3518  }
3519  }
3520  fprintf(f, "\n");
3521  }
3522  }
3523  fclose(f);
3524 }
3525 
3526 void MultiSplitControl::pmat1(const char* s) {
3527  int it;
3528  double a, b, d, rhs;
3529  for (it = 0; it < nrn_nthread; ++it) {
3530  NrnThread* _nt = nrn_threads + it;
3531  auto* const vec_d = _nt->node_d_storage();
3532  auto* const vec_rhs = _nt->node_rhs_storage();
3533  MultiSplitThread& t = mth_[it];
3534  int i1 = 0;
3535  int i3 = _nt->end;
3536  for (MultiSplit* ms: *multisplit_list_) {
3537  int i = ms->nd[0]->v_node_index;
3538  if (i < i1 || i >= i3) {
3539  continue;
3540  }
3541  d = D(i);
3542  rhs = RHS(i);
3543  a = b = 0.;
3544  if (ms->nd[1]) {
3545  a = mth_[it].S1A(0);
3546  }
3547  Printf("%2d %s sid=%d %12.5g %12.5g %12.5g %12.5g\n",
3548  nrnmpi_myid,
3549  s,
3550  ms->sid[0],
3551  b,
3552  d,
3553  a,
3554  rhs);
3555  if (ms->nd[1]) {
3556  d = D(ms->nd[1]->v_node_index);
3557  rhs = RHS(ms->nd[1]->v_node_index);
3558  a = 0.;
3559  b = t.S1B(t.backbone_sid1_begin - t.backbone_begin);
3560  Printf("%2d %s sid=%d %12.5g %12.5g %12.5g %12.5g\n",
3561  nrnmpi_myid,
3562  s,
3563  ms->sid[1],
3564  b,
3565  d,
3566  a,
3567  rhs);
3568  }
3569  }
3570  }
3571 }
3572 
3573 double* nrn_classicalNodeA(Node* nd) {
3574  int i = nd->v_node_index;
3575  Node* pnd = nd->_classical_parent;
3576  NrnThread* _nt = nd->_nt;
3577  if (_nt->_v_parent[i] == pnd) {
3578  return &NODEA(nd);
3579  } else if (pnd == 0) {
3580  return 0;
3581  } else if (_nt->_v_parent[pnd->v_node_index] == nd) {
3582  return &NODEB(pnd);
3583  } else {
3584  assert(0);
3585  }
3586  return 0;
3587 }
3588 
3589 double* nrn_classicalNodeB(Node* nd) {
3590  int i = nd->v_node_index;
3591  Node* pnd = nd->_classical_parent;
3592  NrnThread* _nt = nd->_nt;
3593  if (_nt->_v_parent[i] == pnd) {
3594  return &NODEB(nd);
3595  } else if (pnd == 0) {
3596  return 0;
3597  } else if (_nt->_v_parent[pnd->v_node_index] == nd) {
3598  return &NODEA(pnd);
3599  } else {
3600  assert(0);
3601  }
3602  return 0;
3603 }
const char * secname(Section *sec)
name of section (for use in error messages)
Definition: cabcode.cpp:1674
void setup_topology(void)
Definition: cabcode.cpp:1635
int tree_changed
Definition: cabcode.cpp:51
Node * node_exact(Section *sec, double x)
like node_index but give proper node when x is 0 or 1 as well as in between
Definition: cabcode.cpp:1800
static Node * pnd
Definition: clamp.cpp:33
MultiSplitList * multisplit_list_
MultiSplitThread * mth_
void multisplit(Section *, double, int, int)
Definition: multisplit.cpp:354
void multisplit_nocap_v_part1(NrnThread *)
void multisplit_adjust_rhs(NrnThread *)
void pmatf(bool full=false)
void pmat1(const char *)
void matrix_exchange_nocap()
virtual ~MultiSplitControl()
Definition: multisplit.cpp:335
MultiSplitTransferInfo * msti_
std::unique_ptr< MultiSplitTable > classical_root_to_multisplit_
void multisplit_nocap_v_part3(NrnThread *)
void pmat(bool full=false)
ReducedTree ** rtree_
void reduced_mark(int, int, int, int *, int *, int *)
void multisplit_nocap_v_part2(NrnThread *)
ReducedTree * rt_
Definition: multisplit.cpp:264
int backbone_style
Definition: multisplit.cpp:251
Node * nd[2]
Definition: multisplit.cpp:249
int sid[2]
Definition: multisplit.cpp:250
void triang_backbone(NrnThread *)
void triang(NrnThread *)
void bksub_short_backbone_part1(NrnThread *)
void bksub(NrnThread *)
virtual ~MultiSplitThread()
Definition: multisplit.cpp:350
void bksub_backbone(NrnThread *)
void v_setup(NrnThread *)
void bksub_subtrees(NrnThread *)
void triang_subtree2backbone(NrnThread *)
double ** rmap
Definition: multisplit.cpp:232
int * rmap2smap_index
Definition: multisplit.cpp:236
double ** smap
Definition: multisplit.cpp:232
void fillrmap(int sid1, int sid2, double *pd)
virtual ~ReducedTree()
double * d
Definition: multisplit.cpp:222
double * b
Definition: multisplit.cpp:224
void fillsmap(int sid, double *prhs, double *pdiag)
void scatter()
double * v
Definition: multisplit.cpp:238
MultiSplitControl * msc
Definition: multisplit.cpp:218
void pr_map(int, double *)
ReducedTree(MultiSplitControl *, int rank, int mapsize)
std::unique_ptr< Int2IntTable > s2rt
Definition: multisplit.cpp:241
void reorder(int j, int nt, int *mark, int *all_bb_relation, int *allsid)
double * a
Definition: multisplit.cpp:223
double * rhs
Definition: multisplit.cpp:221
#define sec
Definition: md1redef.h:20
#define id
Definition: md1redef.h:41
#define i
Definition: md1redef.h:19
static double order(void *v)
Definition: cvodeobj.cpp:218
ms
Definition: extargs.h:1
static RNG::key_type k
Definition: nrnran123.cpp:9
#define assert(ex)
Definition: hocassrt.h:24
#define rhs
Definition: lineq.h:6
printf
Definition: extdef.h:5
void move(Item *q1, Item *q2, Item *q3)
Definition: list.cpp:200
char * nh
Definition: mos2nrn.cpp:11
void(* nrn_multisplit_setup_)()
Definition: treeset.cpp:50
std::vector< MultiSplit * > MultiSplitList
Definition: multisplit.cpp:295
static void nrnmpi_postrecv_doubles(double *, int, int, int, void **)
Definition: multisplit.cpp:44
#define D(i)
Definition: multisplit.cpp:56
void * nrn_multisplit_reduce_solve(NrnThread *)
#define S1A(i)
Definition: multisplit.cpp:58
double * nrn_classicalNodeA(Node *nd)
static void nrnmpi_int_allgatherv(int *, int *, int *, int *)
Definition: multisplit.cpp:43
int nrn_multisplit_active_
Definition: multisplit.cpp:20
double * nrn_classicalNodeB(Node *nd)
static double nrnmpi_splitcell_wait_
Definition: multisplit.cpp:36
std::unordered_map< Node *, MultiSplit * > MultiSplitTable
Definition: multisplit.cpp:294
void nrn_multisplit_nocap_v_part1(NrnThread *nt)
#define RHS(i)
Definition: multisplit.cpp:57
void(* nrn_multisplit_solve_)()
Definition: solve.cpp:76
void nrn_multisplit_ptr_update()
static MultiSplitControl * msc_
Definition: multisplit.cpp:298
void nrn_multisplit_nocap_v_part3(NrnThread *nt)
void nrn_multisplit_nocap_v()
static void multisplit_v_setup()
void nrnmpi_multisplit_clear()
Definition: multisplit.cpp:499
void nrn_multisplit_adjust_rhs(NrnThread *nt)
double t
Definition: cvodeobj.cpp:57
static void nrnmpi_wait(void **)
Definition: multisplit.cpp:46
void nrn_multisplit_nocap_v_part2(NrnThread *nt)
static double nrnmpi_wtime()
Definition: multisplit.cpp:48
void * nrn_multisplit_triang(NrnThread *)
double nrnmpi_rtcomp_time_
Definition: ocbbs.cpp:31
void * nrn_multisplit_bksub(NrnThread *)
static void nrnmpi_send_doubles(double *, int, int, int)
Definition: multisplit.cpp:45
void nrnmpi_multisplit(Section *, double x, int sid, int backbone_style)
Definition: multisplit.cpp:300
static void nrnmpi_int_allgather(int *, int *, int)
Definition: multisplit.cpp:42
#define S1B(i)
Definition: multisplit.cpp:59
static void multisplit_solve()
static int nrnmpi_use
Definition: multisplit.cpp:41
static void nrnmpi_barrier()
Definition: multisplit.cpp:47
std::unordered_map< int, int > Int2IntTable
Definition: multisplit.cpp:198
NrnThread * nrn_threads
Definition: multicore.cpp:56
int nrn_nthread
Definition: multicore.cpp:55
void hoc_execerror(const char *s1, const char *s2)
Definition: nrnoc_aux.cpp:39
int diam_changed
Definition: nrnoc_aux.cpp:21
auto *const vec_d
Definition: cellorder.cpp:615
int Sprintf(char(&buf)[N], const char *fmt, Args &&... args)
Redirect sprintf to snprintf if the buffer size can be deduced.
Definition: wrap_sprintf.h:14
int ii
Definition: cellorder.cpp:631
auto *const vec_b
Definition: cellorder.cpp:614
auto *const vec_rhs
Definition: cellorder.cpp:616
int root
Definition: cellorder.cpp:622
void recalc_diam(void)
Definition: treeset.cpp:923
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
static Node * node(Object *)
Definition: netcvode.cpp:291
void nrn_matrix_node_free()
Definition: treeset.cpp:1771
int const size_t const size_t n
Definition: nrngsl.h:10
size_t p
size_t j
s
Definition: multisend.cpp:521
short type
Definition: cabvars.h:10
int nrnmpi_myid
#define NODEA(n)
Definition: section_fwd.hpp:52
#define NODEB(n)
Definition: section_fwd.hpp:53
int use_sparse13
Definition: treeset.cpp:58
#define NODERHS(n)
Definition: section_fwd.hpp:55
#define NODED(n)
Definition: section_fwd.hpp:54
int ibuf[3]
Definition: multisplit.cpp:186
MultiSplit * ms
Definition: multisplit.cpp:188
double adjust_rhs_
Definition: multisplit.cpp:187
MultiSplit * ms
Definition: multisplit.cpp:195
double * pd[3]
Definition: multisplit.cpp:193
double adjust_rhs_
Definition: multisplit.cpp:194
Definition: section.h:105
struct NrnThread * _nt
Definition: section.h:196
struct Node * _classical_parent
Definition: section.h:195
int eqn_index_
Definition: section.h:187
Section * sec
Definition: section.h:193
int v_node_index
Definition: section.h:212
int sec_node_index_
Definition: section.h:213
Represent main neuron object computed by single thread.
Definition: multicore.h:58
int * _v_parent_index
Definition: multicore.h:89
double & actual_d(std::size_t row)
Definition: multicore.h:82
int ncell
Definition: multicore.h:64
double * node_a_storage()
Definition: multicore.cpp:1054
int id
Definition: multicore.h:66
double & actual_rhs(std::size_t row)
Definition: multicore.h:85
double * node_rhs_storage()
Definition: multicore.cpp:1074
double * node_area_storage()
Definition: multicore.cpp:1059
int end
Definition: multicore.h:65
Node ** _v_parent
Definition: multicore.h:91
double * node_d_storage()
Definition: multicore.cpp:1069
Node ** _v_node
Definition: multicore.h:90
double * node_voltage_storage()
Definition: multicore.cpp:1098
double * node_b_storage()
Definition: multicore.cpp:1064
short nnode
Definition: section.h:52
int Printf(const char *fmt, Args... args)
Definition: logger.hpp:18