NEURON
multisend_setup.cpp
Go to the documentation of this file.
1 /*
2 For very large numbers of processors and cells and fanout, it is taking
3 a long time to figure out each cells target list given the input gids
4 (gid2in) on each host. e.g 240 seconds for 2^25 cells, 1k connections
5 per cell, and 128K cores; and 340 seconds for two phase excchange.
6 To reduce this setup time we experiment with a very different algorithm in which
7 we construct a gid target host list on host gid%nhost and copy that list to
8 the source host owning the gid.
9 */
10 #include "oc_ansi.h"
11 
12 #include <nrnran123.h>
13 
14 static void del(int* a) {
15  if (a) {
16  delete[] a;
17  }
18 }
19 
20 static int* newintval(int val, int size) {
21  if (size == 0) {
22  return 0;
23  }
24  int* x = new int[size];
25  for (int i = 0; i < size; ++i) {
26  x[i] = val;
27  }
28  return x;
29 }
30 
31 static int* newoffset(int* acnt, int size) {
32  int* aoff = new int[size + 1];
33  aoff[0] = 0;
34  for (int i = 0; i < size; ++i) {
35  aoff[i + 1] = aoff[i] + acnt[i];
36  }
37  return aoff;
38 }
39 
40 
41 // input scnt, sdispl ; output, newly allocated rcnt, rdispl
42 static void all2allv_helper(int* scnt, int* sdispl, int*& rcnt, int*& rdispl) {
43  int np = nrnmpi_numprocs;
44  int* c = newintval(1, np);
45  rdispl = newoffset(c, np);
46  rcnt = newintval(0, np);
47  nrnmpi_int_alltoallv(scnt, c, rdispl, rcnt, c, rdispl);
48  del(c);
49  del(rdispl);
50  rdispl = newoffset(rcnt, np);
51 }
52 
53 // input s, scnt, sdispl ; output, newly allocated r, rcnt, rdispl
54 static void
55 all2allv_int(int* s, int* scnt, int* sdispl, int*& r, int*& rcnt, int*& rdispl, const char* dmes) {
56  int np = nrnmpi_numprocs;
57  all2allv_helper(scnt, sdispl, rcnt, rdispl);
58  r = newintval(0, rdispl[np]);
59 
60  nrnmpi_int_alltoallv(s, scnt, sdispl, r, rcnt, rdispl);
61 }
62 
63 class TarList {
64  public:
65  TarList();
66  virtual ~TarList();
67  virtual void alloc();
68  int size;
69  int* list;
70  int rank;
71  int* indices; // indices of list for groups of phase2 targets.
72  // If indices is not null, then size is one less than
73  // the size of the indices list where indices[size] = the size of
74  // the list. Indices[0] is 0 and list[indices[i]] is the rank
75  // to send the ith group of phase2 targets.
76 };
77 
78 using Int2TarList = std::unordered_map<int, std::unique_ptr<TarList>>;
79 
81  size = 0;
82  list = 0;
83  rank = -1;
84  indices = 0;
85 }
87  del(list);
88  del(indices);
89 }
90 
92  if (size) {
93  list = new int[size];
94  }
95 }
96 
98 
99 static void random_init(int i) {
100  if (!ranstate) {
102  }
103 }
104 
105 static unsigned int get_random() {
106  return nrnran123_ipick(ranstate);
107 }
108 
109 static int iran(int i1, int i2) {
110  // discrete uniform random integer from i2 to i2 inclusive. Must
111  // work if i1 == i2
112  if (i1 == i2) {
113  return i1;
114  }
115  int i3 = i1 + get_random() % (i2 - i1 + 1);
116  return i3;
117 }
118 
119 static void phase2organize(TarList* tl) {
120  // copied and modified from old specify_phase2_distribution of multisend.cpp
121  int n, nt;
122  nt = tl->size;
123  n = int(sqrt(double(nt)));
124  // change to about 20
125  if (n > 1) { // do not bother if not many connections
126  // equal as possible group sizes
127  tl->indices = new int[n + 1];
128  tl->indices[n] = tl->size;
129  tl->size = n;
130  for (int i = 0; i < n; ++i) {
131  tl->indices[i] = (i * nt) / n;
132  }
133  // Note: not sure the following is true anymore but it could be.
134  // This distribution is very biased (if 0 is a phase1 target
135  // it is always a phase2 sender. So now choose a random
136  // target in the subset and make that the phase2 sender
137  // (need to switch the indices[i] target and the one chosen)
138  for (int i = 0; i < n; ++i) {
139  int i1 = tl->indices[i];
140  int i2 = tl->indices[i + 1] - 1;
141  // need discrete uniform random integer from i1 to i2
142  int i3 = iran(i1, i2);
143  int itar = tl->list[i1];
144  tl->list[i1] = tl->list[i3];
145  tl->list[i3] = itar;
146  }
147  }
148 }
149 
150 /*
151 Setting up target lists uses a lot of temporary memory. It is conceiveable
152 that this can be done prior to creating any cells or connections. I.e.
153 gid2out is presently known from pc.set_gid2node(gid,...). Gid2in is presenly
154 known from NetCon = pc.gid_connect(gid, target) and it is quite a style
155 and hoc network programming change to use something like pc.need_gid(gid)
156 before cells with their synapses are created since one would have to imagine
157 that the hoc network setup code would have to be executed in a virtual
158 or 'abstract' fashion without actually creating, cells, targets, or NetCons.
159 Anyway, to potentially support this in the future, we write setup_target_lists
160 to not use any PreSyn information.
161 */
162 
163 static int setup_target_lists(int**);
164 static void fill_multisend_send_lists(int, int*);
165 
167  // Create and attach Multisend_Send instances to output Presyn
168  for (const auto& iter: gid2out_) {
169  PreSyn* ps = iter.second;
170  // only ones that generate spikes. eg. multisplit
171  // registers a gid and even associates with a cell piece, but
172  // that piece may not send spikes.
173  if (ps->output_index_ >= 0) {
175  ps->bgp.multisend_send_ = new Multisend_Send();
176  }
177  }
178 
179  // Need to use the bgp union slot for multisend_send_phase2_.
180  // Only will be non-NULL if the input is a phase 2 sender.
181  for (const auto& iter: gid2in_) {
182  PreSyn* ps = iter.second;
183  ps->bgp.srchost_ = 0;
184  }
185 
186  int* r;
187  int sz = setup_target_lists(&r);
189  del(r);
190 
191  // phase1debug();
192  // phase2debug();
193 }
194 
195 static void fill_multisend_send_lists(int sz, int* r) {
196  // sequence of gid, size, [totalsize], list
197  // Note that totalsize is there only for output gid's and use_phase2_.
198  // Using this sequence, copy lists to proper phase
199  // 1 and phase 2 lists. (Phase one lists found in gid2out_ and phase
200  // two lists found in gid2in_.
201  for (int i = 0; i < sz;) {
202  int gid = r[i++];
203  int size = r[i++];
204  PreSyn* ps{nullptr};
205  if (use_phase2_) { // look in gid2in first
206  auto iter = gid2in_.find(gid);
207  if (iter != gid2in_.end()) {
208  ps = iter->second;
210  ps->bgp.multisend_send_phase2_ = bsp;
211  bsp->ntarget_hosts_phase2_ = size;
212  int* p = newintval(0, size);
213  bsp->target_hosts_phase2_ = p;
214  // printf("%d %d phase2 size=%d\n", nrnmpi_myid, gid, bsp->ntarget_hosts_phase2_);
215  for (int j = 0; j < size; ++j) {
216  p[j] = r[i++];
217  assert(p[j] != nrnmpi_myid);
218  }
219  }
220  }
221  if (!ps) { // phase 1 target list (or whole list if use_phase2 is 0)
222  auto iter = gid2out_.find(gid);
223  nrn_assert(iter != gid2out_.end());
224  ps = iter->second;
225  Multisend_Send* bs = ps->bgp.multisend_send_;
226  bs->ntarget_hosts_phase1_ = size;
227  if (use_phase2_ == 0) {
228  bs->ntarget_hosts_ = size;
229  } else {
230  bs->ntarget_hosts_ = r[i++];
231  }
232  int* p = newintval(0, size);
233  bs->target_hosts_ = p;
234  // printf("%d %d phase1 size=%d\n", nrnmpi_myid, gid, bs->ntarget_hosts_);
235  for (int j = 0; j < size; ++j) {
236  p[j] = r[i++];
237  // There never was a possibility of send2self
238  // because an output presyn is never in gid2in_.
239  assert(p[j] != nrnmpi_myid);
240  }
241  }
242  }
243  // compute max_ntarget_host and max_multisend_targets
244  max_ntarget_host = 0;
246  for (const auto& iter: gid2out_) {
247  PreSyn* ps = iter.second;
248  if (ps->output_index_ >= 0) { // only ones that generate spikes
249  Multisend_Send* bs = ps->bgp.multisend_send_;
250  if (max_ntarget_host < bs->ntarget_hosts_) {
252  }
253  if (max_multisend_targets < bs->ntarget_hosts_phase1_) {
255  }
256  }
257  }
258  if (use_phase2_)
259  for (const auto& iter: gid2in_) {
260  PreSyn* ps = iter.second;
261  Multisend_Send_Phase2* bsp = ps->bgp.multisend_send_phase2_;
262  if (bsp && max_multisend_targets < bsp->ntarget_hosts_phase2_) {
264  }
265  }
266 }
267 
268 // return is vector and its size. The vector encodes a sequence of
269 // gid, target list size, and target list
270 static int setup_target_lists(int** r_return) {
271  int *s, *r, *scnt, *rcnt, *sdispl, *rdispl;
272  int nhost = nrnmpi_numprocs;
273 
274  // What are the target ranks for a given input gid. All the ranks
275  // with the same input gid send that gid to the intermediate
276  // gid%nhost rank. The intermediate rank can then construct the
277  // list of target ranks for the gids it gets.
278 
279  // scnt is number of input gids from target
280  scnt = newintval(0, nhost);
281  for (const auto& iter: gid2in_) {
282  int gid = iter.first;
283  ++scnt[gid % nhost];
284  }
285 
286  // s are the input gids from target to be sent to the various intermediates
287  sdispl = newoffset(scnt, nhost);
288  s = newintval(0, sdispl[nhost]);
289  for (const auto& iter: gid2in_) {
290  int gid = iter.first;
291  s[sdispl[gid % nhost]++] = gid;
292  }
293  // Restore sdispl for the message.
294  del(sdispl);
295  sdispl = newoffset(scnt, nhost);
296 
297  all2allv_int(s, scnt, sdispl, r, rcnt, rdispl, "gidin to intermediate");
298  del(s);
299  del(scnt);
300  del(sdispl);
301  // r is the gids received by this intermediate rank from all other ranks.
302 
303  // Construct hash table for finding the target rank list for a given gid.
304  Int2TarList gid2tarlist;
305  // Now figure out the size of the target list for each distinct gid in r.
306  for (int i = 0; i < rdispl[nhost]; ++i) {
307  const int gid = r[i];
308  auto& tl = gid2tarlist[gid]; // default-construct a new std::unique_ptr<TarList> if needed
309  if (!tl) {
310  tl.reset(new TarList()); // constructor initialises `size` to zero
311  }
312  ++(tl->size);
313  }
314 
315  // Conceptually, now the intermediate is the mpi source and the gid
316  // sources are the mpi destination in regard to target lists.
317  // It would be possible at this point, but confusing,
318  // to allocate a s[rdispl[nhost]] and figure out scnt and sdispl by
319  // by getting the counts and gids from the ranks that own the source
320  // gids. In this way we could organize s without having to allocate
321  // individual target lists on the intermediate and then allocate
322  // another large s buffer to receive a copy of them. However for
323  // this processing we already require two large buffers for input
324  // gid's so there is no real savings of space.
325  // So let's do the simple obvious sequence and now complete the
326  // target lists.
327 
328  // Allocate the target lists (and set size to 0 (we will recount when filling).
329  for (auto& iter: gid2tarlist) {
330  TarList* tl = iter.second.get();
331  tl->alloc();
332  tl->size = 0;
333  }
334 
335  // fill the target lists
336  for (int rank = 0; rank < nhost; ++rank) {
337  int b = rdispl[rank];
338  int e = rdispl[rank + 1];
339  for (int i = b; i < e; ++i) {
340  const auto iter = gid2tarlist.find(r[i]);
341  if (iter != gid2tarlist.end()) {
342  TarList* tl = iter->second.get();
343  tl->list[tl->size] = rank;
344  tl->size++;
345  }
346  }
347  }
348  del(r);
349  del(rcnt);
350  del(rdispl);
351 
352  // Now the intermediate hosts have complete target lists and
353  // the sources know the intermediate host from the gid2out_ map.
354  // We could potentially organize here for two-phase exchange as well.
355 
356  // Which target lists are desired by the source rank?
357 
358  // Ironically, for round robin distributions, the target lists are
359  // already on the proper source rank so the following code should
360  // be tested for random distributions of gids.
361  // How many on the source rank?
362  scnt = newintval(0, nhost);
363  for (const auto& iter: gid2out_) {
364  PreSyn* ps = iter.second;
365  int gid = iter.first;
366  if (ps->output_index_ >= 0) { // only ones that generate spikes
367  ++scnt[gid % nhost];
368  }
369  }
370  sdispl = newoffset(scnt, nhost);
371 
372  // what are the gids of those target lists
373  s = newintval(0, sdispl[nhost]);
374  for (const auto& iter: gid2out_) {
375  PreSyn* ps = iter.second;
376  int gid = iter.first;
377  if (ps->output_index_ >= 0) { // only ones that generate spikes
378  s[sdispl[gid % nhost]++] = gid;
379  }
380  }
381  // Restore sdispl for the message.
382  del(sdispl);
383  sdispl = newoffset(scnt, nhost);
384  all2allv_int(s, scnt, sdispl, r, rcnt, rdispl, "gidout");
385 
386  // fill in the tl->rank for phase 1 target lists
387  // r is an array of source spiking gids
388  // tl is list associating input gids with list of target ranks.
389  for (int rank = 0; rank < nhost; ++rank) {
390  int b = rdispl[rank];
391  int e = rdispl[rank + 1];
392  for (int i = b; i < e; ++i) {
393  // note that there may be input gids with no corresponding
394  // output gid so that the find may not return true and in
395  // that case the tl->rank remains -1.
396  // For example multisplit gids or simulation of a subset of
397  // cells.
398  const auto iter = gid2tarlist.find(r[i]);
399  if (iter != gid2tarlist.end()) {
400  TarList* tl = iter->second.get();
401  tl->rank = rank;
402  }
403  }
404  }
405  del(s);
406  del(scnt);
407  del(sdispl);
408  del(r);
409  del(rcnt);
410  del(rdispl);
411 
412  if (use_phase2_) {
414  for (const auto& iter: gid2tarlist) {
415  TarList* tl = iter.second.get();
416  if (tl->rank >= 0) { // only if output gid is spike generating
417  phase2organize(tl);
418  }
419  }
420  }
421 
422  // For clarity, use the all2allv_int style of information flow
423  // from source to destination as above (instead of the obsolete
424  // section which was difficult to understand (It was last present at
425  // 672:544c61a730ec))
426  // and also use a uniform code
427  // for copying one and two phase information from a TarList to
428  // develop the s, scnt, and sdispl buffers. That is, a buffer list
429  // section in s for either a one-phase list or the much shorter
430  // (individually) lists for first and second phases, has a
431  // gid, size, totalsize header for each list where totalsize
432  // is only present if the gid is an output gid (for
433  // Multisend_Send.ntarget_host used for conservation).
434  // Note that totalsize is tl->indices[tl->size]
435 
436  // how much to send to each rank
437  scnt = newintval(0, nhost);
438  for (const auto& iter: gid2tarlist) {
439  TarList* tl = iter.second.get();
440  if (tl->rank < 0) {
441  // When the output gid does not generate spikes, that rank
442  // is not interested if there is a target list for it.
443  // If the output gid dies not exist, there is no rank.
444  // In either case ignore this target list.
445  continue;
446  }
447  if (tl->indices) {
448  // indices[size] is the size of list but size of those
449  // are the sublist phase 2 destination ranks which
450  // don't get sent as part of the phase 2 target list.
451  // Also there is a phase 1 target list of size so there
452  // are altogether size+1 target lists.
453  // (one phase 1 list and size phase 2 lists)
454  scnt[tl->rank] += tl->size + 2; // gid, size, list
455  for (int i = 0; i < tl->size; ++i) {
456  scnt[tl->list[tl->indices[i]]] += tl->indices[i + 1] - tl->indices[i] + 1;
457  // gid, size, list
458  }
459  } else {
460  // gid, list size, list
461  scnt[tl->rank] += tl->size + 2;
462  }
463  if (use_phase2_) {
464  // The phase 1 header has as its third element, the
465  // total list size (needed for conservation);
466  scnt[tl->rank] += 1;
467  }
468  }
469  sdispl = newoffset(scnt, nhost);
470  s = newintval(0, sdispl[nhost]);
471  // what to send to each rank
472  for (const auto& iter: gid2tarlist) {
473  int gid = iter.first;
474  TarList* tl = iter.second.get();
475  if (tl->rank < 0) {
476  continue;
477  }
478  if (tl->indices) {
479  s[sdispl[tl->rank]++] = gid;
480  s[sdispl[tl->rank]++] = tl->size;
481  if (use_phase2_) {
482  s[sdispl[tl->rank]++] = tl->indices[tl->size];
483  }
484  for (int i = 0; i < tl->size; ++i) {
485  s[sdispl[tl->rank]++] = tl->list[tl->indices[i]];
486  }
487  for (int i = 0; i < tl->size; ++i) {
488  int rank = tl->list[tl->indices[i]];
489  s[sdispl[rank]++] = gid;
490  assert(tl->indices[i + 1] > tl->indices[i]);
491  s[sdispl[rank]++] = tl->indices[i + 1] - tl->indices[i] - 1;
492  for (int j = tl->indices[i] + 1; j < tl->indices[i + 1]; ++j) {
493  s[sdispl[rank]++] = tl->list[j];
494  }
495  }
496  } else {
497  // gid, list size, list
498  s[sdispl[tl->rank]++] = gid;
499  s[sdispl[tl->rank]++] = tl->size;
500  if (use_phase2_) {
501  s[sdispl[tl->rank]++] = tl->size;
502  }
503  for (int i = 0; i < tl->size; ++i) {
504  s[sdispl[tl->rank]++] = tl->list[i];
505  }
506  }
507  }
508  del(sdispl);
509  sdispl = newoffset(scnt, nhost);
510  all2allv_int(s, scnt, sdispl, r, rcnt, rdispl, "lists");
511  del(s);
512  del(scnt);
513  del(sdispl);
514 
515  del(rcnt);
516  int sz = rdispl[nhost];
517  del(rdispl);
518  *r_return = r;
519  return sz;
520 }
static void nrnmpi_int_alltoallv(const int *s, const int *scnt, const int *sdispl, int *r, int *rcnt, int *rdispl)
int ntarget_hosts_phase1_
Definition: multisend.cpp:142
int * target_hosts_
Definition: multisend.cpp:140
Definition: netcon.h:258
int output_index_
Definition: netcon.h:308
virtual ~TarList()
virtual void alloc()
#define i
Definition: md1redef.h:19
std::uint32_t nrnran123_ipick(nrnran123_State *s)
Definition: nrnran123.cpp:110
nrnran123_State * nrnran123_newstream()
Construct a new Random123 stream based on the philox4x32 generator.
Definition: nrnran123.cpp:31
static int c
Definition: hoc.cpp:169
#define assert(ex)
Definition: hocassrt.h:24
sqrt
Definition: extdef.h:3
static int np
Definition: mpispike.cpp:25
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
int const size_t const size_t n
Definition: nrngsl.h:10
size_t p
size_t j
void nrn_multisend_cleanup_presyn(PreSyn *ps)
Definition: multisend.cpp:553
s
Definition: multisend.cpp:521
static int max_multisend_targets
Definition: multisend.cpp:317
static int use_phase2_
Definition: multisend.cpp:132
static int max_ntarget_host
Definition: multisend.cpp:313
static void random_init(int i)
static int setup_target_lists(int **)
static unsigned int get_random()
static void del(int *a)
static void all2allv_int(int *s, int *scnt, int *sdispl, int *&r, int *&rcnt, int *&rdispl, const char *dmes)
static int iran(int i1, int i2)
static nrnran123_State * ranstate
static int * newoffset(int *acnt, int size)
static int * newintval(int val, int size)
static void phase2organize(TarList *tl)
static void all2allv_helper(int *scnt, int *sdispl, int *&rcnt, int *&rdispl)
std::unordered_map< int, std::unique_ptr< TarList > > Int2TarList
static void setup_presyn_multisend_lists()
static void fill_multisend_send_lists(int, int *)
static Gid2PreSyn gid2out_
Definition: netpar.cpp:40
static Gid2PreSyn gid2in_
Definition: netpar.cpp:41
int nrnmpi_myid
HOC interpreter function declarations (included by hocdec.h)
static double nhost(void *v)
Definition: ocbbs.cpp:201