NEURON
test_solver.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================.
7 */
13 
14 #include <catch2/catch_test_macros.hpp>
15 #include <catch2/matchers/catch_matchers_vector.hpp>
16 
17 #include <iostream>
18 #include <functional>
19 #include <map>
20 #include <random>
21 #include <utility>
22 #include <vector>
23 
24 using namespace coreneuron;
25 
26 struct SolverData {
27  std::vector<double> d, rhs;
28  std::vector<int> parent_index;
29 };
30 
31 constexpr auto magic_index_value = -2;
32 constexpr auto magic_double_value = std::numeric_limits<double>::lowest();
33 
34 enum struct SolverImplementation {
42 };
43 
44 std::ostream& operator<<(std::ostream& os, SolverImplementation impl) {
46  return os << "SolverImplementation::CellPermute0_CPU";
47  } else if (impl == SolverImplementation::CellPermute0_GPU) {
48  return os << "SolverImplementation::CellPermute0_GPU";
49  } else if (impl == SolverImplementation::CellPermute1_CPU) {
50  return os << "SolverImplementation::CellPermute1_CPU";
51  } else if (impl == SolverImplementation::CellPermute1_GPU) {
52  return os << "SolverImplementation::CellPermute1_GPU";
53  } else if (impl == SolverImplementation::CellPermute2_CPU) {
54  return os << "SolverImplementation::CellPermute2_CPU";
55  } else if (impl == SolverImplementation::CellPermute2_GPU) {
56  return os << "SolverImplementation::CellPermute2_GPU";
57  } else if (impl == SolverImplementation::CellPermute2_CUDA) {
58  return os << "SolverImplementation::CellPermute2_CUDA";
59  } else {
60  throw std::runtime_error("Invalid SolverImplementation");
61  }
62 }
63 
65  int num_threads{1};
66  int num_cells{1};
67  int num_segments_per_cell{3};
68  std::function<double(int, int)> produce_a{[](auto, auto) { return 3.14159; }},
69  produce_b{[](auto, auto) { return 42.0; }}, produce_d{[](auto, auto) { return 7.0; }},
70  produce_rhs{[](auto, auto) { return -16.0; }};
71 };
72 
73 // TODO include some global lock as a sanity check (only one instance of
74 // SetupThreads should exist at any given time)
75 struct SetupThreads {
78  corenrn_param.gpu = false;
79  switch (impl) {
81  corenrn_param.gpu = true;
82  [[fallthrough]];
85  break;
87  corenrn_param.gpu = true;
88  [[fallthrough]];
91  break;
94  [[fallthrough]];
96  corenrn_param.gpu = true;
97  [[fallthrough]];
100  break;
101  }
103  nrn_threads_create(config.num_threads);
105  int num_cells_remaining{config.num_cells}, total_cells{};
106  for (auto ithread = 0; ithread < nrn_nthread; ++ithread) {
107  auto& nt = nrn_threads[ithread];
108  // How many cells to distribute on this thread, trying to get the right
109  // total even if num_threads does not exactly divide num_cells.
110  nt.ncell = num_cells_remaining / (nrn_nthread - ithread);
111  total_cells += nt.ncell;
112  num_cells_remaining -= nt.ncell;
113  // How many segments are there in this thread?
114  nt.end = nt.ncell * config.num_segments_per_cell;
115  auto const padded_size = nrn_soa_padded_size(nt.end, 0);
116  // Allocate one big block because the GPU data transfer code assumes this.
117  nt._ndata = padded_size * 4;
118  nt._data = static_cast<double*>(emalloc_align(nt._ndata * sizeof(double)));
119  auto* vec_rhs = (nt._actual_rhs = nt._data + 0 * padded_size);
120  auto* vec_d = (nt._actual_d = nt._data + 1 * padded_size);
121  auto* vec_a = (nt._actual_a = nt._data + 2 * padded_size);
122  auto* vec_b = (nt._actual_b = nt._data + 3 * padded_size);
123  auto* parent_indices =
124  (nt._v_parent_index = static_cast<int*>(emalloc_align(padded_size * sizeof(int))));
125  // Magic value to check against later.
126  std::fill(parent_indices, parent_indices + nt.end, magic_index_value);
127  // Put all the root nodes first, then put the other segments
128  // in blocks. i.e. ABCDAAAABBBBCCCCDDDD
129  auto const get_index = [ncell = nt.ncell,
130  nseg = config.num_segments_per_cell](auto icell, auto iseg) {
131  if (iseg == 0) {
132  return icell;
133  } else {
134  return ncell + icell * (nseg - 1) + iseg - 1;
135  }
136  };
137  for (auto icell = 0; icell < nt.ncell; ++icell) {
138  for (auto iseg = 0; iseg < config.num_segments_per_cell; ++iseg) {
139  auto const global_index = get_index(icell, iseg);
140  vec_a[global_index] = config.produce_a(icell, iseg);
141  vec_b[global_index] = config.produce_b(icell, iseg);
142  vec_d[global_index] = config.produce_d(icell, iseg);
143  vec_rhs[global_index] = config.produce_rhs(icell, iseg);
144  // 0th element is the root node, which has no parent
145  // other elements are attached in a binary tree configuration
146  // | 0 |
147  // | / \ |
148  // | 1 2 |
149  // | / \ / \ |
150  // | 3 4 5 6 |
151  // TODO: include some other topologies, e.g. a long straight line, or
152  // an unbalanced tree.
153  auto const parent_id = iseg ? get_index(icell, (iseg - 1) / 2) : -1;
154  parent_indices[global_index] = parent_id;
155  }
156  }
157  // Check we didn't mess up populating any parent indices
158  for (auto i = 0; i < nt.end; ++i) {
159  REQUIRE(parent_indices[i] != magic_index_value);
160  // Root nodes should come first for --cell-permute=0
161  if (i < nt.ncell) {
162  REQUIRE(parent_indices[i] == -1);
163  }
164  }
166  nt._permute = interleave_order(nt.id, nt.ncell, nt.end, parent_indices);
167  REQUIRE(nt._permute);
168  permute_data(vec_a, nt.end, nt._permute);
169  permute_data(vec_b, nt.end, nt._permute);
170  // This isn't done in CoreNEURON because these are reset every
171  // time step, but permute d/rhs here so that the initial values
172  // set by produce_d and produce_rhs are propagated consistently
173  // to all of the solver implementations.
174  permute_data(vec_d, nt.end, nt._permute);
175  permute_data(vec_rhs, nt.end, nt._permute);
176  // index values change as well as ordering
177  permute_ptr(parent_indices, nt.end, nt._permute);
178  node_permute(parent_indices, nt.end, nt._permute);
179  }
180  }
182  std::cout << "CellPermute0_GPU is a nonstandard configuration, copying data to the "
183  "device may produce warnings:";
184  }
185  if (corenrn_param.gpu) {
187  }
189  std::cout << "\n...no more warnings expected" << std::endl;
190  }
191  // Make sure we produced the number of cells we were aiming for
192  REQUIRE(total_cells == config.num_cells);
193  REQUIRE(num_cells_remaining == 0);
194  }
195 
197  if (corenrn_param.gpu) {
199  }
200  for (auto& nt: *this) {
201  free_memory(std::exchange(nt._data, nullptr));
202  delete[] std::exchange(nt._permute, nullptr);
203  free_memory(std::exchange(nt._v_parent_index, nullptr));
204  }
207  }
208 
210  std::vector<SolverData> ret{static_cast<std::size_t>(nrn_nthread)};
211  // Sync the solver data from GPU to host
213  // Un-permute the data in and store it in ret.{d,parent_index,rhs}
214  for (auto i = 0; i < nrn_nthread; ++i) {
215  auto& nt = nrn_threads[i];
216  auto& sd = ret[i];
217  sd.d.resize(nt.end, magic_double_value);
218  sd.parent_index.resize(nt.end, magic_index_value);
219  sd.rhs.resize(nt.end, magic_double_value);
220  auto* inv_permute = nt._permute ? inverse_permute(nt._permute, nt.end) : nullptr;
221  for (auto i = 0; i < nt.end; ++i) {
222  // index in permuted vectors
223  auto const p_i = nt._permute ? nt._permute[i] : i;
224  // parent index in permuted vectors
225  auto const p_parent = nt._v_parent_index[p_i];
226  // parent index in unpermuted vectors (i.e. on the same scale as `i`)
227  auto const parent = p_parent == -1
228  ? -1
229  : (inv_permute ? inv_permute[p_parent] : p_parent);
230  // Save the values to the de-permuted return structure
231  sd.d[i] = nt._actual_d[p_i];
232  sd.parent_index[i] = parent;
233  sd.rhs[i] = nt._actual_rhs[p_i];
234  }
235  delete[] inv_permute;
236  for (auto i = 0; i < nt.end; ++i) {
237  REQUIRE(sd.d[i] != magic_double_value);
238  REQUIRE(sd.parent_index[i] != magic_index_value);
239  REQUIRE(sd.rhs[i] != magic_double_value);
240  }
241  }
242  return ret;
243  }
244 
245  void solve() {
246  for (auto& thread: *this) {
247  nrn_solve_minimal(&thread);
248  }
249  }
250 
251  NrnThread* begin() const {
252  return nrn_threads;
253  }
254  NrnThread* end() const {
255  return nrn_threads + nrn_nthread;
256  }
257 };
258 
259 template <typename... Args>
260 auto solve_and_dump(Args&&... args) {
261  SetupThreads threads{std::forward<Args>(args)...};
262  threads.solve();
263  return threads.dump_solver_data();
264 }
265 
267  // These are always available
268  std::vector<SolverImplementation> ret{SolverImplementation::CellPermute0_CPU,
271 #ifdef CORENEURON_ENABLE_GPU
272  // Consider making these steerable via a runtime switch in GPU builds
277 #endif
278  return ret;
279 }
280 
282  std::map<SolverImplementation, std::vector<SolverData>> const& solver_data) {
283  // CellPermute0_CPU is the simplest version of the solver, it should always
284  // be present and it's a good reference to use
285  constexpr auto ref_impl = SolverImplementation::CellPermute0_CPU;
286  REQUIRE(solver_data.find(ref_impl) != solver_data.end());
287  auto const& ref_data = solver_data.at(ref_impl);
288  for (auto const& [impl, impl_data]: solver_data) {
289  // Must have compatible numbers of threads.
290  REQUIRE(impl_data.size() == ref_data.size());
291  std::cout << "Comparing " << impl << " to " << ref_impl << std::endl;
292  for (auto n_thread = 0ul; n_thread < impl_data.size(); ++n_thread) {
293  // Must have compatible numbers of segments/data entries
294  REQUIRE(impl_data[n_thread].d.size() == ref_data[n_thread].d.size());
295  REQUIRE(impl_data[n_thread].parent_index.size() ==
296  ref_data[n_thread].parent_index.size());
297  REQUIRE(impl_data[n_thread].rhs.size() == ref_data[n_thread].rhs.size());
298  CHECK_THAT(impl_data[n_thread].d, Catch::Matchers::Approx(ref_data[n_thread].d));
299  REQUIRE(impl_data[n_thread].parent_index == ref_data[n_thread].parent_index);
300  CHECK_THAT(impl_data[n_thread].rhs, Catch::Matchers::Approx(ref_data[n_thread].rhs));
301  }
302  }
303 }
304 
305 template <typename... Args>
307  std::map<SolverImplementation, std::vector<SolverData>> solver_data;
308  for (auto impl: active_implementations()) {
309  solver_data[impl] = solve_and_dump(impl, std::forward<Args>(args)...);
310  }
311  compare_solver_data(solver_data);
312  return solver_data;
313 }
314 
315 // *Roughly* tuned to accomodate NVHPC 22.3 at -O0; the largest differences come
316 // from the pseudorandom seeded tests.
317 constexpr double default_tolerance = 2e-11;
318 
319 TEST_CASE("SingleCellAndThread", "[solver][single-thread]") {
320  constexpr std::size_t segments = 32;
321  ToyModelConfig config{};
322  config.num_segments_per_cell = segments;
323  auto const solver_data = compare_all_active_implementations(config);
324  for (auto const& [impl, data]: solver_data) {
325  REQUIRE(data.size() == 1); // nthreads
326  REQUIRE(data[0].d.size() == segments);
327  REQUIRE(data[0].parent_index.size() == segments);
328  REQUIRE(data[0].rhs.size() == segments);
329  }
330 }
331 
332 TEST_CASE("UnbalancedCellSingleThread", "[solver][single-thread]") {
333  ToyModelConfig config{};
334  config.num_segments_per_cell = 19; // not a nice round number
336 }
337 
338 TEST_CASE("LargeCellSingleThread", "[solver][single-thread]") {
339  ToyModelConfig config{};
340  config.num_segments_per_cell = 4096;
342 }
343 
344 TEST_CASE("ManySmallCellsSingleThread", "[solver][single-thread]") {
345  ToyModelConfig config{};
346  config.num_cells = 1024;
348 }
349 
350 TEST_CASE("ManySmallCellsMultiThread", "[solver][multi-thread]") {
351  ToyModelConfig config{};
352  config.num_cells = 1024;
353  config.num_threads = 2;
355 }
356 
358  std::mt19937_64 gen{42};
359  ToyModelConfig config{};
360  config.produce_a = [g = gen, d = std::normal_distribution{1.0, 0.1}](int icell,
361  int iseg) mutable {
362  return d(g);
363  };
364  config.produce_b = [g = gen, d = std::normal_distribution{7.0, 0.2}](int, int) mutable {
365  return d(g);
366  };
367  config.produce_d = [g = gen, d = std::normal_distribution{-0.1, 0.01}](int, int) mutable {
368  return d(g);
369  };
370  config.produce_rhs = [g = gen, d = std::normal_distribution{-15.0, 2.0}](int, int) mutable {
371  return d(g);
372  };
373  return config;
374 }
375 
376 TEST_CASE("LargeCellSingleThreadRandom", "[solver][single-thread][random]") {
377  auto config = random_config();
378  config.num_segments_per_cell = 4096;
380 }
381 
382 TEST_CASE("ManySmallCellsSingleThreadRandom", "[solver][single-thread][random]") {
383  auto config = random_config();
384  config.num_cells = 1024;
386 }
#define i
Definition: md1redef.h:19
#define rhs
Definition: lineq.h:6
void free_memory(void *pointer)
Definition: memory.h:213
static double map(void *v)
Definition: mlinedit.cpp:43
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
NrnThread * nrn_threads
Definition: multicore.cpp:56
void nrn_threads_create(int n)
Definition: multicore.cpp:102
void * emalloc_align(size_t size, size_t alignment)
int nrn_nthread
Definition: multicore.cpp:55
static int inv_permute(int i, NrnThread &nt)
Definition: prcellstate.cpp:32
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
int * inverse_permute(int *p, int n)
int interleave_permute_type
bool use_solve_interleave
Definition: solve_core.cpp:13
void nrn_solve_minimal(NrnThread *)
Definition: solve_core.cpp:18
void nrn_threads_free()
Definition: multicore.cpp:125
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
corenrn_parameters corenrn_param
Printing method.
std::ostream & operator<<(std::ostream &os, const corenrn_parameters &corenrn_param)
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:784
void permute_data(double *vec, int n, int *p)
auto *const vec_d
Definition: cellorder.cpp:615
std::vector< int > interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
Definition: cellorder.cpp:348
auto *const vec_b
Definition: cellorder.cpp:614
void create_interleave_info()
Definition: cellorder.cpp:110
auto *const vec_rhs
Definition: cellorder.cpp:616
void permute_ptr(int *vec, int n, int *p)
void destroy_interleave_info()
Definition: cellorder.cpp:115
static int get_index(const ast::IndexedName &node)
SetupThreads(SolverImplementation impl, ToyModelConfig config={})
Definition: test_solver.cpp:76
NrnThread * begin() const
NrnThread * end() const
auto dump_solver_data()
std::vector< double > d
Definition: test_solver.cpp:27
std::vector< int > parent_index
Definition: test_solver.cpp:28
int num_segments_per_cell
Definition: test_solver.cpp:67
std::function< double(int, int)> produce_a
Definition: test_solver.cpp:68
bool cuda_interface
Enable GPU computation.
constexpr auto magic_index_value
Definition: test_solver.cpp:31
auto random_config()
auto active_implementations()
void compare_solver_data(std::map< SolverImplementation, std::vector< SolverData >> const &solver_data)
constexpr auto magic_double_value
Definition: test_solver.cpp:32
auto solve_and_dump(Args &&... args)
TEST_CASE("SingleCellAndThread", "[solver][single-thread]")
auto compare_all_active_implementations(Args &&... args)
SolverImplementation
Definition: test_solver.cpp:34
constexpr double default_tolerance