14 #include <catch2/catch_test_macros.hpp>
15 #include <catch2/matchers/catch_matchers_vector.hpp>
27 std::vector<double>
d,
rhs;
46 return os <<
"SolverImplementation::CellPermute0_CPU";
48 return os <<
"SolverImplementation::CellPermute0_GPU";
50 return os <<
"SolverImplementation::CellPermute1_CPU";
52 return os <<
"SolverImplementation::CellPermute1_GPU";
54 return os <<
"SolverImplementation::CellPermute2_CPU";
56 return os <<
"SolverImplementation::CellPermute2_GPU";
58 return os <<
"SolverImplementation::CellPermute2_CUDA";
60 throw std::runtime_error(
"Invalid SolverImplementation");
67 int num_segments_per_cell{3};
68 std::function<double(
int,
int)> produce_a{[](
auto,
auto) {
return 3.14159; }},
69 produce_b{[](
auto,
auto) {
return 42.0; }}, produce_d{[](
auto,
auto) {
return 7.0; }},
70 produce_rhs{[](
auto,
auto) {
return -16.0; }};
105 int num_cells_remaining{config.num_cells}, total_cells{};
106 for (
auto ithread = 0; ithread <
nrn_nthread; ++ithread) {
111 total_cells += nt.ncell;
112 num_cells_remaining -= nt.ncell;
114 nt.end = nt.ncell * config.num_segments_per_cell;
117 nt._ndata = padded_size * 4;
118 nt._data =
static_cast<double*
>(
emalloc_align(nt._ndata *
sizeof(
double)));
119 auto*
vec_rhs = (nt._actual_rhs = nt._data + 0 * padded_size);
120 auto*
vec_d = (nt._actual_d = nt._data + 1 * padded_size);
121 auto* vec_a = (nt._actual_a = nt._data + 2 * padded_size);
122 auto*
vec_b = (nt._actual_b = nt._data + 3 * padded_size);
123 auto* parent_indices =
124 (nt._v_parent_index =
static_cast<int*
>(
emalloc_align(padded_size *
sizeof(
int))));
130 nseg = config.num_segments_per_cell](
auto icell,
auto iseg) {
134 return ncell + icell * (nseg - 1) + iseg - 1;
137 for (
auto icell = 0; icell < nt.ncell; ++icell) {
138 for (
auto iseg = 0; iseg < config.num_segments_per_cell; ++iseg) {
139 auto const global_index =
get_index(icell, iseg);
140 vec_a[global_index] = config.produce_a(icell, iseg);
141 vec_b[global_index] = config.produce_b(icell, iseg);
142 vec_d[global_index] = config.produce_d(icell, iseg);
143 vec_rhs[global_index] = config.produce_rhs(icell, iseg);
153 auto const parent_id = iseg ?
get_index(icell, (iseg - 1) / 2) : -1;
154 parent_indices[global_index] = parent_id;
158 for (
auto i = 0;
i < nt.end; ++
i) {
162 REQUIRE(parent_indices[
i] == -1);
167 REQUIRE(nt._permute);
178 node_permute(parent_indices, nt.end, nt._permute);
182 std::cout <<
"CellPermute0_GPU is a nonstandard configuration, copying data to the "
183 "device may produce warnings:";
189 std::cout <<
"\n...no more warnings expected" << std::endl;
192 REQUIRE(total_cells == config.num_cells);
193 REQUIRE(num_cells_remaining == 0);
200 for (
auto& nt: *
this) {
202 delete[] std::exchange(nt._permute,
nullptr);
203 free_memory(std::exchange(nt._v_parent_index,
nullptr));
210 std::vector<SolverData> ret{
static_cast<std::size_t
>(
nrn_nthread)};
221 for (
auto i = 0;
i < nt.end; ++
i) {
223 auto const p_i = nt._permute ? nt._permute[
i] :
i;
225 auto const p_parent = nt._v_parent_index[p_i];
227 auto const parent = p_parent == -1
231 sd.d[
i] = nt._actual_d[p_i];
232 sd.parent_index[
i] = parent;
233 sd.rhs[
i] = nt._actual_rhs[p_i];
236 for (
auto i = 0;
i < nt.end; ++
i) {
246 for (
auto& thread: *
this) {
259 template <
typename... Args>
263 return threads.dump_solver_data();
271 #ifdef CORENEURON_ENABLE_GPU
286 REQUIRE(solver_data.find(ref_impl) != solver_data.end());
287 auto const& ref_data = solver_data.at(ref_impl);
288 for (
auto const& [impl, impl_data]: solver_data) {
290 REQUIRE(impl_data.size() == ref_data.size());
291 std::cout <<
"Comparing " << impl <<
" to " << ref_impl << std::endl;
292 for (
auto n_thread = 0ul; n_thread < impl_data.size(); ++n_thread) {
294 REQUIRE(impl_data[n_thread].d.size() == ref_data[n_thread].d.size());
295 REQUIRE(impl_data[n_thread].parent_index.size() ==
296 ref_data[n_thread].parent_index.size());
297 REQUIRE(impl_data[n_thread].
rhs.size() == ref_data[n_thread].rhs.size());
298 CHECK_THAT(impl_data[n_thread].d, Catch::Matchers::Approx(ref_data[n_thread].d));
299 REQUIRE(impl_data[n_thread].parent_index == ref_data[n_thread].parent_index);
300 CHECK_THAT(impl_data[n_thread].
rhs, Catch::Matchers::Approx(ref_data[n_thread].
rhs));
305 template <
typename... Args>
307 std::map<SolverImplementation, std::vector<SolverData>> solver_data;
309 solver_data[impl] =
solve_and_dump(impl, std::forward<Args>(args)...);
319 TEST_CASE(
"SingleCellAndThread",
"[solver][single-thread]") {
320 constexpr std::size_t segments = 32;
324 for (
auto const& [impl,
data]: solver_data) {
325 REQUIRE(
data.size() == 1);
326 REQUIRE(
data[0].d.size() == segments);
327 REQUIRE(
data[0].parent_index.size() == segments);
328 REQUIRE(
data[0].
rhs.size() == segments);
332 TEST_CASE(
"UnbalancedCellSingleThread",
"[solver][single-thread]") {
338 TEST_CASE(
"LargeCellSingleThread",
"[solver][single-thread]") {
344 TEST_CASE(
"ManySmallCellsSingleThread",
"[solver][single-thread]") {
350 TEST_CASE(
"ManySmallCellsMultiThread",
"[solver][multi-thread]") {
353 config.num_threads = 2;
358 std::mt19937_64 gen{42};
360 config.
produce_a = [g = gen, d = std::normal_distribution{1.0, 0.1}](
int icell,
364 config.produce_b = [g = gen, d = std::normal_distribution{7.0, 0.2}](int, int)
mutable {
367 config.produce_d = [g = gen, d = std::normal_distribution{-0.1, 0.01}](int, int)
mutable {
370 config.produce_rhs = [g = gen, d = std::normal_distribution{-15.0, 2.0}](int, int)
mutable {
376 TEST_CASE(
"LargeCellSingleThreadRandom",
"[solver][single-thread][random]") {
378 config.num_segments_per_cell = 4096;
382 TEST_CASE(
"ManySmallCellsSingleThreadRandom",
"[solver][single-thread][random]") {
384 config.num_cells = 1024;
void free_memory(void *pointer)
static double map(void *v)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
void nrn_threads_create(int n)
void * emalloc_align(size_t size, size_t alignment)
static int inv_permute(int i, NrnThread &nt)
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
int * inverse_permute(int *p, int n)
int interleave_permute_type
bool use_solve_interleave
void nrn_solve_minimal(NrnThread *)
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
corenrn_parameters corenrn_param
Printing method.
std::ostream & operator<<(std::ostream &os, const corenrn_parameters &corenrn_param)
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
void permute_data(double *vec, int n, int *p)
std::vector< int > interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
void create_interleave_info()
void permute_ptr(int *vec, int n, int *p)
void destroy_interleave_info()
static int get_index(const ast::IndexedName &node)
SetupThreads(SolverImplementation impl, ToyModelConfig config={})
NrnThread * begin() const
std::vector< int > parent_index
int num_segments_per_cell
std::function< double(int, int)> produce_a
bool cuda_interface
Enable GPU computation.
bool gpu
Enable pthread/openmp.
constexpr auto magic_index_value
auto active_implementations()
void compare_solver_data(std::map< SolverImplementation, std::vector< SolverData >> const &solver_data)
constexpr auto magic_double_value
auto solve_and_dump(Args &&... args)
TEST_CASE("SingleCellAndThread", "[solver][single-thread]")
auto compare_all_active_implementations(Args &&... args)
constexpr double default_tolerance