37 using VVTN = std::vector<VTN>;
38 using VVVTN = std::vector<VVTN>;
55 if (palevel < pblevel) {
57 }
else if (palevel == pblevel) {
78 for (
size_t i = 0;
i < level.size(); ++
i) {
79 level[
i]->treenode_order =
i;
86 for (
auto& level: levels) {
87 for (
auto* nd: level) {
88 nd->treenode_order =
order++;
97 static size_t g32(TNode* nd) {
101 static bool is_parent_race(TNode* nd) {
103 for (
const auto& child: nd->children) {
104 if (pg == g32(child)) {
115 for (
const auto& child: nd->
children) {
124 static bool is_child_race(TNode* nd) {
125 if (nd->children.size() < 2) {
128 if (nd->children.size() == 2) {
129 return g32(nd->children[0]) == g32(nd->children[1]);
132 for (
const auto& child: nd->children) {
133 std::size_t gc = g32(child);
134 if (
s.find(gc) !=
s.end()) {
148 size_t c0 = nd->
children[0]->nodevec_index;
149 size_t c1 = nd->
children[1]->nodevec_index;
150 c0 = (c0 < c1) ? (c1 - c0) : (c0 - c1);
153 size_t ic0 = nd->
children[0]->nodevec_index;
167 for (
const auto& child: nd->
children) {
168 std::size_t d1 = child->nodevec_index -
pi;
177 template <
typename T>
178 static void move_range(
size_t start,
size_t length,
size_t dst, std::vector<T>&
v) {
179 typename std::vector<T>::iterator first, middle, last;
181 first =
v.begin() + start;
182 middle = first + length;
183 last =
v.begin() + dst;
185 first =
v.begin() + dst;
186 middle =
v.begin() + start;
187 last = middle + length;
189 std::rotate(first, middle, last);
192 static void move_nodes(
size_t start,
size_t length,
size_t dst,
VTN& nodes) {
198 for (
size_t i = start;
i < dst - length; ++
i) {
201 for (
size_t i = dst - length;
i < dst; ++
i) {
202 nrn_assert(nodes[
i]->nodevec_index == start + (
i - (dst - length)));
206 for (
size_t i = start;
i < dst; ++
i) {
207 nodes[
i]->nodevec_index =
i;
213 static size_t need2move(TNode* nd) {
218 static void how_many_warpsize_groups_have_only_leaves(
VTN& nodes) {
220 for (
size_t i = 0;
i < nodes.size();
i +=
warpsize) {
229 printf(
"warpsize group %ld starting at level %ld\n",
i /
warpsize, nodes[
i]->level);
233 printf(
"number of warpsize groups with only leaves = %ld\n",
n);
236 static void pr_race_situation(
VTN& nodes) {
240 for (
size_t i = nodes.size() - 1; nodes[
i]->level != 0; --
i) {
241 TNode* nd = nodes[
i];
245 if (is_parent_race(nd)) {
246 printf(
"level=%ld i=%ld d=%ld n=%ld",
251 for (
const auto& cnd: nd->children) {
252 printf(
" %ld %ld", cnd->level, cnd->nodevec_index);
257 if (is_child_race(nd)) {
261 printf(
"prace=%ld crace=%ld prace2=%ld\n", prace, crace, prace2);
288 size_t i =
look->nodevec_index;
295 while (nodes[
i - 1]->
children.empty() &&
n < d) {
311 printf(
"could not eliminate prace for g=%ld c=%ld l=%ld o=%ld %ld\n",
321 size_t c0 = nd->
children[0]->nodevec_index;
322 size_t c1 = nd->
children[1]->nodevec_index;
323 size_t d =
warpsize - ((c0 > c1) ? (c0 - c1) : (c1 - c0));
327 printf(
"could not eliminate crace for g=%ld c=%ld l=%ld o=%ld %ld\n",
338 std::size_t nnode = std::accumulate(levels.begin(),
341 [](std::size_t
s,
const VTN& l) { return s + l.size(); });
344 for (
const auto& level: levels) {
345 for (
const auto& l: level) {
349 for (
size_t i = 0;
i < nodes.size(); ++
i) {
350 nodes[
i]->nodevec_index =
i;
385 if (0 && nodes.size() %
warpsize != 0) {
386 size_t nnode = nodes.size() - levels[0].size();
387 printf(
"warp of %ld cells has %ld nodes in last cycle %ld\n",
397 for (
size_t i = nodes.size() - 1;
i >= levels[0].size(); --
i) {
409 for (
size_t i = 0;
i < nodes.size(); ++
i) {
410 nodes[
i]->treenode_order =
i;
423 for (
auto& level: levels) {
425 for (
const auto& nd: level) {
426 for (
size_t k = 0;
k < nd->children.size(); ++
k) {
427 nd->children[
k]->treenode_order =
k;
432 for (
auto& level: levels) {
442 for (
size_t i = 0;
i < groups[0].size(); ++
i) {
444 for (
const auto& group: groups) {
445 printf(
" %5ld", group[
i].size());
466 for (
size_t i = 0;
i < nodevec.size(); ++
i) {
467 nodevec[
i]->nodevec_index =
i;
485 for (
auto& group: groups) {
491 for (
const auto& nd: nodevec) {
492 groups[nd->groupindex][nd->level].push_back(nd);
498 for (
auto& group: groups) {
TNode is the tree node that represents the tree of the compartments.
size_t groupindex
Cell ID that this compartment belongs to.
size_t nodevec_index
Total number of compartments from the current node and below.
size_t level
For cell permute 1 (Interleaved):
size_t hash
Hash algorith that generates a hash based on the hash of the children and the number of compartments ...
size_t cellindex
level of of this compartment in the tree
size_t treenode_order
index in nodevec that is set in check() In cell permute 2 this is set as Breadth First traversal
static double order(void *v)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
In mechanism libraries, cannot use auto const token = nrn_ensure_model_data_are_sorted(); because the...
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
static void move_range(size_t start, size_t length, size_t dst, std::vector< T > &v)
static bool is_parent_race2(TNode *nd)
void group_order2(VecTNode &, size_t groupsize, size_t ncell)
Implementation of the advanced interleaving strategy (interleave_permute_type == 2)
static void set_nodeindex(VecTNode &nodevec)
size_t level_from_root(VecTNode &)
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
static void analyze(VVTN &levels)
void chklevel(VTN &level, size_t nident=8)
static void set_treenode_order(VVTN &levels)
static bool is_child_race2(TNode *nd)
static void move_nodes(size_t start, size_t length, size_t dst, VTN &nodes)
static void checkrace(TNode *nd, VTN &nodes)
static void sortlevel(VTN &level)
std::vector< VVTN > VVVTN
static size_t next_leaf(TNode *nd, VTN &nodes)
static bool eliminate_race(TNode *nd, size_t d, VTN &nodes, TNode *look)
static void question2(VVTN &levels)
static bool sortlevel_cmp(TNode *a, TNode *b)
void prgroupsize(VVVTN &groups)
static void eliminate_prace(TNode *nd, VTN &nodes)
size_t dist2child(TNode *nd)
static bool final_nodevec_cmp(TNode *a, TNode *b)
static void eliminate_crace(TNode *nd, VTN &nodes)
std::vector< TNode * > VecTNode
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
int const size_t const size_t n
static double look(void *v)
static double children(void *v)