62 nwarp = std::min(
ncell, nwarp);
66 std::vector<size_t> typedispl;
67 size_t total_compart = 0;
68 typedispl.push_back(0);
72 if (
i == 0 || nodevec[
i]->hash != nodevec[
i - 1]->hash) {
73 typedispl.push_back(typedispl.back() + 1);
75 typedispl.back() += 1;
79 size_t ideal_compart_per_warp = total_compart / nwarp;
81 size_t min_cells_per_warp = 0;
82 for (
size_t i = 0, sz = 0; sz < ideal_compart_per_warp; ++
i) {
89 double best_balance = 0.0;
91 printf(
"best_balance=%g ncell=%ld ntype=%ld nwarp=%ld\n",
102 std::sort(nodevec.begin(), nodevec.begin() +
ncell,
warpcmp);
103 for (
size_t i = 0;
i < nodevec.size(); ++
i) {
TNode is the tree node that represents the tree of the compartments.
size_t groupindex
Cell ID that this compartment belongs to.
size_t nodevec_index
Total number of compartments from the current node and below.
std::vector< std::size_t > lpt(std::size_t nbag, std::vector< std::size_t > &pieces, double *bal)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
In mechanism libraries, cannot use auto const token = nrn_ensure_model_data_are_sorted(); because the...
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
bool warpcmp(const TNode *a, const TNode *b)
std::vector< TNode * > VecTNode