47 std::swap(nwarp,
info.nwarp);
56 std::swap(nnode,
info.nnode);
58 std::swap(idle,
info.idle);
59 std::swap(cache_access,
info.cache_access);
60 std::swap(child_race,
info.child_race);
91 InterleaveInfo::~InterleaveInfo() {
105 delete[] cache_access;
140 size_t ncacheline = 0;
144 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
154 if (crace.find(par) != crace.end()) {
161 if (par != lastp + 1) {
162 ch = (ch ==
'r') ?
'R' :
'o';
181 ii.cache_access[
iwarp] = ncacheline;
184 printf(
"warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
202 for (
int i = cellbegin;
i < cellend; ++
i) {
210 int ncell_in_warp = cellend - cellbegin;
214 size_t ncacheline = 0;
217 int inode =
ii.firstnode[cellbegin];
218 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
225 if (icore < ncell_in_warp && icore >= sbegin) {
226 int par =
p[inode +
icore];
227 if (par != lastp + 1) {
242 inode +=
ii.stride[icycle + 1];
248 ii.cache_access[
iwarp] = ncacheline;
251 printf(
"warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access\n",
260 size_t nwarp = size_t(
ii.nwarp);
262 for (
size_t j = 0;
j < 4; ++
j) {
264 smm[
j][1] = 1000000000;
267 double emax = 0.0, emin = 1.0;
268 for (
size_t i = 0;
i < nwarp; ++
i) {
269 size_t n =
ii.nnode[
i];
270 double e = double(
n) / (
n +
ii.idle[
i]);
277 size_t s[4] = {
n,
ii.idle[
i],
ii.cache_access[
i],
ii.child_race[
i]};
278 for (
size_t j = 0;
j < 4; ++
j) {
280 if (smm[
j][1] >
s[
j]) {
283 if (smm[
j][2] <
s[
j]) {
288 std::vector<size_t>
v(nwarp);
289 for (
size_t i = 0;
i < nwarp; ++
i) {
295 "thread %d nwarp=%ld balance=%g warp_efficiency %g to %g\n", ith, nwarp, bal, emin, emax);
296 const char* cp[4] = {
"nodes",
"idle",
"ca",
"cr"};
297 for (
size_t i = 0;
i < 4; ++
i) {
298 printf(
" %s=%ld (%ld:%ld)", cp[
i], smm[
i][0], smm[
i][1], smm[
i][2]);
308 printf(
"%s nrnthread %d node info\n", mes, nt.
id);
309 for (
int i = 0;
i < nt.
end; ++
i) {
311 " _v_node[%2d]->v_node_index=%2d %p"
312 " _v_parent[%2d]->v_node_index=%2d parent[%2d]=%2d\n",
321 for (
auto tml = nt.
tml; tml; tml = tml->
next) {
325 printf(
" %2d ndindex=%2d nd=%p [%2d] pdata=%p prop=%p\n",
356 if (parent[
i] == 0) {
379 printf(
"icell=%d cellsize=%d first=%d last=%d\n",
391 std::vector<int>
p(nnode);
393 for (
int i = 0;
i < nnode; ++
i) {
398 node_permute(
p.data(),
p.size(),
order);
404 ii.nnode =
new size_t[nwarp];
405 ii.ncycle =
new size_t[nwarp];
406 ii.idle =
new size_t[nwarp];
407 ii.cache_access =
new size_t[nwarp];
408 ii.child_race =
new size_t[nwarp];
409 for (
int i = 0;
i < nwarp; ++
i) {
437 for (
int i = 0;
i < nt.end; ++
i) {
438 int x = nt._v_parent_index[
p[
i]];
439 int par = x >= 0 ? perm[x] : -1;
440 printf(
"%2d <- %2d parent=%2d\n",
i,
p[
i], par);
448 for (
int i = 0;
i < nt.end; ++
i) {
449 nt._v_node[
i]->v_node_index =
i;
451 for (
auto tml = nt.tml; tml; tml = tml->next) {
465 static int** cell_indices_debug(
NrnThread& nt, InterleaveInfo&
ii) {
476 int* sz =
new int[
ncell];
477 int*
cell =
new int[nnode];
482 for (
int i =
ncell;
i < nnode; ++
i) {
496 int** cellindices =
new int*[
ncell];
498 cellindices[
i] =
new int[sz[
i]];
501 for (
int i =
ncell;
i < nnode; ++
i) {
517 static int*** cell_indices_threads;
518 void mk_cell_indices() {
525 cell_indices_threads[
i] =
nullptr;
532 #define GPU_V(i) nt->_actual_v[i]
533 #define GPU_A(i) nt->_actual_a[i]
534 #define GPU_B(i) nt->_actual_b[i]
535 #define GPU_D(i) nt->_actual_d[i]
536 #define GPU_RHS(i) nt->_actual_rhs[i]
538 #define GPU_V(i) vec_v[i]
539 #define GPU_A(i) vec_a[i]
540 #define GPU_B(i) vec_b[i]
541 #define GPU_D(i) vec_d[i]
542 #define GPU_RHS(i) vec_rhs[i]
544 #define GPU_PARENT(i) nt->_v_parent_index[i]
567 #ifndef CORENEURON_ENABLE_GPU
595 #ifndef CORENEURON_ENABLE_GPU
605 static void solve_interleaved2_loop_body(
NrnThread* nt,
677 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
703 void solve_interleaved2(
int ith) {
706 int nwarp =
ii.nwarp;
714 auto* d_nt =
static_cast<NrnThread*
>(acc_deviceptr(nt));
715 auto* d_info =
static_cast<InterleaveInfo*
>(acc_deviceptr(
interleave_info + ith));
716 solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
724 #if defined(CORENEURON_ENABLE_GPU)
729 if (nt->compute_gpu) {
750 nodebegin [0:nwarp + 1]) async(nt->stream_id))
760 solve_interleaved2_loop_body(
766 solve_interleaved2_loop_body(
782 void solve_interleaved1(
int ith) {
803 async(nt->stream_id))
805 for (
int icell = 0; icell <
ncell; ++icell) {
815 solve_interleaved2(ith);
817 solve_interleaved1(ith);
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
static double order(void *v)
double load_balance(std::vector< size_t > &v)
void free_memory(void *pointer)
static double map(void *v)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
InterleaveInfo * interleave_info
void update(NrnThread *_nt)
int interleave_permute_type
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
corenrn_parameters corenrn_param
Printing method.
In mechanism libraries, cannot use auto const token = nrn_ensure_model_data_are_sorted(); because the...
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;} }}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;if(0) { nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) async(nt->stream_id)) nrn_pragma_omp(target teams loop map(present, alloc:nt[:1], strides[:nstride], ncycles[:nwarp], stridedispl[:nwarp+1], rootbegin[:nwarp+1], nodebegin[:nwarp+1])) for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} nrn_pragma_acc(wait(nt->stream_id)) } else { for(int icore=0;icore< ncore;icore+=warpsize) { solve_interleaved2_loop_body(nt, icore, ncycles, strides, stridedispl, rootbegin, nodebegin);} }}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
void nrn_permute_node_order()
Compute and carry out the permutation for interleave_permute_type.
std::vector< int > interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
static void print_quality1(int iwarp, InterleaveInfo &ii, int ncell, int *p)
static void triang_interleaved(NrnThread *nt, int icell, int icellsize, int nstride, int *stride, int *lastnode)
void copy_array(T *&dest, T *src, size_t n)
std::vector< int > node_order(int ncell, int nnode, int *parents, int &nwarp, int &nstride, int *&stride, int *&firstnode, int *&lastnode, int *&cellsize, int *&stridedispl)
Function that returns a permutation of length nnode.
void copy_align_array(T *&dest, T *src, size_t n)
void create_interleave_info()
static void print_quality2(int iwarp, InterleaveInfo &ii, int *p)
void update_parent_index(int *vec, int vec_size, const std::vector< int > &permute)
bool has_subtrees_to_compute
void sort_ml(Memb_list *ml)
void solve_interleaved(int ith)
Solve the Hines matrices based on the interleave_permute_type (1 or 2).
int int int int * stridedispl
static void prnode(const char *mes, NrnThread &nt)
static void bksub_interleaved(NrnThread *nt, int icell, int icellsize, int, int *stride, int *firstnode)
void permute_ptr(int *vec, int n, int *p)
int int int int int int * nodebegin
int int int int int * rootbegin
static void warp_balance(int ith, InterleaveInfo &ii)
void destroy_interleave_info()
int nrn_optimize_node_order(int type)
Select node ordering for optimum gaussian elimination.
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
int const size_t const size_t n
std::vector< Memb_func > memb_func
static double cell(void *v)
std::vector< T > inverse_permute_vector(const std::vector< T > &p)
void forward_permute(std::vector< T > &data, const std::vector< int > &perm)
A view into a set of mechanism instances.
Represent main neuron object computed by single thread.
double * node_a_storage()
double * node_rhs_storage()
double * node_d_storage()
double * node_b_storage()
struct NrnThreadMembList * next
bool cuda_interface
Enable GPU computation.
bool gpu
Enable pthread/openmp.