31 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
32 #include <cuda_runtime_api.h>
35 #if __has_include(<cxxabi.h>)
42 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
47 #include <shared_mutex>
49 struct present_table_value {
50 std::size_t ref_count{}, size{};
53 std::map<std::byte const*, present_table_value> present_table;
54 std::shared_mutex present_table_mutex;
66 std::unique_ptr<char, decltype(free)*> demangled{
67 abi::__cxa_demangle(mangled,
nullptr,
nullptr, &
status), free};
68 return status ? mangled : demangled.get();
73 bool cnrn_target_debug_output_enabled() {
74 const char*
env = std::getenv(
"CORENEURON_GPU_DEBUG");
78 std::string env_s{
env};
81 }
else if (env_s ==
"0") {
84 throw std::runtime_error(
"CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s +
")");
87 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
100 std::type_info
const& typeid_T,
104 if (!cnrn_target_enable_debug) {
107 std::cerr << file <<
':' << line <<
": cnrn_target_copyin<" <<
cxx_demangle(typeid_T.name())
108 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T
109 <<
") -> " << d_ptr << std::endl;
113 std::size_t sizeof_T,
114 std::type_info
const& typeid_T,
117 if (!cnrn_target_enable_debug) {
120 std::cerr << file <<
':' << line <<
": cnrn_target_delete<" <<
cxx_demangle(typeid_T.name())
121 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')'
126 std::type_info
const& typeid_T,
129 if (!cnrn_target_enable_debug) {
132 std::cerr << file <<
':' << line <<
": cnrn_target_deviceptr<" <<
cxx_demangle(typeid_T.name())
133 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
137 std::type_info
const& typeid_T,
140 if (!cnrn_target_enable_debug) {
143 std::cerr << file <<
':' << line <<
": cnrn_target_is_present<" <<
cxx_demangle(typeid_T.name())
144 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
148 std::size_t sizeof_T,
149 std::type_info
const& typeid_T,
153 if (!cnrn_target_enable_debug) {
156 std::cerr << file <<
':' << line <<
": cnrn_target_memcpy_to_device<"
157 <<
cxx_demangle(typeid_T.name()) <<
">(" << d_ptr <<
", " << h_ptr <<
", " << len
158 <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')' << std::endl;
161 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
162 std::pair<void*, bool> cnrn_target_deviceptr_impl(
bool must_be_present_or_null,
void const* h_ptr) {
164 return {
nullptr,
false};
168 std::shared_lock _{present_table_mutex};
169 if (present_table.empty()) {
170 return {
nullptr, must_be_present_or_null};
174 auto const iter =
std::prev(std::upper_bound(
175 present_table.begin(), present_table.end(), h_ptr, [](
void const* hp,
auto const& entry) {
176 return hp < entry.first;
178 if (iter == present_table.end()) {
179 return {
nullptr, must_be_present_or_null};
181 std::byte
const*
const h_byte_ptr{
static_cast<std::byte const*
>(h_ptr)};
182 std::byte
const*
const h_start_of_block{iter->first};
183 std::size_t
const block_size{iter->second.size};
184 std::byte*
const d_start_of_block{iter->second.dev_ptr};
185 bool const is_present{h_byte_ptr < h_start_of_block + block_size};
187 return {
nullptr, must_be_present_or_null};
189 return {d_start_of_block + (h_byte_ptr - h_start_of_block),
false};
192 void cnrn_target_copyin_update_present_table(
void const* h_ptr,
void* d_ptr, std::size_t len) {
197 std::lock_guard _{present_table_mutex};
199 present_table_value new_val{};
201 new_val.ref_count = 1;
202 new_val.dev_ptr =
static_cast<std::byte*
>(d_ptr);
203 auto const [iter, inserted] = present_table.emplace(
static_cast<std::byte const*
>(h_ptr),
207 assert(iter->second.size == len);
208 assert(iter->second.dev_ptr == new_val.dev_ptr);
209 ++(iter->second.ref_count);
212 void cnrn_target_delete_update_present_table(
void const* h_ptr, std::size_t len) {
216 std::lock_guard _{present_table_mutex};
217 auto const iter = present_table.find(
static_cast<std::byte const*
>(h_ptr));
218 assert(iter != present_table.end());
219 assert(iter->second.size == len);
220 --(iter->second.ref_count);
221 if (iter->second.ref_count == 0) {
222 present_table.erase(iter);
228 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
231 acc_device_t device_type = acc_device_nvidia;
233 return acc_get_num_devices(device_type);
234 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
236 return omp_get_num_devices();
238 throw std::runtime_error(
239 "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
244 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
246 acc_set_device_num(device_num, acc_device_nvidia);
247 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
249 omp_set_default_device(device_num);
254 auto const cuda_code = cudaSetDevice(device_num);
255 assert(cuda_code == cudaSuccess);
257 throw std::runtime_error(
258 "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
262 #ifdef CORENEURON_ENABLE_GPU
263 #ifndef CORENEURON_UNIFIED_MEMORY
273 if (ml->global_variables) {
274 assert(ml->global_variables_size);
276 ml->global_variables_size);
305 NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
332 NetSendBuffer_t* nsb = ml->_net_send_buffer;
335 NetSendBuffer_t* d_nsb;
365 static void update_ml_on_host(
const Memb_list* ml,
int type) {
387 auto nrb = ml->_net_receive_buffer;
394 nrb->_pnt_index[:nrb->_size],
395 nrb->_weight_index[:nrb->_size],
396 nrb->_displ[:nrb->_size + 1],
397 nrb->_nrb_index[:nrb->_size])
403 nrb->_pnt_index[:nrb->_size],
404 nrb->_weight_index[:nrb->_size],
405 nrb->_displ[:nrb->_size + 1],
406 nrb->_nrb_index[:nrb->_size])
418 NetSendBuffer_t* nsb{ml->_net_send_buffer};
431 NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
442 int n = ml->nodecount;
454 if (ml->global_variables) {
455 assert(ml->global_variables_size);
457 ml->global_variables_size);
467 #ifdef CORENEURON_ENABLE_GPU
470 for (
int i = 0;
i < nthreads;
i++) {
478 #ifdef CORENEURON_UNIFIED_MEMORY
479 for (
int i = 0;
i < nthreads;
i++) {
498 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
510 printf(
"\n Warning: No permutation data? Required for linear algebra!");
515 for (
int i = 0;
i < nthreads;
i++) {
545 dptr = d__data + 0 * ne;
548 dptr = d__data + 1 * ne;
551 dptr = d__data + 2 * ne;
554 dptr = d__data + 3 * ne;
557 dptr = d__data + 4 * ne;
560 dptr = d__data + 5 * ne;
564 dptr = d__data + 6 * ne;
579 bool first_tml =
true;
581 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
599 Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
606 double* d_shadow_ptr;
685 int* d_ptr =
nullptr;
704 int* d_ptr =
nullptr;
721 printf(
"\n ERROR: only --cell_permute = [12] implemented");
725 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
740 double** d_tr_varrays{
nullptr};
768 &d_fornetcon_perm_indices);
785 #ifdef CORENEURON_ENABLE_GPU
789 size_t n = from.size();
801 #ifdef CORENEURON_ENABLE_GPU
802 auto const n = vec.size();
807 static_cast<void>(vec);
817 #ifdef CORENEURON_ENABLE_GPU
830 auto const realloc = [old_size = nrb->
_size, nrb](
auto*& ptr, std::size_t extra_size = 0) {
831 using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
833 "Only trivially constructible and copiable types are supported.");
834 static_assert(std::is_same<decltype(ptr), T*&>::
value,
835 "ptr should be reference-to-pointer");
836 auto*
const new_data =
static_cast<T*
>(
ecalloc_align((nrb->
_size + extra_size),
sizeof(T)));
837 std::memcpy(new_data, ptr, (old_size + extra_size) *
sizeof(T));
848 #ifdef CORENEURON_ENABLE_GPU
881 if (a.first == b.first) {
882 return a.second > b.second;
884 return a.first > b.first;
890 if (nrb->
_cnt == 0) {
895 std::priority_queue<NRB_P, std::vector<NRB_P>,
comp> nrbq;
897 for (
int i = 0;
i < nrb->
_cnt; ++
i) {
903 int last_instance_index = -1;
906 while (!nrbq.empty()) {
907 const NRB_P&
p = nrbq.top();
909 if (
p.first != last_instance_index) {
912 nrb->
_displ[displ_cnt] = index_cnt;
913 last_instance_index =
p.first;
930 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
939 if (nrb && nrb->
_cnt) {
975 #ifdef CORENEURON_ENABLE_GPU
983 printf(
"ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n",
nrnmpi_myid);
1013 #ifdef CORENEURON_ENABLE_GPU
1015 for (
int i = 0;
i < nthreads;
i++) {
1045 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1050 update_ml_on_host(tml->ml, tml->index);
1113 #ifdef CORENEURON_ENABLE_GPU
1114 for (
int i = 0;
i < nthreads;
i++) {
1152 #ifdef CORENEURON_ENABLE_GPU
1153 for (
int i = 0;
i < nthreads;
i++) {
1234 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1235 delete_ml_from_device(tml->ml, tml->index);
1249 #ifdef CORENEURON_ENABLE_GPU
1283 for (
int i = 0;
i < ns->
n; ++
i) {
1284 pd = d_jacdat +
i *
n;
1291 #ifdef CORENEURON_ENABLE_GPU
1310 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1317 unsigned n1 = so->
neqn + 1;
1342 for (
unsigned irow = 1; irow < n1; ++irow) {
1370 for (
unsigned irow = 1; irow < n1; ++irow) {
1393 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1399 unsigned n1 = so->
neqn + 1;
1400 for (
unsigned irow = 1; irow < n1; ++irow) {
1415 #ifdef CORENEURON_ENABLE_GPU
1446 if (num_devices_per_node == 0) {
1447 nrn_fatal_error(
"\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
1452 nrn_fatal_error(
"Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
1454 num_devices_per_node);
1466 local_rank = nrnmpi_local_rank();
1467 local_size = nrnmpi_local_size();
1474 std::cout <<
" Info : " << num_devices_per_node <<
" GPUs shared by " << local_size
1475 <<
" ranks per node\n";
1480 for (
int i = 0;
i < nt->n_vecplay;
i++) {
1496 *(d_vecplay_instance->discon_indices_));
1513 for (
int i = 0;
i < nt->n_vecplay;
i++) {
int cxx_demangle(const char *symbol, char **funcname, size_t *funcname_sz)
neuron::container::data_handle< double > pd_
IvocVect * discon_indices_
auto & get_prop_dparam_size()
auto & get_is_artificial()
auto & get_prop_param_size()
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
void free_memory(void *pointer)
void move(Item *q1, Item *q2, Item *q3)
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
void cnrn_target_is_present_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void cnrn_target_memcpy_to_device(std::string_view file, int line, T *d_ptr, const T *h_ptr, std::size_t len=1)
void cnrn_target_copyin_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
void nrn_sparseobj_copyto_device(SparseObj *so)
void cnrn_target_deviceptr_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
void nrn_newtonspace_delete_from_device(NewtonSpace *ns)
void nrn_abort(int errcode)
void * ecalloc_align(size_t n, size_t size, size_t alignment)
double ** nrn_ion_global_map
void nrn_VecPlay_delete_from_device(NrnThread *nt)
void nrn_ion_global_map_copyto_device()
int cnrn_target_get_num_devices()
InterleaveInfo * interleave_info
void copy_ivoc_vect_to_device(const IvocVect &from, IvocVect &to)
void cnrn_target_delete(std::string_view file, int line, T *h_ptr, std::size_t len=1)
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
void update(NrnThread *_nt)
T * cnrn_target_copyin(std::string_view file, int line, const T *h_ptr, std::size_t len=1)
static void net_receive_buffer_order(NetReceiveBuffer_t *nrb)
static void nrn_fatal_error(const char *msg)
const int ion_global_map_member_size
int interleave_permute_type
std::pair< int, int > NRB_P
void nrn_newtonspace_copyto_device(NewtonSpace *ns)
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
void cnrn_target_set_default_device(int device_num)
void update_net_receive_buffer(NrnThread *nt)
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
void nrn_VecPlay_copyto_device(NrnThread *nt, void **d_vecplay)
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
void nrn_ion_global_map_delete_from_device()
void update_net_send_buffer_on_host(NrnThread *nt, NetSendBuffer_t *nsb)
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
void nrn_sparseobj_delete_from_device(SparseObj *so)
void realloc_net_receive_buffer(NrnThread *nt, Memb_list *ml)
void cnrn_target_delete_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len)
int nrn_ion_global_map_size
void cnrn_target_memcpy_to_device_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
corenrn_parameters corenrn_param
Printing method.
void update_weights_from_gpu(NrnThread *threads, int nthreads)
Copy weights from GPU to CPU.
void delete_ivoc_vect_from_device(IvocVect &vec)
int const size_t const size_t n
#define cnrn_target_deviceptr(...)
A view into a set of mechanism instances.
std::vector< double * > data()
Get a vector of double* representing the model data.
Represent main neuron object computed by single thread.
NetReceiveBuffer_t * _net_receive_buffer
int _net_send_buffer_size
PreSynHelper * presyns_helper
NrnFastImem * nrn_fast_imem
size_t * _fornetcon_weight_perm
size_t * _fornetcon_perm_indices
std::size_t _fornetcon_perm_indices_size
std::size_t _fornetcon_weight_perm_size
TrajectoryRequests * trajec_requests
bool operator()(const NRB_P &a, const NRB_P &b)
bool mpi_enable
Initialization seed for random number generator (int)
unsigned num_gpus
Number of warps to balance for cell_interleave_permute == 2.