NEURON
nrnran123.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================.
7 */
13 
14 #ifdef CORENEURON_USE_BOOST_POOL
15 #include <boost/pool/pool_alloc.hpp>
16 #include <unordered_map>
17 #endif
18 
19 #include <cmath>
20 #include <iostream>
21 #include <memory>
22 #include <mutex>
23 
24 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
25 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
26  defined(_OPENMP) && defined(__CUDACC__)
27 #define CORENRN_HOST_DEVICE __host__ __device__
28 #else
29 #define CORENRN_HOST_DEVICE
30 #endif
31 
32 namespace {
33 #ifdef CORENEURON_USE_BOOST_POOL
34 /** Tag type for use with boost::fast_pool_allocator that forwards to
35  * coreneuron::[de]allocate_unified(). Using a Random123-specific type here
36  * makes sure that allocations do not come from the same global pool as other
37  * usage of boost pools for objects with sizeof == sizeof(nrnran123_State).
38  *
39  * The messy m_block_sizes map is just because `deallocate_unified` uses sized
40  * deallocations, but the Boost pool allocators don't. Because this is hidden
41  * behind the pool mechanism, these methods are not called very often and the
42  * overhead is minimal.
43  */
44 struct random123_allocate_unified {
45  using size_type = std::size_t;
46  using difference_type = std::size_t;
47  static char* malloc(const size_type bytes) {
48  std::lock_guard<std::mutex> const lock{m_mutex};
49  static_cast<void>(lock);
50  auto* buffer = coreneuron::allocate_unified(bytes);
51  m_block_sizes[buffer] = bytes;
52  return reinterpret_cast<char*>(buffer);
53  }
54  static void free(char* const block) {
55  std::lock_guard<std::mutex> const lock{m_mutex};
56  static_cast<void>(lock);
57  auto const iter = m_block_sizes.find(block);
58  assert(iter != m_block_sizes.end());
59  auto const size = iter->second;
60  m_block_sizes.erase(iter);
61  return coreneuron::deallocate_unified(block, size);
62  }
63  static std::mutex m_mutex;
64  static std::unordered_map<void*, std::size_t> m_block_sizes;
65 };
66 
67 std::mutex random123_allocate_unified::m_mutex{};
68 std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes{};
69 
70 using random123_allocator =
71  boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
72 #else
73 using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State>;
74 #endif
75 /* Global data structure per process. Using a unique_ptr here causes [minor]
76  * problems because its destructor can be called very late during application
77  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
78  * been shut down then tools like cuda-memcheck reports errors.
79  */
80 OMP_Mutex g_instance_count_mutex;
81 std::size_t g_instance_count{};
82 
83 } // namespace
84 
85 namespace random123_global {
86 #ifdef __CUDACC__
87 #define g_k_qualifiers __device__ __constant__
88 #else
89 #define g_k_qualifiers
90 #endif
91 g_k_qualifiers philox4x32_key_t g_k{{0, 0}};
92 
93 // Cannot refer to g_k directly from a nrn_pragma_acc(routine seq) method like
94 // coreneuron_random123_philox4x32_helper, and cannot have this inlined there at
95 // higher optimisation levels
96 __attribute__((noinline)) philox4x32_key_t& global_state() {
97  return random123_global::g_k;
98 }
99 } // namespace random123_global
100 
101 CORENRN_HOST_DEVICE philox4x32_ctr_t
103  return philox4x32(s->c, random123_global::global_state());
104 }
105 
106 namespace coreneuron {
108  return g_instance_count;
109 }
110 
111 /* if one sets the global, one should reset all the stream sequences. */
113  return random123_global::global_state().v[0];
114 }
115 
116 /* nrn123 streams are created from cpu launcher routine */
117 void nrnran123_set_globalindex(uint32_t gix) {
118  // If the global seed is changing then we shouldn't have any active streams.
119  auto& g_k = random123_global::global_state();
120  {
121  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
122  if (g_instance_count != 0 && nrnmpi_myid == 0) {
123  std::cout
124  << "nrnran123_set_globalindex(" << gix
125  << ") called when a non-zero number of Random123 streams (" << g_instance_count
126  << ") were active. This is not safe, some streams will remember the old value ("
127  << g_k.v[0] << ')' << std::endl;
128  }
129  }
130  if (g_k.v[0] != gix) {
131  g_k.v[0] = gix;
132  if (coreneuron::gpu_enabled()) {
133 #ifdef __CUDACC__
134  {
135  auto const code = cudaMemcpyToSymbol(g_k, &g_k, sizeof(g_k));
136  assert(code == cudaSuccess);
137  }
138  {
139  auto const code = cudaDeviceSynchronize();
140  assert(code == cudaSuccess);
141  }
142 #else
143  nrn_pragma_acc(update device(g_k))
144  nrn_pragma_omp(target update to(g_k))
145 #endif
146  }
147  }
148 }
149 
151  if (coreneuron::gpu_enabled()) {
152 #ifndef __CUDACC__
154 #endif
155  }
156 }
157 
159  if (coreneuron::gpu_enabled()) {
160 #ifndef __CUDACC__
162 #endif
163  }
164 }
165 
166 /** @brief Allocate a new Random123 stream.
167  * @todo It would be nicer if the API return type was
168  * std::unique_ptr<nrnran123_State, ...not specified...>, so we could use a
169  * custom allocator/deleter and avoid the (fragile) need for matching
170  * nrnran123_deletestream calls.
171  */
173  uint32_t id2,
174  uint32_t id3,
175  bool use_unified_memory) {
176  // The `use_unified_memory` argument is an implementation detail to keep the
177  // old behaviour that some Random123 streams that are known to only be used
178  // from the CPU are allocated using new/delete instead of unified memory.
179  // See OPENACC_EXCLUDED_FILES in coreneuron/CMakeLists.txt. If we dropped
180  // this feature then we could always use coreneuron::unified_allocator.
181 #ifndef CORENEURON_ENABLE_GPU
182  if (use_unified_memory) {
183  throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
184  }
185 #endif
186  nrnran123_State* s{nullptr};
187  if (use_unified_memory) {
188  s = coreneuron::allocate_unique<nrnran123_State>(random123_allocator{}).release();
189  } else {
190  s = new nrnran123_State{};
191  }
192  s->c.v[0] = 0;
193  s->c.v[1] = id3;
194  s->c.v[2] = id1;
195  s->c.v[3] = id2;
196  nrnran123_setseq(s, 0, 0);
197  {
198  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
199  ++g_instance_count;
200  }
201  return s;
202 }
203 
204 /* nrn123 streams are destroyed from cpu launcher routine */
205 void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
206 #ifndef CORENEURON_ENABLE_GPU
207  if (use_unified_memory) {
208  throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
209  }
210 #endif
211  {
212  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
213  --g_instance_count;
214  }
215  if (use_unified_memory) {
216  std::unique_ptr<nrnran123_State, coreneuron::alloc_deleter<random123_allocator>> _{s};
217  } else {
218  delete s;
219  }
220 }
221 } // namespace coreneuron
#define CORENRN_HOST_DEVICE
Definition: nrnran123.cpp:29
#define g_k_qualifiers
Definition: nrnran123.cpp:89
CORENRN_HOST_DEVICE philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State *s)
Definition: nrnran123.cpp:102
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:66
void nrnran123_setseq(nrnran123_State *s, std::uint32_t seq, char which)
Set a Random123 sequence for a sequnece ID and which selector.
Definition: nrnran123.cpp:55
#define assert(ex)
Definition: hocassrt.h:24
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
nrnran123_State * nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3, bool use_unified_memory)
Allocate a new Random123 stream.
Definition: nrnran123.cpp:172
void nrnran123_set_globalindex(uint32_t gix)
Definition: nrnran123.cpp:117
void * allocate_unified(std::size_t num_bytes)
Definition: memory.cpp:26
void update(NrnThread *_nt)
void deallocate_unified(void *ptr, std::size_t num_bytes)
Definition: memory.cpp:44
void nrnran123_deletestream(nrnran123_State *s, bool use_unified_memory)
Definition: nrnran123.cpp:205
bool gpu_enabled()
Definition: memory.cpp:18
nrn_pragma_acc(routine seq) int vector_capacity(void *v)
Definition: ivocvect.cpp:30
void nrnran123_destroy_global_state_on_device()
Definition: nrnran123.cpp:158
std::size_t nrnran123_instance_count()
Definition: nrnran123.cpp:107
void nrnran123_initialise_global_state_on_device()
Definition: nrnran123.cpp:150
uint32_t nrnran123_get_globalindex()
Definition: nrnran123.cpp:112
__attribute__((noinline)) philox4x32_key_t &global_state()
Definition: nrnran123.cpp:96
g_k_qualifiers philox4x32_key_t g_k
Definition: nrnran123.cpp:91
s
Definition: multisend.cpp:521
#define lock