diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu index b8d883dc1c2c2d04130b5cb2d87f0982db7ccdd8..9e785676c41ba7981231b2c6de780ec72e9bb84c 100644 --- a/arbor/backends/gpu/matrix_fine.cu +++ b/arbor/backends/gpu/matrix_fine.cu @@ -17,7 +17,10 @@ namespace kernels { // to[i] = from[p[i]] template <typename T, typename I> __global__ -void gather(const T* from, T* to, const I* p, unsigned n) { +void gather(const T* __restrict__ const from, + T* __restrict__ const to, + const I* __restrict__ const p, + unsigned n) { unsigned i = threadIdx.x + blockDim.x*blockIdx.x; if (i<n) { @@ -28,7 +31,10 @@ void gather(const T* from, T* to, const I* p, unsigned n) { // to[p[i]] = from[i] template <typename T, typename I> __global__ -void scatter(const T* from, T* to, const I* p, unsigned n) { +void scatter(const T* __restrict__ const from, + T* __restrict__ const to, + const I* __restrict__ const p, + unsigned n) { unsigned i = threadIdx.x + blockDim.x*blockIdx.x; if (i<n) { @@ -45,18 +51,18 @@ void scatter(const T* from, T* to, const I* p, unsigned n) { template <typename T, typename I> __global__ void assemble_matrix_fine( - T* d, - T* rhs, - const T* invariant_d, - const T* voltage, - const T* current, - const T* conductivity, - const T* cv_capacitance, - const T* area, - const I* cv_to_cell, - const T* dt_intdom, - const I* cell_to_intdom, - const I* perm, + T* __restrict__ const d, + T* __restrict__ const rhs, + const T* __restrict__ const invariant_d, + const T* __restrict__ const voltage, + const T* __restrict__ const current, + const T* __restrict__ const conductivity, + const T* __restrict__ const cv_capacitance, + const T* __restrict__ const area, + const I* __restrict__ const cv_to_cell, + const T* __restrict__ const dt_intdom, + const I* __restrict__ const cell_to_intdom, + const I* __restrict__ const perm, unsigned n) { const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x; @@ -97,15 +103,15 @@ void assemble_matrix_fine( template <typename T> __global__ void solve_matrix_fine( - T* rhs, - T* d, - const T* u, - const level_metadata* level_meta, - const fvm_index_type* level_lengths, - const fvm_index_type* level_parents, - const fvm_index_type* block_index, - fvm_index_type* num_matrix, // number of packed matrices = number of cells - fvm_index_type* padded_size) + T* __restrict__ const rhs, + T* __restrict__ const d, + const T* __restrict__ const u, + const level_metadata* __restrict__ const level_meta, + const fvm_index_type* __restrict__ const level_lengths, + const fvm_index_type* __restrict__ const level_parents, + const fvm_index_type* __restrict__ const block_index, + fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells + fvm_index_type* __restrict__ const padded_size) { const auto tid = threadIdx.x; const auto bid = blockIdx.x; diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu index 931e6a1a1994ae548b9815dc7dd8f0431850e50c..b136f8dcd63d6fb87e4542508b83a3184d9e1ad2 100644 --- a/arbor/backends/gpu/multi_event_stream.cu +++ b/arbor/backends/gpu/multi_event_stream.cu @@ -11,10 +11,10 @@ namespace kernels { template <typename T, typename I> __global__ void mark_until_after( unsigned n, - I* mark, - const I* span_end, - const T* ev_time, - const T* t_until) + I* __restrict__ const mark, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + const T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -31,10 +31,10 @@ namespace kernels { template <typename T, typename I> __global__ void mark_until( unsigned n, - I* mark, - const I* span_end, - const T* ev_time, - const T* t_until) + I* __restrict__ const mark, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + const T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -51,10 +51,10 @@ namespace kernels { template <typename I> __global__ void drop_marked_events( unsigned n, - I* n_nonempty, - I* span_begin, - const I* span_end, - const I* mark) + I* __restrict__ const n_nonempty, + I* __restrict__ const span_begin, + const I* __restrict__ const span_end, + const I* __restrict__ const mark) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -69,10 +69,10 @@ namespace kernels { template <typename T, typename I> __global__ void event_time_if_before( unsigned n, - const I* span_begin, - const I* span_end, - const T* ev_time, - T* t_until) + const I* __restrict__ const span_begin, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu index 57989e642132f00fe3da7826862ba6e27e128feb..62e7c4d450c3a34c868076e9bbe3cdd52ea0d418 100644 --- a/arbor/backends/gpu/shared_state.cu +++ b/arbor/backends/gpu/shared_state.cu @@ -14,7 +14,11 @@ namespace gpu { namespace kernel { template <typename T> -__global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, T tmax) { +__global__ void update_time_to_impl(unsigned n, + T* __restrict__ const time_to, + const T* __restrict__ const time, + T dt, + T tmax) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { auto t = time[i]+dt; @@ -23,7 +27,10 @@ __global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, } template <typename T, typename I> -__global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* voltage, I* current_density) { +__global__ void add_gj_current_impl(unsigned n, + const T* __restrict__ const gj_info, + const I* __restrict__ const voltage, + I* __restrict__ const current_density) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { auto gj = gj_info[i]; @@ -35,7 +42,9 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta // Vector/scalar addition: x[i] += v ∀i template <typename T> -__global__ void add_scalar(unsigned n, T* x, fvm_value_type v) { +__global__ void add_scalar(unsigned n, + T* __restrict__ const x, + fvm_value_type v) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { x[i] += v; @@ -44,11 +53,11 @@ __global__ void add_scalar(unsigned n, T* x, fvm_value_type v) { template <typename T, typename I> __global__ void set_dt_impl( T* __restrict__ dt_intdom, - const T* time_to, - const T* time, + const T* __restrict__ time_to, + const T* __restrict__ time, const unsigned ncomp, T* __restrict__ dt_comp, - const I* cv_to_intdom) { + const I* __restrict__ cv_to_intdom) { auto idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < ncomp) { const auto ind = cv_to_intdom[idx]; @@ -60,7 +69,9 @@ __global__ void set_dt_impl( T* __restrict__ dt_intdom, __global__ void take_samples_impl( multi_event_stream_state<raw_probe_info> s, - const fvm_value_type* time, fvm_value_type* sample_time, fvm_value_type* sample_value) + const fvm_value_type* __restrict__ const time, + fvm_value_type* __restrict__ const sample_time, + fvm_value_type* __restrict__ const sample_value) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<s.n) { diff --git a/arbor/backends/gpu/threshold_watcher.cu b/arbor/backends/gpu/threshold_watcher.cu index ea6b42185e4efef899a83f60f6bbf14a6f0c5a27..28180608ca466369e7dc171983b3216df0ae035e 100644 --- a/arbor/backends/gpu/threshold_watcher.cu +++ b/arbor/backends/gpu/threshold_watcher.cu @@ -30,10 +30,15 @@ inline T lerp(T a, T b, T u) { __global__ void test_thresholds_impl( int size, - const fvm_index_type* cv_to_intdom, const fvm_value_type* t_after, const fvm_value_type* t_before, + const fvm_index_type* __restrict__ const cv_to_intdom, + const fvm_value_type* __restrict__ const t_after, + const fvm_value_type* __restrict__ const t_before, stack_storage<threshold_crossing>& stack, - fvm_index_type* is_crossed, fvm_value_type* prev_values, - const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds) + fvm_index_type* __restrict__ const is_crossed, + fvm_value_type* __restrict__ const prev_values, + const fvm_index_type* __restrict__ const cv_index, + const fvm_value_type* __restrict__ const values, + const fvm_value_type* __restrict__ const thresholds) { int i = threadIdx.x + blockIdx.x*blockDim.x; @@ -73,8 +78,11 @@ void test_thresholds_impl( __global__ extern void reset_crossed_impl( - int size, fvm_index_type* is_crossed, - const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds) + int size, + fvm_index_type* __restrict__ const is_crossed, + const fvm_index_type* __restrict__ const cv_index, + const fvm_value_type* __restrict__ const values, + const fvm_value_type* __restrict__ const thresholds) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i<size) { diff --git a/arbor/memory/fill.cu b/arbor/memory/fill.cu index 59db57ab58053d423d221217b8fd29e5f1f0a6a3..fc16e9cc838c79a7bc0e49986f8923aaad16c842 100644 --- a/arbor/memory/fill.cu +++ b/arbor/memory/fill.cu @@ -7,7 +7,7 @@ namespace gpu { template <typename T, typename I> __global__ -void fill_kernel(T* v, T value, I n) { +void fill_kernel(T* __restrict__ const v, T value, I n) { auto tid = threadIdx.x + blockDim.x*blockIdx.x; if(tid < n) {