From 5ace929e962ea9aa7d836684341f2dffc0cded59 Mon Sep 17 00:00:00 2001 From: thorstenhater <24411438+thorstenhater@users.noreply.github.com> Date: Thu, 23 Jul 2020 11:03:53 +0200 Subject: [PATCH] Gpu/restrict all the things (#1026) Make all pointer arguments to kernels `__restrict__` to avoid unnecessary loads. The effect on the busyring benchmark (swapped pas -> hh) with 8192 cells on a V100 GPU (time for model-run in seconds): ``` |----------+-------| | Baseline | After | |----------+-------| | 2.347 | 2.268 | | 2.345 | 2.262 | | 2.321 | 2.276 | | 2.323 | 2.267 | | 2.330 | 2.249 | |----------+-------| | 2.321 | 2.249 | |----------+-------| ``` --- arbor/backends/gpu/matrix_fine.cu | 52 +++++++++++++----------- arbor/backends/gpu/multi_event_stream.cu | 32 +++++++-------- arbor/backends/gpu/shared_state.cu | 25 ++++++++---- arbor/backends/gpu/threshold_watcher.cu | 18 +++++--- arbor/memory/fill.cu | 2 +- 5 files changed, 77 insertions(+), 52 deletions(-) diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu index b8d883dc..9e785676 100644 --- a/arbor/backends/gpu/matrix_fine.cu +++ b/arbor/backends/gpu/matrix_fine.cu @@ -17,7 +17,10 @@ namespace kernels { // to[i] = from[p[i]] template <typename T, typename I> __global__ -void gather(const T* from, T* to, const I* p, unsigned n) { +void gather(const T* __restrict__ const from, + T* __restrict__ const to, + const I* __restrict__ const p, + unsigned n) { unsigned i = threadIdx.x + blockDim.x*blockIdx.x; if (i<n) { @@ -28,7 +31,10 @@ void gather(const T* from, T* to, const I* p, unsigned n) { // to[p[i]] = from[i] template <typename T, typename I> __global__ -void scatter(const T* from, T* to, const I* p, unsigned n) { +void scatter(const T* __restrict__ const from, + T* __restrict__ const to, + const I* __restrict__ const p, + unsigned n) { unsigned i = threadIdx.x + blockDim.x*blockIdx.x; if (i<n) { @@ -45,18 +51,18 @@ void scatter(const T* from, T* to, const I* p, unsigned n) { template <typename T, typename I> __global__ void assemble_matrix_fine( - T* d, - T* rhs, - const T* invariant_d, - const T* voltage, - const T* current, - const T* conductivity, - const T* cv_capacitance, - const T* area, - const I* cv_to_cell, - const T* dt_intdom, - const I* cell_to_intdom, - const I* perm, + T* __restrict__ const d, + T* __restrict__ const rhs, + const T* __restrict__ const invariant_d, + const T* __restrict__ const voltage, + const T* __restrict__ const current, + const T* __restrict__ const conductivity, + const T* __restrict__ const cv_capacitance, + const T* __restrict__ const area, + const I* __restrict__ const cv_to_cell, + const T* __restrict__ const dt_intdom, + const I* __restrict__ const cell_to_intdom, + const I* __restrict__ const perm, unsigned n) { const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x; @@ -97,15 +103,15 @@ void assemble_matrix_fine( template <typename T> __global__ void solve_matrix_fine( - T* rhs, - T* d, - const T* u, - const level_metadata* level_meta, - const fvm_index_type* level_lengths, - const fvm_index_type* level_parents, - const fvm_index_type* block_index, - fvm_index_type* num_matrix, // number of packed matrices = number of cells - fvm_index_type* padded_size) + T* __restrict__ const rhs, + T* __restrict__ const d, + const T* __restrict__ const u, + const level_metadata* __restrict__ const level_meta, + const fvm_index_type* __restrict__ const level_lengths, + const fvm_index_type* __restrict__ const level_parents, + const fvm_index_type* __restrict__ const block_index, + fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells + fvm_index_type* __restrict__ const padded_size) { const auto tid = threadIdx.x; const auto bid = blockIdx.x; diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu index 931e6a1a..b136f8dc 100644 --- a/arbor/backends/gpu/multi_event_stream.cu +++ b/arbor/backends/gpu/multi_event_stream.cu @@ -11,10 +11,10 @@ namespace kernels { template <typename T, typename I> __global__ void mark_until_after( unsigned n, - I* mark, - const I* span_end, - const T* ev_time, - const T* t_until) + I* __restrict__ const mark, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + const T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -31,10 +31,10 @@ namespace kernels { template <typename T, typename I> __global__ void mark_until( unsigned n, - I* mark, - const I* span_end, - const T* ev_time, - const T* t_until) + I* __restrict__ const mark, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + const T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -51,10 +51,10 @@ namespace kernels { template <typename I> __global__ void drop_marked_events( unsigned n, - I* n_nonempty, - I* span_begin, - const I* span_end, - const I* mark) + I* __restrict__ const n_nonempty, + I* __restrict__ const span_begin, + const I* __restrict__ const span_end, + const I* __restrict__ const mark) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { @@ -69,10 +69,10 @@ namespace kernels { template <typename T, typename I> __global__ void event_time_if_before( unsigned n, - const I* span_begin, - const I* span_end, - const T* ev_time, - T* t_until) + const I* __restrict__ const span_begin, + const I* __restrict__ const span_end, + const T* __restrict__ const ev_time, + T* __restrict__ const t_until) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu index 57989e64..62e7c4d4 100644 --- a/arbor/backends/gpu/shared_state.cu +++ b/arbor/backends/gpu/shared_state.cu @@ -14,7 +14,11 @@ namespace gpu { namespace kernel { template <typename T> -__global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, T tmax) { +__global__ void update_time_to_impl(unsigned n, + T* __restrict__ const time_to, + const T* __restrict__ const time, + T dt, + T tmax) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { auto t = time[i]+dt; @@ -23,7 +27,10 @@ __global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, } template <typename T, typename I> -__global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* voltage, I* current_density) { +__global__ void add_gj_current_impl(unsigned n, + const T* __restrict__ const gj_info, + const I* __restrict__ const voltage, + I* __restrict__ const current_density) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { auto gj = gj_info[i]; @@ -35,7 +42,9 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta // Vector/scalar addition: x[i] += v ∀i template <typename T> -__global__ void add_scalar(unsigned n, T* x, fvm_value_type v) { +__global__ void add_scalar(unsigned n, + T* __restrict__ const x, + fvm_value_type v) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<n) { x[i] += v; @@ -44,11 +53,11 @@ __global__ void add_scalar(unsigned n, T* x, fvm_value_type v) { template <typename T, typename I> __global__ void set_dt_impl( T* __restrict__ dt_intdom, - const T* time_to, - const T* time, + const T* __restrict__ time_to, + const T* __restrict__ time, const unsigned ncomp, T* __restrict__ dt_comp, - const I* cv_to_intdom) { + const I* __restrict__ cv_to_intdom) { auto idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < ncomp) { const auto ind = cv_to_intdom[idx]; @@ -60,7 +69,9 @@ __global__ void set_dt_impl( T* __restrict__ dt_intdom, __global__ void take_samples_impl( multi_event_stream_state<raw_probe_info> s, - const fvm_value_type* time, fvm_value_type* sample_time, fvm_value_type* sample_value) + const fvm_value_type* __restrict__ const time, + fvm_value_type* __restrict__ const sample_time, + fvm_value_type* __restrict__ const sample_value) { unsigned i = threadIdx.x+blockIdx.x*blockDim.x; if (i<s.n) { diff --git a/arbor/backends/gpu/threshold_watcher.cu b/arbor/backends/gpu/threshold_watcher.cu index ea6b4218..28180608 100644 --- a/arbor/backends/gpu/threshold_watcher.cu +++ b/arbor/backends/gpu/threshold_watcher.cu @@ -30,10 +30,15 @@ inline T lerp(T a, T b, T u) { __global__ void test_thresholds_impl( int size, - const fvm_index_type* cv_to_intdom, const fvm_value_type* t_after, const fvm_value_type* t_before, + const fvm_index_type* __restrict__ const cv_to_intdom, + const fvm_value_type* __restrict__ const t_after, + const fvm_value_type* __restrict__ const t_before, stack_storage<threshold_crossing>& stack, - fvm_index_type* is_crossed, fvm_value_type* prev_values, - const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds) + fvm_index_type* __restrict__ const is_crossed, + fvm_value_type* __restrict__ const prev_values, + const fvm_index_type* __restrict__ const cv_index, + const fvm_value_type* __restrict__ const values, + const fvm_value_type* __restrict__ const thresholds) { int i = threadIdx.x + blockIdx.x*blockDim.x; @@ -73,8 +78,11 @@ void test_thresholds_impl( __global__ extern void reset_crossed_impl( - int size, fvm_index_type* is_crossed, - const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds) + int size, + fvm_index_type* __restrict__ const is_crossed, + const fvm_index_type* __restrict__ const cv_index, + const fvm_value_type* __restrict__ const values, + const fvm_value_type* __restrict__ const thresholds) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i<size) { diff --git a/arbor/memory/fill.cu b/arbor/memory/fill.cu index 59db57ab..fc16e9cc 100644 --- a/arbor/memory/fill.cu +++ b/arbor/memory/fill.cu @@ -7,7 +7,7 @@ namespace gpu { template <typename T, typename I> __global__ -void fill_kernel(T* v, T value, I n) { +void fill_kernel(T* __restrict__ const v, T value, I n) { auto tid = threadIdx.x + blockDim.x*blockIdx.x; if(tid < n) { -- GitLab