From 5ace929e962ea9aa7d836684341f2dffc0cded59 Mon Sep 17 00:00:00 2001
From: thorstenhater <24411438+thorstenhater@users.noreply.github.com>
Date: Thu, 23 Jul 2020 11:03:53 +0200
Subject: [PATCH] Gpu/restrict all the things (#1026)

Make all pointer arguments to kernels `__restrict__` to avoid unnecessary loads.

The effect on the busyring benchmark (swapped pas -> hh) with 8192 cells on a V100 GPU (time for model-run in seconds):
```
|----------+-------|
| Baseline | After |
|----------+-------|
|    2.347 | 2.268 |
|    2.345 | 2.262 |
|    2.321 | 2.276 |
|    2.323 | 2.267 |
|    2.330 | 2.249 |
|----------+-------|
|    2.321 | 2.249 |
|----------+-------|
```
---
 arbor/backends/gpu/matrix_fine.cu        | 52 +++++++++++++-----------
 arbor/backends/gpu/multi_event_stream.cu | 32 +++++++--------
 arbor/backends/gpu/shared_state.cu       | 25 ++++++++----
 arbor/backends/gpu/threshold_watcher.cu  | 18 +++++---
 arbor/memory/fill.cu                     |  2 +-
 5 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu
index b8d883dc..9e785676 100644
--- a/arbor/backends/gpu/matrix_fine.cu
+++ b/arbor/backends/gpu/matrix_fine.cu
@@ -17,7 +17,10 @@ namespace kernels {
 // to[i] = from[p[i]]
 template <typename T, typename I>
 __global__
-void gather(const T* from, T* to, const I* p, unsigned n) {
+void gather(const T* __restrict__ const from,
+            T* __restrict__ const to,
+            const I* __restrict__ const p,
+            unsigned n) {
     unsigned i = threadIdx.x + blockDim.x*blockIdx.x;
 
     if (i<n) {
@@ -28,7 +31,10 @@ void gather(const T* from, T* to, const I* p, unsigned n) {
 // to[p[i]] = from[i]
 template <typename T, typename I>
 __global__
-void scatter(const T* from, T* to, const I* p, unsigned n) {
+void scatter(const T* __restrict__ const from,
+             T* __restrict__ const to,
+             const I* __restrict__ const p,
+             unsigned n) {
     unsigned i = threadIdx.x + blockDim.x*blockIdx.x;
 
     if (i<n) {
@@ -45,18 +51,18 @@ void scatter(const T* from, T* to, const I* p, unsigned n) {
 template <typename T, typename I>
 __global__
 void assemble_matrix_fine(
-        T* d,
-        T* rhs,
-        const T* invariant_d,
-        const T* voltage,
-        const T* current,
-        const T* conductivity,
-        const T* cv_capacitance,
-        const T* area,
-        const I* cv_to_cell,
-        const T* dt_intdom,
-        const I* cell_to_intdom,
-        const I* perm,
+        T* __restrict__ const d,
+        T* __restrict__ const rhs,
+        const T* __restrict__ const invariant_d,
+        const T* __restrict__ const voltage,
+        const T* __restrict__ const current,
+        const T* __restrict__ const conductivity,
+        const T* __restrict__ const cv_capacitance,
+        const T* __restrict__ const area,
+        const I* __restrict__ const cv_to_cell,
+        const T* __restrict__ const dt_intdom,
+        const I* __restrict__ const cell_to_intdom,
+        const I* __restrict__ const perm,
         unsigned n)
 {
     const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
@@ -97,15 +103,15 @@ void assemble_matrix_fine(
 template <typename T>
 __global__
 void solve_matrix_fine(
-    T* rhs,
-    T* d,
-    const T* u,
-    const level_metadata* level_meta,
-    const fvm_index_type* level_lengths,
-    const fvm_index_type* level_parents,
-    const fvm_index_type* block_index,
-    fvm_index_type* num_matrix, // number of packed matrices = number of cells
-    fvm_index_type* padded_size)
+    T* __restrict__ const rhs,
+    T* __restrict__ const d,
+    const T* __restrict__ const u,
+    const level_metadata* __restrict__ const level_meta,
+    const fvm_index_type* __restrict__ const level_lengths,
+    const fvm_index_type* __restrict__ const level_parents,
+    const fvm_index_type* __restrict__ const block_index,
+    fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells
+    fvm_index_type* __restrict__ const padded_size)
 {
     const auto tid = threadIdx.x;
     const auto bid = blockIdx.x;
diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu
index 931e6a1a..b136f8dc 100644
--- a/arbor/backends/gpu/multi_event_stream.cu
+++ b/arbor/backends/gpu/multi_event_stream.cu
@@ -11,10 +11,10 @@ namespace kernels {
     template <typename T, typename I>
     __global__ void mark_until_after(
         unsigned n,
-        I* mark,
-        const I* span_end,
-        const T* ev_time,
-        const T* t_until)
+        I* __restrict__ const mark,
+        const I* __restrict__ const span_end,
+        const T* __restrict__ const ev_time,
+        const T* __restrict__ const t_until)
     {
         unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
         if (i<n) {
@@ -31,10 +31,10 @@ namespace kernels {
     template <typename T, typename I>
     __global__ void mark_until(
         unsigned n,
-        I* mark,
-        const I* span_end,
-        const T* ev_time,
-        const T* t_until)
+        I* __restrict__ const mark,
+        const I* __restrict__ const span_end,
+        const T* __restrict__ const ev_time,
+        const T* __restrict__ const t_until)
     {
         unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
         if (i<n) {
@@ -51,10 +51,10 @@ namespace kernels {
     template <typename I>
     __global__ void drop_marked_events(
         unsigned n,
-        I* n_nonempty,
-        I* span_begin,
-        const I* span_end,
-        const I* mark)
+        I* __restrict__ const n_nonempty,
+        I* __restrict__ const span_begin,
+        const I* __restrict__ const span_end,
+        const I* __restrict__ const mark)
     {
         unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
         if (i<n) {
@@ -69,10 +69,10 @@ namespace kernels {
     template <typename T, typename I>
     __global__ void event_time_if_before(
         unsigned n,
-        const I* span_begin,
-        const I* span_end,
-        const T* ev_time,
-        T* t_until)
+        const I* __restrict__ const span_begin,
+        const I* __restrict__ const span_end,
+        const T* __restrict__ const ev_time,
+        T* __restrict__ const t_until)
     {
         unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
         if (i<n) {
diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu
index 57989e64..62e7c4d4 100644
--- a/arbor/backends/gpu/shared_state.cu
+++ b/arbor/backends/gpu/shared_state.cu
@@ -14,7 +14,11 @@ namespace gpu {
 namespace kernel {
 
 template <typename T>
-__global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, T tmax) {
+__global__ void update_time_to_impl(unsigned n,
+                                    T* __restrict__ const time_to,
+                                    const T* __restrict__ const time,
+                                    T dt,
+                                    T tmax) {
     unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
     if (i<n) {
         auto t = time[i]+dt;
@@ -23,7 +27,10 @@ __global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt,
 }
 
 template <typename T, typename I>
-__global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* voltage, I* current_density) {
+__global__ void add_gj_current_impl(unsigned n,
+                                    const T* __restrict__ const gj_info,
+                                    const I* __restrict__ const voltage,
+                                    I* __restrict__ const current_density) {
     unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
     if (i<n) {
         auto gj = gj_info[i];
@@ -35,7 +42,9 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta
 
 // Vector/scalar addition: x[i] += v âˆ€i
 template <typename T>
-__global__ void add_scalar(unsigned n, T* x, fvm_value_type v) {
+__global__ void add_scalar(unsigned n,
+                           T* __restrict__ const x,
+                           fvm_value_type v) {
     unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
     if (i<n) {
         x[i] += v;
@@ -44,11 +53,11 @@ __global__ void add_scalar(unsigned n, T* x, fvm_value_type v) {
 
 template <typename T, typename I>
 __global__ void set_dt_impl(      T* __restrict__ dt_intdom,
-                            const T* time_to,
-                            const T* time,
+                            const T* __restrict__ time_to,
+                            const T* __restrict__ time,
                             const unsigned ncomp,
                                   T* __restrict__ dt_comp,
-                            const I* cv_to_intdom) {
+                            const I* __restrict__ cv_to_intdom) {
     auto idx = blockIdx.x*blockDim.x + threadIdx.x;
     if (idx < ncomp) {
         const auto ind = cv_to_intdom[idx];
@@ -60,7 +69,9 @@ __global__ void set_dt_impl(      T* __restrict__ dt_intdom,
 
 __global__ void take_samples_impl(
     multi_event_stream_state<raw_probe_info> s,
-    const fvm_value_type* time, fvm_value_type* sample_time, fvm_value_type* sample_value)
+    const fvm_value_type* __restrict__ const time,
+    fvm_value_type* __restrict__ const sample_time,
+    fvm_value_type* __restrict__ const sample_value)
 {
     unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
     if (i<s.n) {
diff --git a/arbor/backends/gpu/threshold_watcher.cu b/arbor/backends/gpu/threshold_watcher.cu
index ea6b4218..28180608 100644
--- a/arbor/backends/gpu/threshold_watcher.cu
+++ b/arbor/backends/gpu/threshold_watcher.cu
@@ -30,10 +30,15 @@ inline T lerp(T a, T b, T u) {
 __global__
 void test_thresholds_impl(
     int size,
-    const fvm_index_type* cv_to_intdom, const fvm_value_type* t_after, const fvm_value_type* t_before,
+    const fvm_index_type* __restrict__ const cv_to_intdom,
+    const fvm_value_type* __restrict__ const t_after,
+    const fvm_value_type* __restrict__ const t_before,
     stack_storage<threshold_crossing>& stack,
-    fvm_index_type* is_crossed, fvm_value_type* prev_values,
-    const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
+    fvm_index_type* __restrict__ const is_crossed,
+    fvm_value_type* __restrict__ const prev_values,
+    const fvm_index_type* __restrict__ const cv_index,
+    const fvm_value_type* __restrict__ const values,
+    const fvm_value_type* __restrict__ const thresholds)
 {
     int i = threadIdx.x + blockIdx.x*blockDim.x;
 
@@ -73,8 +78,11 @@ void test_thresholds_impl(
 
 __global__
 extern void reset_crossed_impl(
-    int size, fvm_index_type* is_crossed,
-    const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
+    int size,
+    fvm_index_type* __restrict__ const is_crossed,
+    const fvm_index_type* __restrict__ const cv_index,
+    const fvm_value_type* __restrict__ const values,
+    const fvm_value_type* __restrict__ const thresholds)
 {
     int i = threadIdx.x + blockIdx.x*blockDim.x;
     if (i<size) {
diff --git a/arbor/memory/fill.cu b/arbor/memory/fill.cu
index 59db57ab..fc16e9cc 100644
--- a/arbor/memory/fill.cu
+++ b/arbor/memory/fill.cu
@@ -7,7 +7,7 @@ namespace gpu {
 
 template <typename T, typename I>
 __global__
-void fill_kernel(T* v, T value, I n) {
+void fill_kernel(T* __restrict__ const v, T value, I n) {
     auto tid = threadIdx.x + blockDim.x*blockIdx.x;
 
     if(tid < n) {
-- 
GitLab