From a88f9c0201b0d72e1945e152ec51b48d503427c4 Mon Sep 17 00:00:00 2001
From: thorstenhater <24411438+thorstenhater@users.noreply.github.com>
Date: Wed, 14 Oct 2020 10:53:08 +0200
Subject: [PATCH] Gpu/optimise matrix fine (#1028)

Memory access optimization for the fine matrix GPU solver.
---
 arbor/backends/gpu/matrix_fine.cu        | 92 ++++++++++--------------
 arbor/backends/gpu/matrix_fine.hpp       |  3 +-
 arbor/backends/gpu/matrix_state_fine.hpp | 20 +++---
 3 files changed, 52 insertions(+), 63 deletions(-)

diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu
index 9e785676..228d9642 100644
--- a/arbor/backends/gpu/matrix_fine.cu
+++ b/arbor/backends/gpu/matrix_fine.cu
@@ -42,7 +42,7 @@ void scatter(const T* __restrict__ const from,
     }
 }
 
-/// GPU implementatin of Hines matrix assembly.
+/// GPU implementation of Hines matrix assembly.
 /// Fine layout.
 /// For a given time step size dt:
 ///     - use the precomputed alpha and alpha_d values to construct the diagonal
@@ -59,35 +59,26 @@ void assemble_matrix_fine(
         const T* __restrict__ const conductivity,
         const T* __restrict__ const cv_capacitance,
         const T* __restrict__ const area,
-        const I* __restrict__ const cv_to_cell,
+        const I* __restrict__ const cv_to_intdom,
         const T* __restrict__ const dt_intdom,
-        const I* __restrict__ const cell_to_intdom,
         const I* __restrict__ const perm,
         unsigned n)
 {
     const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
-
-    if (tid<n) {
-        auto cid = cv_to_cell[tid];
-        auto dt = dt_intdom[cell_to_intdom[cid]];
-
-        if (dt>0) {
-            // The 1e-3 is a constant of proportionality required to ensure that the
-            // conductance (gi) values have units Î¼S (micro-Siemens).
-            // See the model documentation in docs/model for more information.
-            T oodt_factor = T(1e-3)/dt;
-            T area_factor = T(1e-3)*area[tid];
-
-            const auto gi = oodt_factor*cv_capacitance[tid] + area_factor*conductivity[tid];
-            const auto pid = perm[tid];
-            d[pid] = gi + invariant_d[tid];
-            rhs[pid] = gi*voltage[tid] - area_factor*current[tid];
-        }
-        else {
-            const auto pid = perm[tid];
-            d[pid] = 0;
-            rhs[pid] = voltage[tid];
-        }
+    if (tid < n) {
+        // The 1e-3 is a constant of proportionality required to ensure that the
+        // conductance (gi) values have units Î¼S (micro-Siemens).
+        // See the model documentation in docs/model for more information.
+        const auto dt = dt_intdom[cv_to_intdom[tid]];
+        const auto p = dt > 0;
+        const auto pid = perm[tid];
+        const auto area_factor = T(1e-3)*area[tid];
+        const auto gi = T(1e-3)*cv_capacitance[tid]/dt + area_factor*conductivity[tid];
+        const auto r_d = gi + invariant_d[tid];
+        const auto r_rhs = gi*voltage[tid] - area_factor*current[tid];
+
+        d[pid]   = p ? r_d : 0;
+        rhs[pid] = p ? r_rhs : voltage[tid];
     }
 }
 
@@ -110,8 +101,7 @@ void solve_matrix_fine(
     const fvm_index_type* __restrict__ const level_lengths,
     const fvm_index_type* __restrict__ const level_parents,
     const fvm_index_type* __restrict__ const block_index,
-    fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells
-    fvm_index_type* __restrict__ const padded_size)
+    const fvm_index_type* __restrict__ const num_matrix) // number of packed matrices = number of cells
 {
     const auto tid = threadIdx.x;
     const auto bid = blockIdx.x;
@@ -135,6 +125,7 @@ void solve_matrix_fine(
 
         const unsigned width = lvl_meta.num_branches;
 
+
         // Perform backward substitution for each branch on this level.
         // One thread per branch.
         if (tid < width) {
@@ -150,32 +141,31 @@ void solve_matrix_fine(
             // all the matrices at the same time. When a cell finishes, we put a
             // `0` on the diagonal to mark that it should not be solved for.
             if (d[pos]!=0) {
-
                 // each branch perform substitution
-                T factor = u[pos] / d[pos];
                 for (unsigned i=0; i<len-1; ++i) {
                     const unsigned next_pos = pos + width;
-                    d[next_pos]   -= factor * u[pos];
-                    rhs[next_pos] -= factor * rhs[pos];
-
-                    factor = u[next_pos] / d[next_pos];
+                    const auto d_next = d[next_pos];
+                    const auto rhs_next = rhs[next_pos];
+                    const T factor = -u[pos]/d[pos];
+                    d[next_pos] = fma(factor, u[pos], d_next);
+                    rhs[next_pos] = fma(factor, rhs[pos], rhs_next);
                     pos = next_pos;
                 }
-
                 // Update d and rhs at the parent node of this branch.
                 // A parent may have more than one contributing to it, so we use
                 // atomic updates to avoid races conditions.
                 const unsigned parent_index = next_lvl_meta.matrix_data_index;
                 const unsigned p = parent_index + lvl_parents[tid];
-                //d[p]   -= factor * u[pos];
-                gpu_atomic_add(d  +p, -factor*u[pos]);
-                //rhs[p] -= factor * rhs[pos];
-                gpu_atomic_add(rhs+p, -factor*rhs[pos]);
+
+                const T factor = -u[pos] / d[pos];
+                gpu_atomic_add(d + p, factor*u[pos]);
+                gpu_atomic_add(rhs + p, factor*rhs[pos]);
             }
         }
         __syncthreads();
     }
 
+    // Solve the root
     {
         // The levels are sorted such that the root is the last level
         const auto& last_lvl_meta = block_level_meta[num_levels-1];
@@ -188,14 +178,14 @@ void solve_matrix_fine(
             unsigned pos = last_lvl_meta.matrix_data_index + tid;
 
             if (d[pos]!=0) {
-
                 // backward
                 for (unsigned i=0; i<len-1; ++i) {
-                    T factor = u[pos] / d[pos];
                     const unsigned next_pos = pos + width;
-                    d[next_pos]   -= factor * u[pos];
-                    rhs[next_pos] -= factor * rhs[pos];
-
+                    const T factor = -u[pos] / d[pos];
+                    const auto rhs_next = rhs[next_pos];
+                    const auto d_next = d[next_pos];
+                    d[next_pos]   = fma(factor, u[pos], d_next);
+                    rhs[next_pos] = fma(factor, rhs[pos], rhs_next);
                     pos = next_pos;
                 }
 
@@ -233,15 +223,14 @@ void solve_matrix_fine(
         // Perform forward-substitution for each branch on this level.
         // One thread per branch.
         if (tid < width) {
-            // Load the rhs value for the parent node of this branch.
-            const unsigned p = parent_index + lvl_parents[tid];
-            T rhsp = rhs[p];
-
             // Find the index of the first node in this branch.
             const unsigned len = lvl_lengths[tid];
             unsigned pos = lvl_meta.matrix_data_index + (len-1)*width + tid;
 
             if (d[pos]!=0) {
+                // Load the rhs value for the parent node of this branch.
+                const unsigned p = parent_index + lvl_parents[tid];
+                T rhsp = rhs[p];
                 // each branch perform substitution
                 for (unsigned i=0; i<len; ++i) {
                     rhsp = rhs[pos] - u[pos]*rhsp;
@@ -280,7 +269,6 @@ void scatter(
     kernels::scatter<<<griddim, blockdim>>>(from, to, p, n);
 }
 
-
 void assemble_matrix_fine(
     fvm_value_type* d,
     fvm_value_type* rhs,
@@ -290,9 +278,8 @@ void assemble_matrix_fine(
     const fvm_value_type* conductivity,
     const fvm_value_type* cv_capacitance,
     const fvm_value_type* area,
-    const fvm_index_type* cv_to_cell,
+    const fvm_index_type* cv_to_intdom,
     const fvm_value_type* dt_intdom,
-    const fvm_index_type* cell_to_intdom,
     const fvm_index_type* perm,
     unsigned n)
 {
@@ -301,8 +288,7 @@ void assemble_matrix_fine(
 
     kernels::assemble_matrix_fine<<<num_blocks, block_dim>>>(
         d, rhs, invariant_d, voltage, current, conductivity, cv_capacitance, area,
-        cv_to_cell, dt_intdom, cell_to_intdom,
-        perm, n);
+        cv_to_intdom, dt_intdom, perm, n);
 }
 
 // Example:
@@ -337,7 +323,7 @@ void solve_matrix_fine(
 {
     kernels::solve_matrix_fine<<<num_blocks, blocksize>>>(
         rhs, d, u, level_meta, level_lengths, level_parents, block_index,
-        num_cells, padded_size);
+        num_cells);
 }
 
 } // namespace gpu
diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp
index a892ce5b..74b34906 100644
--- a/arbor/backends/gpu/matrix_fine.hpp
+++ b/arbor/backends/gpu/matrix_fine.hpp
@@ -34,9 +34,8 @@ void assemble_matrix_fine(
     const fvm_value_type* conductivity,
     const fvm_value_type* cv_capacitance,
     const fvm_value_type* area,
-    const fvm_index_type* cv_to_cell,
+    const fvm_index_type* cv_to_intdom,
     const fvm_value_type* dt_intdom,
-    const fvm_index_type* cell_to_intdom,
     const fvm_index_type* perm,
     unsigned n);
 
diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp
index 3b393a85..4d0d46c5 100644
--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -85,7 +85,8 @@ public:
 
     using metadata_array = memory::device_vector<level_metadata>;
 
-    iarray cv_to_cell;
+    // Maps control volume to integration domain
+    iarray cv_to_intdom;
 
     array d;     // [Î¼S]
     array u;     // [Î¼S]
@@ -98,8 +99,8 @@ public:
     // Invariant part of the matrix diagonal
     array invariant_d;         // [Î¼S]
 
-    // Maps cell to integration domain
-    iarray cell_to_intdom;
+    // Solution in unpacked format
+    array solution_;
 
     // Maximum number of branches in each level per block
     unsigned max_branches_per_level;
@@ -449,16 +450,20 @@ public:
         // to be stored in flat format
         cv_capacitance = memory::make_const_view(cap);
         invariant_d = memory::make_const_view(invariant_d_tmp);
-        cell_to_intdom = memory::make_const_view(cell_intdom);
 
-        // calculte the cv -> cell mappings
+        // calculate the cv -> cell mappings
         std::vector<size_type> cv_to_cell_tmp(matrix_size);
         size_type ci = 0;
         for (auto cv_span: util::partition_view(cell_cv_divs)) {
             util::fill(util::subrange_view(cv_to_cell_tmp, cv_span), ci);
             ++ci;
         }
-        cv_to_cell = memory::make_const_view(cv_to_cell_tmp);
+        std::vector<size_type> cv_to_intdom_tmp(matrix_size);
+        for (auto i = 0ul; i < cv_to_cell_tmp.size(); ++i) {
+            cv_to_intdom_tmp[i] = cell_intdom[cv_to_cell_tmp[i]];
+        }
+        cv_to_intdom = memory::make_const_view(cv_to_intdom_tmp);
+
     }
 
     // Assemble the matrix
@@ -477,9 +482,8 @@ public:
             conductivity.data(),
             cv_capacitance.data(),
             cv_area.data(),
-            cv_to_cell.data(),
+            cv_to_intdom.data(),
             dt_intdom.data(),
-            cell_to_intdom.data(),
             perm.data(),
             size());
     }
-- 
GitLab