From c8b2e7854be2bf979434db3f160b3a01bf9950e6 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Tue, 4 Oct 2022 13:07:07 +0200
Subject: [PATCH] Optimize CPU-side solvers (#1992)

* remove storage of RHS and face-conductance in solvers
* elide copy RHS->U and RHS->Xd respectively
  -> solve will now directly mangly U and Xd
* fix tests accordingly
* Introduce wrapper macro for __restrict__ to Isolate against compiler specifics.
---
 arbor/backends/multicore/cable_solver.hpp     | 129 +++++++++---------
 arbor/backends/multicore/diffusion_solver.hpp |  49 +++----
 arbor/backends/multicore/shared_state.cpp     |  17 ++-
 arbor/include/arbor/mechanism_abi.h           |  13 ++
 test/unit/test_matrix.cpp                     |  30 ++--
 5 files changed, 115 insertions(+), 123 deletions(-)

diff --git a/arbor/backends/multicore/cable_solver.hpp b/arbor/backends/multicore/cable_solver.hpp
index 53d6f789..24dff425 100644
--- a/arbor/backends/multicore/cable_solver.hpp
+++ b/arbor/backends/multicore/cable_solver.hpp
@@ -19,19 +19,13 @@ struct cable_solver {
 
     iarray parent_index;
     iarray cell_cv_divs;
-
-    array d;     // [Î¼S]
-    array u;     // [Î¼S]
-    array rhs;   // [nA]
-
-    array cv_capacitance;      // [pF]
-    array face_conductance;    // [Î¼S]
-    array cv_area;             // [Î¼m^2]
-
     iarray cell_to_intdom;
 
-    // the invariant part of the matrix diagonal
-    array invariant_d;         // [Î¼S]
+    array d;              // [Î¼S]
+    array u;              // [Î¼S]
+    array cv_capacitance; // [pF]
+    array cv_area;        // [Î¼m^2]
+    array invariant_d;    // [Î¼S] invariant part of matrix diagonal
 
     cable_solver() = default;
     cable_solver(const cable_solver&) = default;
@@ -48,11 +42,11 @@ struct cable_solver {
                  const std::vector<index_type>& cell_to_intdom):
         parent_index(p.begin(), p.end()),
         cell_cv_divs(cell_cv_divs.begin(), cell_cv_divs.end()),
-        d(size(), 0), u(size(), 0), rhs(size()),
+        cell_to_intdom(cell_to_intdom.begin(), cell_to_intdom.end()),
+        d(size(), 0), u(size(), 0),
         cv_capacitance(cap.begin(), cap.end()),
-        face_conductance(cond.begin(), cond.end()),
         cv_area(area.begin(), area.end()),
-        cell_to_intdom(cell_to_intdom.begin(), cell_to_intdom.end())
+        invariant_d(size(), 0)
     {
         // Sanity check
         arb_assert(cap.size() == size());
@@ -60,11 +54,9 @@ struct cable_solver {
         arb_assert(cell_cv_divs.back() == (index_type)size());
 
         // Build invariant parts
-        const auto n = size();
-        invariant_d = array(n, 0);
-        if (n >= 1) { // skip empty matrix, ie cell with empty morphology
-            for (auto i: util::make_span(1u, n)) {
-                const auto gij = face_conductance[i];
+        if (size() >= 1) {
+            for (auto i: util::make_span(1u, size())) {
+                const auto gij = cond[i];
                 u[i] = -gij;
                 invariant_d[i] += gij;
                 if (p[i]!=-1) { // root
@@ -74,71 +66,82 @@ struct cable_solver {
         }
     }
 
-    const_view solution() const {
-        // In this back end the solution is a simple view of the rhs, which
-        // contains the solution after the matrix_solve is performed.
-        return rhs;
-    }
+    // Setup and solve the cable equation
+    // * expects the voltage from its first argument
+    // * will likewise overwrite the first argument with the solction
+    template<typename T>
+    void solve(T& rhs, const_view dt_intdom, const_view current, const_view conductivity) {
+        value_type * const ARB_NO_ALIAS d_ = d.data();
+        value_type * const ARB_NO_ALIAS r_ = rhs.data();
+
+        const value_type * const ARB_NO_ALIAS i_ = current.data();
+        const value_type * const ARB_NO_ALIAS inv_ = invariant_d.data();
+        const value_type * const ARB_NO_ALIAS c_ = cv_capacitance.data();
+        const value_type * const ARB_NO_ALIAS g_ = conductivity.data();
+        const value_type * const ARB_NO_ALIAS a_ = cv_area.data();
 
-    // Assemble the matrix
-    // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
-    //   dt_intdom       [ms]      (per integration domain)
-    //   voltage         [mV]      (per control volume)
-    //   current density [A.m^-2]  (per control volume)
-    //   conductivity    [kS.m^-2] (per control volume)
-    void assemble(const_view dt_intdom, const_view voltage, const_view current, const_view conductivity) {
         const auto cell_cv_part = util::partition_view(cell_cv_divs);
         const index_type ncells = cell_cv_part.size();
-        // loop over submatrices
+        // Assemble; loop over submatrices
+        // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
+        //   dt_intdom       [ms]      (per integration domain)
+        //   voltage         [mV]      (per control volume)
+        //   current density [A.m^-2]  (per control volume)
+        //   conductivity    [kS.m^-2] (per control volume)
         for (auto m: util::make_span(0, ncells)) {
-            const auto dt = dt_intdom[cell_to_intdom[m]];
-            if (dt>0) {
-                const value_type oodt_factor = 1e-3/dt; // [1/Âµs]
-                for (auto i: util::make_span(cell_cv_part[m])) {
-                    const auto area_factor = 1e-3*cv_area[i]; // [1e-9Â·mÂ²]
-                    const auto gi = oodt_factor*cv_capacitance[i] + area_factor*conductivity[i]; // [Î¼S]
-                    d[i] = gi + invariant_d[i];
-                    // convert current to units nA
-                    rhs[i] = gi*voltage[i] - area_factor*current[i];
+            const auto dt = dt_intdom[cell_to_intdom[m]];    // [ms]
+            if (dt > 0) {
+                const value_type oodt = 1e-3/dt;             // [1/Âµs]
+                const auto& [lo, hi] = cell_cv_part[m];
+                for(int i = lo; i < hi; ++i) {
+                    const auto area = 1e-3*a_[i];            // [1e-9Â·mÂ²]
+                    const auto gi = oodt*c_[i] + area*g_[i]; // [Î¼S]
+                    d_[i] = gi + inv_[i];                    // [Î¼S]
+                    r_[i] = gi*r_[i] - area*i_[i];           // [nA]
                 }
             }
             else {
-                for (auto i: util::make_span(cell_cv_part[m])) {
-                    d[i] = 0;
-                    rhs[i] = voltage[i];
+                const auto& [lo, hi] = cell_cv_part[m];
+                for(int i = lo; i < hi; ++i) {
+                    d_[i] = 0.0;
                 }
             }
         }
+        solve(rhs);
     }
 
-    void solve() {
-        // loop over submatrices
-        for (const auto& [first, last]: util::partition_view(cell_cv_divs)) {
-            if (first >= last) continue; // skip cell with no CVs
-            if (d[first]!=0) {
+    // Solve; loop over submatrices
+    // Afterwards rhs will contain the solution.
+    // NOTE: This exists separately only to cater to the tests
+    template<typename T>
+    void solve(T& rhs) {
+        value_type * const ARB_NO_ALIAS r_ = rhs.data();
+        value_type * const ARB_NO_ALIAS d_ = d.data();
+
+        const value_type * const ARB_NO_ALIAS u_ = u.data();
+        const index_type * const ARB_NO_ALIAS p_ = parent_index.data();
+
+        const auto cell_cv_part = util::partition_view(cell_cv_divs);
+        for (const auto& [first, last]: cell_cv_part) {
+            if (first < last && d_[first] != 0) {  // skip vacuous cells
                 // backward sweep
-                for(auto i=last-1; i>first; --i) {
-                    const auto factor = u[i] / d[i];
-                    d[parent_index[i]]   -= factor * u[i];
-                    rhs[parent_index[i]] -= factor * rhs[i];
+                for(int i = last - 1; i > first; --i) {
+                    const auto factor = u_[i] / d_[i];
+                    const auto pi = p_[i];
+                    d_[pi] -= factor * u_[i];
+                    r_[pi] -= factor * r_[i];
                 }
                 // solve root
-                rhs[first] /= d[first];
+                r_[first] /= d_[first];
                 // forward sweep
-                for(auto i=first+1; i<last; ++i) {
-                    rhs[i] -= u[i] * rhs[parent_index[i]];
-                    rhs[i] /= d[i];
+                for(int i = first + 1; i < last; ++i) {
+                    r_[i] -= u_[i] * r_[p_[i]];
+                    r_[i] /= d_[i];
                 }
             }
         }
     }
 
-    template<typename VTo>
-    void solve(VTo& to) {
-        solve();
-        memory::copy(rhs, to);
-    }
-
     std::size_t num_cells() const { return cell_cv_divs.size() - 1; }
     std::size_t size() const { return parent_index.size(); }
 };
diff --git a/arbor/backends/multicore/diffusion_solver.hpp b/arbor/backends/multicore/diffusion_solver.hpp
index 3df25a6d..05b5d2f5 100644
--- a/arbor/backends/multicore/diffusion_solver.hpp
+++ b/arbor/backends/multicore/diffusion_solver.hpp
@@ -19,17 +19,12 @@ struct diffusion_solver {
 
     iarray parent_index;
     iarray cell_cv_divs;
-
-    array d;     // [Î¼S]
-    array u;     // [Î¼S]
-    array rhs;   // [nA]
-
-    array cv_area;             // [Î¼m^2]
-
     iarray cell_to_intdom;
 
-    // the invariant part of the matrix diagonal
-    array invariant_d;         // [Î¼S]
+    array d;           // [Î¼S]
+    array u;           // [Î¼S]
+    array cv_area;     // [Î¼m^2]
+    array invariant_d; // [Î¼S] invariant part of matrix diagonal
 
     diffusion_solver() = default;
     diffusion_solver(const diffusion_solver&) = default;
@@ -45,9 +40,10 @@ struct diffusion_solver {
                      const std::vector<index_type>& cell_to_intdom):
         parent_index(p.begin(), p.end()),
         cell_cv_divs(cell_cv_divs.begin(), cell_cv_divs.end()),
-        d(size(), 0), u(size(), 0), rhs(size()),
+        cell_to_intdom(cell_to_intdom.begin(), cell_to_intdom.end()),
+        d(size(), 0), u(size(), 0),
         cv_area(area.begin(), area.end()),
-        cell_to_intdom(cell_to_intdom.begin(), cell_to_intdom.end())
+        invariant_d(size(), 0)
     {
         // Sanity check
         arb_assert(diff.size() == size());
@@ -55,7 +51,6 @@ struct diffusion_solver {
 
         // Build invariant parts
         const auto n = size();
-        invariant_d = array(n, 0);
         if (n >= 1) { // skip empty matrix, ie cell with empty morphology
             for (auto i: util::make_span(1u, n)) {
                 auto gij = diff[i];
@@ -67,20 +62,18 @@ struct diffusion_solver {
         }
     }
 
-    const_view solution() const {
-        // In this back end the solution is a simple view of the rhs, which
-        // contains the solution after the matrix_solve is performed.
-        return rhs;
-    }
 
-    // Assemble the matrix
+    // Assemble and solve the matrix
     //   dt_intdom       [ms]      (per integration domain)
     //   internal conc   [mM]      (per control volume)
     //   voltage         [mV]      (per control volume)
     //   current density [A.m^-2]  (per control volume and species)
-    //   diffusivity     [???]     (per control volume)
+    //   diffusivity     [m^2/s]   (per control volume)
     //   charge          [e]
-    void assemble(const_view dt_intdom, const_view concentration, const_view voltage, const_view current, const_view conductivity, arb_value_type q) {
+    //   diff. concentration
+    //    * will be overwritten by the solution
+    template<typename T>
+    void solve(T& concentration, const_view dt_intdom, const_view voltage, const_view current, const_view conductivity, arb_value_type q) {
         auto cell_cv_part = util::partition_view(cell_cv_divs);
         index_type ncells = cell_cv_part.size();
         // loop over submatrices
@@ -98,19 +91,21 @@ struct diffusion_solver {
                     // using Faraday's constant
                     auto F = A/(q*96.485332);
                     d[i]   = _1_dt   + F*g + invariant_d[i];
-                    rhs[i] = _1_dt*X + F*(u*g - J);
+                    concentration[i] = _1_dt*X + F*(u*g - J);
                 }
             }
             else {
                 for (auto i: util::make_span(cell_cv_part[m])) {
-                    d[i]   = 0;
-                    rhs[i] = concentration[i];
+                    d[i] = 0;
                 }
             }
         }
+        solve(concentration);
     }
 
-    void solve() {
+    // Separate solver; analoguos with cable solver
+    template<typename T>
+    void solve(T& rhs) {
         // loop over submatrices
         for (const auto& [first, last]: util::partition_view(cell_cv_divs)) {
             if (first >= last) continue; // skip cell with no CVs
@@ -132,12 +127,6 @@ struct diffusion_solver {
         }
     }
 
-    template<typename VTo>
-    void solve(VTo& to) {
-        solve();
-        memory::copy(rhs, to);
-    }
-
     std::size_t size() const { return parent_index.size(); }
 };
 
diff --git a/arbor/backends/multicore/shared_state.cpp b/arbor/backends/multicore/shared_state.cpp
index 28578a25..4ebab08c 100644
--- a/arbor/backends/multicore/shared_state.cpp
+++ b/arbor/backends/multicore/shared_state.cpp
@@ -243,20 +243,19 @@ shared_state::shared_state(
 }
 
 void shared_state::integrate_voltage() {
-    solver.assemble(dt_intdom, voltage, current_density, conductivity);
-    solver.solve(voltage);
+    solver.solve(voltage, dt_intdom, current_density, conductivity);
 }
 
 void shared_state::integrate_diffusion() {
     for (auto& [ion, data]: ion_data) {
         if (data.solver) {
-            data.solver->assemble(dt_intdom,
-                                  data.Xd_,
-                                  voltage,
-                                  data.iX_,
-                                  data.gX_,
-                                  data.charge[0]);
-            data.solver->solve(data.Xd_);
+            data.solver->solve(data.Xd_,
+                               dt_intdom,
+                               voltage,
+                               data.iX_,
+                               data.gX_,
+                               data.charge[0]);
+
         }
     }
 }
diff --git a/arbor/include/arbor/mechanism_abi.h b/arbor/include/arbor/mechanism_abi.h
index 6280c4ca..ce32ce3b 100644
--- a/arbor/include/arbor/mechanism_abi.h
+++ b/arbor/include/arbor/mechanism_abi.h
@@ -7,6 +7,19 @@
 extern "C" {
 #endif
 
+// Marker for non-overlapping arrays/pointers
+#ifdef __cplusplus
+#if defined ( __GNUC__ ) || defined ( __clang__ )
+#define ARB_NO_ALIAS __restrict__
+#else
+#error "Unknown compiler, please add support."
+#endif
+#else
+#define ARB_NO_ALIAS restrict
+#endif
+
+
+
 // Version
 #define ARB_MECH_ABI_VERSION_MAJOR 0
 #define ARB_MECH_ABI_VERSION_MINOR 2
diff --git a/test/unit/test_matrix.cpp b/test/unit/test_matrix.cpp
index 12959eb6..f5e21fe3 100644
--- a/test/unit/test_matrix.cpp
+++ b/test/unit/test_matrix.cpp
@@ -45,9 +45,7 @@ TEST(matrix, solve_host)
         solver_type m({0}, {0,1}, vvec(1), vvec(1), vvec(1), {0});
         fill(m.d,  2);
         fill(m.u, -1);
-        fill(m.rhs,1);
-
-        auto x = array({0});
+        array x({1});
         m.solve(x);
 
         EXPECT_EQ(x[0], 0.5);
@@ -65,11 +63,8 @@ TEST(matrix, solve_host)
 
             fill(m.d,  2);
             fill(m.u, -1);
-            fill(m.rhs,1);
-
+            auto x = array(n, 1);
 
-            auto x = array();
-            x.resize(n);
             m.solve(x);
 
             auto err = math::square(std::fabs(2.*x[0] - x[1] - 1.));
@@ -103,12 +98,11 @@ TEST(matrix, zero_diagonal)
 
     assign(m.d,   vvec({2,  3,  2, 0,  0,  4,  5}));
     assign(m.u,   vvec({0, -1, -1, 0, -1,  0, -2}));
-    assign(m.rhs, vvec({3,  5,  7, 7,  8, 16, 32}));
 
     // Expected solution:
     std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
 
-    auto x = array({0, 0, 0, 0, 0, 0, 0});
+    auto x = vvec({3,  5,  7, 7,  8, 16, 32});
     m.solve(x);
 
     EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
@@ -142,7 +136,7 @@ TEST(matrix, zero_diagonal_assembled)
     vvec Cm = {1, 1, 1, 1, 1, 2, 3};
 
     // Initial voltage of zero; currents alone determine rhs.
-    array v(7, 0.0);
+    auto v = vvec{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     vvec area(7, 1.0);
 
     // (Scaled) membrane conductances contribute to diagonal.
@@ -158,24 +152,18 @@ TEST(matrix, zero_diagonal_assembled)
     // x = [ 4 5 6 7 8 9 10 ]
 
     solver_type m(p, c, Cm, g, area, s);
-    m.assemble(dt, v, i, mg);
-
-    auto x = array({0, 0, 0, 0, 0, 0, 0});
     std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
 
-    m.solve(x);
-    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+    m.solve(v, dt, i, mg);
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, v));
 
     // Set dt of 2nd (middle) submatrix to zero. Solution
     // should then return voltage values for that submatrix.
 
     dt[1] = 0;
-    v[3] = -20;
-    v[4] = -30;
-    m.assemble(dt, v, i, mg);
-    m.solve(x);
-
+    v = vvec{0.0, 0.0, 0.0, -20.0, -30.0, 0.0, 0.0};
+    m.solve(v, dt, i, mg);
     expected = {4, 5, 6, -20, -30, 9, 10};
 
-    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, v));
 }
-- 
GitLab