From a316dd872077cfec9ebcd8daa0f462d84dfd0f77 Mon Sep 17 00:00:00 2001
From: thorstenhater <24411438+thorstenhater@users.noreply.github.com>
Date: Wed, 24 Jun 2020 17:03:08 +0200
Subject: [PATCH] Performance/copy to swap (#1027)

Remove a redundant copy in favor of a swap operation for a gain in performance;
especially on GPU since copies are synchronous. Similarly, instead of solving the
linear system into an intermediate array, write output directly into the target.

Here is the effect on the busyring benchmark (swapped pas -> hh) with 8192 cells on a
V100 GPU (time for model-run in seconds).
```
|----------+--------------------------------+------------------------------------|
| Baseline | fvm_lowered_cell: copy -> swap | matrix: solve + copy -> solve_into |
|----------+--------------------------------+------------------------------------|
|    2.230 |                          2.199 |                              2.129 |
|    2.231 |                          2.209 |                              2.132 |
|    2.225 |                          2.209 |                              2.136 |
|    2.227 |                          2.186 |                              2.130 |
|    2.220 |                          2.204 |                              2.133 |
|----------+--------------------------------+------------------------------------|
|     2.22 |                          2.186 |                              2.129 |
|----------+--------------------------------+------------------------------------|
```
---
 arbor/backends/gpu/matrix_state_fine.hpp  | 46 ++++++++++-------------
 arbor/backends/gpu/matrix_state_flat.hpp  | 10 ++---
 arbor/backends/multicore/matrix_state.hpp |  8 ++++
 arbor/fvm_lowered_cell_impl.hpp           |  5 +--
 arbor/matrix.hpp                          | 11 ++----
 test/unit/test_fvm_lowered.cpp            |  3 +-
 test/unit/test_matrix.cpp                 | 31 ++++++++-------
 test/unit/test_matrix.cu                  | 11 ++++--
 test/unit/test_matrix_cpuvsgpu.cpp        | 11 +++---
 9 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp
index f2fa08db..3b393a85 100644
--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -98,9 +98,6 @@ public:
     // Invariant part of the matrix diagonal
     array invariant_d;         // [Î¼S]
 
-    // Solution in unpacked format
-    array solution_;
-
     // Maps cell to integration domain
     iarray cell_to_intdom;
 
@@ -418,7 +415,6 @@ public:
         // d, u, rhs        : packed
         // cv_capacitance   : flat
         // invariant_d      : flat
-        // solution_        : flat
         // cv_to_cell       : flat
         // area             : flat
 
@@ -446,8 +442,7 @@ public:
         // transform u_shuffled values into packed u vector.
         flat_to_packed(u_shuffled, u);
 
-        // the invariant part of d, cv_area and the solution are in flat form
-        solution_ = array(matrix_size, 0);
+        // the invariant part of d and cv_area are in flat form
         cv_area = memory::make_const_view(area);
 
         // the cv_capacitance can be copied directly because it is
@@ -489,43 +484,40 @@ public:
             size());
     }
 
-    void solve() {
-        solve_matrix_fine(
-            rhs.data(), d.data(), u.data(),
-            level_meta.data(), level_lengths.data(), level_parents.data(),
-            block_index.data(),
-            num_cells_in_block.data(),
-            data_partition.data(),
-            num_cells_in_block.size(), max_branches_per_level);
-
+    void solve(array& to) {
+        solve_matrix_fine(rhs.data(),
+                          d.data(),
+                          u.data(),
+                          level_meta.data(),
+                          level_lengths.data(),
+                          level_parents.data(),
+                          block_index.data(),
+                          num_cells_in_block.data(),
+                          data_partition.data(),
+                          num_cells_in_block.size(),
+                          max_branches_per_level);
         // unpermute the solution
-        packed_to_flat(rhs, solution_);
+        packed_to_flat(rhs, to);
     }
 
-    const_view solution() const {
-        return solution_;
+private:
+    std::size_t size() const {
+        return matrix_size;
     }
 
-    template <typename VFrom, typename VTo>
-    void flat_to_packed(const VFrom& from, VTo& to ) {
+    void flat_to_packed(const array& from, array& to ) {
         arb_assert(from.size()==matrix_size);
         arb_assert(to.size()==data_size);
 
         scatter(from.data(), to.data(), perm.data(), perm.size());
     }
 
-    template <typename VFrom, typename VTo>
-    void packed_to_flat(const VFrom& from, VTo& to ) {
+    void packed_to_flat(const array& from, array& to ) {
         arb_assert(from.size()==data_size);
         arb_assert(to.size()==matrix_size);
 
         gather(from.data(), to.data(), perm.data(), perm.size());
     }
-
-private:
-    std::size_t size() const {
-        return matrix_size;
-    }
 };
 
 } // namespace gpu
diff --git a/arbor/backends/gpu/matrix_state_flat.hpp b/arbor/backends/gpu/matrix_state_flat.hpp
index df1c8c84..c5a6d98c 100644
--- a/arbor/backends/gpu/matrix_state_flat.hpp
+++ b/arbor/backends/gpu/matrix_state_flat.hpp
@@ -115,9 +115,6 @@ struct matrix_state_flat {
     }
 
     // interface for exposing the solution to the outside world
-    const_view solution() const {
-        return memory::make_view(rhs);
-    }
 
     // Assemble the matrix
     // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
@@ -132,10 +129,11 @@ struct matrix_state_flat {
             cv_to_cell.data(), dt_intdom.data(), cell_to_intdom.data(), size());
     }
 
-    void solve() {
+    void solve(array& to) {
         // perform solve on gpu
-        solve_matrix_flat(rhs.data(), d.data(), u.data(), parent_index.data(),
-                          cell_cv_divs.data(), num_matrices());
+        arb_assert(to.size() == rhs.size());
+        solve_matrix_flat(rhs.data(), d.data(), u.data(), parent_index.data(), cell_cv_divs.data(), num_matrices());
+        memory::copy(rhs, to);
     }
 
     std::size_t size() const {
diff --git a/arbor/backends/multicore/matrix_state.hpp b/arbor/backends/multicore/matrix_state.hpp
index 84e3f041..24a21c1b 100644
--- a/arbor/backends/multicore/matrix_state.hpp
+++ b/arbor/backends/multicore/matrix_state.hpp
@@ -3,6 +3,8 @@
 #include <util/partition.hpp>
 #include <util/span.hpp>
 
+#include <memory/memory.hpp>
+
 #include "multicore_common.hpp"
 
 namespace arb {
@@ -133,6 +135,12 @@ public:
         }
     }
 
+    template<typename VTo>
+    void solve(VTo& to) {
+        solve();
+        memory::copy(rhs, to);
+    }
+
 private:
 
     std::size_t size() const {
diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index 173cad5e..81ba0068 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -276,8 +276,7 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
         matrix_.assemble(state_->dt_intdom, state_->voltage, state_->current_density, state_->conductivity);
         PL();
         PE(advance_integrate_matrix_solve);
-        matrix_.solve();
-        memory::copy(matrix_.solution(), state_->voltage);
+        matrix_.solve(state_->voltage);
         PL();
 
         // Integrate mechanism state.
@@ -296,7 +295,7 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
 
         PE(advance_integrate_threshold);
         threshold_watcher_.test();
-        memory::copy(state_->time_to, state_->time);
+        std::swap(state_->time_to, state_->time);
         PL();
 
         // Check for non-physical solutions:
diff --git a/arbor/matrix.hpp b/arbor/matrix.hpp
index 8609cb7c..3bc7a54b 100644
--- a/arbor/matrix.hpp
+++ b/arbor/matrix.hpp
@@ -61,9 +61,9 @@ public:
     /// the partition of the parent index over the cells
     const iarray& cell_index() const { return cell_index_; }
 
-    /// Solve the linear system.
-    void solve() {
-        state_.solve();
+    /// Solve the linear system into a given solution storage.
+    void solve(array& to) {
+        state_.solve(to);
     }
 
     /// Assemble the matrix for given dt
@@ -71,11 +71,6 @@ public:
         state_.assemble(dt_cell, voltage, current, conductivity);
     }
 
-    /// Get a view of the solution
-    typename State::const_view solution() const {
-        return state_.solution();
-    }
-
 private:
     /// the parent indice that describe matrix structure
     iarray parent_index_;
diff --git a/test/unit/test_fvm_lowered.cpp b/test/unit/test_fvm_lowered.cpp
index 09cfe347..4110db15 100644
--- a/test/unit/test_fvm_lowered.cpp
+++ b/test/unit/test_fvm_lowered.cpp
@@ -231,6 +231,7 @@ TEST(fvm_lowered, matrix_init)
     fvcell.initialize({0}, cable1d_recipe(cell), cell_to_intdom, targets, probe_map);
 
     auto& J = fvcell.*private_matrix_ptr;
+    auto& S = fvcell.*private_state_ptr;
     EXPECT_EQ(J.size(), 12u);
 
     // Test that the matrix is initialized with sensible values
@@ -242,7 +243,7 @@ TEST(fvm_lowered, matrix_init)
 
     EXPECT_FALSE(util::any_of(util::subrange_view(mat.u, 1, n), isnan));
     EXPECT_FALSE(util::any_of(mat.d, isnan));
-    EXPECT_FALSE(util::any_of(J.solution(), isnan));
+    EXPECT_FALSE(util::any_of(S->voltage, isnan));
 
     EXPECT_FALSE(util::any_of(util::subrange_view(mat.u, 1, n), ispos));
     EXPECT_FALSE(util::any_of(mat.d, isneg));
diff --git a/test/unit/test_matrix.cpp b/test/unit/test_matrix.cpp
index 6ead914f..e3b17d8e 100644
--- a/test/unit/test_matrix.cpp
+++ b/test/unit/test_matrix.cpp
@@ -14,9 +14,11 @@
 
 using namespace arb;
 
-using matrix_type = matrix<arb::multicore::backend>;
-using index_type = matrix_type::index_type;
-using value_type = matrix_type::value_type;
+using backend     = multicore::backend;
+using array       = backend::array;
+using matrix_type = matrix<backend>;
+using index_type  = matrix_type::index_type;
+using value_type  = matrix_type::value_type;
 
 using vvec = std::vector<value_type>;
 
@@ -47,9 +49,10 @@ TEST(matrix, solve_host)
         fill(state.u, -1);
         fill(state.rhs,1);
 
-        m.solve();
+        auto x = array({0});
+        m.solve(x);
 
-        EXPECT_EQ(m.solution()[0], 0.5);
+        EXPECT_EQ(x[0], 0.5);
     }
 
     // matrices in the range of 2x2 to 1000x1000
@@ -68,9 +71,11 @@ TEST(matrix, solve_host)
             fill(A.u, -1);
             fill(A.rhs,1);
 
-            m.solve();
 
-            auto x = m.solution();
+            auto x = array();
+            x.resize(n);
+            m.solve(x);
+
             auto err = math::square(std::fabs(2.*x[0] - x[1] - 1.));
             for(auto i : make_span(1,n-1)) {
                 err += math::square(std::fabs(2.*x[i] - x[i-1] - x[i+1] - 1.));
@@ -108,8 +113,8 @@ TEST(matrix, zero_diagonal)
     // Expected solution:
     std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
 
-    m.solve();
-    auto x = m.solution();
+    auto x = array({0, 0, 0, 0, 0, 0, 0});
+    m.solve(x);
 
     EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
 }
@@ -159,12 +164,11 @@ TEST(matrix, zero_diagonal_assembled)
 
     matrix_type m(p, c, Cm, g, area, s);
     m.assemble(dt, v, i, mg);
-    m.solve();
 
-    vvec x;
-    assign(x, m.solution());
+    auto x = array({0, 0, 0, 0, 0, 0, 0});
     std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
 
+    m.solve(x);
     EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
 
     // Set dt of 2nd (middle) submatrix to zero. Solution
@@ -174,9 +178,8 @@ TEST(matrix, zero_diagonal_assembled)
     v[3] = -20;
     v[4] = -30;
     m.assemble(dt, v, i, mg);
-    m.solve();
+    m.solve(x);
 
-    assign(x, m.solution());
     expected = {4, 5, 6, -20, -30, 9, 10};
 
     EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu
index b46e8315..66499801 100644
--- a/test/unit/test_matrix.cu
+++ b/test/unit/test_matrix.cu
@@ -134,19 +134,22 @@ TEST(matrix, backends)
     auto gpu_i = on_gpu(i);
     auto gpu_mg = on_gpu(mg);
 
+    auto x_flat_d = gpu_array(group_size);
+    auto x_fine_d = gpu_array(group_size);
+
     flat.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg);
     fine.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg);
 
-    flat.solve();
-    fine.solve();
+    flat.solve(x_flat_d);
+    fine.solve(x_fine_d);
 
     // Compare the results.
     // We expect exact equality for the two gpu matrix implementations because both
     // perform the same operations in the same order on the same inputs.
-    std::vector<double> x_flat = assign_from(on_host(flat.solution()));
+    auto x_flat = on_host(x_flat_d);
     // as the fine algorithm contains atomics the solution might be slightly
     // different from flat and interleaved
-    std::vector<double> x_fine = assign_from(on_host(fine.solution()));
+    auto x_fine = on_host(x_fine_d);
 
     auto max_diff_fine =
         util::max_value(
diff --git a/test/unit/test_matrix_cpuvsgpu.cpp b/test/unit/test_matrix_cpuvsgpu.cpp
index f89bb900..07601e1a 100644
--- a/test/unit/test_matrix_cpuvsgpu.cpp
+++ b/test/unit/test_matrix_cpuvsgpu.cpp
@@ -124,15 +124,16 @@ TEST(matrix, assemble)
     std::generate(dt.begin(), dt.end(), [&](){return dt_dist(gen);});
 
     // Voltage, current, and conductance values
+    auto result_h = host_array(group_size);
+    auto x_d = gpu_array(group_size);
     m_mc.assemble(host_array(dt.begin(), dt.end()), host_array(group_size, -64), host_array(group_size, 10), host_array(group_size, 3));
-    m_mc.solve();
+    m_mc.solve(result_h);
     m_gpu.assemble(on_gpu(dt), gpu_array(group_size, -64), gpu_array(group_size, 10), gpu_array(group_size, 3));
-    m_gpu.solve();
-
+    m_gpu.solve(x_d);
+    auto result_g = on_host(x_d);
+    
     // Compare the GPU and CPU results.
     // Cast result to float, because we are happy to ignore small differencs
-    std::vector<float> result_h = util::assign_from(m_mc.solution());
-    std::vector<float> result_g = util::assign_from(on_host(m_gpu.solution()));
     EXPECT_TRUE(seq_almost_eq<float>(result_h, result_g));
 }
 
-- 
GitLab