From d9839df01899456e8b2a09dd313aa66e4b9ab2ed Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Wed, 8 Mar 2017 12:03:08 +0100
Subject: [PATCH] remove small cudaMemcpys (#175)

This patch removes the many small `cudaMemcpy` calls for single values, except for those from calling `net_receive` in event delivery.

The small copies during initialization were from when the upper diagonal and time invariant component of the diagonal were computed on the host. There were many small reads/writes to device memory accessing the `p` and `u` vectors.

* Remove many small device copies in matrix setup by copying required data to host, computing, and then copying back in one copy.
* Add `constexpr` test `is_debug_mode()` for having been compiled in debug mode (tests `NDEBUG`).
* Only perform `is_physical_solution` test if `is_debug_mode()` is true. (The `is_physical_solution` test triggers a single copy from device to host on each time step to test whether the voltage has exceeded some "reasonable" physical bounds.)
---
 src/backends/fvm_gpu.hpp | 7 +++++--
 src/cell_group.hpp       | 2 +-
 src/util/debug.hpp       | 7 +++++++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backends/fvm_gpu.hpp b/src/backends/fvm_gpu.hpp
index 5e3e7a14..9933dd77 100644
--- a/src/backends/fvm_gpu.hpp
+++ b/src/backends/fvm_gpu.hpp
@@ -118,17 +118,20 @@ struct backend {
         {
             auto n = d.size();
             host_array invariant_d_tmp(n, 0);
+            host_array u_tmp(n, 0);
 
             // make a copy of the conductance on the host
             host_array face_conductance_tmp = face_conductance;
+            auto p_tmp = memory::on_host(p);
             for(auto i: util::make_span(1u, n)) {
                 auto gij = face_conductance_tmp[i];
 
-                u[i] = -gij;
+                u_tmp[i] = -gij;
                 invariant_d_tmp[i] += gij;
-                invariant_d_tmp[p[i]] += gij;
+                invariant_d_tmp[p_tmp[i]] += gij;
             }
             invariant_d = invariant_d_tmp;
+            memory::copy(u_tmp, u);
 
             params = {
                 d.data(), u.data(), rhs.data(),
diff --git a/src/cell_group.hpp b/src/cell_group.hpp
index 807238a4..ef4c71b2 100644
--- a/src/cell_group.hpp
+++ b/src/cell_group.hpp
@@ -106,7 +106,7 @@ public:
             time_type tnext = next ? next->time: tstep;
             cell_.advance(tnext - cell_.time());
 
-            if (!cell_.is_physical_solution()) {
+            if (util::is_debug_mode() && !cell_.is_physical_solution()) {
                 std::cerr << "warning: solution out of bounds for cell "
                           << gid_base_ << " at t " << cell_.time() << " ms\n";
             }
diff --git a/src/util/debug.hpp b/src/util/debug.hpp
index c2945e29..3c3eaa85 100644
--- a/src/util/debug.hpp
+++ b/src/util/debug.hpp
@@ -11,6 +11,13 @@ namespace nest {
 namespace mc {
 namespace util {
 
+constexpr inline bool is_debug_mode() {
+#ifndef NDEBUG
+    return true;
+#else
+    return false;
+#endif
+}
 using failed_assertion_handler_t =
     bool (*)(const char* assertion, const char* file, int line, const char* func);
 
-- 
GitLab