From 180a7ace75d338dec2794e7058e5447158c99758 Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Wed, 20 Sep 2017 15:17:46 +0200
Subject: [PATCH] Stand alone CUDA compilation for threshold_watcher in gpu
 backend (#345)

Refactor the threshold_watcher and stack data structures in the gpu backend so that they are amenable to separable compilation.

* Make `gpu::stack<T>` have a host-only interface that wraps a POD type `gpu::stack_base<T>`.
* Implement a `push_back(stack_base, value)` method in `backends/gpu/kernels/stack.hpp` that is visible only to device code.
* Move `test_thresholds` kernel to a .cu file, replacing template parameters with types provided by `backends/fvm_types.hpp`.
* Add a simple C function interface, callable from host side code, defined in `backends/gpu/threshold_common.hpp`.
* Simplify the `gpu::impl::padded_size` function (both to read and in terms of efficiency).
* Use `typeid` as the default for pretty-printing types in the memory back end.
* Update the `test_gpu_stack` unit test to support new gpu stack interface.
* Fix bug in the `test_spikes` unit test, which was not running the GPU back end in the cuda unit tests.
---
 CMakeLists.txt                               |   4 +-
 src/CMakeLists.txt                           |   1 +
 src/backends/gpu/fvm.hpp                     |   3 +-
 src/backends/gpu/kernels/detail.hpp          |   4 +-
 src/backends/gpu/kernels/stack.hpp           |  31 ++++++
 src/backends/gpu/kernels/test_thresholds.cu  |  79 ++++++++++++++
 src/backends/gpu/kernels/test_thresholds.hpp |  62 ++---------
 src/backends/gpu/stack.hpp                   | 105 ++++++++++---------
 src/backends/gpu/stack_common.hpp            |  42 ++++++++
 src/backends/gpu/threshold_watcher.hpp       |  45 ++++----
 src/communication/communicator.hpp           |   9 +-
 src/memory/allocator.hpp                     |  29 +++--
 src/memory/definitions.hpp                   |   3 +-
 src/util/compat.hpp                          |  11 +-
 tests/unit/test_gpu_stack.cu                 |  65 +++++++++---
 tests/unit/test_spikes.cpp                   |  11 +-
 16 files changed, 332 insertions(+), 172 deletions(-)
 create mode 100644 src/backends/gpu/kernels/stack.hpp
 create mode 100644 src/backends/gpu/kernels/test_thresholds.cu
 create mode 100644 src/backends/gpu/stack_common.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4220519f..57d4bdfe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,9 +7,9 @@ enable_language(CXX)
 # Hide warnings about mixing old and new signatures for target_link_libraries.
 # These can't be avoided, because the FindCUDA packed provided by CMake before
 # version 3.9 uses the old signature, while other packages use the new signature.
-if ("${CMAKE_VERSION}" MATCHES "^3.[0-8].")
+#if ("${CMAKE_VERSION}" MATCHES "^3.[0-8].")
     cmake_policy(SET CMP0023 OLD)
-endif()
+#endif()
 
 # save incoming CXX flags for forwarding to modcc external project
 set(SAVED_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a0c5ac1..db49ff64 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,6 +28,7 @@ set(CUDA_SOURCES
     backends/gpu/fvm.cu
     backends/gpu/multi_event_stream.cu
     backends/gpu/fill.cu
+    backends/gpu/kernels/test_thresholds.cu
     backends/gpu/kernels/take_samples.cu
 )
 
diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp
index 7002c7ae..35be89af 100644
--- a/src/backends/gpu/fvm.hpp
+++ b/src/backends/gpu/fvm.hpp
@@ -92,8 +92,7 @@ struct backend {
         return mech_map_.count(name)>0;
     }
 
-    using threshold_watcher =
-        nest::mc::gpu::threshold_watcher<value_type, size_type>;
+    using threshold_watcher = nest::mc::gpu::threshold_watcher;
 
     // perform min/max reductions on 'array' type
     template <typename V>
diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp
index e33e933b..d2c1be1f 100644
--- a/src/backends/gpu/kernels/detail.hpp
+++ b/src/backends/gpu/kernels/detail.hpp
@@ -48,8 +48,8 @@ constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
 
 // The smallest size of a buffer required to store n items in such that the
 // buffer has size that is a multiple of block_dim.
-constexpr inline unsigned padded_size (unsigned n, unsigned block_dim) {
-    return n%block_dim ? n+block_dim-(n%block_dim): n;
+constexpr inline unsigned padded_size(unsigned n, unsigned block_dim) {
+    return block_dim*block_count(n, block_dim);
 }
 
 // Placeholders to use for mark padded locations in data structures that use
diff --git a/src/backends/gpu/kernels/stack.hpp b/src/backends/gpu/kernels/stack.hpp
new file mode 100644
index 00000000..d303144b
--- /dev/null
+++ b/src/backends/gpu/kernels/stack.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "../stack_common.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+template <typename T>
+__device__
+void push_back(stack_storage<T>& s, const T& value) {
+    // Atomically increment the stores counter. The atomicAdd returns
+    // the value of stores before the increment, which is the location
+    // at which this thread can store value.
+    unsigned position = atomicAdd(&(s.stores), 1u);
+
+    // It is possible that stores>capacity. In this case, only capacity
+    // entries are stored, and additional values are lost. The stores
+    // contains the total number of attempts to push.
+    if (position<s.capacity) {
+        s.data[position] = value;
+    }
+
+    // Note: there are no guards against s.stores overflowing: in which
+    // case the stores counter would start again from 0, and values would
+    // be overwritten from the front of the stack.
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/test_thresholds.cu b/src/backends/gpu/kernels/test_thresholds.cu
new file mode 100644
index 00000000..946a0e29
--- /dev/null
+++ b/src/backends/gpu/kernels/test_thresholds.cu
@@ -0,0 +1,79 @@
+#include <backends/fvm_types.hpp>
+
+#include "detail.hpp"
+#include "stack.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// kernel used to test for threshold crossing test code.
+/// params:
+///     t       : current time (ms)
+///     t_prev  : time of last test (ms)
+///     size    : number of values to test
+///     is_crossed  : crossing state at time t_prev (true or false)
+///     prev_values : values at sample points (see index) sampled at t_prev
+///     index      : index with locations in values to test for crossing
+///     values     : values at t_prev
+///     thresholds : threshold values to watch for crossings
+__global__
+void test_thresholds_kernel(
+    const fvm_size_type* cv_to_cell, const fvm_value_type* t_after, const fvm_value_type* t_before,
+    int size,
+    stack_storage<threshold_crossing>& stack,
+    fvm_size_type* is_crossed, fvm_value_type* prev_values,
+    const fvm_size_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+
+    bool crossed = false;
+    float crossing_time;
+
+    if (i<size) {
+        // Test for threshold crossing
+        const auto cv     = cv_index[i];
+        const auto cell   = cv_to_cell[cv];
+        const auto v_prev = prev_values[i];
+        const auto v      = values[cv];
+        const auto thresh = thresholds[i];
+
+        if (!is_crossed[i]) {
+            if (v>=thresh) {
+                // The threshold has been passed, so estimate the time using
+                // linear interpolation
+                auto pos = (thresh - v_prev)/(v - v_prev);
+                crossing_time = impl::lerp(t_before[cell], t_after[cell], pos);
+
+                is_crossed[i] = 1;
+                crossed = true;
+            }
+        }
+        else if (v<thresh) {
+            is_crossed[i]=0;
+        }
+
+        prev_values[i] = v;
+    }
+
+    if (crossed) {
+        push_back(stack, {fvm_size_type(i), crossing_time});
+    }
+}
+
+void test_thresholds(
+    const fvm_size_type* cv_to_cell, const fvm_value_type* t_after, const fvm_value_type* t_before,
+    int size,
+    stack_storage<threshold_crossing>& stack,
+    fvm_size_type* is_crossed, fvm_value_type* prev_values,
+    const fvm_size_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
+{
+    constexpr int block_dim = 128;
+    const int grid_dim = impl::block_count(size, block_dim);
+    test_thresholds_kernel<<<grid_dim, block_dim>>>(
+        cv_to_cell, t_after, t_before, size, stack, is_crossed, prev_values, cv_index, values, thresholds);
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/test_thresholds.hpp b/src/backends/gpu/kernels/test_thresholds.hpp
index 4c23d13d..c7b8526c 100644
--- a/src/backends/gpu/kernels/test_thresholds.hpp
+++ b/src/backends/gpu/kernels/test_thresholds.hpp
@@ -1,63 +1,19 @@
 #pragma once
 
+#include <backends/fvm_types.hpp>
+
+#include "stack.hpp"
+
 namespace nest {
 namespace mc {
 namespace gpu {
 
-/// kernel used to test for threshold crossing test code.
-/// params:
-///     t       : current time (ms)
-///     t_prev  : time of last test (ms)
-///     size    : number of values to test
-///     is_crossed  : crossing state at time t_prev (true or false)
-///     prev_values : values at sample points (see index) sampled at t_prev
-///     index      : index with locations in values to test for crossing
-///     values     : values at t_prev
-///     thresholds : threshold values to watch for crossings
-template <typename T, typename I, typename Stack>
-__global__
-void test_thresholds(
-    const I* cv_to_cell, const T* t_after, const T* t_before,
+extern void test_thresholds(
+    const fvm_size_type* cv_to_cell, const fvm_value_type* t_after, const fvm_value_type* t_before,
     int size,
-    Stack& stack,
-    I* is_crossed, T* prev_values,
-    const I* cv_index, const T* values, const T* thresholds)
-{
-    int i = threadIdx.x + blockIdx.x*blockDim.x;
-
-    bool crossed = false;
-    float crossing_time;
-
-    if (i<size) {
-        // Test for threshold crossing
-        const auto cv     = cv_index[i];
-        const auto cell   = cv_to_cell[cv];
-        const auto v_prev = prev_values[i];
-        const auto v      = values[cv];
-        const auto thresh = thresholds[i];
-
-        if (!is_crossed[i]) {
-            if (v>=thresh) {
-                // The threshold has been passed, so estimate the time using
-                // linear interpolation
-                auto pos = (thresh - v_prev)/(v - v_prev);
-                crossing_time = impl::lerp(t_before[cell], t_after[cell], pos);
-
-                is_crossed[i] = 1;
-                crossed = true;
-            }
-        }
-        else if (v<thresh) {
-            is_crossed[i]=0;
-        }
-
-        prev_values[i] = v;
-    }
-
-    if (crossed) {
-        stack.push_back({I(i), crossing_time});
-    }
-}
+    stack_storage<threshold_crossing>& stack,
+    fvm_size_type* is_crossed, fvm_value_type* prev_values,
+    const fvm_size_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds);
 
 } // namespace gpu
 } // namespace mc
diff --git a/src/backends/gpu/stack.hpp b/src/backends/gpu/stack.hpp
index 2cad1d10..ce00d9bc 100644
--- a/src/backends/gpu/stack.hpp
+++ b/src/backends/gpu/stack.hpp
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <algorithm>
+
 #include <memory/allocator.hpp>
+#include "stack_common.hpp"
 
 namespace nest {
 namespace mc {
@@ -22,91 +25,95 @@ namespace gpu {
 template <typename T>
 class stack {
     using value_type = T;
-    using allocator = memory::managed_allocator<value_type>;
-
-    // The number of items of type value_type that can be stored in the stack
-    unsigned capacity_;
-
-    // The number of items that have been stored
-    unsigned size_;
-
-    // Memory containing the value buffer
-    // Stored in managed memory to facilitate host-side access of values
-    // pushed from kernels on the device.
-    value_type* data_;
+    template <typename U>
+    using allocator = memory::managed_allocator<U>;
+
+    using storage_type = stack_storage<value_type>;
+    storage_type* storage_;
+
+    storage_type* create_storage(unsigned n) {
+        auto p = allocator<storage_type>().allocate(1);
+        p->capacity = n;
+        p->stores = 0;
+        p->data = allocator<value_type>().allocate(n);
+        return p;
+    }
 
 public:
+    stack& operator=(const stack& other) = delete;
+    stack(const stack& other) = delete;
+
+    stack(): storage_(create_storage(0)) {}
 
-    stack(unsigned capacity):
-        capacity_(capacity), size_(0u)
-    {
-        data_ = allocator().allocate(capacity_);
+    stack(stack&& other): storage_(create_storage(0)) {
+        std::swap(storage_, other.storage_);
     }
 
-    ~stack() {
-        allocator().deallocate(data_, capacity_);
+    stack& operator=(stack&& other) {
+        std::swap(storage_, other.storage_);
+        return *this;
     }
 
-    // Append a new value to the stack.
-    // The value will only be appended if do_push is true.
-    __device__
-    void push_back(const value_type& value) {
-        // Atomically increment the size_ counter. The atomicAdd returns
-        // the value of size_ before the increment, which is the location
-        // at which this thread can store value.
-        unsigned position = atomicAdd(&size_, 1u);
+    explicit stack(unsigned capacity): storage_(create_storage(capacity)) {}
 
-        // It is possible that size_>capacity_. In this case, only capacity_
-        // entries are stored, and additional values are lost. The size_
-        // will contain the total number of attempts to push,
-        if (position<capacity_) {
-            data_[position] = value;
-        }
+    ~stack() {
+        allocator<value_type>().deallocate(storage_->data, storage_->capacity);
+        allocator<storage_type>().deallocate(storage_, 1);
     }
 
-    __host__
     void clear() {
-        size_ = 0;
+        storage_->stores = 0u;
     }
 
     // The number of items that have been pushed back on the stack.
-    // size may exceed capacity, which indicates that the caller attempted
+    // This may exceed capacity, which indicates that the caller attempted
     // to push back more values than there was space to store.
-    __host__ __device__
+    unsigned pushes() const {
+        return storage_->stores;
+    }
+
+    bool overflow() const {
+        return storage_->stores>capacity();
+    }
+
+    // The number of values stored in the stack.
     unsigned size() const {
-        return size_;
+        return std::min(storage_->stores, storage_->capacity);
     }
 
     // The maximum number of items that can be stored in the stack.
-    __host__ __device__
     unsigned capacity() const {
-        return capacity_;
+        return storage_->capacity;
+    }
+
+    storage_type& storage() {
+        return *storage_;
     }
 
     value_type& operator[](unsigned i) {
-        EXPECTS(i<size_ && i<capacity_);
-        return data_[i];
+        EXPECTS(i<size());
+        return storage_->data[i];
     }
 
     value_type& operator[](unsigned i) const {
-        EXPECTS(i<size_ && i<capacity_);
-        return data_[i];
+        EXPECTS(i<size());
+        return storage_->data[i];
     }
 
     value_type* begin() {
-        return data_;
+        return storage_->data;
     }
     const value_type* begin() const {
-        return data_;
+        return storage_->data;
     }
 
     value_type* end() {
-        // Take care of the case where size_>capacity_.
-        return data_ + (size_>capacity_? capacity_: size_);
+        // Take care of the case where size>capacity.
+        return storage_->data + size();
     }
     const value_type* end() const {
-        // Take care of the case where size_>capacity_.
-        return data_ + (size_>capacity_? capacity_: size_);
+        // Take care of the case where size>capacity.
+        return storage_->data + size();
     }
 };
 
diff --git a/src/backends/gpu/stack_common.hpp b/src/backends/gpu/stack_common.hpp
new file mode 100644
index 00000000..74730079
--- /dev/null
+++ b/src/backends/gpu/stack_common.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <backends/fvm_types.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+// stores a single crossing event
+struct threshold_crossing {
+    fvm_size_type index;    // index of variable
+    fvm_value_type time;    // time of crossing
+
+    friend bool operator==(threshold_crossing l, threshold_crossing r) {
+        return l.index==r.index && l.time==r.time;
+    }
+};
+
+// Concrete storage of gpu stack datatype.
+// The stack datatype resides in host memory, and holds a pointer to the
+// stack_storage in managed memory, which can be accessed by both host and
+// gpu code.
+template <typename T>
+struct stack_storage {
+    using value_type = T;
+
+    // The number of items of type value_type that can be stored in the stack
+    unsigned capacity;
+
+    // The number of items that have been stored.
+    // This may exceed capacity if more stores were attempted than it is
+    // possible to store, in which case only the first capacity values are valid.
+    unsigned stores;
+
+    // Memory containing the value buffer
+    value_type* data;
+};
+
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index 39a2a9e8..b63ec72e 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -6,6 +6,7 @@
 
 #include "managed_ptr.hpp"
 #include "stack.hpp"
+#include "backends/fvm_types.hpp"
 #include "kernels/test_thresholds.hpp"
 
 namespace nest {
@@ -14,27 +15,23 @@ namespace gpu {
 
 /// threshold crossing logic
 /// used as part of spike detection back end
-template <typename T, typename I>
 class threshold_watcher {
 public:
-    using value_type = T;
-    using size_type = I;
+    using value_type = fvm_value_type;
+    using size_type = fvm_size_type;
 
-    using array = memory::device_vector<T>;
-    using iarray = memory::device_vector<I>;
+    using array = memory::device_vector<value_type>;
+    using iarray = memory::device_vector<size_type>;
     using const_view = typename array::const_view_type;
     using const_iview = typename iarray::const_view_type;
 
-    /// stores a single crossing event
-    struct threshold_crossing {
-        size_type index;    // index of variable
-        value_type time;    // time of crossing
-    };
-
     using stack_type = stack<threshold_crossing>;
 
     threshold_watcher() = default;
 
+    threshold_watcher(threshold_watcher&& other) = default;
+    threshold_watcher& operator=(threshold_watcher&& other) = default;
+
     threshold_watcher(
             const_iview vec_ci,
             const_view vec_t_before,
@@ -51,7 +48,9 @@ public:
         thresholds_(memory::make_const_view(thresh)),
         prev_values_(values),
         is_crossed_(size()),
-        stack_(make_managed_ptr<stack_type>(10*size()))
+        // TODO: allocates enough space for 10 spikes per watch.
+        // A more robust approach might be needed to avoid overflows.
+        stack_(10*size())
     {
         reset();
     }
@@ -59,7 +58,7 @@ public:
     /// Remove all stored crossings that were detected in previous calls
     /// to test()
     void clear_crossings() {
-        stack_->clear();
+        stack_.clear();
     }
 
     /// Reset state machine for each detector.
@@ -89,7 +88,10 @@ public:
     }
 
     const std::vector<threshold_crossing> crossings() const {
-        return std::vector<threshold_crossing>(stack_->begin(), stack_->end());
+        if (stack_.overflow()) {
+            throw std::runtime_error("GPU spike buffer overflow.");
+        }
+        return std::vector<threshold_crossing>(stack_.begin(), stack_.end());
     }
 
     /// Tests each target for changed threshold state.
@@ -97,18 +99,17 @@ public:
     /// crossed since current time t, and the last time the test was
     /// performed.
     void test() {
-        constexpr int block_dim = 128;
-        const int grid_dim = (size()+block_dim-1)/block_dim;
-        test_thresholds<<<grid_dim, block_dim>>>(
+        test_thresholds(
             cv_to_cell_.data(), t_after_.data(), t_before_.data(),
             size(),
-            *stack_,
+            stack_.storage(),
             is_crossed_.data(), prev_values_.data(),
             cv_index_.data(), values_.data(), thresholds_.data());
 
-        // Check that the number of spikes has not exceeded
-        // the capacity of the stack.
-        EXPECTS(stack_->size() <= stack_->capacity());
+        // Check that the number of spikes has not exceeded capacity.
+        // ATTENTION: requires cudaDeviceSynchronize to avoid simultaneous
+        // host-device managed memory access.
+        EXPECTS((cudaDeviceSynchronize(), !stack_.overflow()));
     }
 
     /// the number of threashold values that are being monitored
@@ -131,7 +132,7 @@ private:
     array prev_values_;         // values at previous sample time: on gpu
     iarray is_crossed_;         // bool flag for state of each watch: on gpu
 
-    managed_ptr<stack_type> stack_;
+    stack_type stack_;
 };
 
 } // namespace gpu
diff --git a/src/communication/communicator.hpp b/src/communication/communicator.hpp
index e6a50739..8a974d85 100644
--- a/src/communication/communicator.hpp
+++ b/src/communication/communicator.hpp
@@ -1,15 +1,16 @@
 #pragma once
 
 #include <algorithm>
+#include <functional>
 #include <iostream>
-#include <vector>
 #include <random>
-#include <functional>
+#include <utility>
+#include <vector>
 
 #include <algorithms.hpp>
 #include <common_types.hpp>
-#include <connection.hpp>
 #include <communication/gathered_vector.hpp>
+#include <connection.hpp>
 #include <domain_decomposition.hpp>
 #include <event_queue.hpp>
 #include <recipe.hpp>
@@ -55,7 +56,7 @@ public:
 
         // For caching information about each cell
         struct gid_info {
-            using connection_list = decltype(rec.connections_on(0));
+            using connection_list = decltype(std::declval<recipe>().connections_on(0));
             cell_gid_type gid;
             cell_gid_type local_group;
             connection_list conns;
diff --git a/src/memory/allocator.hpp b/src/memory/allocator.hpp
index 9add1a67..3f77c08f 100644
--- a/src/memory/allocator.hpp
+++ b/src/memory/allocator.hpp
@@ -148,7 +148,7 @@ namespace impl {
                 void* ptr = reinterpret_cast<void *>
                                 (aligned_malloc<char, Alignment>(size));
 
-                if(ptr == nullptr) {
+                if (!ptr) {
                     return nullptr;
                 }
 
@@ -166,7 +166,7 @@ namespace impl {
             }
 
             void free_policy(void *ptr) {
-                if(ptr == nullptr) {
+                if (!ptr) {
                     return;
                 }
                 cudaHostUnregister(ptr);
@@ -189,9 +189,12 @@ namespace impl {
             static_assert(1024%Alignment==0, "CUDA managed memory is always aligned on 1024 byte boundaries");
 
             void* allocate_policy(std::size_t n) {
+                if (!n) {
+                    return nullptr;
+                }
                 void* ptr;
                 auto status = cudaMallocManaged(&ptr, n);
-                if(status != cudaSuccess) {
+                if (status != cudaSuccess) {
                     LOG_ERROR("memory:: unable to allocate managed memory");
                     ptr = nullptr;
                 }
@@ -208,7 +211,9 @@ namespace impl {
             }
 
             void free_policy(void* p) {
-                cudaFree(p);
+                if (p) {
+                    cudaFree(p);
+                }
             }
         };
 
@@ -278,11 +283,14 @@ public:
     }
 
     pointer allocate(size_type cnt, typename std::allocator<void>::const_pointer = 0) {
-        return reinterpret_cast<T*>(allocate_policy(cnt*sizeof(T)));
+        if (cnt) {
+            return reinterpret_cast<T*>(allocate_policy(cnt*sizeof(T)));
+        }
+        return nullptr;
     }
 
-    void deallocate(pointer p, size_type) {
-        if( p!=nullptr ) {
+    void deallocate(pointer p, size_type cnt) {
+        if (p) {
             free_policy(p);
         }
     }
@@ -334,6 +342,13 @@ namespace util {
             return std::string("device_policy");
         }
     };
+
+    template <>
+    struct type_printer<impl::cuda::managed_policy<>>{
+        static std::string print() {
+            return std::string("managed_policy");
+        }
+    };
 #endif
 
     template <typename T, typename Policy>
diff --git a/src/memory/definitions.hpp b/src/memory/definitions.hpp
index 6254315d..3b0b3277 100644
--- a/src/memory/definitions.hpp
+++ b/src/memory/definitions.hpp
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <sstream>
+#include <typeinfo>
 
 namespace nest {
 namespace mc {
@@ -67,7 +68,7 @@ namespace util {
     template <typename T>
     struct type_printer{
         static std::string print() {
-            return std::string("T");
+            return typeid(T).name();
         }
     };
 
diff --git a/src/util/compat.hpp b/src/util/compat.hpp
index 85d54ad2..977bce12 100644
--- a/src/util/compat.hpp
+++ b/src/util/compat.hpp
@@ -7,8 +7,7 @@
 
 namespace compat {
 
-template<int major=0, int minor=0, int patchlevel=0>
-constexpr bool using_intel_compiler() {
+constexpr bool using_intel_compiler(int major=0, int minor=0, int patchlevel=0) {
 #if defined(__INTEL_COMPILER)
     return __INTEL_COMPILER >= major*100 + minor &&
            __INTEL_COMPILER_UPDATE >= patchlevel;
@@ -17,12 +16,10 @@ constexpr bool using_intel_compiler() {
 #endif
 }
 
-template<int major=0, int minor=0, int patchlevel=0>
-constexpr bool using_gnu_compiler() {
+constexpr bool using_gnu_compiler(int major=0, int minor=0, int patchlevel=0) {
 #if defined(__GNUC__)
-    constexpr int available = __GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__;
-    constexpr int required = major*10000 + minor*100 + patchlevel;
-    return available >= required;
+    return (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+            > (major*10000 + minor*100 + patchlevel);
 #else
     return false;
 #endif
diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu
index ad20a4d5..343d4142 100644
--- a/tests/unit/test_gpu_stack.cu
+++ b/tests/unit/test_gpu_stack.cu
@@ -1,5 +1,6 @@
 #include "../gtest.h"
 
+#include <backends/gpu/kernels/stack.hpp>
 #include <backends/gpu/stack.hpp>
 #include <backends/gpu/managed_ptr.hpp>
 
@@ -18,9 +19,9 @@ TEST(stack, construction) {
 namespace kernels {
     template <typename F>
     __global__
-    void push_back(gpu::stack<int>& s, F f) {
+    void push_back(gpu::stack_storage<int>& s, F f) {
         if (f(threadIdx.x)) {
-            s.push_back(threadIdx.x);
+            nest::mc::gpu::push_back(s, int(threadIdx.x));
         }
     }
 
@@ -52,28 +53,58 @@ TEST(stack, push_back) {
 
     const unsigned n = 10;
     EXPECT_TRUE(n%2 == 0); // require n is even for tests to work
-    auto s = gpu::make_managed_ptr<stack>(n);
+    auto s = stack(n);
+    auto& sstorage = s.storage();
 
-    kernels::push_back<<<1, n>>>(*s, kernels::all_ftor());
+    kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor());
     cudaDeviceSynchronize();
-    EXPECT_EQ(n, s->size());
-    for (auto i=0; i<int(s->size()); ++i) {
-        EXPECT_EQ(i, (*s)[i]);
+    EXPECT_EQ(n, s.size());
+    for (auto i=0; i<int(s.size()); ++i) {
+        EXPECT_EQ(i, s[i]);
     }
 
-    s->clear();
-    kernels::push_back<<<1, n>>>(*s, kernels::even_ftor());
+    s.clear();
+    kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor());
     cudaDeviceSynchronize();
-    EXPECT_EQ(n/2, s->size());
-    for (auto i=0; i<int(s->size())/2; ++i) {
-        EXPECT_EQ(2*i, (*s)[i]);
+    EXPECT_EQ(n/2, s.size());
+    for (auto i=0; i<int(s.size())/2; ++i) {
+        EXPECT_EQ(2*i, s[i]);
     }
 
-    s->clear();
-    kernels::push_back<<<1, n>>>(*s, kernels::odd_ftor());
+    s.clear();
+    kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor());
     cudaDeviceSynchronize();
-    EXPECT_EQ(n/2, s->size());
-    for (auto i=0; i<int(s->size())/2; ++i) {
-        EXPECT_EQ(2*i+1, (*s)[i]);
+    EXPECT_EQ(n/2, s.size());
+    for (auto i=0; i<int(s.size())/2; ++i) {
+        EXPECT_EQ(2*i+1, s[i]);
     }
 }
+
+TEST(stack, overflow) {
+    using T = int;
+    using stack = gpu::stack<T>;
+
+    const unsigned n = 10;
+    auto s = stack(n);
+    auto& sstorage = s.storage();
+    EXPECT_FALSE(s.overflow());
+
+    // push 2n items into a stack of size n
+    kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor());
+    cudaDeviceSynchronize();
+    EXPECT_EQ(n, s.size());
+    EXPECT_EQ(2*n, s.pushes());
+    EXPECT_TRUE(s.overflow());
+}
+
+TEST(stack, empty) {
+    using T = int;
+    using stack = gpu::stack<T>;
+
+    stack s(0u);
+
+    EXPECT_EQ(s.size(), 0u);
+    EXPECT_EQ(s.capacity(), 0u);
+
+    EXPECT_EQ(s.storage().data, nullptr);
+}
diff --git a/tests/unit/test_spikes.cpp b/tests/unit/test_spikes.cpp
index 5e759025..cd992984 100644
--- a/tests/unit/test_spikes.cpp
+++ b/tests/unit/test_spikes.cpp
@@ -18,7 +18,6 @@ using backend = USE_BACKEND;
 #endif
 
 TEST(spikes, threshold_watcher) {
-    using backend = multicore::backend;
     using size_type = backend::size_type;
     using value_type = backend::value_type;
     using array = backend::array;
@@ -61,7 +60,7 @@ TEST(spikes, threshold_watcher) {
 
     // test again at t=1, with unchanged values
     //  - nothing should change
-    util::fill(time_after, 1.);
+    memory::fill(time_after, 1.);
     watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_TRUE(watch.is_crossed(1));
@@ -72,7 +71,7 @@ TEST(spikes, threshold_watcher) {
     //  - 2nd watch should now stop spiking
     memory::fill(values, 0.);
     memory::copy(time_after, time_before);
-    util::fill(time_after, 2.);
+    memory::fill(time_after, 2.);
     watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
@@ -100,7 +99,7 @@ TEST(spikes, threshold_watcher) {
     //  - all watches should stop spiking
     memory::fill(values, 0.);
     memory::copy(time_after, time_before);
-    util::fill(time_after, 4.);
+    memory::fill(time_after, 4.);
     watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
@@ -111,7 +110,7 @@ TEST(spikes, threshold_watcher) {
     //  - watch 3 should be spiking
     values[index[2]] = 6.;
     memory::copy(time_after, time_before);
-    util::fill(time_after, 5.);
+    memory::fill(time_after, 5.);
     watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
@@ -144,7 +143,7 @@ TEST(spikes, threshold_watcher) {
     //
     memory::fill(values, 0);
     values[index[0]] = 10.; // first watch should be intialized to spiking state
-    util::fill(time_before, 0.);
+    memory::fill(time_before, 0.);
     watch.reset();
     EXPECT_EQ(watch.crossings().size(), 0u);
     EXPECT_TRUE(watch.is_crossed(0));
-- 
GitLab