diff --git a/arbor/backends/gpu/stack.hpp b/arbor/backends/gpu/stack.hpp
index b8b417ec15e53892eaf19eabacd0073fbd9e1f6e..942be1ee0f21d4c27529d10629b240a36e55dd83 100644
--- a/arbor/backends/gpu/stack.hpp
+++ b/arbor/backends/gpu/stack.hpp
@@ -5,9 +5,9 @@
 
 #include <arbor/assert.hpp>
 
-#include "backends/gpu/managed_ptr.hpp"
 #include "gpu_context.hpp"
 #include "memory/allocator.hpp"
+#include "memory/cuda_wrappers.hpp"
 #include "stack_storage.hpp"
 
 namespace arb {
@@ -29,110 +29,140 @@ namespace gpu {
 template <typename T>
 class stack {
     using value_type = T;
+
     template <typename U>
-    using allocator = memory::managed_allocator<U>;
+    using allocator = memory::cuda_allocator<U>;
 
     using storage_type = stack_storage<value_type>;
 
     using gpu_context_handle = std::shared_ptr<arb::gpu_context>;
 
-    managed_ptr<storage_type> storage_;
+private:
+    // pointer in GPU memory
+    storage_type* device_storage_;
+
+    // copy of the device_storage in host
+    storage_type host_storage_;
     gpu_context_handle gpu_context_;
 
-    managed_ptr<storage_type> create_storage(unsigned n) {
-        auto p = make_managed_ptr<storage_type>();
-        p->capacity = n;
-        p->stores = 0;
-        p->data = n? allocator<value_type>().allocate(n): nullptr;
-        return p;
+    // copy of data from GPU memory, to be manually refreshed before access
+    std::vector<T> data_;
+
+    void create_storage(unsigned n) {
+        data_.reserve(n);
+
+        host_storage_.capacity = n;
+        host_storage_.stores = 0u;
+        host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;
+
+        device_storage_ = allocator<storage_type>().allocate(1);
+        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
     }
 
 public:
+
     stack& operator=(const stack& other) = delete;
     stack(const stack& other) = delete;
+    stack() = delete;
 
-    stack(const gpu_context_handle& gpu_ctx):
-        storage_(create_storage(0)), gpu_context_(gpu_ctx) {}
-
-    stack(stack&& other): storage_(create_storage(0)), gpu_context_(other.gpu_context_) {
-        std::swap(storage_, other.storage_);
+    stack(gpu_context_handle h): gpu_context_(h) {
+        host_storage_.data = nullptr;
+        device_storage_ = nullptr;
     }
 
     stack& operator=(stack&& other) {
-        std::swap(storage_, other.storage_);
+        gpu_context_ = other.gpu_context_;
+        std::swap(device_storage_, other.device_storage_);
+        std::swap(host_storage_, other.host_storage_);
+        std::swap(data_, other.data_);
         return *this;
     }
 
-    explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx):
-        storage_(create_storage(capacity)), gpu_context_(gpu_ctx) {}
+    stack(stack&& other) {
+        *this = std::move(other);
+    }
+
+    explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): gpu_context_(gpu_ctx) {
+        create_storage(capacity);
+    }
 
     ~stack() {
-        storage_.synchronize();
-        if (storage_->data) {
-            allocator<value_type>().deallocate(storage_->data, storage_->capacity);
+        if (host_storage_.data) {
+            allocator<value_type>().deallocate(host_storage_.data, host_storage_.capacity);
         }
+        allocator<storage_type>().deallocate(device_storage_, sizeof(storage_type));
     }
 
-    // Perform any required synchronization if concurrent host-side access is not supported.
-    // (Correctness still requires that GPU operations on this stack are complete.)
-    void host_access() const {
-        gpu_context_->synchronize_for_managed_access();
+    // After this call both host and device storage are synchronized to the GPU
+    // state before the call.
+    void update_host() {
+        memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
+
+        auto num = size();
+        data_.resize(num);
+        auto bytes = num*sizeof(T);
+        memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
     }
 
+    // After this call both host and device storage are synchronized to empty state.
     void clear() {
-        storage_->stores = 0u;
+        host_storage_.stores = 0u;
+        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
+        data_.clear();
+    }
+
+    // The information returned by the calls below may be out of sync with the
+    // version on the GPU if the GPU storage has been modified since the last
+    // call to update_host().
+    storage_type get_storage_copy() const {
+        return host_storage_;
+    }
+
+    const std::vector<value_type>& data() const {
+        return data_;
     }
 
     // The number of items that have been pushed back on the stack.
     // This may exceed capacity, which indicates that the caller attempted
     // to push back more values than there was space to store.
     unsigned pushes() const {
-        return storage_->stores;
+        return host_storage_.stores;
     }
 
     bool overflow() const {
-        return storage_->stores>capacity();
+        return host_storage_.stores>host_storage_.capacity;
     }
 
     // The number of values stored in the stack.
     unsigned size() const {
-        return std::min(storage_->stores, storage_->capacity);
+        return std::min(host_storage_.stores, host_storage_.capacity);
     }
 
     // The maximum number of items that can be stored in the stack.
     unsigned capacity() const {
-        return storage_->capacity;
+        return host_storage_.capacity;
     }
 
+    // This returns a non-const reference to the unerlying device storage so
+    // that it can be passed to GPU kernels that need to modify the stack.
     storage_type& storage() {
-        return *storage_;
+        return *device_storage_;
     }
 
-    value_type& operator[](unsigned i) {
+    const value_type& operator[](unsigned i) const {
         arb_assert(i<size());
-        return storage_->data[i];
+        return data_[i];
     }
 
-    value_type& operator[](unsigned i) const {
-        arb_assert(i<size());
-        return storage_->data[i];
-    }
-
-    value_type* begin() {
-        return storage_->data;
-    }
     const value_type* begin() const {
-        return storage_->data;
+        return data_.data();
     }
 
-    value_type* end() {
-        // Take care of the case where size>capacity.
-        return storage_->data + size();
-    }
     const value_type* end() const {
         // Take care of the case where size>capacity.
-        return storage_->data + size();
+        return data_.data() + size();
     }
+
 };
 
 } // namespace gpu
diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp
index 7b1ad17b9f73d497178e003a80d07802cd95e85f..eca942b9264841a69771c48391453dc86222d896 100644
--- a/arbor/backends/gpu/threshold_watcher.hpp
+++ b/arbor/backends/gpu/threshold_watcher.hpp
@@ -70,7 +70,7 @@ public:
 
     /// Remove all stored crossings that were detected in previous calls to test()
     void clear_crossings() {
-        stack_.host_access();
+        stack_.update_host();
         stack_.clear();
     }
 
@@ -90,7 +90,7 @@ public:
     }
 
     const std::vector<threshold_crossing>& crossings() const {
-        stack_.host_access();
+        stack_.update_host();
 
         if (stack_.overflow()) {
             throw arbor_internal_error("gpu/threshold_watcher: gpu spike buffer overflow");
@@ -141,7 +141,7 @@ private:
     array v_prev_;              // Values at previous sample time.
 
     // Hybrid host/gpu data structure for accumulating threshold crossings.
-    stack_type stack_;
+    mutable stack_type stack_;
 
     // host side storage for the crossings
     mutable std::vector<threshold_crossing> crossings_;
diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu
index 10af85c4305ff0d641252aecd04f8f77cb963627..7f05e324a61193185ec71d931511d7e5d5226696 100644
--- a/test/unit/test_gpu_stack.cu
+++ b/test/unit/test_gpu_stack.cu
@@ -63,30 +63,46 @@ TEST(stack, push_back) {
     auto s = stack(n, context);
     auto& sstorage = s.storage();
 
+    EXPECT_EQ(0u, s.size()); // dummy tests
+    EXPECT_EQ(n, s.capacity());
+
+
     kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor());
-    cudaDeviceSynchronize();
+    s.update_host();
     EXPECT_EQ(n, s.size());
-    std::sort(sstorage.data, sstorage.data+s.size());
-    for (auto i=0; i<int(s.size()); ++i) {
-        EXPECT_EQ(i, s[i]);
+    {
+        auto d = s.data();
+        EXPECT_EQ(s.size(), d.size());
+        std::sort(d.begin(), d.end());
+        for (unsigned i=0; i<n; ++i) {
+            EXPECT_EQ(i, d[i]);
+        }
     }
 
     s.clear();
     kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor());
-    cudaDeviceSynchronize();
+    s.update_host();
     EXPECT_EQ(n/2, s.size());
-    std::sort(sstorage.data, sstorage.data+s.size());
-    for (auto i=0; i<int(s.size())/2; ++i) {
-        EXPECT_EQ(2*i, s[i]);
+    {
+        auto d = s.data();
+        EXPECT_EQ(s.size(), d.size());
+        std::sort(d.begin(), d.end());
+        for (unsigned i=0; i<n/2; ++i) {
+            EXPECT_EQ(2*i, d[i]);
+        }
     }
 
     s.clear();
     kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor());
-    cudaDeviceSynchronize();
+    s.update_host();
     EXPECT_EQ(n/2, s.size());
-    std::sort(sstorage.data, sstorage.data+s.size());
-    for (auto i=0; i<int(s.size())/2; ++i) {
-        EXPECT_EQ(2*i+1, s[i]);
+    {
+        auto d = s.data();
+        EXPECT_EQ(s.size(), d.size());
+        std::sort(d.begin(), d.end());
+        for (unsigned i=0; i<n/2; ++i) {
+            EXPECT_EQ(2*i+1, d[i]);
+        }
     }
 }
 
@@ -104,7 +120,7 @@ TEST(stack, overflow) {
 
     // push 2n items into a stack of size n
     kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor());
-    cudaDeviceSynchronize();
+    s.update_host();
     EXPECT_EQ(n, s.size());
     EXPECT_EQ(2*n, s.pushes());
     EXPECT_TRUE(s.overflow());
@@ -122,5 +138,7 @@ TEST(stack, empty) {
     EXPECT_EQ(s.size(), 0u);
     EXPECT_EQ(s.capacity(), 0u);
 
-    EXPECT_EQ(s.storage().data, nullptr);
+    auto device_storage = s.get_storage_copy();
+
+    EXPECT_EQ(device_storage.data, nullptr);
 }