diff --git a/arbor/backends/gpu/stack.hpp b/arbor/backends/gpu/stack.hpp index b8b417ec15e53892eaf19eabacd0073fbd9e1f6e..942be1ee0f21d4c27529d10629b240a36e55dd83 100644 --- a/arbor/backends/gpu/stack.hpp +++ b/arbor/backends/gpu/stack.hpp @@ -5,9 +5,9 @@ #include <arbor/assert.hpp> -#include "backends/gpu/managed_ptr.hpp" #include "gpu_context.hpp" #include "memory/allocator.hpp" +#include "memory/cuda_wrappers.hpp" #include "stack_storage.hpp" namespace arb { @@ -29,110 +29,140 @@ namespace gpu { template <typename T> class stack { using value_type = T; + template <typename U> - using allocator = memory::managed_allocator<U>; + using allocator = memory::cuda_allocator<U>; using storage_type = stack_storage<value_type>; using gpu_context_handle = std::shared_ptr<arb::gpu_context>; - managed_ptr<storage_type> storage_; +private: + // pointer in GPU memory + storage_type* device_storage_; + + // copy of the device_storage in host + storage_type host_storage_; gpu_context_handle gpu_context_; - managed_ptr<storage_type> create_storage(unsigned n) { - auto p = make_managed_ptr<storage_type>(); - p->capacity = n; - p->stores = 0; - p->data = n? allocator<value_type>().allocate(n): nullptr; - return p; + // copy of data from GPU memory, to be manually refreshed before access + std::vector<T> data_; + + void create_storage(unsigned n) { + data_.reserve(n); + + host_storage_.capacity = n; + host_storage_.stores = 0u; + host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr; + + device_storage_ = allocator<storage_type>().allocate(1); + memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); } public: + stack& operator=(const stack& other) = delete; stack(const stack& other) = delete; + stack() = delete; - stack(const gpu_context_handle& gpu_ctx): - storage_(create_storage(0)), gpu_context_(gpu_ctx) {} - - stack(stack&& other): storage_(create_storage(0)), gpu_context_(other.gpu_context_) { - std::swap(storage_, other.storage_); + stack(gpu_context_handle h): gpu_context_(h) { + host_storage_.data = nullptr; + device_storage_ = nullptr; } stack& operator=(stack&& other) { - std::swap(storage_, other.storage_); + gpu_context_ = other.gpu_context_; + std::swap(device_storage_, other.device_storage_); + std::swap(host_storage_, other.host_storage_); + std::swap(data_, other.data_); return *this; } - explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): - storage_(create_storage(capacity)), gpu_context_(gpu_ctx) {} + stack(stack&& other) { + *this = std::move(other); + } + + explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): gpu_context_(gpu_ctx) { + create_storage(capacity); + } ~stack() { - storage_.synchronize(); - if (storage_->data) { - allocator<value_type>().deallocate(storage_->data, storage_->capacity); + if (host_storage_.data) { + allocator<value_type>().deallocate(host_storage_.data, host_storage_.capacity); } + allocator<storage_type>().deallocate(device_storage_, sizeof(storage_type)); } - // Perform any required synchronization if concurrent host-side access is not supported. - // (Correctness still requires that GPU operations on this stack are complete.) - void host_access() const { - gpu_context_->synchronize_for_managed_access(); + // After this call both host and device storage are synchronized to the GPU + // state before the call. + void update_host() { + memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type)); + + auto num = size(); + data_.resize(num); + auto bytes = num*sizeof(T); + memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes); } + // After this call both host and device storage are synchronized to empty state. void clear() { - storage_->stores = 0u; + host_storage_.stores = 0u; + memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); + data_.clear(); + } + + // The information returned by the calls below may be out of sync with the + // version on the GPU if the GPU storage has been modified since the last + // call to update_host(). + storage_type get_storage_copy() const { + return host_storage_; + } + + const std::vector<value_type>& data() const { + return data_; } // The number of items that have been pushed back on the stack. // This may exceed capacity, which indicates that the caller attempted // to push back more values than there was space to store. unsigned pushes() const { - return storage_->stores; + return host_storage_.stores; } bool overflow() const { - return storage_->stores>capacity(); + return host_storage_.stores>host_storage_.capacity; } // The number of values stored in the stack. unsigned size() const { - return std::min(storage_->stores, storage_->capacity); + return std::min(host_storage_.stores, host_storage_.capacity); } // The maximum number of items that can be stored in the stack. unsigned capacity() const { - return storage_->capacity; + return host_storage_.capacity; } + // This returns a non-const reference to the unerlying device storage so + // that it can be passed to GPU kernels that need to modify the stack. storage_type& storage() { - return *storage_; + return *device_storage_; } - value_type& operator[](unsigned i) { + const value_type& operator[](unsigned i) const { arb_assert(i<size()); - return storage_->data[i]; + return data_[i]; } - value_type& operator[](unsigned i) const { - arb_assert(i<size()); - return storage_->data[i]; - } - - value_type* begin() { - return storage_->data; - } const value_type* begin() const { - return storage_->data; + return data_.data(); } - value_type* end() { - // Take care of the case where size>capacity. - return storage_->data + size(); - } const value_type* end() const { // Take care of the case where size>capacity. - return storage_->data + size(); + return data_.data() + size(); } + }; } // namespace gpu diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp index 7b1ad17b9f73d497178e003a80d07802cd95e85f..eca942b9264841a69771c48391453dc86222d896 100644 --- a/arbor/backends/gpu/threshold_watcher.hpp +++ b/arbor/backends/gpu/threshold_watcher.hpp @@ -70,7 +70,7 @@ public: /// Remove all stored crossings that were detected in previous calls to test() void clear_crossings() { - stack_.host_access(); + stack_.update_host(); stack_.clear(); } @@ -90,7 +90,7 @@ public: } const std::vector<threshold_crossing>& crossings() const { - stack_.host_access(); + stack_.update_host(); if (stack_.overflow()) { throw arbor_internal_error("gpu/threshold_watcher: gpu spike buffer overflow"); @@ -141,7 +141,7 @@ private: array v_prev_; // Values at previous sample time. // Hybrid host/gpu data structure for accumulating threshold crossings. - stack_type stack_; + mutable stack_type stack_; // host side storage for the crossings mutable std::vector<threshold_crossing> crossings_; diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu index 10af85c4305ff0d641252aecd04f8f77cb963627..7f05e324a61193185ec71d931511d7e5d5226696 100644 --- a/test/unit/test_gpu_stack.cu +++ b/test/unit/test_gpu_stack.cu @@ -63,30 +63,46 @@ TEST(stack, push_back) { auto s = stack(n, context); auto& sstorage = s.storage(); + EXPECT_EQ(0u, s.size()); // dummy tests + EXPECT_EQ(n, s.capacity()); + + kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor()); - cudaDeviceSynchronize(); + s.update_host(); EXPECT_EQ(n, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); - for (auto i=0; i<int(s.size()); ++i) { - EXPECT_EQ(i, s[i]); + { + auto d = s.data(); + EXPECT_EQ(s.size(), d.size()); + std::sort(d.begin(), d.end()); + for (unsigned i=0; i<n; ++i) { + EXPECT_EQ(i, d[i]); + } } s.clear(); kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor()); - cudaDeviceSynchronize(); + s.update_host(); EXPECT_EQ(n/2, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); - for (auto i=0; i<int(s.size())/2; ++i) { - EXPECT_EQ(2*i, s[i]); + { + auto d = s.data(); + EXPECT_EQ(s.size(), d.size()); + std::sort(d.begin(), d.end()); + for (unsigned i=0; i<n/2; ++i) { + EXPECT_EQ(2*i, d[i]); + } } s.clear(); kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor()); - cudaDeviceSynchronize(); + s.update_host(); EXPECT_EQ(n/2, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); - for (auto i=0; i<int(s.size())/2; ++i) { - EXPECT_EQ(2*i+1, s[i]); + { + auto d = s.data(); + EXPECT_EQ(s.size(), d.size()); + std::sort(d.begin(), d.end()); + for (unsigned i=0; i<n/2; ++i) { + EXPECT_EQ(2*i+1, d[i]); + } } } @@ -104,7 +120,7 @@ TEST(stack, overflow) { // push 2n items into a stack of size n kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor()); - cudaDeviceSynchronize(); + s.update_host(); EXPECT_EQ(n, s.size()); EXPECT_EQ(2*n, s.pushes()); EXPECT_TRUE(s.overflow()); @@ -122,5 +138,7 @@ TEST(stack, empty) { EXPECT_EQ(s.size(), 0u); EXPECT_EQ(s.capacity(), 0u); - EXPECT_EQ(s.storage().data, nullptr); + auto device_storage = s.get_storage_copy(); + + EXPECT_EQ(device_storage.data, nullptr); }