Skip to content
Snippets Groups Projects
Commit 51f83b42 authored by kanduri's avatar kanduri Committed by Benjamin Cumming
Browse files

Removed the use of CUDA Managed Memory from GPU-Stack (#821)

Remove managed memory from the GPU stack data type that is used for spike collection on the GPU back end. Now the stack has explicit synchronization points, that are called from the host, for:
* copying stack state from GPU to host memory where it can be interrogated.
* resetting the stack state on the GPU to empty.

Fixes  #810.
parent 677f96df
Branches
Tags
No related merge requests found
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#include <arbor/assert.hpp> #include <arbor/assert.hpp>
#include "backends/gpu/managed_ptr.hpp"
#include "gpu_context.hpp" #include "gpu_context.hpp"
#include "memory/allocator.hpp" #include "memory/allocator.hpp"
#include "memory/cuda_wrappers.hpp"
#include "stack_storage.hpp" #include "stack_storage.hpp"
namespace arb { namespace arb {
...@@ -29,110 +29,140 @@ namespace gpu { ...@@ -29,110 +29,140 @@ namespace gpu {
template <typename T> template <typename T>
class stack { class stack {
using value_type = T; using value_type = T;
template <typename U> template <typename U>
using allocator = memory::managed_allocator<U>; using allocator = memory::cuda_allocator<U>;
using storage_type = stack_storage<value_type>; using storage_type = stack_storage<value_type>;
using gpu_context_handle = std::shared_ptr<arb::gpu_context>; using gpu_context_handle = std::shared_ptr<arb::gpu_context>;
managed_ptr<storage_type> storage_; private:
// pointer in GPU memory
storage_type* device_storage_;
// copy of the device_storage in host
storage_type host_storage_;
gpu_context_handle gpu_context_; gpu_context_handle gpu_context_;
managed_ptr<storage_type> create_storage(unsigned n) { // copy of data from GPU memory, to be manually refreshed before access
auto p = make_managed_ptr<storage_type>(); std::vector<T> data_;
p->capacity = n;
p->stores = 0; void create_storage(unsigned n) {
p->data = n? allocator<value_type>().allocate(n): nullptr; data_.reserve(n);
return p;
host_storage_.capacity = n;
host_storage_.stores = 0u;
host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;
device_storage_ = allocator<storage_type>().allocate(1);
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
} }
public: public:
stack& operator=(const stack& other) = delete; stack& operator=(const stack& other) = delete;
stack(const stack& other) = delete; stack(const stack& other) = delete;
stack() = delete;
stack(const gpu_context_handle& gpu_ctx): stack(gpu_context_handle h): gpu_context_(h) {
storage_(create_storage(0)), gpu_context_(gpu_ctx) {} host_storage_.data = nullptr;
device_storage_ = nullptr;
stack(stack&& other): storage_(create_storage(0)), gpu_context_(other.gpu_context_) {
std::swap(storage_, other.storage_);
} }
stack& operator=(stack&& other) { stack& operator=(stack&& other) {
std::swap(storage_, other.storage_); gpu_context_ = other.gpu_context_;
std::swap(device_storage_, other.device_storage_);
std::swap(host_storage_, other.host_storage_);
std::swap(data_, other.data_);
return *this; return *this;
} }
explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): stack(stack&& other) {
storage_(create_storage(capacity)), gpu_context_(gpu_ctx) {} *this = std::move(other);
}
explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): gpu_context_(gpu_ctx) {
create_storage(capacity);
}
~stack() { ~stack() {
storage_.synchronize(); if (host_storage_.data) {
if (storage_->data) { allocator<value_type>().deallocate(host_storage_.data, host_storage_.capacity);
allocator<value_type>().deallocate(storage_->data, storage_->capacity);
} }
allocator<storage_type>().deallocate(device_storage_, sizeof(storage_type));
} }
// Perform any required synchronization if concurrent host-side access is not supported. // After this call both host and device storage are synchronized to the GPU
// (Correctness still requires that GPU operations on this stack are complete.) // state before the call.
void host_access() const { void update_host() {
gpu_context_->synchronize_for_managed_access(); memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
auto num = size();
data_.resize(num);
auto bytes = num*sizeof(T);
memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
} }
// After this call both host and device storage are synchronized to empty state.
void clear() { void clear() {
storage_->stores = 0u; host_storage_.stores = 0u;
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
data_.clear();
}
// The information returned by the calls below may be out of sync with the
// version on the GPU if the GPU storage has been modified since the last
// call to update_host().
storage_type get_storage_copy() const {
return host_storage_;
}
const std::vector<value_type>& data() const {
return data_;
} }
// The number of items that have been pushed back on the stack. // The number of items that have been pushed back on the stack.
// This may exceed capacity, which indicates that the caller attempted // This may exceed capacity, which indicates that the caller attempted
// to push back more values than there was space to store. // to push back more values than there was space to store.
unsigned pushes() const { unsigned pushes() const {
return storage_->stores; return host_storage_.stores;
} }
bool overflow() const { bool overflow() const {
return storage_->stores>capacity(); return host_storage_.stores>host_storage_.capacity;
} }
// The number of values stored in the stack. // The number of values stored in the stack.
unsigned size() const { unsigned size() const {
return std::min(storage_->stores, storage_->capacity); return std::min(host_storage_.stores, host_storage_.capacity);
} }
// The maximum number of items that can be stored in the stack. // The maximum number of items that can be stored in the stack.
unsigned capacity() const { unsigned capacity() const {
return storage_->capacity; return host_storage_.capacity;
} }
// This returns a non-const reference to the unerlying device storage so
// that it can be passed to GPU kernels that need to modify the stack.
storage_type& storage() { storage_type& storage() {
return *storage_; return *device_storage_;
} }
value_type& operator[](unsigned i) { const value_type& operator[](unsigned i) const {
arb_assert(i<size()); arb_assert(i<size());
return storage_->data[i]; return data_[i];
} }
value_type& operator[](unsigned i) const {
arb_assert(i<size());
return storage_->data[i];
}
value_type* begin() {
return storage_->data;
}
const value_type* begin() const { const value_type* begin() const {
return storage_->data; return data_.data();
} }
value_type* end() {
// Take care of the case where size>capacity.
return storage_->data + size();
}
const value_type* end() const { const value_type* end() const {
// Take care of the case where size>capacity. // Take care of the case where size>capacity.
return storage_->data + size(); return data_.data() + size();
} }
}; };
} // namespace gpu } // namespace gpu
......
...@@ -70,7 +70,7 @@ public: ...@@ -70,7 +70,7 @@ public:
/// Remove all stored crossings that were detected in previous calls to test() /// Remove all stored crossings that were detected in previous calls to test()
void clear_crossings() { void clear_crossings() {
stack_.host_access(); stack_.update_host();
stack_.clear(); stack_.clear();
} }
...@@ -90,7 +90,7 @@ public: ...@@ -90,7 +90,7 @@ public:
} }
const std::vector<threshold_crossing>& crossings() const { const std::vector<threshold_crossing>& crossings() const {
stack_.host_access(); stack_.update_host();
if (stack_.overflow()) { if (stack_.overflow()) {
throw arbor_internal_error("gpu/threshold_watcher: gpu spike buffer overflow"); throw arbor_internal_error("gpu/threshold_watcher: gpu spike buffer overflow");
...@@ -141,7 +141,7 @@ private: ...@@ -141,7 +141,7 @@ private:
array v_prev_; // Values at previous sample time. array v_prev_; // Values at previous sample time.
// Hybrid host/gpu data structure for accumulating threshold crossings. // Hybrid host/gpu data structure for accumulating threshold crossings.
stack_type stack_; mutable stack_type stack_;
// host side storage for the crossings // host side storage for the crossings
mutable std::vector<threshold_crossing> crossings_; mutable std::vector<threshold_crossing> crossings_;
......
...@@ -63,30 +63,46 @@ TEST(stack, push_back) { ...@@ -63,30 +63,46 @@ TEST(stack, push_back) {
auto s = stack(n, context); auto s = stack(n, context);
auto& sstorage = s.storage(); auto& sstorage = s.storage();
EXPECT_EQ(0u, s.size()); // dummy tests
EXPECT_EQ(n, s.capacity());
kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor()); kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor());
cudaDeviceSynchronize(); s.update_host();
EXPECT_EQ(n, s.size()); EXPECT_EQ(n, s.size());
std::sort(sstorage.data, sstorage.data+s.size()); {
for (auto i=0; i<int(s.size()); ++i) { auto d = s.data();
EXPECT_EQ(i, s[i]); EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n; ++i) {
EXPECT_EQ(i, d[i]);
}
} }
s.clear(); s.clear();
kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor()); kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor());
cudaDeviceSynchronize(); s.update_host();
EXPECT_EQ(n/2, s.size()); EXPECT_EQ(n/2, s.size());
std::sort(sstorage.data, sstorage.data+s.size()); {
for (auto i=0; i<int(s.size())/2; ++i) { auto d = s.data();
EXPECT_EQ(2*i, s[i]); EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n/2; ++i) {
EXPECT_EQ(2*i, d[i]);
}
} }
s.clear(); s.clear();
kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor()); kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor());
cudaDeviceSynchronize(); s.update_host();
EXPECT_EQ(n/2, s.size()); EXPECT_EQ(n/2, s.size());
std::sort(sstorage.data, sstorage.data+s.size()); {
for (auto i=0; i<int(s.size())/2; ++i) { auto d = s.data();
EXPECT_EQ(2*i+1, s[i]); EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n/2; ++i) {
EXPECT_EQ(2*i+1, d[i]);
}
} }
} }
...@@ -104,7 +120,7 @@ TEST(stack, overflow) { ...@@ -104,7 +120,7 @@ TEST(stack, overflow) {
// push 2n items into a stack of size n // push 2n items into a stack of size n
kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor()); kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor());
cudaDeviceSynchronize(); s.update_host();
EXPECT_EQ(n, s.size()); EXPECT_EQ(n, s.size());
EXPECT_EQ(2*n, s.pushes()); EXPECT_EQ(2*n, s.pushes());
EXPECT_TRUE(s.overflow()); EXPECT_TRUE(s.overflow());
...@@ -122,5 +138,7 @@ TEST(stack, empty) { ...@@ -122,5 +138,7 @@ TEST(stack, empty) {
EXPECT_EQ(s.size(), 0u); EXPECT_EQ(s.size(), 0u);
EXPECT_EQ(s.capacity(), 0u); EXPECT_EQ(s.capacity(), 0u);
EXPECT_EQ(s.storage().data, nullptr); auto device_storage = s.get_storage_copy();
EXPECT_EQ(device_storage.data, nullptr);
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment