Skip to content
Snippets Groups Projects
Commit 51f83b42 authored by kanduri's avatar kanduri Committed by Benjamin Cumming
Browse files

Removed the use of CUDA Managed Memory from GPU-Stack (#821)

Remove managed memory from the GPU stack data type that is used for spike collection on the GPU back end. Now the stack has explicit synchronization points, that are called from the host, for:
* copying stack state from GPU to host memory where it can be interrogated.
* resetting the stack state on the GPU to empty.

Fixes  #810.
parent 677f96df
No related branches found
No related tags found
No related merge requests found
......@@ -5,9 +5,9 @@
#include <arbor/assert.hpp>
#include "backends/gpu/managed_ptr.hpp"
#include "gpu_context.hpp"
#include "memory/allocator.hpp"
#include "memory/cuda_wrappers.hpp"
#include "stack_storage.hpp"
namespace arb {
......@@ -29,110 +29,140 @@ namespace gpu {
template <typename T>
class stack {
using value_type = T;
template <typename U>
using allocator = memory::managed_allocator<U>;
using allocator = memory::cuda_allocator<U>;
using storage_type = stack_storage<value_type>;
using gpu_context_handle = std::shared_ptr<arb::gpu_context>;
managed_ptr<storage_type> storage_;
private:
// pointer in GPU memory
storage_type* device_storage_;
// copy of the device_storage in host
storage_type host_storage_;
gpu_context_handle gpu_context_;
managed_ptr<storage_type> create_storage(unsigned n) {
auto p = make_managed_ptr<storage_type>();
p->capacity = n;
p->stores = 0;
p->data = n? allocator<value_type>().allocate(n): nullptr;
return p;
// copy of data from GPU memory, to be manually refreshed before access
std::vector<T> data_;
void create_storage(unsigned n) {
data_.reserve(n);
host_storage_.capacity = n;
host_storage_.stores = 0u;
host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;
device_storage_ = allocator<storage_type>().allocate(1);
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
}
public:
stack& operator=(const stack& other) = delete;
stack(const stack& other) = delete;
stack() = delete;
stack(const gpu_context_handle& gpu_ctx):
storage_(create_storage(0)), gpu_context_(gpu_ctx) {}
stack(stack&& other): storage_(create_storage(0)), gpu_context_(other.gpu_context_) {
std::swap(storage_, other.storage_);
stack(gpu_context_handle h): gpu_context_(h) {
host_storage_.data = nullptr;
device_storage_ = nullptr;
}
stack& operator=(stack&& other) {
std::swap(storage_, other.storage_);
gpu_context_ = other.gpu_context_;
std::swap(device_storage_, other.device_storage_);
std::swap(host_storage_, other.host_storage_);
std::swap(data_, other.data_);
return *this;
}
explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx):
storage_(create_storage(capacity)), gpu_context_(gpu_ctx) {}
stack(stack&& other) {
*this = std::move(other);
}
explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx): gpu_context_(gpu_ctx) {
create_storage(capacity);
}
~stack() {
storage_.synchronize();
if (storage_->data) {
allocator<value_type>().deallocate(storage_->data, storage_->capacity);
if (host_storage_.data) {
allocator<value_type>().deallocate(host_storage_.data, host_storage_.capacity);
}
allocator<storage_type>().deallocate(device_storage_, sizeof(storage_type));
}
// Perform any required synchronization if concurrent host-side access is not supported.
// (Correctness still requires that GPU operations on this stack are complete.)
void host_access() const {
gpu_context_->synchronize_for_managed_access();
// After this call both host and device storage are synchronized to the GPU
// state before the call.
void update_host() {
memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
auto num = size();
data_.resize(num);
auto bytes = num*sizeof(T);
memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
}
// After this call both host and device storage are synchronized to empty state.
void clear() {
storage_->stores = 0u;
host_storage_.stores = 0u;
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
data_.clear();
}
// The information returned by the calls below may be out of sync with the
// version on the GPU if the GPU storage has been modified since the last
// call to update_host().
storage_type get_storage_copy() const {
return host_storage_;
}
const std::vector<value_type>& data() const {
return data_;
}
// The number of items that have been pushed back on the stack.
// This may exceed capacity, which indicates that the caller attempted
// to push back more values than there was space to store.
unsigned pushes() const {
return storage_->stores;
return host_storage_.stores;
}
bool overflow() const {
return storage_->stores>capacity();
return host_storage_.stores>host_storage_.capacity;
}
// The number of values stored in the stack.
unsigned size() const {
return std::min(storage_->stores, storage_->capacity);
return std::min(host_storage_.stores, host_storage_.capacity);
}
// The maximum number of items that can be stored in the stack.
unsigned capacity() const {
return storage_->capacity;
return host_storage_.capacity;
}
// This returns a non-const reference to the unerlying device storage so
// that it can be passed to GPU kernels that need to modify the stack.
storage_type& storage() {
return *storage_;
return *device_storage_;
}
value_type& operator[](unsigned i) {
const value_type& operator[](unsigned i) const {
arb_assert(i<size());
return storage_->data[i];
return data_[i];
}
value_type& operator[](unsigned i) const {
arb_assert(i<size());
return storage_->data[i];
}
value_type* begin() {
return storage_->data;
}
const value_type* begin() const {
return storage_->data;
return data_.data();
}
value_type* end() {
// Take care of the case where size>capacity.
return storage_->data + size();
}
const value_type* end() const {
// Take care of the case where size>capacity.
return storage_->data + size();
return data_.data() + size();
}
};
} // namespace gpu
......
......@@ -70,7 +70,7 @@ public:
/// Remove all stored crossings that were detected in previous calls to test()
void clear_crossings() {
stack_.host_access();
stack_.update_host();
stack_.clear();
}
......@@ -90,7 +90,7 @@ public:
}
const std::vector<threshold_crossing>& crossings() const {
stack_.host_access();
stack_.update_host();
if (stack_.overflow()) {
throw arbor_internal_error("gpu/threshold_watcher: gpu spike buffer overflow");
......@@ -141,7 +141,7 @@ private:
array v_prev_; // Values at previous sample time.
// Hybrid host/gpu data structure for accumulating threshold crossings.
stack_type stack_;
mutable stack_type stack_;
// host side storage for the crossings
mutable std::vector<threshold_crossing> crossings_;
......
......@@ -63,30 +63,46 @@ TEST(stack, push_back) {
auto s = stack(n, context);
auto& sstorage = s.storage();
EXPECT_EQ(0u, s.size()); // dummy tests
EXPECT_EQ(n, s.capacity());
kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor());
cudaDeviceSynchronize();
s.update_host();
EXPECT_EQ(n, s.size());
std::sort(sstorage.data, sstorage.data+s.size());
for (auto i=0; i<int(s.size()); ++i) {
EXPECT_EQ(i, s[i]);
{
auto d = s.data();
EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n; ++i) {
EXPECT_EQ(i, d[i]);
}
}
s.clear();
kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor());
cudaDeviceSynchronize();
s.update_host();
EXPECT_EQ(n/2, s.size());
std::sort(sstorage.data, sstorage.data+s.size());
for (auto i=0; i<int(s.size())/2; ++i) {
EXPECT_EQ(2*i, s[i]);
{
auto d = s.data();
EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n/2; ++i) {
EXPECT_EQ(2*i, d[i]);
}
}
s.clear();
kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor());
cudaDeviceSynchronize();
s.update_host();
EXPECT_EQ(n/2, s.size());
std::sort(sstorage.data, sstorage.data+s.size());
for (auto i=0; i<int(s.size())/2; ++i) {
EXPECT_EQ(2*i+1, s[i]);
{
auto d = s.data();
EXPECT_EQ(s.size(), d.size());
std::sort(d.begin(), d.end());
for (unsigned i=0; i<n/2; ++i) {
EXPECT_EQ(2*i+1, d[i]);
}
}
}
......@@ -104,7 +120,7 @@ TEST(stack, overflow) {
// push 2n items into a stack of size n
kernels::push_back<<<1, 2*n>>>(sstorage, kernels::all_ftor());
cudaDeviceSynchronize();
s.update_host();
EXPECT_EQ(n, s.size());
EXPECT_EQ(2*n, s.pushes());
EXPECT_TRUE(s.overflow());
......@@ -122,5 +138,7 @@ TEST(stack, empty) {
EXPECT_EQ(s.size(), 0u);
EXPECT_EQ(s.capacity(), 0u);
EXPECT_EQ(s.storage().data, nullptr);
auto device_storage = s.get_storage_copy();
EXPECT_EQ(device_storage.data, nullptr);
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment