From 2c135d753e90ca4abbe027ef06063cbdcc673ef0 Mon Sep 17 00:00:00 2001
From: noraabiakar <nora.abiakar@gmail.com>
Date: Wed, 22 Aug 2018 18:53:44 +0200
Subject: [PATCH] Create gpu_context and manage it as part of execution_context
 (#566)

* Add gpu_context as part of execution context containing information about GPU availability, managed_memory synchronization, and atomic double availability.
* Choose between ON and OFF for ARB_GPU in CMake. If ON compile for K20, K80, and P100

Note that we still need compile time information about the GPU in cuda_atomic.hpp for atomicAdd(double*, double*). This is because the function is only defined when the program is compiled  for sm_60 or more.
---
 CMakeLists.txt                                | 28 ++-------
 arbor/CMakeLists.txt                          |  2 +-
 arbor/backends/gpu/fvm.hpp                    |  6 +-
 arbor/backends/gpu/managed_ptr.cpp            | 12 ----
 arbor/backends/gpu/managed_ptr.hpp            | 35 +-----------
 arbor/backends/gpu/stack.hpp                  | 15 +++--
 arbor/backends/gpu/threshold_watcher.hpp      |  7 ++-
 arbor/backends/multicore/fvm.hpp              |  7 ++-
 .../backends/multicore/threshold_watcher.hpp  |  6 +-
 arbor/cell_group_factory.cpp                  |  6 +-
 arbor/cell_group_factory.hpp                  |  7 ++-
 arbor/fvm_lowered_cell.hpp                    |  3 +-
 arbor/fvm_lowered_cell_impl.cpp               |  6 +-
 arbor/fvm_lowered_cell_impl.hpp               |  7 ++-
 arbor/gpu_context.cpp                         | 11 ++++
 arbor/gpu_context.hpp                         | 57 +++++++++++++++++++
 arbor/partition_load_balance.cpp              |  4 +-
 arbor/simulation.cpp                          |  2 +-
 include/arbor/execution_context.hpp           |  9 ++-
 test/unit/CMakeLists.txt                      |  1 +
 test/unit/test_backend.cpp                    |  5 +-
 test/unit/test_fvm_lowered.cpp                | 20 +++++--
 test/unit/test_gpu_stack.cu                   | 16 ++++--
 test/unit/test_mc_cell_group.cpp              |  4 +-
 test/unit/test_mc_cell_group_gpu.cpp          |  3 +-
 test/unit/test_probe.cpp                      |  4 +-
 test/unit/test_spikes.cpp                     |  3 +-
 27 files changed, 176 insertions(+), 110 deletions(-)
 delete mode 100644 arbor/backends/gpu/managed_ptr.cpp
 create mode 100644 arbor/gpu_context.cpp
 create mode 100644 arbor/gpu_context.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4fc22b8..f21e36d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,11 +8,6 @@ enable_language(CXX)
 # Configure-time build options for Arbor:
 #----------------------------------------------------------
 
-# Enable CUDA support with ARB_GPU_MODEL.
-
-set(ARB_GPU_MODEL "none" CACHE STRING "The target GPU architecture: one of {none,K20,K80,P100}")
-set_property(CACHE ARB_GPU_MODEL PROPERTY STRINGS none K20 K80 P100 )
-
 # Specify target archiecture.
 
 set(ARB_ARCH "" CACHE STRING "Target architecture for arbor libraries")
@@ -37,6 +32,7 @@ set(ARB_VALIDATION_DATA_DIR "${PROJECT_SOURCE_DIR}/validation/data" CACHE PATH
 #----------------------------------------------------------
 # Configure-time features for Arbor:
 #----------------------------------------------------------
+option(ARB_WITH_GPU "build with GPU support" OFF)
 
 option(ARB_WITH_MPI "build with MPI support" OFF)
 
@@ -72,7 +68,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 # Add CUDA as a language if GPU support requested.
 # (This has to be set early so as to enable CUDA tests in generator
 # expressions.)
-if(NOT ARB_GPU_MODEL MATCHES "none")
+if(ARB_WITH_GPU)
     enable_language(CUDA)
 endif()
 
@@ -169,7 +165,7 @@ endif()
 # CUDA support
 #--------------
 
-if(NOT ARB_GPU_MODEL MATCHES "none")
+if(ARB_WITH_GPU)
     set(ARB_WITH_CUDA TRUE)
 
     add_compile_options(
@@ -177,22 +173,10 @@ if(NOT ARB_GPU_MODEL MATCHES "none")
         "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
     target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_GPU)
 
-    if(ARB_GPU_MODEL MATCHES "K20")
-        set(cuda_arch sm_35)
-        set(arb_cuda_arch 350)
-    elseif(ARB_GPU_MODEL MATCHES "K80")
-        set(cuda_arch sm_37)
-        set(arb_cuda_arch 370)
-    elseif(ARB_GPU_MODEL MATCHES "P100")
-        set(cuda_arch sm_60)
-        set(arb_cuda_arch 600)
-    else()
-        message(FATAL_ERROR "-- GPU architecture '${ARB_GPU_MODEL}' not supported. Use one of {none, K20, K80, P100}")
-    endif()
+    target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
+    target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
+    target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
 
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-arch=${cuda_arch}>)
-    target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA_ARCH=${arb_cuda_arch})
 endif()
 
 # Use libunwind if available for pretty printing stack traces
diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index b07fddd6..37917fb7 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -10,6 +10,7 @@ set(arbor_sources
     builtin_mechanisms.cpp
     cell_group_factory.cpp
     common_types_io.cpp
+    gpu_context.cpp
     local_alloc.cpp
     event_binner.cpp
     fvm_layout.cpp
@@ -54,7 +55,6 @@ if(ARB_WITH_CUDA)
         backends/gpu/stimulus.cpp
         backends/gpu/stimulus.cu
         backends/gpu/threshold_watcher.cu
-        backends/gpu/managed_ptr.cpp
         backends/gpu/matrix_assemble.cu
         backends/gpu/matrix_interleave.cu
         backends/gpu/matrix_solve.cu
diff --git a/arbor/backends/gpu/fvm.hpp b/arbor/backends/gpu/fvm.hpp
index 262d9e3c..9d02eea3 100644
--- a/arbor/backends/gpu/fvm.hpp
+++ b/arbor/backends/gpu/fvm.hpp
@@ -50,7 +50,8 @@ struct backend {
     static threshold_watcher voltage_watcher(
         const shared_state& state,
         const std::vector<index_type>& cv,
-        const std::vector<value_type>& thresholds)
+        const std::vector<value_type>& thresholds,
+        const execution_context& context)
     {
         return threshold_watcher(
             state.cv_to_cell.data(),
@@ -58,7 +59,8 @@ struct backend {
             state.time_to.data(),
             state.voltage.data(),
             cv,
-            thresholds);
+            thresholds,
+            context);
     }
 };
 
diff --git a/arbor/backends/gpu/managed_ptr.cpp b/arbor/backends/gpu/managed_ptr.cpp
deleted file mode 100644
index 90cd37a9..00000000
--- a/arbor/backends/gpu/managed_ptr.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-namespace arb {
-namespace gpu {
-
-// TODO: make this a runtime check
-
-bool device_concurrent_managed_access() {
-    return (ARB_CUDA_ARCH >= 600); // all GPUs from P100
-}
-
-} // namespace gpu
-} // namespace arb
-
diff --git a/arbor/backends/gpu/managed_ptr.hpp b/arbor/backends/gpu/managed_ptr.hpp
index 9d977662..98202d32 100644
--- a/arbor/backends/gpu/managed_ptr.hpp
+++ b/arbor/backends/gpu/managed_ptr.hpp
@@ -8,18 +8,6 @@
 namespace arb {
 namespace gpu {
 
-// Pre-pascal NVIDIA GPUs don't support page faulting for GPU reads of managed
-// memory, so when a kernel is launched, all managed memory is copied to the
-// GPU. The upshot of this is that no CPU-side reads can be made of _any_
-// managed memory can be made whe _any_ kernel is running.
-//
-// The following helper function can be used to determine whether
-// synchronization is required before CPU-side reads of managed memory: if the
-// device concurrentManagedAccess property is zero, then safe host-side requires
-// a synchronization.
-
-bool device_concurrent_managed_access();
-
 // used to indicate that the type pointed to by the managed_ptr is to be
 // constructed in the managed_ptr constructor
 struct construct_in_place_tag {};
@@ -41,11 +29,7 @@ public:
     using element_type = T;
     using pointer = element_type*;
     using reference = element_type&;
-
-    managed_ptr():
-        concurrent_managed_access(device_concurrent_managed_access())
-    {}
-
+    
     managed_ptr(const managed_ptr& other) = delete;
 
     // Allocate memory and construct in place using args.
@@ -53,18 +37,14 @@ public:
     // point of the wrapper is to hide the complexity of allocating managed
     // memory and constructing a type in place.
     template <typename... Args>
-    managed_ptr(construct_in_place_tag, Args&&... args):
-        concurrent_managed_access(device_concurrent_managed_access())
-    {
+    managed_ptr(construct_in_place_tag, Args&&... args) {
         memory::managed_allocator<element_type> allocator;
         data_ = allocator.allocate(1u);
         synchronize();
         data_ = new (data_) element_type(std::forward<Args>(args)...);
     }
 
-    managed_ptr(managed_ptr&& other):
-        concurrent_managed_access(other.concurrent_managed_access)
-    {
+    managed_ptr(managed_ptr&& other) {
         std::swap(other.data_, data_);
     }
 
@@ -113,16 +93,7 @@ public:
         cudaDeviceSynchronize();
     }
 
-    // Synchronize if concurrent host-side access is not supported.
-    void host_access() const {
-        if (!concurrent_managed_access) {
-            cudaDeviceSynchronize();
-        }
-    }
-
 private:
-    const bool concurrent_managed_access;
-
     __host__ __device__
     bool is_allocated() const {
         return data_!=nullptr;
diff --git a/arbor/backends/gpu/stack.hpp b/arbor/backends/gpu/stack.hpp
index 53293463..1d177199 100644
--- a/arbor/backends/gpu/stack.hpp
+++ b/arbor/backends/gpu/stack.hpp
@@ -5,6 +5,7 @@
 #include <arbor/assert.hpp>
 
 #include "backends/gpu/managed_ptr.hpp"
+#include "gpu_context.hpp"
 #include "memory/allocator.hpp"
 #include "stack_storage.hpp"
 
@@ -31,7 +32,11 @@ class stack {
     using allocator = memory::managed_allocator<U>;
 
     using storage_type = stack_storage<value_type>;
+
+    using gpu_context_handle = std::shared_ptr<arb::gpu_context>;
+
     managed_ptr<storage_type> storage_;
+    gpu_context_handle gpu_context_;
 
     managed_ptr<storage_type> create_storage(unsigned n) {
         auto p = make_managed_ptr<storage_type>();
@@ -45,9 +50,10 @@ public:
     stack& operator=(const stack& other) = delete;
     stack(const stack& other) = delete;
 
-    stack(): storage_(create_storage(0)) {}
+    stack(const gpu_context_handle& gpu_ctx):
+        storage_(create_storage(0)), gpu_context_(gpu_ctx) {}
 
-    stack(stack&& other): storage_(create_storage(0)) {
+    stack(stack&& other): storage_(create_storage(0)), gpu_context_(other.gpu_context_) {
         std::swap(storage_, other.storage_);
     }
 
@@ -56,7 +62,8 @@ public:
         return *this;
     }
 
-    explicit stack(unsigned capacity): storage_(create_storage(capacity)) {}
+    explicit stack(unsigned capacity, const gpu_context_handle& gpu_ctx):
+        storage_(create_storage(capacity)), gpu_context_(gpu_ctx) {}
 
     ~stack() {
         storage_.synchronize();
@@ -68,7 +75,7 @@ public:
     // Perform any required synchronization if concurrent host-side access is not supported.
     // (Correctness still requires that GPU operations on this stack are complete.)
     void host_access() const {
-        storage_.host_access();
+        gpu_context_->synchronize_for_managed_access();
     }
 
     void clear() {
diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp
index 7cc62f8a..68942b71 100644
--- a/arbor/backends/gpu/threshold_watcher.hpp
+++ b/arbor/backends/gpu/threshold_watcher.hpp
@@ -40,13 +40,16 @@ public:
     threshold_watcher(threshold_watcher&& other) = default;
     threshold_watcher& operator=(threshold_watcher&& other) = default;
 
+    threshold_watcher(const execution_context& ctx): stack_(ctx.gpu) {}
+
     threshold_watcher(
         const fvm_index_type* cv_to_cell,
         const fvm_value_type* t_before,
         const fvm_value_type* t_after,
         const fvm_value_type* values,
         const std::vector<fvm_index_type>& cv_index,
-        const std::vector<fvm_value_type>& thresholds
+        const std::vector<fvm_value_type>& thresholds,
+        const execution_context& ctx
     ):
         cv_to_cell_(cv_to_cell),
         t_before_(t_before),
@@ -58,7 +61,7 @@ public:
         v_prev_(memory::const_host_view<fvm_value_type>(values, cv_index.size())),
         // TODO: allocates enough space for 10 spikes per watch.
         // A more robust approach might be needed to avoid overflows.
-        stack_(10*size())
+        stack_(10*size(), ctx.gpu)
     {
         crossings_.reserve(stack_.capacity());
         reset();
diff --git a/arbor/backends/multicore/fvm.hpp b/arbor/backends/multicore/fvm.hpp
index c4318763..99dcd2b6 100644
--- a/arbor/backends/multicore/fvm.hpp
+++ b/arbor/backends/multicore/fvm.hpp
@@ -2,6 +2,7 @@
 
 #include <string>
 #include <vector>
+#include <arbor/execution_context.hpp>
 
 #include "backends/event.hpp"
 #include "backends/multicore/matrix_state.hpp"
@@ -46,7 +47,8 @@ struct backend {
     static threshold_watcher voltage_watcher(
         const shared_state& state,
         const std::vector<index_type>& cv,
-        const std::vector<value_type>& thresholds)
+        const std::vector<value_type>& thresholds,
+        const execution_context& context)
     {
         return threshold_watcher(
             state.cv_to_cell.data(),
@@ -54,7 +56,8 @@ struct backend {
             state.time_to.data(),
             state.voltage.data(),
             cv,
-            thresholds);
+            thresholds,
+            context);
     }
 };
 
diff --git a/arbor/backends/multicore/threshold_watcher.hpp b/arbor/backends/multicore/threshold_watcher.hpp
index dcb38168..13e55257 100644
--- a/arbor/backends/multicore/threshold_watcher.hpp
+++ b/arbor/backends/multicore/threshold_watcher.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <arbor/assert.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/fvm_types.hpp>
 #include <arbor/math.hpp>
 
@@ -14,13 +15,16 @@ class threshold_watcher {
 public:
     threshold_watcher() = default;
 
+    threshold_watcher(const execution_context& ctx) {}
+
     threshold_watcher(
         const fvm_index_type* cv_to_cell,
         const fvm_value_type* t_before,
         const fvm_value_type* t_after,
         const fvm_value_type* values,
         const std::vector<fvm_index_type>& cv_index,
-        const std::vector<fvm_value_type>& thresholds
+        const std::vector<fvm_value_type>& thresholds,
+        const execution_context& context
     ):
         cv_to_cell_(cv_to_cell),
         t_before_(t_before),
diff --git a/arbor/cell_group_factory.cpp b/arbor/cell_group_factory.cpp
index d82bada0..0fc0a08b 100644
--- a/arbor/cell_group_factory.cpp
+++ b/arbor/cell_group_factory.cpp
@@ -18,13 +18,13 @@ cell_group_ptr make_cell_group(Args&&... args) {
     return cell_group_ptr(new Impl(std::forward<Args>(args)...));
 }
 
-cell_group_factory cell_kind_implementation(cell_kind ck, backend_kind bk) {
+cell_group_factory cell_kind_implementation(cell_kind ck, backend_kind bk, const execution_context& ctx) {
     using gid_vector = std::vector<cell_gid_type>;
 
     switch (ck) {
     case cell_kind::cable1d_neuron:
-        return [bk](const gid_vector& gids, const recipe& rec) {
-            return make_cell_group<mc_cell_group>(gids, rec, make_fvm_lowered_cell(bk));
+        return [bk, ctx](const gid_vector& gids, const recipe& rec) {
+            return make_cell_group<mc_cell_group>(gids, rec, make_fvm_lowered_cell(bk, ctx));
         };
 
     case cell_kind::spike_source:
diff --git a/arbor/cell_group_factory.hpp b/arbor/cell_group_factory.hpp
index 47c476e5..0cd72de5 100644
--- a/arbor/cell_group_factory.hpp
+++ b/arbor/cell_group_factory.hpp
@@ -11,6 +11,7 @@
 
 #include <arbor/common_types.hpp>
 #include <arbor/recipe.hpp>
+#include <arbor/execution_context.hpp>
 
 #include "cell_group.hpp"
 
@@ -18,10 +19,10 @@ namespace arb {
 
 using cell_group_factory = std::function<cell_group_ptr (const std::vector<cell_gid_type>&, const recipe&)>;
 
-cell_group_factory cell_kind_implementation(cell_kind, backend_kind);
+cell_group_factory cell_kind_implementation(cell_kind, backend_kind, const execution_context&);
 
-inline bool cell_kind_supported(cell_kind c, backend_kind b) {
-    return static_cast<bool>(cell_kind_implementation(c, b));
+inline bool cell_kind_supported(cell_kind c, backend_kind b, const execution_context& ctx) {
+    return static_cast<bool>(cell_kind_implementation(c, b, ctx));
 }
 
 } // namespace arb
diff --git a/arbor/fvm_lowered_cell.hpp b/arbor/fvm_lowered_cell.hpp
index 81a26887..bb4184fc 100644
--- a/arbor/fvm_lowered_cell.hpp
+++ b/arbor/fvm_lowered_cell.hpp
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include <arbor/common_types.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/fvm_types.hpp>
 #include <arbor/recipe.hpp>
 
@@ -44,6 +45,6 @@ struct fvm_lowered_cell {
 
 using fvm_lowered_cell_ptr = std::unique_ptr<fvm_lowered_cell>;
 
-fvm_lowered_cell_ptr make_fvm_lowered_cell(backend_kind p);
+fvm_lowered_cell_ptr make_fvm_lowered_cell(backend_kind p, const execution_context& ctx);
 
 } // namespace arb
diff --git a/arbor/fvm_lowered_cell_impl.cpp b/arbor/fvm_lowered_cell_impl.cpp
index 1136f64e..01b0ee0f 100644
--- a/arbor/fvm_lowered_cell_impl.cpp
+++ b/arbor/fvm_lowered_cell_impl.cpp
@@ -12,13 +12,13 @@
 
 namespace arb {
 
-fvm_lowered_cell_ptr make_fvm_lowered_cell(backend_kind p) {
+fvm_lowered_cell_ptr make_fvm_lowered_cell(backend_kind p, const execution_context& ctx) {
     switch (p) {
     case backend_kind::multicore:
-        return fvm_lowered_cell_ptr(new fvm_lowered_cell_impl<multicore::backend>);
+        return fvm_lowered_cell_ptr(new fvm_lowered_cell_impl<multicore::backend>(ctx));
     case backend_kind::gpu:
 #ifdef ARB_HAVE_GPU
-        return fvm_lowered_cell_ptr(new fvm_lowered_cell_impl<gpu::backend>);
+        return fvm_lowered_cell_ptr(new fvm_lowered_cell_impl<gpu::backend>(ctx));
 #endif
         ; // fall through
     default:
diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index 82ecbba6..fcc9670d 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -43,6 +43,8 @@ public:
     using index_type = fvm_index_type;
     using size_type = fvm_size_type;
 
+    fvm_lowered_cell_impl(execution_context ctx): context_(ctx), threshold_watcher_(ctx) {};
+
     void reset() override;
 
     void initialize(
@@ -71,6 +73,8 @@ private:
     using sample_event_stream = typename backend::sample_event_stream;
     using threshold_watcher = typename backend::threshold_watcher;
 
+    execution_context context_;
+
     std::unique_ptr<shared_state> state_; // Cell state shared across mechanisms.
 
     // TODO: Can we move the backend-dependent data structures below into state_?
@@ -446,8 +450,7 @@ void fvm_lowered_cell_impl<B>::initialize(
         }
     }
 
-    threshold_watcher_ = threshold_watcher(state_->cv_to_cell.data(), state_->time.data(),
-        state_->time_to.data(), state_->voltage.data(), detector_cv, detector_threshold);
+    threshold_watcher_ = backend::voltage_watcher(*state_, detector_cv, detector_threshold, context_);
 
     reset();
 }
diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp
new file mode 100644
index 00000000..9e2042b3
--- /dev/null
+++ b/arbor/gpu_context.cpp
@@ -0,0 +1,11 @@
+#include <memory>
+
+#include "gpu_context.hpp"
+
+namespace arb {
+
+std::shared_ptr<gpu_context> make_gpu_context() {
+    return std::make_shared<gpu_context>();
+}
+
+}
diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp
new file mode 100644
index 00000000..fea7fc25
--- /dev/null
+++ b/arbor/gpu_context.hpp
@@ -0,0 +1,57 @@
+#include <memory>
+
+#ifdef ARB_HAVE_GPU
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+namespace arb {
+
+#ifndef ARB_HAVE_GPU
+struct gpu_context {
+    bool has_gpu_;
+    size_t attributes_;
+
+    gpu_context(): has_gpu_(false), attributes_(0) {}
+};
+
+#else
+
+enum gpu_flags {
+    has_concurrent_managed_access = 1,
+    has_atomic_double = 2
+};
+
+struct gpu_context {
+    bool has_gpu_;
+    size_t attributes_;
+
+    gpu_context() : has_gpu_(true) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, 0);
+        attributes_ = 0;
+        if (prop.concurrentManagedAccess) {
+            attributes_ |= gpu_flags::has_concurrent_managed_access;
+        }
+        if (prop.major*100 + prop.minor >= 600) {
+            attributes_ |= gpu_flags::has_atomic_double;
+        }
+    };
+
+    bool has_concurrent_managed_access() {
+        return attributes_ & gpu_flags::has_concurrent_managed_access;
+    }
+
+    bool has_atomic_double() {
+        return attributes_ & gpu_flags::has_atomic_double;
+    }
+
+    void synchronize_for_managed_access() {
+        if(!has_concurrent_managed_access()) {
+            cudaDeviceSynchronize();
+        }
+    }
+};
+
+#endif
+}
diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index 60dd3946..0e4ffca3 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -65,8 +65,8 @@ domain_decomposition partition_load_balance(
     // of cell group updates according to rules such as the back end on
     // which the cell group is running.
 
-    auto has_gpu_backend = [](cell_kind c) {
-        return cell_kind_supported(c, backend_kind::gpu);
+    auto has_gpu_backend = [ctx](cell_kind c) {
+        return cell_kind_supported(c, backend_kind::gpu, ctx);
     };
 
     std::vector<cell_kind> kinds;
diff --git a/arbor/simulation.cpp b/arbor/simulation.cpp
index be20b6da..87d88c28 100644
--- a/arbor/simulation.cpp
+++ b/arbor/simulation.cpp
@@ -160,7 +160,7 @@ simulation_state::simulation_state(
     foreach_group_index(
         [&](cell_group_ptr& group, int i) {
             const auto& group_info = decomp.groups[i];
-            auto factory = cell_kind_implementation(group_info.kind, group_info.backend);
+            auto factory = cell_kind_implementation(group_info.kind, group_info.backend, ctx);
             group = factory(group_info.gids, rec);
         });
 
diff --git a/include/arbor/execution_context.hpp b/include/arbor/execution_context.hpp
index fdc9ea9f..3a457a2a 100644
--- a/include/arbor/execution_context.hpp
+++ b/include/arbor/execution_context.hpp
@@ -11,20 +11,25 @@ namespace arb {
 namespace threading {
     class task_system;
 }
+struct gpu_context;
+
 using task_system_handle = std::shared_ptr<threading::task_system>;
 using distributed_context_handle = std::shared_ptr<distributed_context>;
+using gpu_context_handle = std::shared_ptr<gpu_context>;
 
 task_system_handle make_thread_pool();
 task_system_handle make_thread_pool(int nthreads);
 
+gpu_context_handle make_gpu_context();
 
 struct execution_context {
     distributed_context_handle distributed;
     task_system_handle thread_pool;
+    gpu_context_handle gpu;
 
     execution_context(): distributed(std::make_shared<distributed_context>()),
-                         thread_pool(arb::make_thread_pool())  {};
-
+                         thread_pool(arb::make_thread_pool()),
+                         gpu(arb::make_gpu_context()) {};
 };
 
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 2945330a..d12ff7bd 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -140,5 +140,6 @@ add_executable(unit ${unit_sources} ${test_mech_sources})
 add_dependencies(unit build_test_mods)
 target_compile_options(unit PRIVATE ${ARB_CXXOPT_ARCH})
 target_compile_definitions(unit PRIVATE "-DDATADIR=\"${CMAKE_CURRENT_SOURCE_DIR}/swc\"")
+target_compile_definitions(unit PRIVATE ARB_HAVE_GPU)
 target_include_directories(unit PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
 target_link_libraries(unit PRIVATE gtest arbor arbor-private-headers arbor-aux)
diff --git a/test/unit/test_backend.cpp b/test/unit/test_backend.cpp
index 74db35c2..48f5836c 100644
--- a/test/unit/test_backend.cpp
+++ b/test/unit/test_backend.cpp
@@ -8,9 +8,10 @@
 using namespace arb;
 
 TEST(backends, gpu_test) {
+    execution_context context;
 #ifdef ARB_GPU_ENABLED
-    EXPECT_NO_THROW(make_fvm_lowered_cell(backend_kind::gpu));
+    EXPECT_NO_THROW(make_fvm_lowered_cell(backend_kind::gpu, context));
 #else
-    EXPECT_ANY_THROW(make_fvm_lowered_cell(backend_kind::gpu));
+    EXPECT_ANY_THROW(make_fvm_lowered_cell(backend_kind::gpu, context));
 #endif
 }
diff --git a/test/unit/test_fvm_lowered.cpp b/test/unit/test_fvm_lowered.cpp
index d6631dde..9001ef0c 100644
--- a/test/unit/test_fvm_lowered.cpp
+++ b/test/unit/test_fvm_lowered.cpp
@@ -80,6 +80,8 @@ using namespace arb;
 
 TEST(fvm_lowered, matrix_init)
 {
+    execution_context context;
+
     auto isnan = [](auto v) { return std::isnan(v); };
     auto ispos = [](auto v) { return v>0; };
     auto isneg = [](auto v) { return v<0; };
@@ -92,7 +94,7 @@ TEST(fvm_lowered, matrix_init)
     std::vector<target_handle> targets;
     probe_association_map<probe_handle> probe_map;
 
-    fvm_cell fvcell;
+    fvm_cell fvcell(context);
     fvcell.initialize({0}, cable1d_recipe(cell), targets, probe_map);
 
     auto& J = fvcell.*private_matrix_ptr;
@@ -116,6 +118,8 @@ TEST(fvm_lowered, matrix_init)
 TEST(fvm_lowered, target_handles) {
     using namespace arb;
 
+    execution_context context;
+
     mc_cell cells[] = {
         make_cell_ball_and_stick(),
         make_cell_ball_and_3stick()
@@ -135,7 +139,7 @@ TEST(fvm_lowered, target_handles) {
     std::vector<target_handle> targets;
     probe_association_map<probe_handle> probe_map;
 
-    fvm_cell fvcell;
+    fvm_cell fvcell(context);
     fvcell.initialize({0, 1}, cable1d_recipe(cells), targets, probe_map);
 
     mechanism* expsyn = find_mechanism(fvcell, "expsyn");
@@ -175,6 +179,8 @@ TEST(fvm_lowered, stimulus) {
     // amplitude | 0.3  |  0.1
     // CV        |   4  |    0
 
+    execution_context context;
+
     std::vector<mc_cell> cells;
     cells.push_back(make_cell_ball_and_stick(false));
 
@@ -197,7 +203,7 @@ TEST(fvm_lowered, stimulus) {
     std::vector<target_handle> targets;
     probe_association_map<probe_handle> probe_map;
 
-    fvm_cell fvcell;
+    fvm_cell fvcell(context);
     fvcell.initialize({0}, cable1d_recipe(cells), targets, probe_map);
 
     mechanism* stim = find_mechanism(fvcell, "_builtin_stimulus");
@@ -250,6 +256,8 @@ TEST(fvm_lowered, derived_mechs) {
     //
     // 3. Cell with both test_kin1 and custom_kin1.
 
+    execution_context context;
+
     std::vector<mc_cell> cells(3);
     for (int i = 0; i<3; ++i) {
         mc_cell& c = cells[i];
@@ -290,7 +298,7 @@ TEST(fvm_lowered, derived_mechs) {
         std::vector<target_handle> targets;
         probe_association_map<probe_handle> probe_map;
 
-        fvm_cell fvcell;
+        fvm_cell fvcell(context);
         fvcell.initialize({0, 1, 2}, rec, targets, probe_map);
 
         // Both mechanisms will have the same internal name, "test_kin1".
@@ -373,6 +381,8 @@ TEST(fvm_lowered, weighted_write_ion) {
     // the same as a 100Âµm dendrite, which makes it easier to describe the
     // expected weights.
 
+    execution_context context;
+
     mc_cell c;
     c.add_soma(5);
 
@@ -394,7 +404,7 @@ TEST(fvm_lowered, weighted_write_ion) {
     std::vector<target_handle> targets;
     probe_association_map<probe_handle> probe_map;
 
-    fvm_cell fvcell;
+    fvm_cell fvcell(context);
     fvcell.initialize({0}, cable1d_recipe(c), targets, probe_map);
 
     auto& state = *(fvcell.*private_state_ptr).get();
diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu
index 18d9c180..9e0235dd 100644
--- a/test/unit/test_gpu_stack.cu
+++ b/test/unit/test_gpu_stack.cu
@@ -3,13 +3,15 @@
 #include <backends/gpu/stack.hpp>
 #include <backends/gpu/stack_cu.hpp>
 #include <backends/gpu/managed_ptr.hpp>
+#include <arbor/execution_context.hpp>
 
 using namespace arb;
 
 TEST(stack, construction) {
     using T = int;
 
-    gpu::stack<T> s(10);
+    execution_context context;
+    gpu::stack<T> s(10, context.gpu);
 
     EXPECT_EQ(0u, s.size());
     EXPECT_EQ(10u, s.capacity());
@@ -51,9 +53,11 @@ TEST(stack, push_back) {
     using T = int;
     using stack = gpu::stack<T>;
 
+    execution_context context;
+
     const unsigned n = 10;
     EXPECT_TRUE(n%2 == 0); // require n is even for tests to work
-    auto s = stack(n);
+    auto s = stack(n, context.gpu);
     auto& sstorage = s.storage();
 
     kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor());
@@ -84,8 +88,10 @@ TEST(stack, overflow) {
     using T = int;
     using stack = gpu::stack<T>;
 
+    execution_context context;
+
     const unsigned n = 10;
-    auto s = stack(n);
+    auto s = stack(n, context.gpu);
     auto& sstorage = s.storage();
     EXPECT_FALSE(s.overflow());
 
@@ -101,7 +107,9 @@ TEST(stack, empty) {
     using T = int;
     using stack = gpu::stack<T>;
 
-    stack s(0u);
+    execution_context context;
+
+    stack s(0u, context.gpu);
 
     EXPECT_EQ(s.size(), 0u);
     EXPECT_EQ(s.capacity(), 0u);
diff --git a/test/unit/test_mc_cell_group.cpp b/test/unit/test_mc_cell_group.cpp
index a9873cbc..66326c22 100644
--- a/test/unit/test_mc_cell_group.cpp
+++ b/test/unit/test_mc_cell_group.cpp
@@ -14,8 +14,10 @@
 using namespace arb;
 
 namespace {
+    execution_context context;
+
     fvm_lowered_cell_ptr lowered_cell() {
-        return make_fvm_lowered_cell(backend_kind::multicore);
+        return make_fvm_lowered_cell(backend_kind::multicore, context);
     }
 
     mc_cell make_cell() {
diff --git a/test/unit/test_mc_cell_group_gpu.cpp b/test/unit/test_mc_cell_group_gpu.cpp
index b1431454..ad15d437 100644
--- a/test/unit/test_mc_cell_group_gpu.cpp
+++ b/test/unit/test_mc_cell_group_gpu.cpp
@@ -13,7 +13,8 @@ using namespace arb;
 
 namespace {
     fvm_lowered_cell_ptr lowered_cell() {
-        return make_fvm_lowered_cell(backend_kind::gpu);
+        execution_context context;
+        return make_fvm_lowered_cell(backend_kind::gpu, context);
     }
 
     mc_cell make_cell() {
diff --git a/test/unit/test_probe.cpp b/test/unit/test_probe.cpp
index b22df63b..42326f35 100644
--- a/test/unit/test_probe.cpp
+++ b/test/unit/test_probe.cpp
@@ -19,6 +19,8 @@ using shared_state = multicore::backend::shared_state;
 ACCESS_BIND(std::unique_ptr<shared_state> fvm_cell::*, fvm_state_ptr, &fvm_cell::state_);
 
 TEST(probe, fvm_lowered_cell) {
+    execution_context context;
+
     mc_cell bs = make_cell_ball_and_stick(false);
 
     i_clamp stim(0, 100, 0.3);
@@ -37,7 +39,7 @@ TEST(probe, fvm_lowered_cell) {
     std::vector<target_handle> targets;
     probe_association_map<probe_handle> probe_map;
 
-    fvm_cell lcell;
+    fvm_cell lcell(context);
     lcell.initialize({0}, rec, targets, probe_map);
 
     EXPECT_EQ(3u, rec.num_probes(0));
diff --git a/test/unit/test_spikes.cpp b/test/unit/test_spikes.cpp
index 656f0500..7be54ece 100644
--- a/test/unit/test_spikes.cpp
+++ b/test/unit/test_spikes.cpp
@@ -29,6 +29,7 @@ TEST(SPIKES_TEST_CLASS, threshold_watcher) {
 
     // the test creates a watch on 3 values in the array values (which has 10
     // elements in total).
+    execution_context context;
     const auto n = 10;
 
     const std::vector<index_type> index{0, 5, 7};
@@ -53,7 +54,7 @@ TEST(SPIKES_TEST_CLASS, threshold_watcher) {
     list expected;
 
     // create the watch
-    backend::threshold_watcher watch(cell_index.data(), time_before.data(), time_after.data(), values.data(), index, thresh);
+    backend::threshold_watcher watch(cell_index.data(), time_before.data(), time_after.data(), values.data(), index, thresh, context);
 
     // initially the first and third watch should not be spiking
     //           the second is spiking
-- 
GitLab