From 7a6c10314f1244381b4d30370f3e9e87878610f1 Mon Sep 17 00:00:00 2001
From: noraabiakar <nora.abiakar@gmail.com>
Date: Tue, 24 Jul 2018 15:27:31 +0200
Subject: [PATCH] task_system as part of an execution_context  (#537)

- Task system is no longer a single system private to the implementation of the threading backend and used everywhere. A separate task_system can be used (with a specified number of threads) for every simulation.
- arb::execution_context is the interface to task_system  and the previously defined distributed_context
- TBB and serial support has been removed. Cthreads is the only threading backend available.
---
 .gitmodules                                   |   3 -
 .travis.yml                                   |  11 +-
 .ycm_extra_conf.py                            |   1 -
 CMakeLists.txt                                |  27 +---
 arbor/CMakeLists.txt                          |   6 +-
 arbor/communication/communicator.hpp          |  19 +--
 arbor/partition_load_balance.cpp              |   8 +-
 arbor/profile/meter_manager.cpp               |   1 -
 arbor/simulation.cpp                          |  25 ++--
 arbor/thread_private_spike_store.cpp          |  11 +-
 arbor/thread_private_spike_store.hpp          |   4 +
 arbor/threading/cthread.cpp                   |  14 +-
 arbor/threading/cthread_impl.hpp              |  49 ++++---
 arbor/threading/serial.hpp                    | 122 ------------------
 arbor/threading/tbb.hpp                       |  61 ---------
 arbor/threading/threading.cpp                 |   4 -
 arbor/threading/threading.hpp                 |  13 --
 arbor/util/double_buffer.hpp                  |  11 +-
 arbor/util/range.hpp                          |  40 ------
 cmake/FindTBB.cmake                           |  60 ---------
 example/bench/bench.cpp                       |  12 +-
 example/brunel/brunel_miniapp.cpp             |  24 ++--
 example/generators/event_gen.cpp              |   2 +-
 example/miniapp/miniapp.cpp                   |  26 ++--
 ext/CMakeLists.txt                            |  34 -----
 ext/tbb                                       |   1 -
 include/CMakeLists.txt                        |   5 -
 include/arbor/execution_context.hpp           |  29 +++++
 include/arbor/load_balance.hpp                |   4 +-
 include/arbor/simulation.hpp                  |   4 +-
 scripts/travis/build.sh                       |   2 +-
 test/ubench/task_system.cpp                   |  13 +-
 test/unit-distributed/test.cpp                |  13 +-
 test/unit-distributed/test.hpp                |   4 +-
 test/unit-distributed/test_communicator.cpp   |  39 +++---
 .../test_domain_decomposition.cpp             |   8 +-
 test/unit/test_algorithms.cpp                 |  11 +-
 test/unit/test_domain_decomposition.cpp       |   8 +-
 test/unit/test_fvm_lowered.cpp                |   2 +-
 test/unit/test_lif_cell_group.cpp             |   5 +-
 test/unit/test_range.cpp                      |  51 --------
 test/unit/test_spike_store.cpp                |  10 +-
 test/unit/test_thread.cpp                     |  50 +++----
 test/validation/validate_ball_and_stick.cpp   |   2 +-
 test/validation/validate_kinetic.cpp          |   2 +-
 test/validation/validate_soma.cpp             |   2 +-
 test/validation/validate_synapses.cpp         |   2 +-
 47 files changed, 242 insertions(+), 613 deletions(-)
 delete mode 100644 arbor/threading/serial.hpp
 delete mode 100644 arbor/threading/tbb.hpp
 delete mode 100644 cmake/FindTBB.cmake
 delete mode 160000 ext/tbb
 create mode 100644 include/arbor/execution_context.hpp

diff --git a/.gitmodules b/.gitmodules
index 3809b6b0..177f10db 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,6 +4,3 @@
 [submodule "sphinx_rtd_theme"]
 	path = ext/sphinx_rtd_theme
 	url = https://github.com/rtfd/sphinx_rtd_theme.git
-[submodule "tbb"]
-	path = ext/tbb
-	url = https://github.com/wjakob/tbb.git
diff --git a/.travis.yml b/.travis.yml
index dfb339db..baa6baaa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,13 +14,10 @@ addons:
             - libopenmpi-dev
 
 env:
-    # test single node/rank with different threading back ends
-    - BUILD_NAME=serial   WITH_THREAD=serial   WITH_DISTRIBUTED=serial
-    - BUILD_NAME=cthread  WITH_THREAD=cthread  WITH_DISTRIBUTED=serial
-    - BUILD_NAME=tbb      WITH_THREAD=tbb      WITH_DISTRIBUTED=serial
-    # test mpi
-    - BUILD_NAME=mpi      WITH_THREAD=cthread  WITH_DISTRIBUTED=mpi
-    - BUILD_NAME=mpitbb   WITH_THREAD=tbb      WITH_DISTRIBUTED=mpi
+    # test single node/rank with threading backend
+    - BUILD_NAME=cthread  WITH_DISTRIBUTED=serial
+    # test mpi with threading backend
+    - BUILD_NAME=mpi      WITH_DISTRIBUTED=mpi
 
 before_install:
     - CC=gcc-6
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index adb3c3ea..4c01f7ca 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -36,7 +36,6 @@ import ycm_core
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
     '-DNDEBUG',
-    '-DARB_HAVE_CTHREAD',
     '-std=c++11',
     '-x',
     'c++',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7307476e..6ff82e3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,10 +21,6 @@ set(ARB_ARCH "" CACHE STRING "Target architecture for arbor libraries")
 
 option(ARB_VECTORIZE "use explicit SIMD code in generated mechanisms" OFF)
 
-# Use in-tree TBB?
-
-option(ARB_PRIVATE_TBBLIB "build and link against in-tree TBB build" OFF)
-
 # Use externally built modcc?
 
 set(ARB_MODCC "" CACHE STRING "path to external modcc NMODL compiler")
@@ -42,9 +38,6 @@ set(ARB_VALIDATION_DATA_DIR "${PROJECT_SOURCE_DIR}/validation/data" CACHE PATH
 # Configure-time features for Arbor:
 #----------------------------------------------------------
 
-set(ARB_THREADING_MODEL "cthread" CACHE STRING "set the threading model, one of cthread/tbb/serial")
-set_property(CACHE ARB_THREADING_MODEL PROPERTY STRINGS cthread tbb serial )
-
 option(ARB_WITH_MPI "build with MPI support" OFF)
 
 option(ARB_WITH_PROFILING "use built-in profiling" OFF)
@@ -115,13 +108,13 @@ set(CMAKE_CXX_STANDARD 14)
 add_library(arbor-private-deps INTERFACE)
 
 # Interface library `arbor-public-deps` collects requirements for the
-# users of the arbor library (e.g. tbb, mpi) that will become part
+# users of the arbor library (e.g. mpi) that will become part
 # of arbor's PUBLIC interface.
 
 add_library(arbor-public-deps INTERFACE)
 
-# External libraries in `ext` sub-directory: json, tclap and tbb.
-# Creates interface libraries `ext-json`, `ext-tclap` and `ext-tbb`.
+# External libraries in `ext` sub-directory: json and tclap.
+# Creates interface libraries `ext-json` and `ext-tclap`.
 
 add_subdirectory(ext)
 
@@ -150,17 +143,9 @@ endif()
 # Threading model
 #-----------------
 
-if(ARB_THREADING_MODEL MATCHES "tbb")
-    set(ARB_WITH_TBB TRUE)
-    target_link_libraries(arbor-public-deps INTERFACE ext-tbb)
-    target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_TBB)
-elseif(ARB_THREADING_MODEL MATCHES "cthread")
-    set(ARB_WITH_CTHREAD TRUE)
-    find_package(Threads REQUIRED)
-    find_threads_cuda_fix()
-    target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_CTHREAD)
-    target_link_libraries(arbor-private-deps INTERFACE Threads::Threads)
-endif()
+find_package(Threads REQUIRED)
+find_threads_cuda_fix()
+target_link_libraries(arbor-private-deps INTERFACE Threads::Threads)
 
 # MPI support
 #-------------------
diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index aef98060..ec612bbc 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -40,6 +40,7 @@ set(arbor_sources
     spike_source_cell_group.cpp
     swcio.cpp
     threadinfo.cpp
+    threading/cthread.cpp
     threading/threading.cpp
     thread_private_spike_store.cpp
     util/hostname.cpp
@@ -73,11 +74,6 @@ if(ARB_WITH_MPI)
         communication/mpi_context.cpp)
 endif()
 
-if(ARB_WITH_CTHREAD)
-    list(APPEND arbor_sources
-        threading/cthread.cpp)
-endif()
-
 # Add special target for private include directory, for use by arbor target
 # and arbor unit tests. Private headers are also used for the other binaries
 # until the process of splitting our private and public headers is complete.
diff --git a/arbor/communication/communicator.hpp b/arbor/communication/communicator.hpp
index 78aa7e4c..70b79ad7 100644
--- a/arbor/communication/communicator.hpp
+++ b/arbor/communication/communicator.hpp
@@ -44,10 +44,12 @@ public:
 
     explicit communicator(const recipe& rec,
                           const domain_decomposition& dom_dec,
-                          const distributed_context* ctx)
+                          const execution_context* ctx)
     {
-        context_ = ctx;
-        num_domains_ = context_->size();
+        distributed_ = &ctx->distributed;
+        thread_pool_ = ctx->thread_pool;
+
+        num_domains_ = distributed_->size();
         num_local_groups_ = dom_dec.groups.size();
         num_local_cells_ = dom_dec.num_local_cells;
 
@@ -82,7 +84,7 @@ public:
         // Build the connection information for local cells in parallel.
         std::vector<gid_info> gid_infos;
         gid_infos.resize(num_local_cells_);
-        threading::parallel_for::apply(0, gids.size(),
+        threading::parallel_for::apply(0, gids.size(), thread_pool_.get(),
             [&](cell_size_type i) {
                 auto gid = gids[i];
                 gid_infos[i] = gid_info(gid, i, rec.connections_on(gid));
@@ -125,7 +127,7 @@ public:
         // Sort the connections for each domain.
         // This is num_domains_ independent sorts, so it can be parallelized trivially.
         const auto& cp = connection_part_;
-        threading::parallel_for::apply(0, num_domains_,
+        threading::parallel_for::apply(0, num_domains_, thread_pool_.get(),
             [&](cell_size_type i) {
                 util::sort(util::subrange_view(connections_, cp[i], cp[i+1]));
             });
@@ -144,7 +146,7 @@ public:
             local_min = std::min(local_min, con.delay());
         }
 
-        return context_->min(local_min);
+        return distributed_->min(local_min);
     }
 
     /// Perform exchange of spikes.
@@ -159,7 +161,7 @@ public:
 
         PE(communication_exchange_gather);
         // global all-to-all to gather a local copy of the global spike list on each node.
-        auto global_spikes = context_->gather_spikes(local_spikes);
+        auto global_spikes = distributed_->gather_spikes(local_spikes);
         num_spikes_ += global_spikes.size();
         PL();
 
@@ -259,7 +261,8 @@ private:
     std::vector<cell_size_type> index_divisions_;
     util::partition_view_type<std::vector<cell_size_type>> index_part_;
 
-    const distributed_context* context_;
+    const distributed_context* distributed_;
+    task_system_handle thread_pool_;
     std::uint64_t num_spikes_ = 0u;
 };
 
diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp
index e6a80892..f9198140 100644
--- a/arbor/partition_load_balance.cpp
+++ b/arbor/partition_load_balance.cpp
@@ -1,7 +1,7 @@
-#include <arbor/distributed_context.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/recipe.hpp>
+#include <arbor/execution_context.hpp>
 
 #include "cell_group_factory.hpp"
 #include "util/maputil.hpp"
@@ -13,7 +13,7 @@ namespace arb {
 domain_decomposition partition_load_balance(
     const recipe& rec,
     proc_allocation nd,
-    const distributed_context* ctx,
+    const execution_context* ctx,
     partition_hint_map hint_map)
 {
     struct partition_gid_domain {
@@ -31,8 +31,8 @@ domain_decomposition partition_load_balance(
 
     using util::make_span;
 
-    unsigned num_domains = ctx->size();
-    unsigned domain_id = ctx->id();
+    unsigned num_domains = ctx->distributed.size();
+    unsigned domain_id = ctx->distributed.id();
     auto num_global_cells = rec.num_cells();
 
     auto dom_size = [&](unsigned dom) -> cell_gid_type {
diff --git a/arbor/profile/meter_manager.cpp b/arbor/profile/meter_manager.cpp
index 52542035..c5663700 100644
--- a/arbor/profile/meter_manager.cpp
+++ b/arbor/profile/meter_manager.cpp
@@ -1,6 +1,5 @@
 #include <arbor/profile/timer.hpp>
 
-#include <arbor/distributed_context.hpp>
 #include <arbor/profile/meter_manager.hpp>
 
 #include "memory_meter.hpp"
diff --git a/arbor/simulation.cpp b/arbor/simulation.cpp
index f0c90f77..dcef80c4 100644
--- a/arbor/simulation.cpp
+++ b/arbor/simulation.cpp
@@ -37,6 +37,9 @@ public:
     //      current:  spikes generated in the current interval
     //      previous: spikes generated in the preceding interval
 
+    spike_double_buffer(thread_private_spike_store l, thread_private_spike_store r):
+            buffer_(std::move(l), std::move(r)) {}
+
     thread_private_spike_store& current()  { return buffer_.get(); }
     thread_private_spike_store& previous() { return buffer_.other(); }
     void exchange() { buffer_.exchange(); }
@@ -44,7 +47,7 @@ public:
 
 class simulation_state {
 public:
-    simulation_state(const recipe& rec, const domain_decomposition& decomp, const distributed_context* ctx);
+    simulation_state(const recipe& rec, const domain_decomposition& decomp, const execution_context* ctx);
 
     void reset();
 
@@ -96,6 +99,8 @@ private:
 
     communicator communicator_;
 
+    task_system_handle task_system_;
+
     // Pending events to be delivered.
     std::array<std::vector<pse_vector>, 2> event_lanes_;
     std::vector<pse_vector> pending_events_;
@@ -106,7 +111,7 @@ private:
     // Apply a functional to each cell group in parallel.
     template <typename L>
     void foreach_group(L&& fn) {
-        threading::parallel_for::apply(0, cell_groups_.size(),
+        threading::parallel_for::apply(0, cell_groups_.size(), task_system_.get(),
             [&, fn = std::forward<L>(fn)](int i) { fn(cell_groups_[i]); });
     }
 
@@ -114,7 +119,7 @@ private:
     // the cell group pointer reference and index.
     template <typename L>
     void foreach_group_index(L&& fn) {
-        threading::parallel_for::apply(0, cell_groups_.size(),
+        threading::parallel_for::apply(0, cell_groups_.size(), task_system_.get(),
             [&, fn = std::forward<L>(fn)](int i) { fn(cell_groups_[i], i); });
     }
 };
@@ -122,10 +127,12 @@ private:
 simulation_state::simulation_state(
         const recipe& rec,
         const domain_decomposition& decomp,
-        const distributed_context* ctx
+        const execution_context* ctx
     ):
-    local_spikes_(new spike_double_buffer{}),
-    communicator_(rec, decomp, ctx)
+    local_spikes_(new spike_double_buffer(thread_private_spike_store(ctx->thread_pool),
+                                          thread_private_spike_store(ctx->thread_pool))),
+    communicator_(rec, decomp, ctx),
+    task_system_(ctx->thread_pool)
 {
     const auto num_local_cells = communicator_.num_local_cells();
 
@@ -269,7 +276,7 @@ time_type simulation_state::run(time_type tfinal, time_type dt) {
 
         // run the tasks, overlapping if the threading model and number of
         // available threads permits it.
-        threading::task_group g;
+        threading::task_group g(task_system_.get());
         g.run(exchange);
         g.run(update_cells);
         g.wait();
@@ -297,7 +304,7 @@ time_type simulation_state::run(time_type tfinal, time_type dt) {
 //      pending_events    : take all events
 void simulation_state::setup_events(time_type t_from, time_type t_to, std::size_t epoch) {
     const auto n = communicator_.num_local_cells();
-    threading::parallel_for::apply(0, n,
+    threading::parallel_for::apply(0, n, task_system_.get(),
         [&](cell_size_type i) {
             merge_events(
                 t_from, t_to,
@@ -361,7 +368,7 @@ void simulation_state::inject_events(const pse_vector& events) {
 simulation::simulation(
     const recipe& rec,
     const domain_decomposition& decomp,
-    const distributed_context* ctx)
+    const execution_context* ctx)
 {
     impl_.reset(new simulation_state(rec, decomp, ctx));
 }
diff --git a/arbor/thread_private_spike_store.cpp b/arbor/thread_private_spike_store.cpp
index 883d234c..e6a1f92e 100644
--- a/arbor/thread_private_spike_store.cpp
+++ b/arbor/thread_private_spike_store.cpp
@@ -9,11 +9,15 @@ namespace arb {
 
 struct local_spike_store_type {
     threading::enumerable_thread_specific<std::vector<spike>> buffers_;
+
+    local_spike_store_type(const task_system_handle& ts): buffers_(ts) {};
 };
 
-thread_private_spike_store::thread_private_spike_store():
-    impl_(new local_spike_store_type)
-{}
+thread_private_spike_store::thread_private_spike_store(thread_private_spike_store&& t): impl_(std::move(t.impl_)) {};
+
+thread_private_spike_store::thread_private_spike_store(const task_system_handle& ts):
+    impl_(new local_spike_store_type(ts)) {
+}
 
 thread_private_spike_store::~thread_private_spike_store() {}
 
@@ -41,5 +45,4 @@ void thread_private_spike_store::clear() {
         b.clear();
     }
 }
-
 } // namespace arb
diff --git a/arbor/thread_private_spike_store.hpp b/arbor/thread_private_spike_store.hpp
index 9e1829ef..eec50709 100644
--- a/arbor/thread_private_spike_store.hpp
+++ b/arbor/thread_private_spike_store.hpp
@@ -5,6 +5,7 @@
 
 #include <arbor/common_types.hpp>
 #include <arbor/spike.hpp>
+#include <arbor/execution_context.hpp>
 
 #include "threading/threading.hpp"
 
@@ -23,6 +24,9 @@ public :
     thread_private_spike_store();
     ~thread_private_spike_store();
 
+    thread_private_spike_store(thread_private_spike_store&& t);
+    thread_private_spike_store(const task_system_handle& ts);
+
     /// Collate all of the individual buffers into a single vector of spikes.
     /// Does not modify the buffer contents.
     std::vector<spike> gather() const;
diff --git a/arbor/threading/cthread.cpp b/arbor/threading/cthread.cpp
index ae7c9d56..e613beac 100644
--- a/arbor/threading/cthread.cpp
+++ b/arbor/threading/cthread.cpp
@@ -7,6 +7,7 @@
 
 #include "cthread.hpp"
 #include "threading.hpp"
+#include "arbor/execution_context.hpp"
 
 using namespace arb::threading::impl;
 using namespace arb::threading;
@@ -119,15 +120,12 @@ int task_system::get_num_threads() {
     return threads_.size() + 1;
 }
 
-std::size_t task_system::get_current_thread() {
-    std::thread::id tid = std::this_thread::get_id();
-    return thread_ids_[tid];
-}
+std::unordered_map<std::thread::id, std::size_t> task_system::get_thread_ids() {
+    return thread_ids_;
+};
 
-task_system& task_system::get_global_task_system() {
-    auto num_threads = threading::num_threads();
-    static task_system global_task_system(num_threads);
-    return global_task_system;
+task_system_handle arb::make_thread_pool(int nthreads) {
+    return task_system_handle(new task_system(nthreads));
 }
 
 
diff --git a/arbor/threading/cthread_impl.hpp b/arbor/threading/cthread_impl.hpp
index ba4fe009..6899691d 100644
--- a/arbor/threading/cthread_impl.hpp
+++ b/arbor/threading/cthread_impl.hpp
@@ -20,6 +20,7 @@
 #include <type_traits>
 
 #include <cstdlib>
+#include "arbor/execution_context.hpp"
 
 namespace arb {
 namespace threading {
@@ -97,19 +98,17 @@ public:
     // Includes master thread.
     int get_num_threads();
 
-    // Get a stable integer for the current thread that is [0, nthreads).
-    std::size_t get_current_thread();
-
-    // Singleton constructor - needed to order construction with other singletons. TODO
-    static task_system& get_global_task_system();
+    // Returns the thread_id map
+    std::unordered_map<std::thread::id, std::size_t> get_thread_ids();
 };
 
 ///////////////////////////////////////////////////////////////////////
 // types
 ///////////////////////////////////////////////////////////////////////
+
 template <typename T>
 class enumerable_thread_specific {
-    task_system& global_task_system;
+    std::unordered_map<std::thread::id, std::size_t> thread_ids_;
 
     using storage_class = std::vector<T>;
     storage_class data;
@@ -118,21 +117,21 @@ public:
     using iterator = typename storage_class::iterator;
     using const_iterator = typename storage_class::const_iterator;
 
-    enumerable_thread_specific():
-        global_task_system{task_system::get_global_task_system()},
-        data{std::vector<T>(global_task_system.get_num_threads())}
+    enumerable_thread_specific(const task_system_handle& ts):
+        thread_ids_{ts.get()->get_thread_ids()},
+        data{std::vector<T>(ts.get()->get_num_threads())}
     {}
 
-    enumerable_thread_specific(const T& init):
-        global_task_system{task_system::get_global_task_system()},
-        data{std::vector<T>(global_task_system.get_num_threads(), init)}
+    enumerable_thread_specific(const T& init, const task_system_handle& ts):
+        thread_ids_{ts.get()->get_thread_ids()},
+        data{std::vector<T>(ts.get()->get_num_threads(), init)}
     {}
 
     T& local() {
-        return data[global_task_system.get_current_thread()];
+        return data[thread_ids_.at(std::this_thread::get_id())];
     }
     const T& local() const {
-        return data[global_task_system.get_current_thread()];
+        return data[thread_ids_.at(std::this_thread::get_id())];
     }
 
     auto size() const { return data.size(); }
@@ -156,11 +155,13 @@ constexpr bool multithreaded() { return true; }
 class task_group {
 private:
     std::atomic<std::size_t> in_flight_{0};
-    task_system& task_system_;
+    /// We use a raw pointer here instead of a shared_ptr to avoid a race condition
+    /// on the destruction of a task_system that would lead to a thread trying to join itself
+    task_system* task_system_;
 
 public:
-    task_group():
-        task_system_{task_system::get_global_task_system()}
+    task_group(task_system* ts):
+        task_system_{ts}
     {}
 
     task_group(const task_group&) = delete;
@@ -209,14 +210,13 @@ public:
     template<typename F>
     void run(F&& f) {
         ++in_flight_;
-
-        task_system_.async(make_wrapped_function(std::forward<F>(f), in_flight_));
+        task_system_->async(make_wrapped_function(std::forward<F>(f), in_flight_));
     }
 
     // wait till all tasks in this group are done
     void wait() {
         while (in_flight_) {
-            task_system_.try_run_task();
+            task_system_->try_run_task();
         }
     }
 
@@ -231,18 +231,13 @@ public:
 ///////////////////////////////////////////////////////////////////////
 struct parallel_for {
     template <typename F>
-    static void apply(int left, int right, F f) {
-        task_group g;
+    static void apply(int left, int right, task_system* ts, F f) {
+        task_group g(ts);
         for (int i = left; i < right; ++i) {
           g.run([=] {f(i);});
         }
         g.wait();
     }
 };
-
-inline std::size_t thread_id() {
-    return task_system::get_global_task_system().get_current_thread();
-}
-
 } // namespace threading
 } // namespace arb
diff --git a/arbor/threading/serial.hpp b/arbor/threading/serial.hpp
deleted file mode 100644
index af3b0594..00000000
--- a/arbor/threading/serial.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <string>
-#include <vector>
-
-namespace arb {
-namespace threading {
-inline namespace serial {
-
-///////////////////////////////////////////////////////////////////////
-// types
-///////////////////////////////////////////////////////////////////////
-template <typename T>
-class enumerable_thread_specific {
-    std::array<T, 1> data;
-
-public :
-    using iterator = typename std::array<T, 1>::iterator;
-    using const_iterator = typename std::array<T, 1>::const_iterator;
-
-    enumerable_thread_specific() = default;
-
-    enumerable_thread_specific(const T& init) :
-        data{init}
-    {}
-
-    enumerable_thread_specific(T&& init) :
-        data{std::move(init)}
-    {}
-
-    T& local() { return data[0]; }
-    const T& local() const { return data[0]; }
-
-    auto size() const { return data.size(); }
-
-    iterator begin() { return data.begin(); }
-    iterator end()   { return data.end(); }
-
-    const_iterator begin() const { return data.begin(); }
-    const_iterator end()   const { return data.end(); }
-
-    const_iterator cbegin() const { return data.cbegin(); }
-    const_iterator cend()   const { return data.cend(); }
-};
-
-
-///////////////////////////////////////////////////////////////////////
-// algorithms
-///////////////////////////////////////////////////////////////////////
-struct parallel_for {
-    template <typename F>
-    static void apply(int left, int right, F f) {
-        for(int i=left; i<right; ++i) {
-            f(i);
-        }
-    }
-};
-
-template <typename RandomIt>
-void sort(RandomIt begin, RandomIt end) {
-    std::sort(begin, end);
-}
-
-template <typename RandomIt, typename Compare>
-void sort(RandomIt begin, RandomIt end, Compare comp) {
-    std::sort(begin, end, comp);
-}
-
-template <typename Container>
-void sort(Container& c) {
-    std::sort(c.begin(), c.end());
-}
-
-template <typename T>
-using parallel_vector = std::vector<T>;
-
-inline std::string description() {
-    return "serial";
-}
-
-constexpr bool multithreaded() { return false; }
-
-inline std::size_t thread_id() {
-    return 0;
-}
-
-/// Proxy for tbb task group.
-/// The tbb version launches tasks asynchronously, returning control to the
-/// caller. The serial version implemented here simply runs the task, before
-/// returning control, effectively serializing all asynchronous calls.
-class task_group {
-public:
-    task_group() = default;
-
-    template<typename Func>
-    void run(const Func& f) {
-        f();
-    }
-
-    template<typename Func>
-    void run_and_wait(const Func& f) {
-        f();
-    }
-
-    void wait()
-    {}
-
-    bool is_canceling() {
-        return false;
-    }
-
-    void cancel()
-    {}
-};
-
-} // namespace serial
-} // namespace threading
-} // namespace arb
-
diff --git a/arbor/threading/tbb.hpp b/arbor/threading/tbb.hpp
deleted file mode 100644
index b4c2af94..00000000
--- a/arbor/threading/tbb.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include <string>
-
-#include <tbb/tbb.h>
-#include <tbb/tbb_stddef.h>
-#include <tbb/compat/thread>
-#include <tbb/enumerable_thread_specific.h>
-
-namespace arb {
-namespace threading {
-inline namespace tbb {
-
-template <typename T>
-using enumerable_thread_specific = ::tbb::enumerable_thread_specific<T>;
-
-struct parallel_for {
-    template <typename F>
-    static void apply(int left, int right, F f) {
-        ::tbb::parallel_for(left, right, f);
-    }
-};
-
-inline std::string description() {
-    return "TBBv" + std::to_string(::tbb::TBB_runtime_interface_version());
-}
-
-constexpr bool multithreaded() { return true; }
-
-template <typename T>
-using parallel_vector = ::tbb::concurrent_vector<T>;
-
-using task_group = ::tbb::task_group;
-
-inline
-std::size_t thread_id() {
-    static std::atomic<std::size_t> num_threads(0);
-    thread_local std::size_t thread_id = num_threads++;
-    return thread_id;
-}
-
-template <typename RandomIt>
-void sort(RandomIt begin, RandomIt end) {
-    ::tbb::parallel_sort(begin, end);
-}
-
-template <typename RandomIt, typename Compare>
-void sort(RandomIt begin, RandomIt end, Compare comp) {
-    ::tbb::parallel_sort(begin, end, comp);
-}
-
-template <typename Container>
-void sort(Container& c) {
-    ::tbb::parallel_sort(c.begin(), c.end());
-}
-
-} // namespace tbb
-} // namespace threading
-} // namespace arb
-
diff --git a/arbor/threading/threading.cpp b/arbor/threading/threading.cpp
index a05bf1bd..54857415 100644
--- a/arbor/threading/threading.cpp
+++ b/arbor/threading/threading.cpp
@@ -78,12 +78,8 @@ std::size_t num_threads_init() {
 //      number of threads.
 size_t num_threads() {
     // TODO: this is a bit of a hack until we have user-configurable threading.
-#if defined(ARB_HAVE_SERIAL)
-    return 1;
-#else
     static size_t num_threads_cached = num_threads_init();
     return num_threads_cached;
-#endif
 }
 
 } // namespace threading
diff --git a/arbor/threading/threading.hpp b/arbor/threading/threading.hpp
index 8150ca6a..d6fe7b93 100644
--- a/arbor/threading/threading.hpp
+++ b/arbor/threading/threading.hpp
@@ -25,17 +25,4 @@ size_t num_threads();
 } // namespace threading
 } // namespace arb
 
-#if defined(ARB_HAVE_TBB)
-
-#include "tbb.hpp"
-
-#elif defined(ARB_HAVE_CTHREAD)
-
 #include "cthread.hpp"
-
-#else
-
-#define ARB_HAVE_SERIAL
-#include "serial.hpp"
-
-#endif
diff --git a/arbor/util/double_buffer.hpp b/arbor/util/double_buffer.hpp
index f4520098..67afecde 100644
--- a/arbor/util/double_buffer.hpp
+++ b/arbor/util/double_buffer.hpp
@@ -4,6 +4,7 @@
 #include <atomic>
 
 #include <arbor/assert.hpp>
+#include <arbor/execution_context.hpp>
 
 namespace arb {
 namespace util {
@@ -13,7 +14,7 @@ template <typename T>
 class double_buffer {
 private:
     std::atomic<int> index_;
-    std::array<T, 2> buffers_;
+    std::vector<T> buffers_;
 
     int other_index() {
         return index_ ? 0 : 1;
@@ -23,9 +24,15 @@ public:
     using value_type = T;
 
     double_buffer() :
-        index_(0)
+        index_(0), buffers_(2)
     {}
 
+    double_buffer(T l, T r): index_(0) {
+        buffers_.reserve(2);
+        buffers_.push_back(std::move(l));
+        buffers_.push_back(std::move(r));
+    }
+
     /// remove the copy and move constructors which won't work with std::atomic
     double_buffer(double_buffer&&) = delete;
     double_buffer(const double_buffer&) = delete;
diff --git a/arbor/util/range.hpp b/arbor/util/range.hpp
index fbec6108..3f692016 100644
--- a/arbor/util/range.hpp
+++ b/arbor/util/range.hpp
@@ -26,10 +26,6 @@
 #include <type_traits>
 #include <utility>
 
-#ifdef ARB_HAVE_TBB
-#include <tbb/tbb_stddef.h>
-#endif
-
 #include <arbor/assert.hpp>
 
 #include <util/counter.hpp>
@@ -133,42 +129,6 @@ struct range {
     data() const {
         return left;
     }
-
-#ifdef ARB_HAVE_TBB
-    template <
-        typename V = iterator,
-        typename = std::enable_if_t<is_forward_iterator<V>::value>
-    >
-    range(range& r, tbb::split):
-        left(r.left), right(r.right)
-    {
-        std::advance(left, r.size()/2u);
-        r.right = left;
-    }
-
-    template <
-        typename V = iterator,
-        typename = std::enable_if_t<is_forward_iterator<V>::value>
-    >
-    range(range& r, tbb::proportional_split p):
-        left(r.left), right(r.right)
-    {
-        size_type i = (r.size()*p.left())/(p.left()+p.right());
-        if (i<1) {
-            i = 1;
-        }
-        std::advance(left, i);
-        r.right = left;
-    }
-
-    bool is_divisible() const {
-        return is_forward_iterator<U>::value && left != right && std::next(left) != right;
-    }
-
-    static constexpr bool is_splittable_in_proportion() {
-        return is_forward_iterator<U>::value;
-    }
-#endif
 };
 
 template <typename U, typename V>
diff --git a/cmake/FindTBB.cmake b/cmake/FindTBB.cmake
deleted file mode 100644
index 1b4b85fa..00000000
--- a/cmake/FindTBB.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Find the Intel Thread Building Blocks library
-#
-# Sets the following variables:
-#
-#  TBB_FOUND               - True if libtbb and libtbb_malloc found.
-#  TBB_LIBRARIES           - Paths to libtbb and libtbbmalloc.
-#  TBB_INCLUDE_DIR         - Base directory for tbb/ includes.
-# 
-# Generates the import library target TBB:tbb if found.
-#
-# The default search path can be overriden by setting the
-# CMake variable TBB_ROOT_DIR or the environment variables
-# TBBROOT or TBB_ROOT.
-
-if(NOT TBB_FOUND)
-    find_package(Threads REQUIRED)
-
-    set(_tbb_search_path ${TBB_ROOT_DIR} $ENV{TBBROOT} $ENV{TBB_ROOT})
-    set(_tbb_lib_suffixes lib/intel64/gcc4.7 lib/intel64/gcc4.4 lib/gcc4.7 lib/gcc4.4 lib/android lib/mic lib)
-
-    macro(_tbb_findlib libname)
-        find_library(_lib${libname} ${libname}
-            PATHS ${_tbb_search_path} NO_DEFAULT_PATH
-            PATH_SUFFIXES ${_tbb_lib_suffixes})
-        find_library(_lib${libname} ${libname}
-            PATH_SUFFIXES ${_tbb_lib_suffixes})
-    endmacro()
-
-    _tbb_findlib(tbb)
-    _tbb_findlib(tbbmalloc)
-
-    find_path(TBB_INCLUDE_DIR tbb/tbb.h PATHS ${_tbb_search_path} NO_DEFAULT_PATH PATH_SUFFIXES include)
-    find_path(TBB_INCLUDE_DIR tbb/tbb.h)
-
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(TBB DEFAULT_MSG TBB_INCLUDE_DIR _libtbb _libtbbmalloc)
-
-    if(TBB_FOUND)
-        set(TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR})
-        set(TBB_LIBRARIES ${_libtbb} ${_libtbbmalloc})
-        if(NOT TARGET TBB::tbb)
-            if("${_libtbb}" MATCHES "\.a$")
-                add_library(TBB::tbb STATIC IMPORTED GLOBAL)
-            else()
-                add_library(TBB::tbb SHARED IMPORTED GLOBAL)
-            endif()
-            set_target_properties(TBB::tbb PROPERTIES
-                    IMPORTED_LOCATION "${_libtbb}"
-                    INTERFACE_LINK_LIBRARIES "${_libtbbmalloc}" Threads::Threads ${CMAKE_DL_LIBS}
-                    INTERFACE_INCLUDE_DIRECTORIES "${TBB_INCLUDE_DIR}"
-            )
-        endif()
-    endif()
-    mark_as_advanced(TBB_INCLUDE_DIR)
-
-    unset(_tbb_search_path)
-    unset(_tbb_lib_suffixes)
-    unset(_libtbb)
-    unset(_libtbbmalloc)
-endif()
diff --git a/example/bench/bench.cpp b/example/bench/bench.cpp
index 40414176..52e11c20 100644
--- a/example/bench/bench.cpp
+++ b/example/bench/bench.cpp
@@ -10,12 +10,14 @@
 
 #include <arbor/profile/meter_manager.hpp>
 #include <arbor/common_types.hpp>
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/profile/profiler.hpp>
 #include <arbor/recipe.hpp>
 #include <arbor/simulation.hpp>
+#include <arbor/threadinfo.hpp>
+
 
 #include <aux/ioutil.hpp>
 #include <aux/json_meter.hpp>
@@ -30,12 +32,12 @@ namespace profile = arb::profile;
 
 int main(int argc, char** argv) {
     try {
-        arb::distributed_context context;
+        arb::execution_context context;
 #ifdef ARB_HAVE_MPI
         aux::with_mpi guard(&argc, &argv);
-        context = mpi_context(MPI_COMM_WORLD);
+        context.distributed = mpi_context(MPI_COMM_WORLD);
 #endif
-        const bool is_root =  context.id()==0;
+        const bool is_root =  context.distributed.id()==0;
 
         std::cout << aux::mask_stream(is_root);
 
@@ -43,7 +45,7 @@ int main(int argc, char** argv) {
 
         std::cout << params << "\n";
 
-        profile::meter_manager meters(&context);
+        profile::meter_manager meters(&context.distributed);
         meters.start();
 
         // Create an instance of our recipe.
diff --git a/example/brunel/brunel_miniapp.cpp b/example/brunel/brunel_miniapp.cpp
index fb1195f1..463be264 100644
--- a/example/brunel/brunel_miniapp.cpp
+++ b/example/brunel/brunel_miniapp.cpp
@@ -32,7 +32,7 @@
 
 using namespace arb;
 
-void banner(proc_allocation, const distributed_context*);
+void banner(proc_allocation, const execution_context*);
 
 // Samples m unique values in interval [start, end) - gid.
 // We exclude gid because we don't want self-loops.
@@ -187,18 +187,18 @@ private:
 };
 
 int main(int argc, char** argv) {
-    distributed_context context;
+    execution_context context;
 
     try {
 #ifdef ARB_MPI_ENABLED
         with_mpi guard(argc, argv, false);
-        context = mpi_context(MPI_COMM_WORLD);
+        context.distributed = mpi_context(MPI_COMM_WORLD);
 #endif
-        arb::profile::meter_manager meters(&context);
+        arb::profile::meter_manager meters(&context.distributed);
         meters.start();
-        std::cout << aux::mask_stream(context.id()==0);
+        std::cout << aux::mask_stream(context.distributed.id()==0);
         // read parameters
-        io::cl_options options = io::read_options(argc, argv, context.id()==0);
+        io::cl_options options = io::read_options(argc, argv, context.distributed.id()==0);
         proc_allocation nd = local_allocation();
         banner(nd, &context);
 
@@ -246,7 +246,7 @@ int main(int argc, char** argv) {
         if (options.spike_file_output) {
             using std::ios_base;
 
-            auto rank = context.id();
+            auto rank = context.distributed.id();
             aux::path p = options.output_path;
             p /= aux::strsub("%_%.%", options.file_name, rank, options.file_extension);
 
@@ -273,7 +273,7 @@ int main(int argc, char** argv) {
 
         auto report = profile::make_meter_report(meters);
         std::cout << report;
-        if (context.id()==0) {
+        if (context.distributed.id()==0) {
             std::ofstream fid;
             fid.exceptions(std::ios_base::badbit | std::ios_base::failbit);
             fid.open("meters.json");
@@ -282,7 +282,7 @@ int main(int argc, char** argv) {
     }
     catch (io::usage_error& e) {
         // only print usage/startup errors on master
-        std::cerr << aux::mask_stream(context.id()==0);
+        std::cerr << aux::mask_stream(context.distributed.id()==0);
         std::cerr << e.what() << "\n";
         return 1;
     }
@@ -293,11 +293,11 @@ int main(int argc, char** argv) {
     return 0;
 }
 
-void banner(proc_allocation nd, const distributed_context* ctx) {
+void banner(proc_allocation nd, const execution_context* ctx) {
     std::cout << "==========================================\n";
     std::cout << "  Arbor miniapp\n";
-    std::cout << "  - distributed : " << ctx->size()
-              << " (" << ctx->name() << ")\n";
+    std::cout << "  - distributed : " << ctx->distributed.size()
+              << " (" << ctx->distributed.name() << ")\n";
     std::cout << "  - threads     : " << nd.num_threads
               << " (" << arb::thread_implementation() << ")\n";
     std::cout << "  - gpus        : " << nd.num_gpus << "\n";
diff --git a/example/generators/event_gen.cpp b/example/generators/event_gen.cpp
index f3c577c3..fd7423d4 100644
--- a/example/generators/event_gen.cpp
+++ b/example/generators/event_gen.cpp
@@ -128,7 +128,7 @@ int main() {
     // A distributed_context is required for distributed computation (e.g. MPI).
     // For this simple one-cell example, non-distributed context is suitable,
     // which is what we get with a default-constructed distributed_context.
-    arb::distributed_context context;
+    arb::execution_context context;
 
     // Create an instance of our recipe.
     generator_recipe recipe;
diff --git a/example/miniapp/miniapp.cpp b/example/miniapp/miniapp.cpp
index e384a015..edbc220f 100644
--- a/example/miniapp/miniapp.cpp
+++ b/example/miniapp/miniapp.cpp
@@ -6,7 +6,7 @@
 #include <vector>
 
 #include <arbor/common_types.hpp>
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/mc_cell.hpp>
 #include <arbor/profile/meter_manager.hpp>
@@ -36,7 +36,7 @@ using namespace arb;
 
 using util::any_cast;
 
-void banner(proc_allocation, const distributed_context*);
+void banner(proc_allocation, const execution_context*);
 std::unique_ptr<recipe> make_recipe(const io::cl_options&, const probe_distribution&);
 sample_trace make_trace(const probe_info& probe);
 std::fstream& open_or_throw(std::fstream& file, const aux::path& p, bool exclusive = false);
@@ -44,20 +44,20 @@ void report_compartment_stats(const recipe&);
 
 int main(int argc, char** argv) {
     // default serial context
-    distributed_context context;
+    execution_context context;
 
     try {
 #ifdef ARB_MPI_ENABLED
         with_mpi guard(argc, argv, false);
-        context = mpi_context(MPI_COMM_WORLD);
+        context.distributed = mpi_context(MPI_COMM_WORLD);
 #endif
 
-        profile::meter_manager meters(&context);
+        profile::meter_manager meters(&context.distributed);
         meters.start();
 
-        std::cout << aux::mask_stream(context.id()==0);
+        std::cout << aux::mask_stream(context.distributed.id()==0);
         // read parameters
-        io::cl_options options = io::read_options(argc, argv, context.id()==0);
+        io::cl_options options = io::read_options(argc, argv, context.distributed.id()==0);
 
         // TODO: add dry run mode
 
@@ -117,7 +117,7 @@ int main(int argc, char** argv) {
         if (options.spike_file_output) {
             using std::ios_base;
 
-            auto rank = context.id();
+            auto rank = context.distributed.id();
             aux::path p = options.output_path;
             p /= aux::strsub("%_%.%", options.file_name, rank, options.file_extension);
 
@@ -151,7 +151,7 @@ int main(int argc, char** argv) {
 
         auto report = profile::make_meter_report(meters);
         std::cout << report;
-        if (context.id()==0) {
+        if (context.distributed.id()==0) {
             std::ofstream fid;
             fid.exceptions(std::ios_base::badbit | std::ios_base::failbit);
             fid.open("meters.json");
@@ -160,7 +160,7 @@ int main(int argc, char** argv) {
     }
     catch (io::usage_error& e) {
         // only print usage/startup errors on master
-        std::cerr << aux::mask_stream(context.id()==0);
+        std::cerr << aux::mask_stream(context.distributed.id()==0);
         std::cerr << e.what() << "\n";
         return 1;
     }
@@ -171,11 +171,11 @@ int main(int argc, char** argv) {
     return 0;
 }
 
-void banner(proc_allocation nd, const distributed_context* ctx) {
+void banner(proc_allocation nd, const execution_context* ctx) {
     std::cout << "==========================================\n";
     std::cout << "  Arbor miniapp\n";
-    std::cout << "  - distributed : " << ctx->size()
-              << " (" << ctx->name() << ")\n";
+    std::cout << "  - distributed : " << ctx->distributed.size()
+              << " (" << ctx->distributed.name() << ")\n";
     std::cout << "  - threads     : " << nd.num_threads
               << " (" << arb::thread_implementation() << ")\n";
     std::cout << "  - gpus        : " << nd.num_gpus << "\n";
diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt
index 05f93fa1..49aa903e 100644
--- a/ext/CMakeLists.txt
+++ b/ext/CMakeLists.txt
@@ -8,40 +8,6 @@ target_include_directories(ext-json INTERFACE json/single_include)
 add_library(ext-tclap INTERFACE)
 target_include_directories(ext-tclap INTERFACE tclap/include)
 
-# Intel TBB:
-# Alias system TBB or build locally and export that, according
-# to ARB_PRIVATE_TBBLIB setting.
-
-find_package(TBB)
-if(ARB_PRIVATE_TBBLIB OR NOT TBB_FOUND)
-    check_git_submodule(tbb tbb)
-    add_target_if(tbb_avail check-tbb-submodule "Checking TBB submodule" "TBB git submodule required")
-
-    # Turn off proxy malloc library and test compilation.
-    option(TBB_BUILD_TBBMALLOC_PROXY "" OFF)
-    option(TBB_BUILD_TESTS           "" OFF)
-    # Only make static libraries.
-    option(TBB_BUILD_SHARED          "" OFF)
-    option(TBB_BUILD_STATIC          "" ON)
-
-    add_subdirectory(tbb EXCLUDE_FROM_ALL)
-
-    add_library(ext-tbb INTERFACE)
-    add_dependencies(ext-tbb check-tbb-submodule)
-    target_link_libraries(ext-tbb INTERFACE tbb_static tbbmalloc_static)
-    target_include_directories(ext-tbb SYSTEM INTERFACE tbb/include)
-
-    # Can't use install(TARGETS) because 1) tbb targets are defined in
-    # a subdirectory and 2) having been excluded-from-all, the behaviour
-    # might have been undefined anyway. Seriously.
-
-    install(FILES "$<TARGET_FILE:tbb_static>" "$<TARGET_FILE:tbbmalloc_static>" DESTINATION ${CMAKE_INSTALL_LIBDIR} OPTIONAL)
-else()
-    add_library(ext-tbb INTERFACE)
-    target_link_libraries(ext-tbb INTERFACE TBB::tbb)
-endif()
-
-
 # Google benchmark for microbenchmarks:
 
 check_git_submodule(gbench google-benchmark)
diff --git a/ext/tbb b/ext/tbb
deleted file mode 160000
index a0dc9bf7..00000000
--- a/ext/tbb
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a0dc9bf76d0120f917b641ed095360448cabc85b
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 9476c74c..23e4a59b 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -38,11 +38,6 @@ endif()
 if(ARB_WITH_PROFILING)
     list(APPEND arb_features PROFILE)
 endif()
-if(ARB_WITH_TBB)
-    list(APPEND arb_features TBB)
-elseif(ARB_WITH_CTHREAD)
-    list(APPEND arb_features CTHREAD)
-endif()
 
 add_custom_command(
     OUTPUT version.hpp-test
diff --git a/include/arbor/execution_context.hpp b/include/arbor/execution_context.hpp
new file mode 100644
index 00000000..fac08b82
--- /dev/null
+++ b/include/arbor/execution_context.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <arbor/domain_decomposition.hpp>
+#include <arbor/distributed_context.hpp>
+#include <arbor/util/pp_util.hpp>
+#include <arbor/threadinfo.hpp>
+
+
+namespace arb {
+namespace threading {
+    class task_system;
+}
+using task_system_handle = std::shared_ptr<threading::task_system>;
+
+task_system_handle make_thread_pool (int nthreads);
+
+struct execution_context {
+    // TODO: use a shared_ptr for distributed_context
+    distributed_context distributed;
+    task_system_handle thread_pool;
+
+    execution_context(): thread_pool(arb::make_thread_pool(arb::num_threads())) {};
+    execution_context(proc_allocation nd): thread_pool(arb::make_thread_pool(nd.num_threads)) {};
+};
+
+}
diff --git a/include/arbor/load_balance.hpp b/include/arbor/load_balance.hpp
index 8235da03..5866f4a8 100644
--- a/include/arbor/load_balance.hpp
+++ b/include/arbor/load_balance.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/recipe.hpp>
 
@@ -19,7 +19,7 @@ using partition_hint_map = std::unordered_map<cell_kind, partition_hint>;
 domain_decomposition partition_load_balance(
     const recipe& rec,
     proc_allocation nd,
-    const distributed_context* ctx,
+    const execution_context* ctx,
     partition_hint_map hint_map = {});
 
 } // namespace arb
diff --git a/include/arbor/simulation.hpp b/include/arbor/simulation.hpp
index 844380c6..d652108e 100644
--- a/include/arbor/simulation.hpp
+++ b/include/arbor/simulation.hpp
@@ -6,7 +6,7 @@
 #include <vector>
 
 #include <arbor/common_types.hpp>
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/recipe.hpp>
 #include <arbor/sampling.hpp>
@@ -22,7 +22,7 @@ class simulation_state;
 
 class simulation {
 public:
-    simulation(const recipe& rec, const domain_decomposition& decomp, const distributed_context* ctx);
+    simulation(const recipe& rec, const domain_decomposition& decomp, const execution_context* ctx);
 
     void reset();
 
diff --git a/scripts/travis/build.sh b/scripts/travis/build.sh
index 3a6e3dbc..583daec5 100755
--- a/scripts/travis/build.sh
+++ b/scripts/travis/build.sh
@@ -46,7 +46,7 @@ cd $build_path
 #
 progress "Configuring with cmake"
 
-cmake_flags="-DARB_WITH_ASSERTIONS=on -DARB_THREADING_MODEL=${WITH_THREAD} -DARB_WITH_MPI=${WITH_MPI} ${CXX_FLAGS}"
+cmake_flags="-DARB_WITH_ASSERTIONS=on -DARB_WITH_MPI=${WITH_MPI} ${CXX_FLAGS}"
 echo "cmake flags: ${cmake_flags}"
 cmake .. ${cmake_flags} || error "unable to configure cmake"
 
diff --git a/test/ubench/task_system.cpp b/test/ubench/task_system.cpp
index 17f5486b..e0a7303d 100644
--- a/test/ubench/task_system.cpp
+++ b/test/ubench/task_system.cpp
@@ -7,24 +7,19 @@
 #include <thread>
 
 #include <arbor/threadinfo.hpp>
-
 #include <arbor/version.hpp>
-#if defined(ARB_TBB_ENABLED)
-    #include "threading/tbb.hpp"
-#elif defined(ARB_CTHREAD_ENABLED)
-    #include "threading/cthread.hpp"
-#else
-    #include "threading/serial.hpp"
-#endif
+
+#include "threading/cthread.hpp"
 
 #include <benchmark/benchmark.h>
 
 using namespace arb;
 
 void run(unsigned long us_per_task, unsigned tasks) {
+    arb::threading::task_system ts(arb::num_threads());
     auto duration = std::chrono::microseconds(us_per_task);
     arb::threading::parallel_for::apply(
-            0, tasks,
+            0, tasks, &ts,
             [&](unsigned i){std::this_thread::sleep_for(duration);});
 }
 
diff --git a/test/unit-distributed/test.cpp b/test/unit-distributed/test.cpp
index f5be0bdd..4db82042 100644
--- a/test/unit-distributed/test.cpp
+++ b/test/unit-distributed/test.cpp
@@ -5,7 +5,8 @@
 
 #include "../gtest.h"
 
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
+#include "arbor/threadinfo.hpp"
 
 #include <aux/ioutil.hpp>
 #include <aux/tinyopt.hpp>
@@ -17,7 +18,7 @@
 
 using namespace arb;
 
-distributed_context g_context;
+execution_context g_context;
 
 const char* usage_str =
 "[OPTION]...\n"
@@ -28,9 +29,9 @@ const char* usage_str =
 int main(int argc, char **argv) {
 #ifdef TEST_MPI
     with_mpi guard(argc, argv, false);
-    g_context = mpi_context(MPI_COMM_WORLD);
+    g_context.distributed = mpi_context(MPI_COMM_WORLD);
 #elif defined(TEST_LOCAL)
-    g_context = local_context();
+    g_context.distributed = local_context();
 #else
 #error "define TEST_MPI or TEST_LOCAL for distributed test"
 #endif
@@ -42,7 +43,7 @@ int main(int argc, char **argv) {
     auto& listeners = testing::UnitTest::GetInstance()->listeners();
     // replace original printer with our custom printer
     delete listeners.Release(listeners.default_result_printer());
-    listeners.Append(new distributed_listener("run_"+g_context.name(), &g_context));
+    listeners.Append(new distributed_listener("run_"+g_context.distributed.name(), &g_context.distributed));
 
     int return_value = 0;
     try {
@@ -84,5 +85,5 @@ int main(int argc, char **argv) {
 
     // perform global collective, to ensure that all ranks return
     // the same exit code
-    return g_context.max(return_value);
+    return g_context.distributed.max(return_value);
 }
diff --git a/test/unit-distributed/test.hpp b/test/unit-distributed/test.hpp
index b7d4679a..630bd188 100644
--- a/test/unit-distributed/test.hpp
+++ b/test/unit-distributed/test.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 
 // Global context is a global variable, set in the main() funtion of the main
 // test driver test.cpp.
-extern arb::distributed_context g_context;
+extern arb::execution_context g_context;
diff --git a/test/unit-distributed/test_communicator.cpp b/test/unit-distributed/test_communicator.cpp
index 6e2907d2..4e7daef3 100644
--- a/test/unit-distributed/test_communicator.cpp
+++ b/test/unit-distributed/test_communicator.cpp
@@ -8,6 +8,7 @@
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 #include <arbor/spike_event.hpp>
+#include <threading/cthread.hpp>
 
 #include "communication/communicator.hpp"
 #include "util/filter.hpp"
@@ -22,12 +23,12 @@ static bool is_dry_run() {
 }
 
 TEST(communicator, policy_basics) {
-    const auto num_domains = g_context.size();
-    const auto rank = g_context.id();
+    const auto num_domains = g_context.distributed.size();
+    const auto rank = g_context.distributed.id();
 
-    EXPECT_EQ(g_context.min(rank), 0);
+    EXPECT_EQ(g_context.distributed.min(rank), 0);
     if (!is_dry_run()) {
-        EXPECT_EQ(g_context.max(rank), num_domains-1);
+        EXPECT_EQ(g_context.distributed.max(rank), num_domains-1);
     }
 }
 
@@ -51,8 +52,8 @@ int get_value(const arb::spike& s) {
 // Test low level spike_gather function when each domain produces the same
 // number of spikes in the pattern used by dry run mode.
 TEST(communicator, gather_spikes_equal) {
-    const auto num_domains = g_context.size();
-    const auto rank = g_context.id();
+    const auto num_domains = g_context.distributed.size();
+    const auto rank = g_context.distributed.id();
 
     const auto n_local_spikes = 10;
 
@@ -71,7 +72,7 @@ TEST(communicator, gather_spikes_equal) {
     }
 
     // Perform exchange
-    const auto global_spikes = g_context.gather_spikes(local_spikes);
+    const auto global_spikes = g_context.distributed.gather_spikes(local_spikes);
 
     // Test that partition information is correct
     const auto& part = global_spikes.partition();
@@ -91,7 +92,7 @@ TEST(communicator, gather_spikes_equal) {
     // is a list of num_domains*n_local_spikes spikes that have
     // contiguous source gid
     const auto& spikes = global_spikes.values();
-    EXPECT_EQ(n_local_spikes*g_context.size(), int(spikes.size()));
+    EXPECT_EQ(n_local_spikes*g_context.distributed.size(), int(spikes.size()));
     for (auto i=0u; i<spikes.size(); ++i) {
         const auto s = spikes[i];
         EXPECT_EQ(i, unsigned(s.source.gid));
@@ -112,8 +113,8 @@ TEST(communicator, gather_spikes_variant) {
     // number of spikes.
     if (is_dry_run()) return;
 
-    const auto num_domains = g_context.size();
-    const auto rank = g_context.id();
+    const auto num_domains = g_context.distributed.size();
+    const auto rank = g_context.distributed.id();
 
     // Parameter used to scale the number of spikes generated on successive
     // ranks.
@@ -137,7 +138,7 @@ TEST(communicator, gather_spikes_variant) {
     }
 
     // Perform exchange
-    const auto global_spikes = g_context.gather_spikes(local_spikes);
+    const auto global_spikes = g_context.distributed.gather_spikes(local_spikes);
 
     // Test that partition information is correct
     const auto& part =global_spikes.partition();
@@ -167,7 +168,7 @@ namespace {
     public:
         ring_recipe(cell_size_type s):
             size_(s),
-            ranks_(g_context.size())
+            ranks_(g_context.distributed.size())
         {}
 
         cell_size_type num_cells() const override {
@@ -231,7 +232,7 @@ namespace {
     public:
         all2all_recipe(cell_size_type s):
             size_(s),
-            ranks_(g_context.size())
+            ranks_(g_context.distributed.size())
         {}
 
         cell_size_type num_cells() const override {
@@ -314,10 +315,10 @@ test_ring(const domain_decomposition& D, communicator& C, F&& f) {
 
     // gather the global set of spikes
     auto global_spikes = C.exchange(local_spikes);
-    if (global_spikes.size()!=g_context.sum(local_spikes.size())) {
+    if (global_spikes.size()!=g_context.distributed.sum(local_spikes.size())) {
         return ::testing::AssertionFailure() << "the number of gathered spikes "
             << global_spikes.size() << " doesn't match the expected "
-            << g_context.sum(local_spikes.size());
+            << g_context.distributed.sum(local_spikes.size());
     }
 
     // generate the events
@@ -363,7 +364,7 @@ TEST(communicator, ring)
     using util::make_span;
 
     // construct a homogeneous network of 10*n_domain identical cells in a ring
-    unsigned N = g_context.size();
+    unsigned N = g_context.distributed.size();
 
     unsigned n_local = 10u;
     unsigned n_global = n_local*N;
@@ -405,10 +406,10 @@ test_all2all(const domain_decomposition& D, communicator& C, F&& f) {
 
     // gather the global set of spikes
     auto global_spikes = C.exchange(local_spikes);
-    if (global_spikes.size()!=g_context.sum(local_spikes.size())) {
+    if (global_spikes.size()!=g_context.distributed.sum(local_spikes.size())) {
         return ::testing::AssertionFailure() << "the number of gathered spikes "
             << global_spikes.size() << " doesn't match the expected "
-            << g_context.sum(local_spikes.size());
+            << g_context.distributed.sum(local_spikes.size());
     }
 
     // generate the events
@@ -458,7 +459,7 @@ TEST(communicator, all2all)
     using util::make_span;
 
     // construct a homogeneous network of 10*n_domain identical cells in a ring
-    unsigned N = g_context.size();
+    unsigned N = g_context.distributed.size();
 
     unsigned n_local = 10u;
     unsigned n_global = n_local*N;
diff --git a/test/unit-distributed/test_domain_decomposition.cpp b/test/unit-distributed/test_domain_decomposition.cpp
index bade370f..db3c1f1b 100644
--- a/test/unit-distributed/test_domain_decomposition.cpp
+++ b/test/unit-distributed/test_domain_decomposition.cpp
@@ -65,8 +65,8 @@ namespace {
 }
 
 TEST(domain_decomposition, homogeneous_population) {
-    const auto N = g_context.size();
-    const auto I = g_context.id();
+    const auto N = g_context.distributed.size();
+    const auto I = g_context.distributed.id();
 
     {   // Test on a node with 1 cpu core and no gpus.
         // We assume that all cells will be put into cell groups of size 1.
@@ -134,8 +134,8 @@ TEST(domain_decomposition, homogeneous_population) {
 }
 
 TEST(domain_decomposition, heterogeneous_population) {
-    const auto N = g_context.size();
-    const auto I = g_context.id();
+    const auto N = g_context.distributed.size();
+    const auto I = g_context.distributed.id();
 
     {   // Test on a node with 1 cpu core and no gpus.
         // We assume that all cells will be put into cell groups of size 1.
diff --git a/test/unit/test_algorithms.cpp b/test/unit/test_algorithms.cpp
index 8a2aef6d..a238565e 100644
--- a/test/unit/test_algorithms.cpp
+++ b/test/unit/test_algorithms.cpp
@@ -12,18 +12,11 @@
 
 // (Pending abstraction of threading interface)
 #include <arbor/version.hpp>
-#if defined(ARB_TBB_ENABLED)
-    #include "threading/tbb.hpp"
-#elif defined(ARB_CTHREAD_ENABLED)
-    #include "threading/cthread.hpp"
-#else
-    #include "threading/serial.hpp"
-#endif
-
+#include "threading/cthread.hpp"
 #include "common.hpp"
 
 /// tests the sort implementation in threading
-/// is only parallel if TBB is being used
+/// Not parallel
 TEST(algorithms, parallel_sort)
 {
     auto n = 10000;
diff --git a/test/unit/test_domain_decomposition.cpp b/test/unit/test_domain_decomposition.cpp
index effb760b..1e630d97 100644
--- a/test/unit/test_domain_decomposition.cpp
+++ b/test/unit/test_domain_decomposition.cpp
@@ -2,7 +2,7 @@
 
 #include <stdexcept>
 
-#include <arbor/distributed_context.hpp>
+#include <arbor/execution_context.hpp>
 #include <arbor/domain_decomposition.hpp>
 #include <arbor/load_balance.hpp>
 
@@ -48,7 +48,7 @@ namespace {
 // test assumes one domain
 TEST(domain_decomposition, homogenous_population)
 {
-    distributed_context context;
+    execution_context context;
 
     {   // Test on a node with 1 cpu core and no gpus.
         // We assume that all cells will be put into cell groups of size 1.
@@ -108,7 +108,7 @@ TEST(domain_decomposition, homogenous_population)
 
 TEST(domain_decomposition, heterogenous_population)
 {
-    distributed_context context;
+    execution_context context;
 
     {   // Test on a node with 1 cpu core and no gpus.
         // We assume that all cells will be put into cell groups of size 1.
@@ -193,7 +193,7 @@ TEST(domain_decomposition, hints) {
     // Check that we can provide group size hint and gpu/cpu preference
     // by cell kind.
 
-    distributed_context context;
+    execution_context context;
 
     partition_hint_map hints;
     hints[cell_kind::cable1d_neuron].cpu_group_size = 3;
diff --git a/test/unit/test_fvm_lowered.cpp b/test/unit/test_fvm_lowered.cpp
index f81069f6..b93d8195 100644
--- a/test/unit/test_fvm_lowered.cpp
+++ b/test/unit/test_fvm_lowered.cpp
@@ -328,7 +328,7 @@ TEST(fvm_lowered, derived_mechs) {
 
         float times[] = {10.f, 20.f};
 
-        distributed_context context;
+        execution_context context;
         auto decomp = partition_load_balance(rec, proc_allocation{1, 0}, &context);
         simulation sim(rec, decomp, &context);
         sim.add_sampler(all_probes, explicit_schedule(times), sampler);
diff --git a/test/unit/test_lif_cell_group.cpp b/test/unit/test_lif_cell_group.cpp
index 450226b0..80b1909b 100644
--- a/test/unit/test_lif_cell_group.cpp
+++ b/test/unit/test_lif_cell_group.cpp
@@ -155,7 +155,7 @@ TEST(lif_cell_group, spikes) {
     // make two lif cells
     path_recipe recipe(2, 1000, 0.1);
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd = local_allocation();
 
     auto decomp = partition_load_balance(recipe, nd, &context);
@@ -194,9 +194,8 @@ TEST(lif_cell_group, ring)
     // Total simulation time.
     time_type simulation_time = 100;
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd = local_allocation();
-
     auto recipe = ring_recipe(num_lif_cells, weight, delay);
     auto decomp = partition_load_balance(recipe, nd, &context);
 
diff --git a/test/unit/test_range.cpp b/test/unit/test_range.cpp
index 12049ba1..c8c0a82e 100644
--- a/test/unit/test_range.cpp
+++ b/test/unit/test_range.cpp
@@ -9,10 +9,6 @@
 #include <type_traits>
 #include <unordered_map>
 
-#ifdef ARB_HAVE_TBB
-#include <tbb/tbb_stddef.h>
-#endif
-
 #include <util/counter.hpp>
 #include <util/meta.hpp>
 #include <util/range.hpp>
@@ -661,50 +657,3 @@ TEST(range, reverse) {
 
     EXPECT_EQ("olleh"s, rev);
 }
-
-
-#ifdef ARB_HAVE_TBB
-
-TEST(range, tbb_split) {
-    constexpr std::size_t N = 20;
-    int xs[N];
-
-    for (unsigned i = 0; i<N; ++i) {
-        xs[i] = i;
-    }
-
-    auto s = util::make_range(&xs[0], &xs[0]+N);
-
-    while (s.size()>1) {
-        auto ssize = s.size();
-        auto r = decltype(s){s, tbb::split{}};
-        EXPECT_GT(r.size(), 0u);
-        EXPECT_GT(s.size(), 0u);
-        EXPECT_EQ(ssize, r.size()+s.size());
-        EXPECT_EQ(s.end(), r.begin());
-
-        EXPECT_TRUE(r.size()>1 || !r.is_divisible());
-        EXPECT_TRUE(s.size()>1 || !s.is_divisible());
-    }
-
-    for (unsigned i = 1; i<N-1; ++i) {
-        s = util::make_range(&xs[0], &xs[0]+N);
-        // expect exact splitting by proportion in this instance
-
-        auto r = decltype(s){s, tbb::proportional_split{i, N-i}};
-        EXPECT_EQ(&xs[0], s.left);
-        EXPECT_EQ(&xs[0]+i, s.right);
-        EXPECT_EQ(&xs[0]+i, r.left);
-        EXPECT_EQ(&xs[0]+N, r.right);
-    }
-}
-
-TEST(range, tbb_no_split) {
-    std::istringstream sin("10 9 8 7 6");
-    auto s = util::make_range(std::istream_iterator<int>(sin), std::istream_iterator<int>());
-
-    EXPECT_FALSE(decltype(s)::is_splittable_in_proportion());
-    EXPECT_FALSE(s.is_divisible());
-}
-
-#endif
diff --git a/test/unit/test_spike_store.cpp b/test/unit/test_spike_store.cpp
index 9a526cc2..11bd1123 100644
--- a/test/unit/test_spike_store.cpp
+++ b/test/unit/test_spike_store.cpp
@@ -1,6 +1,7 @@
 #include "../gtest.h"
 
 #include <arbor/spike.hpp>
+#include <arbor/execution_context.hpp>
 
 #include "thread_private_spike_store.hpp"
 
@@ -10,7 +11,8 @@ TEST(spike_store, insert)
 {
     using store_type = arb::thread_private_spike_store;
 
-    store_type store;
+    arb::execution_context context;
+    store_type store(context.thread_pool);
 
     // insert 3 spike events and check that they were inserted correctly
     store.insert({
@@ -54,7 +56,8 @@ TEST(spike_store, clear)
 {
     using store_type = arb::thread_private_spike_store;
 
-    store_type store;
+    arb::execution_context context;
+    store_type store(context.thread_pool);
 
     // insert 3 spike events
     store.insert({
@@ -69,7 +72,8 @@ TEST(spike_store, gather)
 {
     using store_type = arb::thread_private_spike_store;
 
-    store_type store;
+    arb::execution_context context;
+    store_type store(context.thread_pool);
 
     std::vector<spike> spikes =
         { {{0,0}, 0.0f}, {{1,2}, 0.5f}, {{2,4}, 1.0f} };
diff --git a/test/unit/test_thread.cpp b/test/unit/test_thread.cpp
index 2e19b1b8..298e4c51 100644
--- a/test/unit/test_thread.cpp
+++ b/test/unit/test_thread.cpp
@@ -1,13 +1,13 @@
 #include "../gtest.h"
 #include "common.hpp"
 #include <arbor/threadinfo.hpp>
+#include <arbor/execution_context.hpp>
 
 #include <iostream>
 #include <ostream>
 // (Pending abstraction of threading interface)
 #include <arbor/version.hpp>
 
-#if defined(ARB_CTHREAD_ENABLED)
 #include "threading/cthread.hpp"
 
 using namespace arb::threading::impl;
@@ -43,26 +43,28 @@ struct ftor_wait {
     ftor_wait() {}
 
     void operator()() const {
-        auto duration = std::chrono::microseconds(500);
+        auto duration = std::chrono::microseconds(100);
         std::this_thread::sleep_for(duration);
     }
 };
 
 struct ftor_parallel_wait {
 
-    ftor_parallel_wait() {}
+    ftor_parallel_wait(task_system* ts): ts{ts} {}
 
     void operator()() const {
         auto nthreads = num_threads();
-        auto duration = std::chrono::microseconds(500);
-        parallel_for::apply(0, nthreads, [=](int i){ std::this_thread::sleep_for(duration);});
+        auto duration = std::chrono::microseconds(100);
+        parallel_for::apply(0, nthreads, ts, [=](int i){ std::this_thread::sleep_for(duration);});
     }
+
+    task_system* ts;
 };
 
 }
 
 TEST(task_system, test_copy) {
-    task_system &ts = task_system::get_global_task_system();
+    task_system ts(num_threads());
 
     ftor f;
     ts.async(f);
@@ -74,10 +76,10 @@ TEST(task_system, test_copy) {
 }
 
 TEST(task_system, test_move) {
-    task_system &s = task_system::get_global_task_system();
+    task_system ts(num_threads());
 
     ftor f;
-    s.async(std::move(f));
+    ts.async(std::move(f));
 
     // Move into new ftor and move ftor into a task (std::function<void()>)
     EXPECT_LE(nmove, 2);
@@ -110,7 +112,8 @@ TEST(notification_queue, test_move) {
 }
 
 TEST(task_group, test_copy) {
-    task_group g;
+    task_system ts(num_threads());
+    task_group g(&ts);
 
     ftor f;
     g.run(f);
@@ -123,7 +126,8 @@ TEST(task_group, test_copy) {
 }
 
 TEST(task_group, test_move) {
-    task_group g;
+    task_system ts(num_threads());
+    task_group g(&ts);
 
     ftor f;
     g.run(std::move(f));
@@ -137,7 +141,9 @@ TEST(task_group, test_move) {
 
 TEST(task_group, individual_tasks) {
     // Simple check for deadlock
-    task_group g;
+    task_system ts(num_threads());
+    task_group g(&ts);
+
     auto nthreads = num_threads();
 
     ftor_wait f;
@@ -149,10 +155,11 @@ TEST(task_group, individual_tasks) {
 
 TEST(task_group, parallel_for_sleep) {
     // Simple check for deadlock for nested parallelism
-    task_group g;
     auto nthreads = num_threads();
+    task_system ts(nthreads);
+    task_group g(&ts);
 
-    ftor_parallel_wait f;
+    ftor_parallel_wait f(&ts);
     for (int i = 0; i < nthreads; i++) {
         g.run(f);
     }
@@ -160,10 +167,10 @@ TEST(task_group, parallel_for_sleep) {
 }
 
 TEST(task_group, parallel_for) {
-
+    task_system ts(num_threads());
     for (int n = 0; n < 10000; n=!n?1:2*n) {
         std::vector<int> v(n, -1);
-        parallel_for::apply(0, n, [&](int i) {v[i] = i;});
+        parallel_for::apply(0, n, &ts, [&](int i) {v[i] = i;});
         for (int i = 0; i< n; i++) {
             EXPECT_EQ(i, v[i]);
         }
@@ -171,13 +178,13 @@ TEST(task_group, parallel_for) {
 }
 
 TEST(task_group, nested_parallel_for) {
-
+    task_system ts(num_threads());
     for (int m = 1; m < 512; m*=2) {
         for (int n = 0; n < 1000; n=!n?1:2*n) {
             std::vector<std::vector<int>> v(n, std::vector<int>(m, -1));
-            parallel_for::apply(0, n, [&](int i) {
+            parallel_for::apply(0, n, &ts, [&](int i) {
                 auto &w = v[i];
-                parallel_for::apply(0, m, [&](int j) { w[j] = i + j; });
+                parallel_for::apply(0, m, &ts, [&](int j) { w[j] = i + j; });
             });
             for (int i = 0; i < n; i++) {
                 for (int j = 0; j < m; j++) {
@@ -189,8 +196,9 @@ TEST(task_group, nested_parallel_for) {
 }
 
 TEST(enumerable_thread_specific, test) {
-    enumerable_thread_specific<int> buffers(0);
-    task_group g;
+    task_system_handle ts = task_system_handle(new task_system(num_threads()));
+    enumerable_thread_specific<int> buffers(ts);
+    task_group g(ts.get());
 
     for (int i = 0; i < 100000; i++) {
         g.run([&](){
@@ -207,5 +215,3 @@ TEST(enumerable_thread_specific, test) {
 
     EXPECT_EQ(100000, sum);
 }
-
-#endif
diff --git a/test/validation/validate_ball_and_stick.cpp b/test/validation/validate_ball_and_stick.cpp
index afc769fe..95257ab7 100644
--- a/test/validation/validate_ball_and_stick.cpp
+++ b/test/validation/validate_ball_and_stick.cpp
@@ -64,7 +64,7 @@ void run_ncomp_convergence_test(
     convergence_test_runner<int> runner("ncomp", plabels, meta);
     runner.load_reference_data(ref_data_path);
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd;
     nd.num_gpus = (backend==backend_kind::gpu);
 
diff --git a/test/validation/validate_kinetic.cpp b/test/validation/validate_kinetic.cpp
index 3bd9f796..c2994eb3 100644
--- a/test/validation/validate_kinetic.cpp
+++ b/test/validation/validate_kinetic.cpp
@@ -43,7 +43,7 @@ void run_kinetic_dt(
     convergence_test_runner<float> runner("dt", plabels, meta);
     runner.load_reference_data(ref_file);
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd;
     nd.num_gpus = (backend==backend_kind::gpu);
 
diff --git a/test/validation/validate_soma.cpp b/test/validation/validate_soma.cpp
index 6c1b17bd..a3e6460f 100644
--- a/test/validation/validate_soma.cpp
+++ b/test/validation/validate_soma.cpp
@@ -29,7 +29,7 @@ void validate_soma(backend_kind backend) {
     rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
     probe_label plabels[1] = {{"soma.mid", {0u, 0u}}};
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd;
     nd.num_gpus = (backend==backend_kind::gpu);
 
diff --git a/test/validation/validate_synapses.cpp b/test/validation/validate_synapses.cpp
index 354dab94..57c926c2 100644
--- a/test/validation/validate_synapses.cpp
+++ b/test/validation/validate_synapses.cpp
@@ -61,7 +61,7 @@ void run_synapse_test(
     convergence_test_runner<int> runner("ncomp", plabels, meta);
     runner.load_reference_data(ref_data_path);
 
-    distributed_context context;
+    execution_context context;
     proc_allocation nd;
     nd.num_gpus = (backend==backend_kind::gpu);
 
-- 
GitLab