From 712070f19f29242483c13be22ba1900bbb18adea Mon Sep 17 00:00:00 2001
From: Ben Cumming <bcumming@cscs.ch>
Date: Wed, 5 Dec 2018 16:54:33 +0100
Subject: [PATCH] Refactor hardware detection to sup (#654)

Refactoring that moves the logic for determining available concurrency and available GPUs from the core Arbor library to the sup library. This also constitutes work towards providing functionality for allocating GPUs to particular ranks when multiple GPUs are visible per rank.

* Move core/thread estimation code to sup library.
* Change default resource behaviour to use one thread and no GPU.
* Provide an interface in the sup library for: acquiring a default GPU; for coordinating an allocation of GPUs across multiple MPI ranks.
---
 .ycm_extra_conf.py                            |  2 +-
 arbor/CMakeLists.txt                          |  4 -
 arbor/hardware/node_info.cpp                  | 53 ------------
 arbor/hardware/node_info.hpp                  | 14 ----
 arbor/local_alloc.cpp                         | 16 ----
 arbor/threading/thread_info.cpp               | 77 ------------------
 arbor/threading/threading.cpp                 |  4 +-
 example/bench/bench.cpp                       | 22 +++--
 example/brunel/brunel_miniapp.cpp             | 24 ++++--
 example/miniapp/miniapp.cpp                   | 22 +++--
 example/ring/ring.cpp                         | 23 ++++--
 include/arbor/context.hpp                     | 27 +------
 sup/CMakeLists.txt                            | 20 ++++-
 {arbor/hardware => sup}/affinity.cpp          | 12 +--
 sup/concurrency.cpp                           | 81 +++++++++++++++++++
 sup/default_gpu.cpp                           | 32 ++++++++
 .../hardware => sup/include/sup}/affinity.hpp |  6 +-
 .../include/sup/concurrency.hpp               | 17 ++--
 sup/include/sup/gpu.hpp                       | 13 +++
 sup/private_gpu.cpp                           | 14 ++++
 20 files changed, 243 insertions(+), 240 deletions(-)
 delete mode 100644 arbor/hardware/node_info.cpp
 delete mode 100644 arbor/hardware/node_info.hpp
 delete mode 100644 arbor/local_alloc.cpp
 delete mode 100644 arbor/threading/thread_info.cpp
 rename {arbor/hardware => sup}/affinity.cpp (83%)
 create mode 100644 sup/concurrency.cpp
 create mode 100644 sup/default_gpu.cpp
 rename {arbor/hardware => sup/include/sup}/affinity.hpp (87%)
 rename arbor/threading/thread_info.hpp => sup/include/sup/concurrency.hpp (62%)
 create mode 100644 sup/include/sup/gpu.hpp
 create mode 100644 sup/private_gpu.cpp

diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index 0e73398a..7f04ca2e 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -55,7 +55,7 @@ flags = [
     '-I',
     'build/include',
     '-I',
-    'aux/include',
+    'sup/include',
 ]
 
 # Set this to the absolute path to the folder (NOT the file!) containing the
diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index b8d199bc..33d410b1 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -13,13 +13,10 @@ set(arbor_sources
     common_types_io.cpp
     execution_context.cpp
     gpu_context.cpp
-    local_alloc.cpp
     event_binner.cpp
     fvm_layout.cpp
     fvm_lowered_cell_impl.cpp
-    hardware/affinity.cpp
     hardware/memory.cpp
-    hardware/node_info.cpp
     hardware/power.cpp
     io/locked_ostream.cpp
     io/serialize_hex.cpp
@@ -43,7 +40,6 @@ set(arbor_sources
     spike_source_cell_group.cpp
     swcio.cpp
     threading/threading.cpp
-    threading/thread_info.cpp
     thread_private_spike_store.cpp
     tree.cpp
     util/hostname.cpp
diff --git a/arbor/hardware/node_info.cpp b/arbor/hardware/node_info.cpp
deleted file mode 100644
index 095c26b0..00000000
--- a/arbor/hardware/node_info.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <thread>
-
-#ifdef ARB_HAVE_GPU
-#include <cuda_runtime.h>
-#endif
-
-// TODO: C++17 use __has_include(<unistd.h>)
-#if defined(__unix__) || defined(__APPLE__) && defined(__MACH__)
-#include <unistd.h>
-#endif
-
-#include "affinity.hpp"
-#include "node_info.hpp"
-
-namespace arb {
-namespace hw {
-
-
-unsigned node_gpus() {
-#ifdef ARB_HAVE_GPU
-    int n;
-    if (cudaGetDeviceCount(&n)==cudaSuccess) {
-        return (unsigned)(n);
-    }
-#endif
-
-    return 0;
-}
-
-unsigned node_processors() {
-    // Attempt to get count first from affinity information if available.
-    unsigned n = get_affinity().size();
-
-    // If no luck, try sysconf.
-#ifdef _SC_NPROCESSORS_ONLN
-    if (!n) {
-        long r = sysconf(_SC_NPROCESSORS_ONLN);
-        if (r>0) {
-            n = (unsigned)r;
-        }
-    }
-#endif
-
-    // If still zero, try the hint from the library.
-    if (!n) {
-        n = std::thread::hardware_concurrency();
-    }
-
-    return n;
-}
-
-} // namespace util
-} // namespace arb
diff --git a/arbor/hardware/node_info.hpp b/arbor/hardware/node_info.hpp
deleted file mode 100644
index 0452bdd4..00000000
--- a/arbor/hardware/node_info.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-namespace arb {
-namespace hw {
-
-// Number of GPUs detected on the node.
-unsigned node_gpus();
-
-// Number of visible logical processors on the node.
-// 0 => unable to determine.
-unsigned node_processors();
-
-} // namespace hw
-} // namespace arb
diff --git a/arbor/local_alloc.cpp b/arbor/local_alloc.cpp
deleted file mode 100644
index 3320e22e..00000000
--- a/arbor/local_alloc.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <arbor/context.hpp>
-
-#include "hardware/node_info.hpp"
-#include "threading/thread_info.hpp"
-#include "threading/threading.hpp"
-
-namespace arb {
-
-local_resources get_local_resources() {
-    auto avail_threads = threading::num_threads_init();
-    auto avail_gpus = arb::hw::node_gpus();
-
-    return local_resources(avail_threads, avail_gpus);
-}
-
-} // namespace arb
diff --git a/arbor/threading/thread_info.cpp b/arbor/threading/thread_info.cpp
deleted file mode 100644
index dce15dbb..00000000
--- a/arbor/threading/thread_info.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <cstdlib>
-#include <exception>
-#include <regex>
-#include <string>
-
-#include <arbor/arbexcept.hpp>
-#include <arbor/util/optional.hpp>
-#include <hardware/node_info.hpp>
-
-#include "thread_info.hpp"
-#include "util/strprintf.hpp"
-
-namespace arb {
-namespace threading {
-
-// Test environment variables for user-specified count of threads.
-//
-// ARB_NUM_THREADS is used if set, otherwise OMP_NUM_THREADS is used.
-//
-// If neither variable is set, returns no value.
-//
-// Valid values for the environment variable are:
-//  0 : Arbor is responsible for picking the number of threads.
-//  >0: The number of threads to use.
-//
-// Throws std::runtime_error:
-//  ARB_NUM_THREADS or OMP_NUM_THREADS is set with invalid value.
-util::optional<size_t> get_env_num_threads() {
-    const char* str;
-
-    // select variable to use:
-    //   If ARB_NUM_THREADS_VAR is set, use $ARB_NUM_THREADS_VAR
-    //   else if ARB_NUM_THREAD set, use it
-    //   else if OMP_NUM_THREADS set, use it
-    if (auto nthreads_var_name = std::getenv("ARB_NUM_THREADS_VAR")) {
-        str = std::getenv(nthreads_var_name);
-    }
-    else if (! (str = std::getenv("ARB_NUM_THREADS"))) {
-        str = std::getenv("OMP_NUM_THREADS");
-    }
-
-    // If the selected var is unset set the number of threads to
-    // the hint given by the standard library
-    if (!str) {
-        return util::nullopt;
-    }
-
-    errno = 0;
-    auto nthreads = std::strtoul(str, nullptr, 10);
-
-    // check that the environment variable string describes a non-negative integer
-    if (errno==ERANGE ||
-        !std::regex_match(str, std::regex("\\s*\\d*[0-9]\\d*\\s*")))
-    {
-        throw arbor_exception(util::pprintf(
-            "requested number of threads \"{}\" is not a valid value", str));
-    }
-
-    return nthreads;
-}
-
-std::size_t num_threads_init() {
-    std::size_t n = 0;
-
-    if (auto env_threads = get_env_num_threads()) {
-        n = env_threads.value();
-    }
-
-    if (!n) {
-        n = hw::node_processors();
-    }
-
-    return n? n: 1;
-}
-
-} // namespace threading
-} // namespace arb
diff --git a/arbor/threading/threading.cpp b/arbor/threading/threading.cpp
index ec116fad..ae251d6e 100644
--- a/arbor/threading/threading.cpp
+++ b/arbor/threading/threading.cpp
@@ -1,7 +1,6 @@
 #include <atomic>
 
 #include "threading.hpp"
-#include "thread_info.hpp"
 
 using namespace arb::threading::impl;
 using namespace arb::threading;
@@ -82,7 +81,8 @@ void task_system::try_run_task() {
     }
 }
 
-task_system::task_system(): task_system(num_threads_init()) {}
+// Default construct with one thread.
+task_system::task_system(): task_system(1) {}
 
 task_system::task_system(int nthreads): count_(nthreads), q_(nthreads) {
     if (nthreads <= 0)
diff --git a/example/bench/bench.cpp b/example/bench/bench.cpp
index 1bae7acf..4ab37a5e 100644
--- a/example/bench/bench.cpp
+++ b/example/bench/bench.cpp
@@ -18,6 +18,8 @@
 #include <arbor/version.hpp>
 
 
+#include <sup/concurrency.hpp>
+#include <sup/gpu.hpp>
 #include <sup/ioutil.hpp>
 #include <sup/json_meter.hpp>
 #ifdef ARB_MPI_ENABLED
@@ -33,16 +35,22 @@ int main(int argc, char** argv) {
     bool is_root = true;
 
     try {
+        arb::proc_allocation resources;
+        if (auto nt = sup::get_env_num_threads()) {
+            resources.num_threads = nt;
+        }
+        else {
+            resources.num_threads = sup::thread_concurrency();
+        }
+
 #ifdef ARB_MPI_ENABLED
         sup::with_mpi guard(argc, argv, false);
-        auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD);
-        {
-            int rank = 0;
-            MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            is_root = rank==0;
-        }
+        resources.gpu_id = sup::find_private_gpu(MPI_COMM_WORLD);
+        auto context = arb::make_context(resources, MPI_COMM_WORLD);
+        is_root = arb::rank(context) == 0;
 #else
-        auto context = arb::make_context();
+        resources.gpu_id = sup::default_gpu();
+        auto context = arb::make_context(resources);
 #endif
 #ifdef ARB_PROFILE_ENABLED
         profile::profiler_initialize(context);
diff --git a/example/brunel/brunel_miniapp.cpp b/example/brunel/brunel_miniapp.cpp
index 3bb10041..b988c6df 100644
--- a/example/brunel/brunel_miniapp.cpp
+++ b/example/brunel/brunel_miniapp.cpp
@@ -18,6 +18,8 @@
 #include <arbor/simulation.hpp>
 #include <arbor/version.hpp>
 
+#include <sup/concurrency.hpp>
+#include <sup/gpu.hpp>
 #include <sup/ioutil.hpp>
 #include <sup/json_meter.hpp>
 #include <sup/path.hpp>
@@ -186,15 +188,23 @@ int main(int argc, char** argv) {
     int rank = 0;
 
     try {
+        arb::proc_allocation resources;
+        if (auto nt = sup::get_env_num_threads()) {
+            resources.num_threads = nt;
+        }
+        else {
+            resources.num_threads = sup::thread_concurrency();
+        }
+
 #ifdef ARB_MPI_ENABLED
         sup::with_mpi guard(argc, argv, false);
-        auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD);
-        {
-            MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            root = rank==0;
-        }
+        resources.gpu_id = sup::find_private_gpu(MPI_COMM_WORLD);
+        auto context = arb::make_context(resources, MPI_COMM_WORLD);
+        rank = arb::rank(context);
+        root = rank==0;
 #else
-        auto context = arb::make_context();
+        resources.gpu_id = sup::default_gpu();
+        auto context = arb::make_context(resources);
 #endif
 
         std::cout << sup::mask_stream(root);
@@ -257,7 +267,7 @@ int main(int argc, char** argv) {
                 spike_out = sup::open_or_throw(p, ios_base::out, !options.over_write);
                 sim.set_local_spike_callback(sup::spike_emitter(spike_out));
             }
-            else if (rank==0) {
+            else if (root) {
                 spike_out = sup::open_or_throw(p, ios_base::out, !options.over_write);
                 sim.set_global_spike_callback(sup::spike_emitter(spike_out));
             }
diff --git a/example/miniapp/miniapp.cpp b/example/miniapp/miniapp.cpp
index 3e34dd1c..a85a408c 100644
--- a/example/miniapp/miniapp.cpp
+++ b/example/miniapp/miniapp.cpp
@@ -17,6 +17,8 @@
 #include <arbor/version.hpp>
 
 
+#include <sup/concurrency.hpp>
+#include <sup/gpu.hpp>
 #include <sup/ioutil.hpp>
 #include <sup/json_meter.hpp>
 #include <sup/path.hpp>
@@ -46,16 +48,24 @@ int main(int argc, char** argv) {
     int rank = 0;
 
     try {
+        arb::proc_allocation resources;
+        if (auto nt = sup::get_env_num_threads()) {
+            resources.num_threads = nt;
+        }
+        else {
+            resources.num_threads = sup::thread_concurrency();
+        }
+
 #ifdef ARB_MPI_ENABLED
         sup::with_mpi guard(argc, argv, false);
-        auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD);
-        {
-            MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            root = rank==0;
-        }
+        resources.gpu_id = sup::find_private_gpu(MPI_COMM_WORLD);
+        auto context = arb::make_context(resources, MPI_COMM_WORLD);
+        root = arb::rank(context) == 0;
 #else
-        auto context = arb::make_context();
+        resources.gpu_id = sup::default_gpu();
+        auto context = arb::make_context(resources);
 #endif
+
 #ifdef ARB_PROFILE_ENABLED
         profile::profiler_initialize(context);
 #endif
diff --git a/example/ring/ring.cpp b/example/ring/ring.cpp
index f92950b9..a82abbc4 100644
--- a/example/ring/ring.cpp
+++ b/example/ring/ring.cpp
@@ -21,6 +21,8 @@
 #include <arbor/recipe.hpp>
 #include <arbor/version.hpp>
 
+#include <sup/concurrency.hpp>
+#include <sup/gpu.hpp>
 #include <sup/ioutil.hpp>
 #include <sup/json_meter.hpp>
 
@@ -154,21 +156,26 @@ struct cell_stats {
     }
 };
 
-
 int main(int argc, char** argv) {
     try {
         bool root = true;
 
+        arb::proc_allocation resources;
+        if (auto nt = sup::get_env_num_threads()) {
+            resources.num_threads = nt;
+        }
+        else {
+            resources.num_threads = sup::thread_concurrency();
+        }
+
 #ifdef ARB_MPI_ENABLED
         sup::with_mpi guard(argc, argv, false);
-        auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD);
-        {
-            int rank;
-            MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-            root = rank==0;
-        }
+        resources.gpu_id = sup::find_private_gpu(MPI_COMM_WORLD);
+        auto context = arb::make_context(resources, MPI_COMM_WORLD);
+        root = arb::rank(context) == 0;
 #else
-        auto context = arb::make_context();
+        resources.gpu_id = sup::default_gpu();
+        auto context = arb::make_context(resources);
 #endif
 
 #ifdef ARB_PROFILE_ENABLED
diff --git a/include/arbor/context.hpp b/include/arbor/context.hpp
index bfaa43e4..d28b8659 100644
--- a/include/arbor/context.hpp
+++ b/include/arbor/context.hpp
@@ -4,17 +4,6 @@
 
 namespace arb {
 
-/// Summary of all available local computation resource.
-struct local_resources {
-    const unsigned num_threads;
-    const unsigned num_gpus;
-
-    local_resources(unsigned threads, unsigned gpus):
-        num_threads(threads),
-        num_gpus(gpus)
-    {}
-};
-
 /// Requested dry-run parameters
 struct dry_run_info {
     unsigned num_ranks;
@@ -24,9 +13,6 @@ struct dry_run_info {
             num_cells_per_rank(cells_per_rank) {}
 };
 
-/// Determine available local domain resources.
-local_resources get_local_resources();
-
 /// A subset of local computation resources to use in a computation.
 struct proc_allocation {
     unsigned num_threads;
@@ -37,17 +23,8 @@ struct proc_allocation {
     // see CUDA documenation for cudaSetDevice and cudaDeviceGetAttribute 
     int gpu_id;
 
-    // By default a proc_allocation will take all available threads and the
-    // GPU with id 0, if available.
-    proc_allocation() {
-        auto avail = get_local_resources();
-
-        // By default take all available threads.
-        num_threads = avail.num_threads;
-
-        // Take the first GPU, if available.
-        gpu_id = avail.num_gpus>0? 0: -1;
-    }
+    // By default use one thread and no GPU.
+    proc_allocation(): proc_allocation(1, -1) {}
 
     proc_allocation(unsigned threads, int gpu):
         num_threads(threads),
diff --git a/sup/CMakeLists.txt b/sup/CMakeLists.txt
index cdd4ecb0..83887be6 100644
--- a/sup/CMakeLists.txt
+++ b/sup/CMakeLists.txt
@@ -1,14 +1,32 @@
 set(sup-sources
-
+    affinity.cpp
+    concurrency.cpp
     glob.cpp
+    default_gpu.cpp
     ioutil.cpp
     json_meter.cpp
     path.cpp
     spike_emitter.cpp
 )
 
+if(ARB_WITH_MPI)
+    list(APPEND sup-sources
+        private_gpu.cpp)
+endif()
+
 add_library(arbor-sup ${sup-sources})
+
 target_compile_options(arbor-sup PRIVATE ${ARB_CXXOPT_ARCH})
 target_link_libraries(arbor-sup PUBLIC ext-json arbor)
 target_include_directories(arbor-sup PUBLIC include)
+
+if(ARB_WITH_MPI)
+    target_compile_definitions(arbor-sup PRIVATE ARB_HAVE_MPI)
+endif()
+if(ARB_WITH_GPU)
+    target_include_directories(arbor-sup PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    target_compile_definitions(arbor-sup PRIVATE ARB_HAVE_GPU)
+endif()
+
 set_target_properties(arbor-sup PROPERTIES OUTPUT_NAME arborsup)
+
diff --git a/arbor/hardware/affinity.cpp b/sup/affinity.cpp
similarity index 83%
rename from arbor/hardware/affinity.cpp
rename to sup/affinity.cpp
index 73e2762f..7454b30d 100644
--- a/arbor/hardware/affinity.cpp
+++ b/sup/affinity.cpp
@@ -12,8 +12,7 @@ extern "C" {
 #include <sched.h>
 }
 
-namespace arb {
-namespace hw {
+namespace sup {
 
 std::vector<int> get_affinity() {
     std::vector<int> cores;
@@ -33,20 +32,17 @@ std::vector<int> get_affinity() {
     return cores;
 }
 
-} // namespace hw
-} // namespace arb
+} // namespace sup
 
 #else // def __linux__
 
 // No support for non-linux systems.
-namespace arb {
-namespace hw {
+namespace sup {
 
 std::vector<int> get_affinity() {
     return {};
 }
 
-} // namespace hw
-} // namespace arb
+} // namespace sup
 
 #endif // def __linux__
diff --git a/sup/concurrency.cpp b/sup/concurrency.cpp
new file mode 100644
index 00000000..6635ae37
--- /dev/null
+++ b/sup/concurrency.cpp
@@ -0,0 +1,81 @@
+#include <cstdlib>
+#include <regex>
+#include <string>
+#include <thread>
+
+#include <arbor/arbexcept.hpp>
+
+#include <sup/affinity.hpp>
+#include <sup/concurrency.hpp>
+
+// TODO: C++17 use __has_include(<unistd.h>)
+#if defined(__unix__) || defined(__APPLE__) && defined(__MACH__)
+#include <unistd.h>
+#endif
+
+namespace sup {
+
+// Test environment variables for user-specified count of threads.
+unsigned get_env_num_threads() {
+    const char* str;
+
+    // select variable to use:
+    //   If ARB_NUM_THREADS_VAR is set, use $ARB_NUM_THREADS_VAR
+    //   else if ARB_NUM_THREAD set, use it
+    //   else if OMP_NUM_THREADS set, use it
+    if (auto nthreads_var_name = std::getenv("ARB_NUM_THREADS_VAR")) {
+        str = std::getenv(nthreads_var_name);
+    }
+    else if (! (str = std::getenv("ARB_NUM_THREADS"))) {
+        str = std::getenv("OMP_NUM_THREADS");
+    }
+
+    // No environment variable set, so return 0.
+    if (!str) {
+        return 0;
+    }
+
+    errno = 0;
+    auto nthreads = std::strtoul(str, nullptr, 10);
+
+    // check that the environment variable string describes a non-negative integer
+    if (errno==ERANGE ||
+        !std::regex_match(str, std::regex("\\s*\\d*[0-9]\\d*\\s*")))
+    {
+        errno = 0;
+        throw arb::arbor_exception(
+            std::string("Requested number of threads \"") + str + "\" is not a valid value");
+    }
+    errno = 0;
+
+    return nthreads;
+}
+
+// Take a best guess at the number of threads that can be run concurrently.
+// Will return at least 1.
+unsigned thread_concurrency() {
+    // Attempt to get count first from affinity information if available.
+    unsigned n = get_affinity().size();
+
+    // If no luck, try sysconf.
+#ifdef _SC_NPROCESSORS_ONLN
+    if (!n) {
+        long r = sysconf(_SC_NPROCESSORS_ONLN);
+        if (r>0) {
+            n = (unsigned)r;
+        }
+    }
+#endif
+
+    // If still zero, try the hint from the library.
+    if (!n) {
+        n = std::thread::hardware_concurrency();
+    }
+
+    // If still zero, use one thread.
+    n = n? n: 1;
+
+    return n;
+}
+
+} // namespace sup
diff --git a/sup/default_gpu.cpp b/sup/default_gpu.cpp
new file mode 100644
index 00000000..80f3f4b9
--- /dev/null
+++ b/sup/default_gpu.cpp
@@ -0,0 +1,32 @@
+#ifdef ARB_HAVE_GPU
+
+#include <cuda_runtime.h>
+
+namespace sup {
+
+// When arbor does not have CUDA support, return -1, which always
+// indicates that no GPU is available.
+int default_gpu() {
+    int n;
+    if (cudaGetDeviceCount(&n)==cudaSuccess) {
+        // if 1 or more GPUs, take the first one.
+        // else return -1 -> no gpu.
+        return n? 0: -1;
+    }
+    return -1;
+}
+
+} // namespace sup
+
+#else // ifdef ARB_HAVE_GPU
+
+namespace sup {
+
+int default_gpu() {
+    return -1;
+}
+
+} // namespace sup
+
+#endif // ifdef ARB_HAVE_GPU
+
diff --git a/arbor/hardware/affinity.hpp b/sup/include/sup/affinity.hpp
similarity index 87%
rename from arbor/hardware/affinity.hpp
rename to sup/include/sup/affinity.hpp
index db6c8f6b..49707c86 100644
--- a/arbor/hardware/affinity.hpp
+++ b/sup/include/sup/affinity.hpp
@@ -3,8 +3,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace arb {
-namespace hw {
+namespace sup {
 
 // The list of logical processors for which the calling thread has affinity.
 // If calling from the main thread at application start up, before
@@ -17,5 +16,4 @@ namespace hw {
 // available cores.
 std::vector<int> get_affinity();
 
-} // namespace util
-} // namespace arb
+} // namespace sup
diff --git a/arbor/threading/thread_info.hpp b/sup/include/sup/concurrency.hpp
similarity index 62%
rename from arbor/threading/thread_info.hpp
rename to sup/include/sup/concurrency.hpp
index 42195fe7..c7a082f3 100644
--- a/arbor/threading/thread_info.hpp
+++ b/sup/include/sup/concurrency.hpp
@@ -2,25 +2,28 @@
 
 #include <arbor/util/optional.hpp>
 
-namespace arb {
-namespace threading {
+namespace sup {
 
 // Test environment variables for user-specified count of threads.
 // Potential environment variables are tested in this order:
 //   1. use the environment variable specified by ARB_NUM_THREADS_VAR
 //   2. use ARB_NUM_THREADS
 //   3. use OMP_NUM_THREADS
-//   4. If no variable is set, returns no value.
 //
 // Valid values for the environment variable are:
 //      0 : Arbor is responsible for picking the number of threads.
 //     >0 : The number of threads to use.
 //
+// Returns:
+//   >0 : the number of threads set by environment variable.
+//    0 : value is not set in environment variable.
+//
 // Throws std::runtime_error:
 //      Environment variable is set with invalid value.
-util::optional<size_t> get_env_num_threads();
+unsigned get_env_num_threads();
 
-size_t num_threads_init();
+// Take a best guess at the number of threads that can be run concurrently.
+// Will return at least 1.
+unsigned thread_concurrency();
 
-} // namespace threading
-} // namespace arb
+} // namespace sup
diff --git a/sup/include/sup/gpu.hpp b/sup/include/sup/gpu.hpp
new file mode 100644
index 00000000..21b95a89
--- /dev/null
+++ b/sup/include/sup/gpu.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <arbor/version.hpp>
+
+namespace sup {
+
+int default_gpu();
+
+template <typename Comm>
+int find_private_gpu(Comm comm);
+
+} // namespace sup
+
diff --git a/sup/private_gpu.cpp b/sup/private_gpu.cpp
new file mode 100644
index 00000000..434d1d61
--- /dev/null
+++ b/sup/private_gpu.cpp
@@ -0,0 +1,14 @@
+#include <mpi.h>
+
+#include <sup/gpu.hpp>
+
+namespace sup {
+
+// Currently a placeholder.
+// Take the default gpu for serial simulations.
+template <>
+int find_private_gpu(MPI_Comm comm) {
+    return default_gpu();
+}
+
+} // namespace sup
-- 
GitLab