From 74411404ad20d68832ae3fb9f0654e6ed614aeec Mon Sep 17 00:00:00 2001
From: Ben Cumming <bcumming@cscs.ch>
Date: Mon, 14 Sep 2020 13:13:07 +0200
Subject: [PATCH] Update to C++17 (#1141)

* Update CMake CXX version.
* Make CUDA 10 a minimum requirement and remove CUDA 9 support code.
* Set up g++ Travis tests to use g++ 8.
* Explicitly set C++14 version for nvcc.
* Properly split cuda compilation, including in unit tests.
* Remove unnecessary modcc `SOLVE` warning.
* Update pybind11 module to tag v2.5.0
* Replace `util::size` and `util::data` with `std::` equivalents.
---
 .travis.yml                                   |  38 ++---
 CMakeLists.txt                                |  57 ++-----
 arbor/algorithms.hpp                          |   2 +-
 arbor/backends/gpu/multi_event_stream.cpp     |  47 ++++++
 arbor/backends/gpu/multi_event_stream.cu      |  77 ++++-----
 arbor/backends/multicore/mechanism.cpp        |   4 +-
 .../backends/multicore/multi_event_stream.hpp |   4 +-
 arbor/fvm_lowered_cell.hpp                    |   5 +-
 arbor/fvm_lowered_cell_impl.hpp               |   2 -
 arbor/include/arbor/util/any_visitor.hpp      |   6 +-
 arbor/memory/array_view.hpp                   |   2 +-
 arbor/profile/profiler.cpp                    |   6 +-
 arbor/util/maputil.hpp                        |   2 +-
 arbor/util/mergeview.hpp                      |   4 +-
 arbor/util/meta.hpp                           |  92 ++---------
 arbor/util/ordered_forest.hpp                 |   3 +-
 arbor/util/partition.hpp                      |   8 +-
 arbor/util/piecewise.hpp                      |   2 +-
 arbor/util/rangeutil.hpp                      |   6 +-
 arbor/util/span.hpp                           |   2 +-
 arbor/util/strprintf.hpp                      |   3 +-
 arborenv/gpu_uuid.cpp                         | 156 +-----------------
 arborenv/include/arborenv/with_mpi.hpp        |   2 +-
 doc/install.rst                               |  55 +++---
 modcc/module.cpp                              |   9 +-
 python/CMakeLists.txt                         |   2 +-
 python/pybind11                               |   2 +-
 test/unit-distributed/test_mpi.cpp            |   4 +-
 test/unit/CMakeLists.txt                      |   9 +-
 test/unit/gpu_vector.hpp                      | 108 ++++++++++++
 test/unit/stats.hpp                           |   2 +-
 test/unit/test_algorithms.cpp                 |   8 +-
 test/unit/test_cycle.cpp                      |   4 +-
 test/unit/test_fvm_layout.cpp                 |   4 +-
 test/unit/test_intrin.cpp                     |   2 +-
 test/unit/test_intrin.cu                      |  48 +++---
 test/unit/test_matrix_cpuvsgpu.cpp            |   2 +-
 .../{test_matrix.cu => test_matrix_gpu.cpp}   |   0
 test/unit/test_partition.cpp                  |   2 +-
 test/unit/test_piecewise.cpp                  |  14 +-
 test/unit/test_range.cpp                      |   4 +-
 test/unit/test_reduce_by_key.cu               |  52 +++---
 test/unit/test_scope_exit.cpp                 |   3 +-
 test/unit/test_transform.cpp                  |   6 +-
 .../{test_vector.cu => test_vector_gpu.cpp}   |  30 ----
 45 files changed, 387 insertions(+), 513 deletions(-)
 create mode 100644 test/unit/gpu_vector.hpp
 rename test/unit/{test_matrix.cu => test_matrix_gpu.cpp} (100%)
 rename test/unit/{test_vector.cu => test_vector_gpu.cpp} (76%)

diff --git a/.travis.yml b/.travis.yml
index 0d555b54..87a734c5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,5 @@
 ######## Testing minimal compiler requirements ########
-# GCC          6.4.0
+# GCC          8.1.0
 # Clang        7.0
 # Apple Clang  1100.0.33.16
 #######################################################
@@ -13,27 +13,27 @@ sudo: false
 matrix:
   include:
 ########################## OS X #########################
-## test gcc6 - single node/rank with threading backend ##
+## test gcc8 - single node/rank with threading backend ##
   - name: "osx, gcc, serial, py"
     os: osx
     osx_image: xcode11.3
     python: 3.6
     env:
-      - MATRIX_EVAL="brew install gcc@6 && brew link --force --overwrite gcc@6 && brew install cmake && CC=gcc-6 && CXX=g++-6"
+      - MATRIX_EVAL="brew install gcc@8 && brew link --force --overwrite gcc@8 && brew install cmake && CC=gcc-8 && CXX=g++-8"
       - BUILD_NAME=cthread-osx-gcc-py
       - WITH_DISTRIBUTED=serial WITH_PYTHON=true PY=3 ARCH=native
-    compiler: gcc-6
+    compiler: gcc-8
 
-## test gcc6 - mpi with threading backend ##
+## test gcc8 - mpi with threading backend ##
   - name: "osx, gcc, mpi, py"
     os: osx
     osx_image: xcode11.3
     python: 3.6
     env:
-      - MATRIX_EVAL="brew install gcc@6 && brew link --force --overwrite gcc@6 && brew install cmake && CC=gcc-6 && CXX=g++-6"
+      - MATRIX_EVAL="brew install gcc@8 && brew link --force --overwrite gcc@8 && brew install cmake && CC=gcc-8 && CXX=g++-8"
       - BUILD_NAME=mpi-osx-gcc-py
       - WITH_DISTRIBUTED=mpi WITH_PYTHON=true PY=3 ARCH=native
-    compiler: gcc-6
+    compiler: gcc-8
 
 ## test clang9 - single node/rank with threading backend ##
   - name: "osx, apple clang, serial, py"
@@ -58,7 +58,7 @@ matrix:
     compiler: clang
 
 ######################### LINUX #########################
-## test gcc7 - single node/rank with threading backend ##
+## test gcc8 - single node/rank with threading backend ##
   - name: "linux, gcc, serial, py"
     os: linux
     dist: bionic
@@ -66,16 +66,16 @@ matrix:
       apt:
         sources:
         packages:
-          - g++-7
+          - g++-8
           - openmpi-bin
           - libopenmpi-dev
     env:
-      - MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
+      - MATRIX_EVAL="CC=gcc-8 && CXX=g++-8"
       - BUILD_NAME=cthread-linux-gcc-py
       - WITH_DISTRIBUTED=serial WITH_PYTHON=true PY=3 ARCH=haswell
-    compiler: gcc-7
+    compiler: gcc-8
 
-## test gcc7 - mpi with threading backend ##
+## test gcc8 - mpi with threading backend ##
   - name: "linux, gcc, mpi, py"
     os: linux
     dist: bionic
@@ -83,16 +83,16 @@ matrix:
       apt:
         sources:
         packages:
-          - g++-6
+          - g++-8
           - openmpi-bin
           - libopenmpi-dev
     env:
-      - MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
+      - MATRIX_EVAL="CC=gcc-8 && CXX=g++-8"
       - BUILD_NAME=mpi-linux-gcc-py
       - WITH_DISTRIBUTED=mpi WITH_PYTHON=true PY=3 ARCH=haswell
-    compiler: gcc-7
+    compiler: gcc-8
 
-## test clang4 - single node/rank with threading backend ##
+## test clang7 - single node/rank with threading backend ##
   - name: "linux, clang, serial, py"
     os: linux
     dist: bionic
@@ -106,9 +106,9 @@ matrix:
       - MATRIX_EVAL="CC=clang && CXX=clang++"
       - BUILD_NAME=cthread-linux-clang-py
       - WITH_DISTRIBUTED=serial WITH_PYTHON=true PY=3 ARCH=native
-    compiler: clang-4.0
+    compiler: clang-7.0
 
-## test clang4 - mpi with threading backend ##
+## test clang7 - mpi with threading backend ##
   - name: "linux, clang, mpi, py"
     os: linux
     dist: bionic
@@ -122,7 +122,7 @@ matrix:
       - MATRIX_EVAL="CC=clang && CXX=clang++"
       - BUILD_NAME=mpi-linux-clang-py
       - WITH_DISTRIBUTED=mpi WITH_PYTHON=true PY=3 ARCH=native
-    compiler: clang-4.0
+    compiler: clang-7.0
 
 before_install:
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export HOMEBREW_NO_AUTO_UPDATE=1; brew cask uninstall --force oclint; fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b307941..bed72ed2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,20 +92,24 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF)
 if(ARB_GPU STREQUAL "cuda")
     set(ARB_WITH_NVCC TRUE)
 
+    # CMake 18 and later set the default CUDA architecture for
+    # each target according to CMAKE_CUDA_ARCHITECTURES.
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    endif()
+
     enable_language(CUDA)
 
-    # Despite native CUDA support, the CUDA package is still required to find
-    # the NVML library and to export the cuda library dependencies from the
-    # installed target.
-    find_package(CUDA REQUIRED)
+    # Despite native CUDA support, the CUDA package is still required to export
+    # the cuda library dependencies from the installed target.
+    find_package(CUDA 10 REQUIRED)
 
 elseif(ARB_GPU STREQUAL "cuda-clang")
     set(ARB_WITH_CUDA_CLANG TRUE)
 
-    # The CUDA package is needed for clang compilation for the same reasons as
-    # above.
+    # The CUDA package is needed for clang compilation for the same reasons as above.
     # enable_langaue(CUDA) has a bug with clang
-    find_package(CUDA REQUIRED)
+    find_package(CUDA 10 REQUIRED)
 elseif(ARB_GPU STREQUAL "hip")
     set(ARB_WITH_HIP_CLANG TRUE)
 endif()
@@ -132,7 +136,8 @@ include("CheckCompilerXLC")
 
 include("CompilerOptions")
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
@@ -256,28 +261,6 @@ if(ARB_WITH_GPU)
     if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
         target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
-        # The arborenv lib needs to use the CUDA NVML library for CUDA 9.
-
-        if (${CUDA_VERSION_MAJOR} LESS 10)
-            set(arborenv_requires_nvml TRUE)
-        endif()
-
-        if(arborenv_requires_nvml)
-            set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
-            find_library(CUDA_NVML
-                    NAMES ${nvml_names}
-                    NO_DEFAULT_PATH
-                    PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
-                    PATH_SUFFIXES lib64/stubs lib/stubs)
-
-            if (NOT CUDA_NVML)
-                message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
-            endif()
-
-            target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
-            target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
-        endif()
-
         add_compile_options(
                 "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
                 "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
@@ -285,20 +268,20 @@ if(ARB_WITH_GPU)
 
 
     if(ARB_WITH_NVCC)
-        target_compile_options(arbor-private-deps INTERFACE
-            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
-        target_compile_options(arbor-private-deps INTERFACE
-            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
         target_compile_options(arbor-private-deps INTERFACE
             $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
         target_compile_options(arbor-private-deps INTERFACE
             $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
+        if (${CUDA_VERSION_MAJOR} GREATER 10)
+            target_compile_options(arbor-private-deps INTERFACE
+                $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_80,code=sm_80>)
+        endif()
 
         target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA)
         target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA)
 
     elseif(ARB_WITH_CUDA_CLANG)
-        set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_37 --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+        set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-gpu-arch=sm_80 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
         target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
         target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
 
@@ -456,10 +439,6 @@ if(ARB_WITH_GPU)
     set(arbor_override_import_lang CXX)
     set(arbor_add_import_libs ${CUDA_LIBRARIES})
     set(arborenv_add_import_libs ${CUDA_LIBRARIES})
-
-    if(arborenv_requires_nvml)
-        list(APPEND arborenv_add_import_libs ${CUDA_NVML})
-    endif()
 endif()
 
 configure_file(
diff --git a/arbor/algorithms.hpp b/arbor/algorithms.hpp
index 722edd07..bcc3b0f1 100644
--- a/arbor/algorithms.hpp
+++ b/arbor/algorithms.hpp
@@ -30,7 +30,7 @@ template <typename C>
 typename util::sequence_traits<C>::value_type
 mean(C const& c)
 {
-    return util::sum(c)/util::size(c);
+    return util::sum(c)/std::size(c);
 }
 
 // returns the prefix sum of c in the form `[0, c[0], c[0]+c[1], ..., sum(c)]`.
diff --git a/arbor/backends/gpu/multi_event_stream.cpp b/arbor/backends/gpu/multi_event_stream.cpp
index 858c9978..c96c47d5 100644
--- a/arbor/backends/gpu/multi_event_stream.cpp
+++ b/arbor/backends/gpu/multi_event_stream.cpp
@@ -6,6 +6,29 @@
 namespace arb {
 namespace gpu {
 
+// These wrappers are implemented in the multi_event_stream.cu file, which
+// is spearately compiled by nvcc, to protect nvcc from having to parse C++17.
+void mark_until_after_w(unsigned n,
+        fvm_index_type* mark,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        const fvm_value_type* t_until);
+void mark_until_w(unsigned n,
+        fvm_index_type* mark,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        const fvm_value_type* t_until);
+void drop_marked_events_w(unsigned n,
+        fvm_index_type* n_nonempty_stream,
+        fvm_index_type* span_begin,
+        fvm_index_type* span_end,
+        fvm_index_type* mark);
+void event_time_if_before_w(unsigned n,
+        fvm_index_type* span_begin,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        fvm_value_type* t_until);
+
 void multi_event_stream_base::clear() {
     memory::fill(span_begin_, 0u);
     memory::fill(span_end_, 0u);
@@ -13,5 +36,29 @@ void multi_event_stream_base::clear() {
     n_nonempty_stream_[0] = 0;
 }
 
+// Designate for processing events `ev` at head of each event stream `i`
+// until `event_time(ev)` > `t_until[i]`.
+void multi_event_stream_base::mark_until_after(const_view t_until) {
+    arb_assert(n_streams()==t_until.size());
+    mark_until_after_w(n_stream_, mark_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+}
+
+// Designate for processing events `ev` at head of each event stream `i`
+// while `t_until[i]` > `event_time(ev)`.
+void multi_event_stream_base::mark_until(const_view t_until) {
+    mark_until_w(n_stream_, mark_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+}
+
+// Remove marked events from front of each event stream.
+void multi_event_stream_base::drop_marked_events() {
+    drop_marked_events_w(n_stream_, n_nonempty_stream_.data(), span_begin_.data(), span_end_.data(), mark_.data());
+}
+
+// If the head of `i`th event stream exists and has time less than `t_until[i]`, set
+// `t_until[i]` to the event time.
+void multi_event_stream_base::event_time_if_before(view t_until) {
+    event_time_if_before_w(n_stream_, span_begin_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+}
+
 } // namespace gpu
 } // namespace arb
diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu
index b136f8dc..7e3e90c7 100644
--- a/arbor/backends/gpu/multi_event_stream.cu
+++ b/arbor/backends/gpu/multi_event_stream.cu
@@ -1,7 +1,6 @@
 #include <arbor/common_types.hpp>
 
 #include "backends/event.hpp"
-#include "backends/gpu/multi_event_stream.hpp"
 #include "gpu_common.hpp"
 
 namespace arb {
@@ -86,50 +85,54 @@ namespace kernels {
     }
 } // namespace kernels
 
-// Designate for processing events `ev` at head of each event stream `i`
-// until `event_time(ev)` > `t_until[i]`.
-void multi_event_stream_base::mark_until_after(const_view t_until) {
-    arb_assert(n_streams()==t_until.size());
-
-    constexpr int block_dim = 128;
-
-    unsigned n = n_stream_;
-    int nblock = impl::block_count(n, block_dim);
-    kernels::mark_until_after<<<nblock, block_dim>>>(
-        n, mark_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+void mark_until_after_w(unsigned n,
+        fvm_index_type* mark,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        const fvm_value_type* t_until)
+{
+    const int nblock = impl::block_count(n, 128);
+    kernels::mark_until_after
+        <<<nblock, 128>>>
+        (n, mark, span_end, ev_time, t_until);
 }
 
-// Designate for processing events `ev` at head of each event stream `i`
-// while `t_until[i]` > `event_time(ev)`.
-void multi_event_stream_base::mark_until(const_view t_until) {
-    arb_assert(n_streams()==t_until.size());
-    constexpr int block_dim = 128;
-
-    unsigned n = n_stream_;
-    int nblock = impl::block_count(n, block_dim);
-    kernels::mark_until<<<nblock, block_dim>>>(
-        n, mark_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+void mark_until_w(unsigned n,
+        fvm_index_type* mark,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        const fvm_value_type* t_until)
+{
+    const int nblock = impl::block_count(n, 128);
+    kernels::mark_until
+        <<<nblock, 128>>>
+        (n, mark, span_end, ev_time, t_until);
 }
 
-// Remove marked events from front of each event stream.
-void multi_event_stream_base::drop_marked_events() {
-    constexpr int block_dim = 128;
+void drop_marked_events_w(unsigned n,
+        fvm_index_type* n_nonempty_stream,
+        fvm_index_type* span_begin,
+        fvm_index_type* span_end,
+        fvm_index_type* mark)
+{
+    const int nblock = impl::block_count(n, 128);
+    kernels::drop_marked_events
+        <<<nblock, 128>>>
+        (n, n_nonempty_stream, span_begin, span_end, mark);
 
-    unsigned n = n_stream_;
-    int nblock = impl::block_count(n, block_dim);
-    kernels::drop_marked_events<<<nblock, block_dim>>>(
-        n, n_nonempty_stream_.data(), span_begin_.data(), span_end_.data(), mark_.data());
 }
 
-// If the head of `i`th event stream exists and has time less than `t_until[i]`, set
-// `t_until[i]` to the event time.
-void multi_event_stream_base::event_time_if_before(view t_until) {
-    constexpr int block_dim = 128;
-    int nblock = impl::block_count(n_stream_, block_dim);
-    kernels::event_time_if_before<<<nblock, block_dim>>>(
-        n_stream_, span_begin_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+void event_time_if_before_w(unsigned n,
+        fvm_index_type* span_begin,
+        fvm_index_type* span_end,
+        fvm_value_type* ev_time,
+        fvm_value_type* t_until)
+{
+    const int nblock = impl::block_count(n, 128);
+    kernels::event_time_if_before
+        <<<nblock, 128>>>
+        (n, span_begin, span_end, ev_time, t_until);
 }
 
-
 } // namespace gpu
 } // namespace arb
diff --git a/arbor/backends/multicore/mechanism.cpp b/arbor/backends/multicore/mechanism.cpp
index a38be492..63f50a36 100644
--- a/arbor/backends/multicore/mechanism.cpp
+++ b/arbor/backends/multicore/mechanism.cpp
@@ -40,8 +40,8 @@ void copy_extend(const Source& source, Dest&& dest, const Fill& fill) {
     using std::begin;
     using std::end;
 
-    auto dest_n = util::size(dest);
-    auto source_n = util::size(source);
+    auto dest_n = std::size(dest);
+    auto source_n = std::size(source);
 
     auto n = source_n<dest_n? source_n: dest_n;
     auto tail = std::copy_n(begin(source), n, begin(dest));
diff --git a/arbor/backends/multicore/multi_event_stream.hpp b/arbor/backends/multicore/multi_event_stream.hpp
index 18b42aa5..8b4c283e 100644
--- a/arbor/backends/multicore/multi_event_stream.hpp
+++ b/arbor/backends/multicore/multi_event_stream.hpp
@@ -97,7 +97,7 @@ public:
     void mark_until_after(const TimeSeq& t_until) {
         using ::arb::event_time;
 
-        arb_assert(n_streams()==util::size(t_until));
+        arb_assert(n_streams()==std::size(t_until));
 
         // note: operation on each `i` is independent.
         for (size_type i = 0; i<n_streams(); ++i) {
@@ -118,7 +118,7 @@ public:
     void mark_until(const TimeSeq& t_until) {
         using ::arb::event_time;
 
-        arb_assert(n_streams()==util::size(t_until));
+        arb_assert(n_streams()==std::size(t_until));
 
         // note: operation on each `i` is independent.
         for (size_type i = 0; i<n_streams(); ++i) {
diff --git a/arbor/fvm_lowered_cell.hpp b/arbor/fvm_lowered_cell.hpp
index 9a4f3972..a30e19d2 100644
--- a/arbor/fvm_lowered_cell.hpp
+++ b/arbor/fvm_lowered_cell.hpp
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <memory>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -134,8 +135,8 @@ struct fvm_probe_data {
         return util::make_range(
             util::visit(
                 [](auto& i) -> std::pair<const probe_handle*, const probe_handle*> {
-                    using util::data;
-                    using util::size;
+                    using std::data;
+                    using std::size;
                     return {data(i.raw_handles), data(i.raw_handles)+size(i.raw_handles)};
                 },
                 info));
diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index 81ba0068..ebb430a5 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -202,8 +202,6 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
     std::vector<deliverable_event> staged_events,
     std::vector<sample_event> staged_samples)
 {
-    using util::as_const;
-
     set_gpu();
 
     // Integration setup
diff --git a/arbor/include/arbor/util/any_visitor.hpp b/arbor/include/arbor/util/any_visitor.hpp
index 348bdaa8..48e5dfc0 100644
--- a/arbor/include/arbor/util/any_visitor.hpp
+++ b/arbor/include/arbor/util/any_visitor.hpp
@@ -17,8 +17,6 @@ namespace util {
 
 namespace impl {
 
-template <typename> using void_t = void; // TODO: C++17 use std::void_t.
-
 template <typename X, typename Y>
 struct propagate_qualifier { using type = Y; };
 
@@ -64,7 +62,7 @@ struct any_visitor<T> {
     };
 
     template <typename F>
-    struct invoke_or_throw<F, impl::void_t<decltype(std::declval<F>()())>> {
+    struct invoke_or_throw<F, std::void_t<decltype(std::declval<F>()())>> {
         template <typename A>
         static auto visit(F&& f, A&& a) {
             using Q = impl::propagate_qualifier_t<A, T>;
@@ -103,7 +101,7 @@ struct invocable_impl {
     struct test: std::false_type {};
 
     template <typename G>
-    struct test<G, void_t<decltype(std::declval<G>()(std::declval<A>()...))>>: std::true_type {};
+    struct test<G, std::void_t<decltype(std::declval<G>()(std::declval<A>()...))>>: std::true_type {};
 
     using type = typename test<F>::type;
 };
diff --git a/arbor/memory/array_view.hpp b/arbor/memory/array_view.hpp
index cbdf1f5c..53a69dfd 100644
--- a/arbor/memory/array_view.hpp
+++ b/arbor/memory/array_view.hpp
@@ -228,7 +228,7 @@ public:
         return size_;
     }
 
-    bool is_empty() const {
+    bool empty() const {
         return size_==0;
     }
 
diff --git a/arbor/profile/profiler.cpp b/arbor/profile/profiler.cpp
index 4ecd1efc..9320a784 100644
--- a/arbor/profile/profiler.cpp
+++ b/arbor/profile/profiler.cpp
@@ -311,11 +311,11 @@ void print(std::ostream& o,
     if (proportion<thresh) return;
 
     if (n.count==profile_node::npos) {
-        snprintf(buf, util::size(buf), "_p_ %-20s%12s%12.3f%12.3f%8.1f",
+        snprintf(buf, std::size(buf), "_p_ %-20s%12s%12.3f%12.3f%8.1f",
                name.c_str(), "-", float(n.time), per_thread_time, proportion);
     }
     else {
-        snprintf(buf, util::size(buf), "_p_ %-20s%12lu%12.3f%12.3f%8.1f",
+        snprintf(buf, std::size(buf), "_p_ %-20s%12lu%12.3f%12.3f%8.1f",
                name.c_str(), n.count, float(n.time), per_thread_time, proportion);
     }
     o << "\n" << buf;
@@ -353,7 +353,7 @@ std::ostream& operator<<(std::ostream& o, const profile& prof) {
 
     auto tree = make_profile_tree(prof);
 
-    snprintf(buf, util::size(buf), "_p_ %-20s%12s%12s%12s%8s", "REGION", "CALLS", "THREAD", "WALL", "\%");
+    snprintf(buf, std::size(buf), "_p_ %-20s%12s%12s%12s%8s", "REGION", "CALLS", "THREAD", "WALL", "\%");
     o << buf;
     print(o, tree, tree.time, prof.num_threads, 0, "");
     return o;
diff --git a/arbor/util/maputil.hpp b/arbor/util/maputil.hpp
index 44956499..bcecd481 100644
--- a/arbor/util/maputil.hpp
+++ b/arbor/util/maputil.hpp
@@ -39,7 +39,7 @@ template <typename Seq, typename = void>
 struct is_associative_container: std::false_type {};
 
 template <typename Seq>
-struct is_associative_container<Seq, void_t<maputil_impl::assoc_test<Seq>>>: maputil_impl::assoc_test<Seq> {};
+struct is_associative_container<Seq, std::void_t<maputil_impl::assoc_test<Seq>>>: maputil_impl::assoc_test<Seq> {};
 
 // Find value in a sequence of key-value pairs or in a key-value assocation map, with
 // optional explicit comparator.
diff --git a/arbor/util/mergeview.hpp b/arbor/util/mergeview.hpp
index 86943b54..4af3ab21 100644
--- a/arbor/util/mergeview.hpp
+++ b/arbor/util/mergeview.hpp
@@ -16,13 +16,13 @@ template <typename A, typename B, typename = void>
 struct has_common_type: std::false_type {};
 
 template <typename A, typename B>
-struct has_common_type<A, B, util::void_t<std::common_type_t<A, B>>>: std::true_type {};
+struct has_common_type<A, B, std::void_t<std::common_type_t<A, B>>>: std::true_type {};
 
 template <typename A, typename B, typename X, typename = void>
 struct common_type_or_else { using type = X; };
 
 template <typename A, typename B, typename X>
-struct common_type_or_else<A, B, X, util::void_t<std::common_type_t<A, B>>> {
+struct common_type_or_else<A, B, X, std::void_t<std::common_type_t<A, B>>> {
     using type = std::common_type_t<A, B>;
 };
 
diff --git a/arbor/util/meta.hpp b/arbor/util/meta.hpp
index 4ab20318..e86a0a04 100644
--- a/arbor/util/meta.hpp
+++ b/arbor/util/meta.hpp
@@ -12,76 +12,6 @@
 namespace arb {
 namespace util {
 
-// The following classes and functions can be replaced
-// with std functions when we migrate to later versions of C++.
-//
-// C++17:
-// void_t, empty, data, as_const
-
-template <class...>
-using void_t = void;
-
-template <typename X>
-constexpr std::size_t size(const X& x) { return x.size(); }
-
-template <typename X, std::size_t N>
-constexpr std::size_t size(X (&)[N]) noexcept { return N; }
-
-template <typename C>
-constexpr auto data(C& c) { return c.data(); }
-
-template <typename C>
-constexpr auto data(const C& c) { return c.data(); }
-
-template <typename T, std::size_t N>
-constexpr T* data(T (&a)[N]) noexcept { return a; }
-
-template <typename T>
-void as_const(T&& t) = delete;
-
-template <typename T>
-constexpr std::add_const_t<T>& as_const(T& t) {
-    return t;
-}
-
-// Use sequence `empty() const` method if exists, otherwise
-// compare begin and end.
-
-namespace impl_empty {
-    template <typename C>
-    struct has_const_empty_method {
-        template <typename T>
-        static decltype(std::declval<const T>().empty(), std::true_type{}) test(int);
-        template <typename T>
-        static std::false_type test(...);
-
-        using type = decltype(test<C>(0));
-    };
-
-    using std::begin;
-    using std::end;
-
-    template <typename Seq>
-    constexpr bool empty(const Seq& seq, std::false_type) {
-        return begin(seq)==end(seq);
-    }
-
-    template <typename Seq>
-    constexpr bool empty(const Seq& seq, std::true_type) {
-        return seq.empty();
-    }
-}
-
-template <typename Seq>
-constexpr bool empty(const Seq& seq) {
-    return impl_empty::empty(seq, typename impl_empty::has_const_empty_method<Seq>::type{});
-}
-
-template <typename T, std::size_t N>
-constexpr bool empty(const T (& c)[N]) noexcept {
-    return false; // N cannot be zero
-}
-
 // Types associated with a container or sequence
 
 namespace impl_seqtrait {
@@ -95,9 +25,19 @@ namespace impl_seqtrait {
     struct data_returns_pointer<T (&)[N], void>: public std::true_type {};
 
     template <typename T>
-    struct data_returns_pointer<T, void_t<decltype(std::declval<T>().data())>>:
+    struct data_returns_pointer<T, std::void_t<decltype(std::declval<T>().data())>>:
         public std::is_pointer<decltype(std::declval<T>().data())>::type {};
 
+    template <typename Seq, typename=void>
+    struct size_type_ {
+        using type = void;
+    };
+
+    template <typename Seq>
+    struct size_type_<Seq, std::void_t<decltype(std::size(std::declval<Seq&>()))>> {
+        using type = decltype(std::size(std::declval<Seq&>()));
+    };
+
     template <typename Seq>
     struct sequence_traits {
         using iterator = decltype(begin(std::declval<Seq&>()));
@@ -105,7 +45,7 @@ namespace impl_seqtrait {
         using value_type = typename std::iterator_traits<iterator>::value_type;
         using reference = typename std::iterator_traits<iterator>::reference;
         using difference_type = typename std::iterator_traits<iterator>::difference_type;
-        using size_type = decltype(size(std::declval<Seq&>()));
+        using size_type = typename size_type_<Seq>::type;
         // For use with heterogeneous ranges:
         using sentinel = decltype(end(std::declval<Seq&>()));
         using const_sentinel = decltype(end(std::declval<const Seq&>()));
@@ -119,7 +59,7 @@ namespace impl_seqtrait {
         std::false_type {};
 
     template<typename T>
-    struct is_sequence<T, void_t<decltype(begin(std::declval<T>()))>>:
+    struct is_sequence<T, std::void_t<decltype(begin(std::declval<T>()))>>:
         std::true_type {};
 
 }
@@ -178,7 +118,7 @@ template <typename T, typename = void>
 struct is_iterator: public std::false_type {};
 
 template <typename T>
-struct is_iterator<T, void_t<typename std::iterator_traits<T>::iterator_category>>:
+struct is_iterator<T, std::void_t<typename std::iterator_traits<T>::iterator_category>>:
     public std::true_type {};
 
 template <typename T>
@@ -249,7 +189,7 @@ template <typename I, typename E>
 struct common_random_access_iterator<
     I,
     E,
-    void_t<decltype(false? std::declval<I>(): std::declval<E>())>,
+    std::void_t<decltype(false? std::declval<I>(): std::declval<E>())>,
     std::enable_if_t<
         is_random_access_iterator<
             std::decay_t<decltype(false? std::declval<I>(): std::declval<E>())>
@@ -269,7 +209,7 @@ struct has_common_random_access_iterator:
     std::false_type {};
 
 template <typename I, typename E>
-struct has_common_random_access_iterator<I, E, void_t<util::common_random_access_iterator_t<I, E>>>:
+struct has_common_random_access_iterator<I, E, std::void_t<util::common_random_access_iterator_t<I, E>>>:
     std::true_type {};
 
 // Generic accessors:
diff --git a/arbor/util/ordered_forest.hpp b/arbor/util/ordered_forest.hpp
index 567226c4..38a8bc09 100644
--- a/arbor/util/ordered_forest.hpp
+++ b/arbor/util/ordered_forest.hpp
@@ -73,8 +73,7 @@ template <typename V, typename Allocator>
 struct ordered_forest_builder;
 
 template <typename V, typename Allocator = std::allocator<V>>
-struct ordered_forest {
-private:
+class ordered_forest {
     struct node {
         V* item_ = nullptr;
         node* parent_ = nullptr;
diff --git a/arbor/util/partition.hpp b/arbor/util/partition.hpp
index e4190b26..f6d4f67c 100644
--- a/arbor/util/partition.hpp
+++ b/arbor/util/partition.hpp
@@ -112,11 +112,11 @@ partition_range<SeqIter> partition_view(const Seq& r) {
  *
  * If the first parameter is `partition_in_place`, the provided
  * container `divisions` will not be resized, and the partition will 
- * be of length `util::size(divisions)-1` or zero if `divisions` is
+ * be of length `std::size(divisions)-1` or zero if `divisions` is
  * empty.
  *
- * Otherwise, `divisions` will be be resized to `util::size(sizes)+1`
- * and represent a partition of length `util::size(sizes)`.
+ * Otherwise, `divisions` will be be resized to `std::size(sizes)+1`
+ * and represent a partition of length `std::size(sizes)`.
  *
  * Returns a partition view over `divisions`.
  */
@@ -159,7 +159,7 @@ template <
 >
 partition_range<typename sequence_traits<Part>::const_iterator>
 make_partition(Part& divisions, const Sizes& sizes, T from=T{}) {
-    divisions.resize(size(sizes)+1);
+    divisions.resize(std::size(sizes)+1);
 
     // (would use std::inclusive_scan in C++17)
     auto pi = std::begin(divisions);
diff --git a/arbor/util/piecewise.hpp b/arbor/util/piecewise.hpp
index ef067d62..9f157989 100644
--- a/arbor/util/piecewise.hpp
+++ b/arbor/util/piecewise.hpp
@@ -24,7 +24,7 @@ constexpr pw_size_type pw_npos = -1;
 
 template <typename T>
 struct indexed_const_iterator {
-    using size_type = decltype(util::size(std::declval<T>()));
+    using size_type = decltype(std::size(std::declval<T>()));
     using difference_type = std::make_signed_t<size_type>;
 
     using value_type = decltype(std::declval<T>()[0]);
diff --git a/arbor/util/rangeutil.hpp b/arbor/util/rangeutil.hpp
index 7e6a5fda..c8db7115 100644
--- a/arbor/util/rangeutil.hpp
+++ b/arbor/util/rangeutil.hpp
@@ -39,7 +39,7 @@ range_view(Seq&& seq) {
 
 template <typename Seq, typename = std::enable_if_t<sequence_traits<Seq&&>::is_contiguous>>
 auto range_pointer_view(Seq&& seq) {
-    return make_range(util::data(seq), util::data(seq)+util::size(seq));
+    return make_range(std::data(seq), std::data(seq)+std::size(seq));
 }
 
 template <
@@ -250,7 +250,7 @@ Value max_value(const Seq& seq, Compare cmp = Compare{}) {
     using std::begin;
     using std::end;
 
-    if (util::empty(seq)) {
+    if (std::empty(seq)) {
         return Value{};
     }
 
@@ -277,7 +277,7 @@ std::pair<Value, Value> minmax_value(const Seq& seq, Compare cmp = Compare{}) {
     using std::begin;
     using std::end;
 
-    if (util::empty(seq)) {
+    if (std::empty(seq)) {
         return {Value{}, Value{}};
     }
 
diff --git a/arbor/util/span.hpp b/arbor/util/span.hpp
index 817e1acf..c6599e16 100644
--- a/arbor/util/span.hpp
+++ b/arbor/util/span.hpp
@@ -39,7 +39,7 @@ span<I> make_span(I right) {
 
 template <typename Seq>
 auto count_along(const Seq& s) {
-    return util::make_span(util::size(s));
+    return util::make_span(std::size(s));
 }
 
 } // namespace util
diff --git a/arbor/util/strprintf.hpp b/arbor/util/strprintf.hpp
index f548e79b..eacd5c49 100644
--- a/arbor/util/strprintf.hpp
+++ b/arbor/util/strprintf.hpp
@@ -10,6 +10,7 @@
 #include <string>
 #include <sstream>
 #include <system_error>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -33,7 +34,7 @@ namespace impl_to_string {
     };
 
     template <typename T>
-    struct select<T, util::void_t<decltype(to_string(std::declval<T>()))>> {
+    struct select<T, std::void_t<decltype(to_string(std::declval<T>()))>> {
         static std::string str(const T& v) {
             return to_string(v);
         }
diff --git a/arborenv/gpu_uuid.cpp b/arborenv/gpu_uuid.cpp
index ec64249b..3d2c55d0 100644
--- a/arborenv/gpu_uuid.cpp
+++ b/arborenv/gpu_uuid.cpp
@@ -15,16 +15,6 @@
 #include "gpu_api.hpp"
 
 
-// CUDA 10 allows GPU uuid to be queried via cudaGetDeviceProperties.
-// Previous versions require the CUDA NVML library to get uuid.
-//
-// ARBENV_USE_NVML will be defined at configuration time if using
-// CUDA version 9.
-
-#ifdef ARBENV_USE_NVML
-    #include <nvml.h>
-#endif
-
 #ifdef __linux__
 extern "C" {
     #include <unistd.h>
@@ -92,8 +82,6 @@ std::runtime_error make_runtime_error(api_error_type error_code) {
         + error_code.name() + ": " + error_code.description());
 }
 
-#ifndef ARBENV_USE_NVML
-
 // For CUDA 10 and later the uuid of all available GPUs is straightforward
 // to obtain by querying cudaGetDeviceProperties for each visible device.
 std::vector<uuid> get_gpu_uuids() {
@@ -119,15 +107,16 @@ std::vector<uuid> get_gpu_uuids() {
             throw make_runtime_error(status);
         }
 
-        // Copy the bytes from props.uuid to uuids[i].
-
 #ifdef ARB_HIP
+        // Build a unique string for the device and hash it, then
+        // copy the bytes of the has to uuids[i].
         auto host = get_hostname();
         if (!host) throw std::runtime_error("Can't uniquely identify GPUs on the system");
         auto uid = std::hash<std::string>{} (*host + '-' + std::to_string(props.pciBusID) + '-' + std::to_string(props.pciDeviceID));
         auto b = reinterpret_cast<const unsigned char*>(&uid);
         std::copy(b, b+sizeof(std::size_t), uuids[i].bytes.begin());
 #else
+        // Copy the bytes from props.uuid to uuids[i].
         auto b = reinterpret_cast<const unsigned char*>(&props.uuid);
         std::copy(b, b+sizeof(uuid), uuids[i].bytes.begin());
 #endif
@@ -136,145 +125,6 @@ std::vector<uuid> get_gpu_uuids() {
     return uuids;
 }
 
-#else
-
-std::runtime_error make_runtime_error(nvmlReturn_t error_code) {
-    return std::runtime_error(
-        std::string("cuda nvml runtime error: ") + nvmlErrorString(error_code));
-}
-
-// Split CUDA_VISIBLE_DEVICES variable string into a list of integers.
-// The environment variable can have spaces, and the order is important:
-// i.e. "0,1" is not the same as "1,0".
-//      CUDA_VISIBLE_DEVICES="1,0"
-//      CUDA_VISIBLE_DEVICES="0, 1"
-// The CUDA run time parses the list until it finds an error, then returns
-// the partial list.
-// i.e.
-//      CUDA_VISIBLE_DEVICES="1, 0, hello" -> {1,0}
-//      CUDA_VISIBLE_DEVICES="hello, 1" -> {}
-// All non-numeric characters at end of a value appear to be ignored:
-//      CUDA_VISIBLE_DEVICES="0a,1" -> {0,1}
-//      CUDA_VISIBLE_DEVICES="a0,1" -> {}
-// This doesn't try too hard to check for all possible errors.
-std::vector<int> parse_visible_devices(std::string str, int ngpu) {
-    std::vector<int> values;
-    std::istringstream ss(str);
-    while (ss) {
-        int v;
-        if (ss >> v) {
-            if (v<0 || v>=ngpu) break;
-            values.push_back(v);
-            while (ss && ss.get()!=',');
-        }
-    }
-    return values;
-}
-
-// Take a uuid string with the format:
-//      GPU-f1fd7811-e4d3-4d54-abb7-efc579fb1e28
-// And convert to a 16 byte sequence
-//
-// Assume that the intput string is correctly formatted.
-uuid string_to_uuid(char* str) {
-    uuid result;
-    unsigned n = std::strlen(str);
-
-    // Remove the "GPU" from front of string, and the '-' hyphens, e.g.:
-    //      GPU-f1fd7811-e4d3-4d54-abb7-efc579fb1e28
-    // becomes
-    //      f1fd7811e4d34d54abb7efc579fb1e28
-    std::remove_if(str, str+n, [](char c){return !std::isxdigit(c);});
-
-    // Converts a single hex character, i.e. 0123456789abcdef, to int
-    // Assumes that input is a valid hex character.
-    auto hex_c2i = [](unsigned char c) -> unsigned char {
-        c = std::tolower(c);
-        return std::isalpha(c)? c-'a'+10: c-'0';
-    };
-
-    // Convert pairs of characters into single bytes.
-    for (int i=0; i<16; ++i) {
-        const char* s = str+2*i;
-        result.bytes[i] = (hex_c2i(s[0])<<4) + hex_c2i(s[1]);
-    }
-
-    return result;
-}
-
-// For CUDA 9 the only way to get gpu uuid is via NVML.
-// NVML can be used to query all GPU devices, not just the
-// devices that have been made visible to the calling process.
-// Hence, there are two steps to finding the uuid of visible devices:
-// 1. Query the environment variable CUDA_VISIBLE_DEVICES to
-//    determine which devices are locally visible, and to enumerate
-//    them correctly.
-// 2. Query NVML for the uuid of each visible device.
-std::vector<uuid> get_gpu_uuids() {
-    // Get number of devices.
-    int ngpus = 0;
-    auto status = get_device_count(&ngpus);
-    if (status.no_device_found()) return {};
-    else if (!status) throw make_runtime_error(status);
-
-    // Attempt to initialize nvml
-    auto nvml_status = nvmlInit();
-    const bool nvml_init = (nvml_status==NVML_ERROR_ALREADY_INITIALIZED);
-    if (!nvml_init && nvml_status!=NVML_SUCCESS) {
-        throw make_runtime_error(nvml_status);
-    }
-    auto nvml_guard = on_scope_exit([nvml_init](){if (!nvml_init) nvmlShutdown();});
-
-    // store the uuids
-    std::vector<uuid> uuids;
-
-    // find the number of available GPUs
-    unsigned count = -1;
-    nvml_status = nvmlDeviceGetCount(&count);
-    if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status);
-
-    // Indexes of GPUs available on this rank
-    std::vector<int> device_ids;
-
-    // Test if the environment variable CUDA_VISIBLE_DEVICES has been set.
-    const char* visible_device_env = std::getenv("CUDA_VISIBLE_DEVICES");
-    // If set, attempt to parse the device ids from it.
-    if (visible_device_env) {
-        // Parse the gpu ids from the environment variable
-        device_ids = parse_visible_devices(visible_device_env, count);
-        if ((unsigned)ngpus != device_ids.size()) {
-            // Mismatch between device count detected by cuda runtime
-            // and that set in environment variable.
-            throw std::runtime_error(
-                "Mismatch between the number of devices in CUDA_VISIBLE_DEVICES"
-                " and the number of devices detected by cudaGetDeviceCount.");
-        }
-    }
-    // Not set, so all devices must be available.
-    else {
-        device_ids.resize(count);
-        std::iota(device_ids.begin(), device_ids.end(), 0);
-    }
-
-    // For each device id, query NVML for the device's uuid.
-    for (int i: device_ids) {
-        char buffer[NVML_DEVICE_UUID_BUFFER_SIZE];
-        // get handle of gpu with index i
-        nvmlDevice_t handle;
-        nvml_status = nvmlDeviceGetHandleByIndex(i, &handle);
-        if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status);
-
-        // get uuid as a string with format GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
-        nvml_status = nvmlDeviceGetUUID(handle, buffer, sizeof(buffer));
-        if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status);
-
-        uuids.push_back(string_to_uuid(buffer));
-    }
-
-    return uuids;
-}
-#endif // ndef ARBENV_USE_NVML
-
 // Compare two sets of uuids
 //   1: both sets are identical
 //  -1: some common elements
diff --git a/arborenv/include/arborenv/with_mpi.hpp b/arborenv/include/arborenv/with_mpi.hpp
index ee9e5f15..53ef6a92 100644
--- a/arborenv/include/arborenv/with_mpi.hpp
+++ b/arborenv/include/arborenv/with_mpi.hpp
@@ -26,7 +26,7 @@ struct with_mpi {
         // force exit the application before the exception that is unwinding
         // the stack has been caught, which would deny the opportunity to print
         // an error message explaining the cause of the exception.
-        if (!std::uncaught_exception()) {
+        if (std::uncaught_exceptions()==0) {
             MPI_Finalize();
         }
     }
diff --git a/doc/install.rst b/doc/install.rst
index 87ba36c7..97f22e04 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -31,7 +31,7 @@ with very few tools.
     =========== ============================================
     Git         To check out the code, minimum version 2.0.
     CMake       To set up the build, minimum version 3.12.
-    compiler    A C++14 compiler. See `compilers <install-compilers_>`_.
+    compiler    A C++17 compiler. See `compilers <install-compilers_>`_.
     =========== ============================================
 
 .. _install-compilers:
@@ -39,7 +39,7 @@ with very few tools.
 Compilers
 ~~~~~~~~~
 
-Arbor requires a C++ compiler that fully supports C++14.
+Arbor requires a C++ compiler that fully supports C++17.
 We recommend using GCC or Clang, for which Arbor has been tested and optimised.
 
 .. table:: Supported Compilers
@@ -47,10 +47,10 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised.
     =========== ============ ============================================
     Compiler    Min version  Notes
     =========== ============ ============================================
-    GCC         6.1.0
-    Clang       4.0          Needs GCC 6 or later for standard library.
+    GCC         8.4.0
+    Clang       7.0          Needs GCC 8 or later for standard library.
     Apple Clang 9            Apple LLVM version 9.0.0 (clang-900.0.39.2)
-    Hip Clang                Unofficial Release
+    Hip Clang   Rocm 3.6     HIP support is currently experimental.
     =========== ============ ============================================
 
 .. _note_CC:
@@ -60,7 +60,7 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised.
     CMake should use. If these are not set, CMake will attempt to automatically choose a compiler,
     which may be too old to compile Arbor.
     For example, the default compiler chosen below by CMake was GCC 4.8.5 at ``/usr/bin/c++``,
-    so the ``CC`` and ``CXX`` variables were used to specify GCC 6.1.0 before calling ``cmake``.
+    so the ``CC`` and ``CXX`` variables were used to specify GCC 10.2.0 before calling ``cmake``.
 
     .. code-block:: bash
 
@@ -70,8 +70,8 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised.
 
         # check which version of GCC is available
         $ g++ --version
-        g++ (GCC) 6.1.0
-        Copyright (C) 2015 Free Software Foundation, Inc.
+        g++ (GCC) 10.2.0
+        Copyright (C) 2020 Free Software Foundation, Inc.
 
         # set environment variables for compilers
         $ export CC=`which gcc`; export CXX=`which g++`;
@@ -79,10 +79,10 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised.
         # launch CMake
         # the compiler version and path is given in the CMake output
         $ cmake ..
-        -- The C compiler identification is GNU 6.1.0
-        -- The CXX compiler identification is GNU 6.1.0
-        -- Check for working C compiler: /cm/local/apps/gcc/6.1.0/bin/gcc
-        -- Check for working C compiler: /cm/local/apps/gcc/6.1.0/bin/gcc -- works
+        -- The C compiler identification is GNU 10.2.0
+        -- The CXX compiler identification is GNU 10.2.0
+        -- Check for working C compiler: /cm/local/apps/gcc/10.2.0/bin/gcc
+        -- Check for working C compiler: /cm/local/apps/gcc/10.2.0/bin/gcc -- works
         ...
 
 .. Note::
@@ -108,7 +108,7 @@ Optional Requirements
 GPU Support
 ~~~~~~~~~~~
 
-Arbor has full support for NVIDIA GPUs, for which the NVIDIA CUDA toolkit version 9 is required.
+Arbor has full support for NVIDIA GPUs, for which the NVIDIA CUDA toolkit version 10 is required.
 And experimental support for AMD GPUs when compiled with hip-clang (non-release compiler).
 
 Distributed
@@ -220,13 +220,6 @@ CMake parameters and flags, follow links to the more detailed descriptions below
 
         cmake -DARB_WITH_ASSERTIONS=ON -DCMAKE_BUILD_TYPE=debug
 
-.. topic:: `Release <buildtarget_>`_ mode (compiler optimizations enabled) with the default
-           compiler, optimized for the local `system architecture <install-architecture_>`_.
-
-    .. code-block:: bash
-
-        cmake -DARB_ARCH=native
-
 .. topic:: `Release <buildtarget_>`_ mode with `Clang <install-compilers_>`_.
 
     .. code-block:: bash
@@ -241,12 +234,12 @@ CMake parameters and flags, follow links to the more detailed descriptions below
 
         cmake -DARB_VECTORIZE=ON -DARB_ARCH=haswell
 
-.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `Nvidia GPUs <install-gpu_>`_, and building with `GCC 6 <install-compilers_>`_.
+.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `Nvidia GPUs <install-gpu_>`_, and building with `GCC 9 <install-compilers_>`_.
 
     .. code-block:: bash
 
-        export CC=gcc-6
-        export CXX=g++-6
+        export CC=gcc-9
+        export CXX=g++-9
         cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=cuda
 
 .. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `AMD GPUs <install-gpu_>`_, and building with `hipcc <install-compilers_>`_.
@@ -258,11 +251,11 @@ CMake parameters and flags, follow links to the more detailed descriptions below
         cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=hip
 
 
-.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, optimized for the `local system architecture <install-architecture_>`_ and `install <install_>`_ in ``/opt/arbor``
+.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, optimized for the local system architecture and `install <install_>`_ in ``/opt/arbor``
 
     .. code-block:: bash
 
-        cmake -DARB_VECTORIZE=ON -DARB_ARCH=native -DCMAKE_INSTALL_PREFIX=/opt/arbor
+        cmake -DARB_VECTORIZE=ON -DCMAKE_INSTALL_PREFIX=/opt/arbor
 
 .. _buildtarget:
 
@@ -392,8 +385,8 @@ Arbor has experimental support for AMD GPUs using HIP. The only compiler current
 Arbor is built for all supported AMD GPUs and the available GPU will be used at runtime.
 
 .. Note::
-    Arbor supports and has been tested on the Kepler (K20 & K80), Pascal (P100) and Volta (V100) GPUs
-    as well as Vega10 and Vega20 GPUs
+    Arbor supports and has been tested on Pascal (P100) and Volta (V100) NVIDIA GPUs,
+    as well as Mi50 and Mi60 AMD GPUs.
 
 
 .. _install-python:
@@ -411,12 +404,12 @@ CMake ``ARB_WITH_PYTHON`` option:
 By default ``ARB_WITH_PYTHON=OFF``. When this option is turned on, a Python module called :py:mod:`arbor` is built.
 
 A specific version of Python can be set when configuring with CMake using the
-``PYTHON_EXECUTABLE`` variable. For example, to use Python 3.7 installed on a Linux
-system with the executable in ``/usr/bin/python3.7``:
+``PYTHON_EXECUTABLE`` variable. For example, to use Python 3.8 installed on a Linux
+system with the executable in ``/usr/bin/python3.8``:
 
 .. code-block:: bash
 
-    cmake .. -DARB_WITH_PYTHON=ON -DPYTHON_EXECUTABLE=/usr/bin/python3.7
+    cmake .. -DARB_WITH_PYTHON=ON -DPYTHON_EXECUTABLE=/usr/bin/python3.8
 
 By default the Python module will be installed in the standard ``CMAKE_INSTALL_PREFIX``
 location. To install the module in a different location, for example as a
@@ -437,7 +430,7 @@ On the target LINUX system, the Arbor package was installed in
     By default CMake sets ``CMAKE_INSTALL_PREFIX`` to ``/usr/local`` on Linux and OS X.
     The compiled libraries are installed in ``/usr/local/lib``, headers are installed in
     ``/usr/local/include``, and the Python module will be installed in a path like
-    ``/usr/local/lib/python3.7/site-packages``.
+    ``/usr/local/lib/python3.8/site-packages``.
     Because ``/usr/local`` is a system path, the installation phase needs to be run as root,
     i.e. ``sudo make install``, even if ``ARB_PYTHON_PREFIX`` is set to a user path
     that does not require root to install.
diff --git a/modcc/module.cpp b/modcc/module.cpp
index 16597ab0..aaf9fcc1 100644
--- a/modcc/module.cpp
+++ b/modcc/module.cpp
@@ -460,13 +460,8 @@ bool Module::semantic() {
             }
         }
 
-        // handle the case where there is no SOLVE in BREAKPOINT
-        if(!found_solve) {
-            warning(" there is no SOLVE statement, required to update the"
-                    " state variables, in the BREAKPOINT block",
-                    breakpoint->location());
-        }
-        else {
+        // handle the case where there is a SOLVE in BREAKPOINT (which is the typical case)
+        if (found_solve) {
             // Redo semantic pass in order to elimate any removed local symbols.
             api_state->semantic(symbols_);
         }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 21fe5be9..64f7cb6d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 # Set up pybind11, which is used to generate Python bindings.
 # Pybind11 has good cmake support, so just add the pybind11 directory,
 # instead of using find_package.
-set(PYBIND11_CPP_STANDARD -std=c++14)
+set(PYBIND11_CPP_STANDARD -std=c++17)
 add_subdirectory(pybind11)
 
 set(pyarb_source
diff --git a/python/pybind11 b/python/pybind11
index e2b884c3..3b1dbeba 160000
--- a/python/pybind11
+++ b/python/pybind11
@@ -1 +1 @@
-Subproject commit e2b884c33bcde70b2ea562ffa52dd7ebee276d50
+Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
diff --git a/test/unit-distributed/test_mpi.cpp b/test/unit-distributed/test_mpi.cpp
index 935242c2..9dfa81d1 100644
--- a/test/unit-distributed/test_mpi.cpp
+++ b/test/unit-distributed/test_mpi.cpp
@@ -79,12 +79,12 @@ TEST(mpi, gather_all_with_partition) {
         if (i%2) {
             int rank_data[] = { i, i+7, i+8 };
             util::append(expected_values, rank_data);
-            expected_divisions.push_back(expected_divisions.back()+util::size(rank_data));
+            expected_divisions.push_back(expected_divisions.back()+std::size(rank_data));
         }
         else {
             int rank_data[] = { i };
             util::append(expected_values, rank_data);
-            expected_divisions.push_back(expected_divisions.back()+util::size(rank_data));
+            expected_divisions.push_back(expected_divisions.back()+std::size(rank_data));
         }
     }
 
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index c7dfc382..f988f32e 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -174,18 +174,17 @@ set(unit_sources
 
 if(ARB_WITH_GPU)
     list(APPEND unit_sources
-
         test_intrin.cu
         test_gpu_stack.cu
-        test_matrix.cu
-        test_matrix_cpuvsgpu.cpp
+        test_multi_event_stream_gpu.cu
         test_reduce_by_key.cu
-        test_vector.cu
 
+        test_matrix_cpuvsgpu.cpp
+        test_matrix_gpu.cpp
         test_mc_cell_group_gpu.cpp
         test_multi_event_stream_gpu.cpp
-        test_multi_event_stream_gpu.cu
         test_spikes_gpu.cpp
+        test_vector_gpu.cpp
     )
 endif()
 
diff --git a/test/unit/gpu_vector.hpp b/test/unit/gpu_vector.hpp
new file mode 100644
index 00000000..02f08181
--- /dev/null
+++ b/test/unit/gpu_vector.hpp
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <vector>
+
+#include "backends/gpu/gpu_api.hpp"
+
+/*
+ * Helpers for using GPU memory in unit tests.
+ *
+ * The memory helpers can't be used in .cu files, because we don't let nvcc
+ * compile most of our headers to avoid compiler bugs and c++ version issues.
+ */
+
+template <typename T>
+struct gpu_ref_proxy {
+    T* ptr;
+
+    gpu_ref_proxy(T* p): ptr(p) {}
+
+    gpu_ref_proxy& operator=(const T& value) {
+        arb::gpu::device_memcpy(ptr, &value, sizeof(T), arb::gpu::gpuMemcpyHostToDevice);
+        return *this;
+    }
+
+    operator T() const {
+        T tmp;
+        arb::gpu::device_memcpy(&tmp, ptr, sizeof(T), arb::gpu::gpuMemcpyDeviceToHost);
+        return tmp;
+    }
+};
+
+template <typename T>
+class gpu_vector {
+    using value_type = T;
+    using size_type = std::size_t;
+
+public:
+    gpu_vector() = default;
+
+    gpu_vector(size_type n) {
+        allocate(n);
+    }
+
+    gpu_vector(const std::vector<T>& other) {
+        allocate(other.size());
+        to_device(other.data());
+    }
+
+    ~gpu_vector() {
+        if (data_) arb::gpu::device_free(data_);
+    }
+
+    std::vector<T> host_vector() const {
+        std::vector<T> v(size());
+        to_host(v.data());
+        return v;
+    }
+
+    value_type* data() {
+        return data_;
+    }
+
+    const value_type* data() const {
+        return data_;
+    }
+
+    size_type size() const {
+        return size_;
+    }
+
+    value_type operator[](size_type i) const {
+        value_type tmp;
+        arb::gpu::device_memcpy(&tmp, data_+i, sizeof(value_type), arb::gpu::gpuMemcpyDeviceToHost);
+        return tmp;
+    }
+
+    gpu_ref_proxy<value_type> operator[](size_type i) {
+        return gpu_ref_proxy<value_type>(data_+i);
+    }
+
+private:
+
+    void allocate(size_type n) {
+        size_ = n;
+        arb::gpu::device_malloc(&data_, n*sizeof(T));
+    }
+
+    void to_device(const value_type* other) {
+        arb::gpu::device_memcpy(data_, other, size_in_bytes(), arb::gpu::gpuMemcpyHostToDevice);
+    }
+
+    void to_host(value_type* other) const {
+        arb::gpu::device_memcpy(other, data_, size_in_bytes(), arb::gpu::gpuMemcpyDeviceToHost);
+    }
+
+    size_type size_in_bytes() const {
+        return size_*sizeof(value_type);
+    }
+
+    void free() {
+        arb::gpu::device_free(data_);
+        size_ = 0;
+        data_ = nullptr;
+    }
+
+    size_type size_;
+    value_type* data_;
+};
diff --git a/test/unit/stats.hpp b/test/unit/stats.hpp
index 1e26caf7..c453aebf 100644
--- a/test/unit/stats.hpp
+++ b/test/unit/stats.hpp
@@ -57,7 +57,7 @@ namespace ks {
 
 template <typename Seq>
 double dn_statistic(const Seq& qs) {
-    double n = static_cast<double>(arb::util::size(qs));
+    double n = static_cast<double>(std::size(qs));
     double d = 0;
     int j = 0;
     for (auto q: qs) {
diff --git a/test/unit/test_algorithms.cpp b/test/unit/test_algorithms.cpp
index ab975e0c..303996df 100644
--- a/test/unit/test_algorithms.cpp
+++ b/test/unit/test_algorithms.cpp
@@ -483,9 +483,9 @@ template <typename Sub, typename Sup>
     using namespace arb;
 
     auto indices = util::index_into(sub, sup);
-    auto n_indices = util::size(indices);
-    auto n_sub  = util::size(sub);
-    if (util::size(indices)!=util::size(sub)) {
+    auto n_indices = std::size(indices);
+    auto n_sub  = std::size(sub);
+    if (std::size(indices)!=std::size(sub)) {
         return ::testing::AssertionFailure()
              << "index_into size " << n_indices << " does not equal sub-sequence size " << n_sub;
     }
@@ -530,7 +530,7 @@ arb::util::range<std::reverse_iterator<I>> reverse_range(arb::util::range<I> r)
 TEST(algorithms, index_into)
 {
     using ivector = std::vector<std::ptrdiff_t>;
-    using arb::util::size;
+    using std::size;
     using arb::util::index_into;
     using arb::util::assign_from;
     using arb::util::make_range;
diff --git a/test/unit/test_cycle.cpp b/test/unit/test_cycle.cpp
index ec935ae1..a9d8b5e0 100644
--- a/test/unit/test_cycle.cpp
+++ b/test/unit/test_cycle.cpp
@@ -113,7 +113,7 @@ TEST(cycle_iterator, carray) {
     int values[] = { 4, 2, 3 };
     auto cycle_iter = util::make_cyclic_iterator(std::cbegin(values),
                                                  std::cend(values));
-    auto values_size = util::size(values);
+    auto values_size = std::size(values);
     for (auto i = 0u; i < 2*values_size; ++i) {
         EXPECT_EQ(values[i % values_size], *cycle_iter++);
     }
@@ -201,7 +201,7 @@ TEST(cycle_iterator, order) {
     EXPECT_TRUE(c1 <= c2);
     EXPECT_TRUE(c1 >= c2);
 
-    c2 += util::size(values);
+    c2 += std::size(values);
 
     EXPECT_TRUE(c1 < c2);
     EXPECT_FALSE(c2 < c1);
diff --git a/test/unit/test_fvm_layout.cpp b/test/unit/test_fvm_layout.cpp
index dc5f9527..0f55d429 100644
--- a/test/unit/test_fvm_layout.cpp
+++ b/test/unit/test_fvm_layout.cpp
@@ -176,9 +176,9 @@ struct exp_instance {
 
     template <typename Seq>
     exp_instance(int cv, const Seq& tgts, double e, double tau):
-        cv(cv), multiplicity(util::size(tgts)), e(e), tau(tau)
+        cv(cv), multiplicity(std::size(tgts)), e(e), tau(tau)
     {
-        targets.reserve(util::size(tgts));
+        targets.reserve(std::size(tgts));
         for (auto t: tgts) targets.push_back(t);
         util::sort(targets);
     }
diff --git a/test/unit/test_intrin.cpp b/test/unit/test_intrin.cpp
index e0b3d385..4910cdb9 100644
--- a/test/unit/test_intrin.cpp
+++ b/test/unit/test_intrin.cpp
@@ -10,7 +10,7 @@
 using namespace arb::multicore;
 
 using arb::util::make_span;
-using arb::util::size;
+using std::size;
 
 constexpr double dqnan = std::numeric_limits<double>::quiet_NaN();
 constexpr double dmax = std::numeric_limits<double>::max();
diff --git a/test/unit/test_intrin.cu b/test/unit/test_intrin.cu
index 50ae74a7..b8d1da9c 100644
--- a/test/unit/test_intrin.cu
+++ b/test/unit/test_intrin.cu
@@ -4,9 +4,8 @@
 
 #include "backends/gpu/gpu_api.hpp"
 #include "backends/gpu/math_cu.hpp"
-#include "memory/memory.hpp"
-#include "util/rangeutil.hpp"
-#include "util/span.hpp"
+
+#include "gpu_vector.hpp"
 
 namespace kernels {
     template <typename T>
@@ -45,15 +44,13 @@ namespace kernels {
 TEST(gpu_intrinsics, gpu_atomic_add) {
     int expected = (128*129)/2;
 
-    arb::memory::device_vector<float> f(1);
-    f[0] = 0.f;
+    gpu_vector<float> f(std::vector<float>{0.f});
 
     kernels::test_atomic_add<<<1, 128>>>(f.data());
 
     EXPECT_EQ(float(expected), f[0]);
 
-    arb::memory::device_vector<double> d(1);
-    d[0] = 0.f;
+    gpu_vector<double> d(std::vector<double>{0.});
 
     kernels::test_atomic_add<<<1, 128>>>(d.data());
 
@@ -64,15 +61,13 @@ TEST(gpu_intrinsics, gpu_atomic_add) {
 TEST(gpu_intrinsics, gpu_atomic_sub) {
     int expected = -(128*129)/2;
 
-    arb::memory::device_vector<float> f(1);
-    f[0] = 0.f;
+    gpu_vector<float> f(std::vector<float>{0.f});
 
     kernels::test_atomic_sub<<<1, 128>>>(f.data());
 
     EXPECT_EQ(float(expected), f[0]);
 
-    arb::memory::device_vector<double> d(1);
-    d[0] = 0.f;
+    gpu_vector<double> d(std::vector<double>{0.});
 
     kernels::test_atomic_sub<<<1, 128>>>(d.data());
 
@@ -97,38 +92,36 @@ TEST(gpu_intrinsics, minmax) {
         {  0, -inf, -inf,   0},
     };
 
-    const auto n = arb::util::size(inputs);
-
-    arb::memory::device_vector<double> lhs(n);
-    arb::memory::device_vector<double> rhs(n);
-    arb::memory::device_vector<double> result(n);
+    const int n = inputs.size();
 
-    using arb::util::make_span;
+    gpu_vector<double> lhs(n);
+    gpu_vector<double> rhs(n);
+    gpu_vector<double> result(n);
 
-    for (auto i: make_span(0, n)) {
+    for (int i=0; i<n; ++i) {
         lhs[i] = inputs[i].lhs;
         rhs[i] = inputs[i].rhs;
     }
 
     // test min
     kernels::test_min<<<1, n>>>(lhs.data(), rhs.data(), result.data());
-    for (auto i: make_span(0, n)) {
+    for (int i=0; i<n; ++i) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_min);
     }
 
     kernels::test_min<<<1, n>>>(rhs.data(), lhs.data(), result.data());
-    for (auto i: make_span(0, n)) {
+    for (int i=0; i<n; ++i) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_min);
     }
 
     // test max
     kernels::test_max<<<1, n>>>(lhs.data(), rhs.data(), result.data());
-    for (auto i: make_span(0, n)) {
+    for (int i=0; i<n; ++i) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_max);
     }
 
     kernels::test_max<<<1, n>>>(rhs.data(), lhs.data(), result.data());
-    for (auto i: make_span(0, n)) {
+    for (int i=0; i<n; ++i) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_max);
     }
 }
@@ -137,16 +130,15 @@ TEST(gpu_intrinsics, exprelr) {
     constexpr double dmin = std::numeric_limits<double>::min();
     constexpr double dmax = std::numeric_limits<double>::max();
     constexpr double deps = std::numeric_limits<double>::epsilon();
-    double inputs[] = {-1.,  -0.,  0.,  1., -dmax,  -dmin,  dmin,  dmax, -deps, deps, 10*deps, 100*deps, 1000*deps};
+    std::vector<double> inputs{-1.,  -0.,  0.,  1., -dmax,  -dmin,  dmin,  dmax, -deps, deps, 10*deps, 100*deps, 1000*deps};
 
-    auto n = arb::util::size(inputs);
-    arb::memory::device_vector<double> x(arb::memory::host_view<double>(inputs, n));
-    arb::memory::device_vector<double> result(n);
+    auto n = inputs.size();
+    gpu_vector<double> x(inputs);
+    gpu_vector<double> result(n);
 
     kernels::test_exprelr<<<1,n>>>(x.data(), result.data());
 
-    auto index = arb::util::make_span(0, n);
-    for (auto i: index) {
+    for (unsigned i=0; i<n; ++i) {
         auto x = inputs[i];
         double expected = std::fabs(x)<deps? 1.0: x/std::expm1(x);
         double error = std::fabs(expected-double(result[i]));
diff --git a/test/unit/test_matrix_cpuvsgpu.cpp b/test/unit/test_matrix_cpuvsgpu.cpp
index 07601e1a..ed751b21 100644
--- a/test/unit/test_matrix_cpuvsgpu.cpp
+++ b/test/unit/test_matrix_cpuvsgpu.cpp
@@ -131,7 +131,7 @@ TEST(matrix, assemble)
     m_gpu.assemble(on_gpu(dt), gpu_array(group_size, -64), gpu_array(group_size, 10), gpu_array(group_size, 3));
     m_gpu.solve(x_d);
     auto result_g = on_host(x_d);
-    
+
     // Compare the GPU and CPU results.
     // Cast result to float, because we are happy to ignore small differencs
     EXPECT_TRUE(seq_almost_eq<float>(result_h, result_g));
diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix_gpu.cpp
similarity index 100%
rename from test/unit/test_matrix.cu
rename to test/unit/test_matrix_gpu.cpp
diff --git a/test/unit/test_partition.cpp b/test/unit/test_partition.cpp
index d8c160c4..cd67bc2f 100644
--- a/test/unit/test_partition.cpp
+++ b/test/unit/test_partition.cpp
@@ -83,7 +83,7 @@ TEST(partition, partition_view_non_numeric) {
 
 TEST(partition, make_partition_in_place) {
     unsigned sizes[] = { 7, 3, 0, 2 };
-    unsigned part_store[util::size(sizes)+1];
+    unsigned part_store[std::size(sizes)+1];
 
     auto p = util::make_partition(util::partition_in_place, part_store, sizes, 10u);
     ASSERT_EQ(4u, p.size());
diff --git a/test/unit/test_piecewise.cpp b/test/unit/test_piecewise.cpp
index 65b74361..cac24581 100644
--- a/test/unit/test_piecewise.cpp
+++ b/test/unit/test_piecewise.cpp
@@ -239,21 +239,21 @@ TEST(piecewise, equal_range) {
         ASSERT_EQ(er0.first, er0.second);
 
         auto er1 = p.equal_range(1.0);
-        ASSERT_EQ(1u, er1.second-er1.first);
+        ASSERT_EQ(1, er1.second-er1.first);
         EXPECT_EQ(10, er1.first->second);
 
         auto er2 = p.equal_range(2.0);
-        ASSERT_EQ(2u, er2.second-er2.first);
+        ASSERT_EQ(2, er2.second-er2.first);
         auto iter = er2.first;
         EXPECT_EQ(10, iter++->second);
         EXPECT_EQ(9, iter->second);
 
         auto er3_5 = p.equal_range(3.5);
-        ASSERT_EQ(1u, er3_5.second-er3_5.first);
+        ASSERT_EQ(1, er3_5.second-er3_5.first);
         EXPECT_EQ(8, er3_5.first->second);
 
         auto er4 = p.equal_range(4.0);
-        ASSERT_EQ(1u, er4.second-er4.first);
+        ASSERT_EQ(1, er4.second-er4.first);
         EXPECT_EQ(8, er4.first->second);
 
         auto er5 = p.equal_range(5.0);
@@ -267,13 +267,13 @@ TEST(piecewise, equal_range) {
         ASSERT_EQ(er0.first, er0.second);
 
         auto er1 = p.equal_range(1.0);
-        ASSERT_EQ(2u, er1.second-er1.first);
+        ASSERT_EQ(2, er1.second-er1.first);
         auto iter = er1.first;
         EXPECT_EQ(10, iter++->second);
         EXPECT_EQ(11, iter++->second);
 
         auto er2 = p.equal_range(2.0);
-        ASSERT_EQ(4u, er2.second-er2.first);
+        ASSERT_EQ(4, er2.second-er2.first);
         iter = er2.first;
         EXPECT_EQ(11, iter++->second);
         EXPECT_EQ(12, iter++->second);
@@ -281,7 +281,7 @@ TEST(piecewise, equal_range) {
         EXPECT_EQ(14, iter++->second);
 
         auto er3 = p.equal_range(3.0);
-        ASSERT_EQ(2u, er3.second-er3.first);
+        ASSERT_EQ(2, er3.second-er3.first);
         iter = er3.first;
         EXPECT_EQ(14, iter++->second);
         EXPECT_EQ(15, iter++->second);
diff --git a/test/unit/test_range.cpp b/test/unit/test_range.cpp
index ff1e2276..9b6a14f1 100644
--- a/test/unit/test_range.cpp
+++ b/test/unit/test_range.cpp
@@ -421,7 +421,7 @@ TEST(range, assign_from) {
 
     {
         std::vector<int> copy = util::assign_from(in);
-        for (auto i=0u; i<util::size(in); ++i) {
+        for (auto i=0u; i<std::size(in); ++i) {
             EXPECT_EQ(in[i], copy[i]);
         }
     }
@@ -429,7 +429,7 @@ TEST(range, assign_from) {
     {
         std::vector<int> copy = util::assign_from(
             util::transform_view(in, [](int i) {return 2*i;}));
-        for (auto i=0u; i<util::size(in); ++i) {
+        for (auto i=0u; i<std::size(in); ++i) {
             EXPECT_EQ(2*in[i], copy[i]);
         }
     }
diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu
index 5d164a25..30998cd1 100644
--- a/test/unit/test_reduce_by_key.cu
+++ b/test/unit/test_reduce_by_key.cu
@@ -1,11 +1,11 @@
 #include "../gtest.h"
 
+#include <algorithm>
 #include <vector>
 
 #include <backends/gpu/reduce_by_key.hpp>
-#include <memory/memory.hpp>
-#include <util/span.hpp>
-#include <util/rangeutil.hpp>
+
+#include "gpu_vector.hpp"
 
 using namespace arb;
 
@@ -25,37 +25,35 @@ std::vector<T> reduce(const std::vector<T>& in, size_t n_out, const std::vector<
     EXPECT_EQ(in.size(), index.size());
     EXPECT_TRUE(std::is_sorted(index.begin(), index.end()));
 
-    using array = memory::device_vector<T>;
-    using iarray = memory::device_vector<int>;
+    using array  = gpu_vector<T>;
+    using iarray = gpu_vector<int>;
 
     int n = in.size();
 
-    array  src = memory::make_const_view(in);
-    iarray idx = memory::make_const_view(index);
-    array  dst(n_out, 0);
+    array  src(in);
+    iarray idx(index);
+    array  dst(std::vector<T>(n_out, 0));
 
     unsigned grid_dim = (n-1)/block_dim + 1;
     reduce_kernel<<<grid_dim, block_dim>>>(src.data(), dst.data(), idx.data(), n);
 
-    std::vector<T> out(n_out);
-    memory::copy(dst, memory::make_view(out));
-
-    return out;
+    return dst.host_vector();
 }
 
 TEST(reduce_by_key, no_repetitions)
 {
     int n = 64;
+    std::vector<int> index(n);
+    for (int i=0; i<n; ++i) index[i] = i;
+
     {
         std::vector<float> in(n, 1);
-        std::vector<int> index = util::assign_from(util::make_span(0, n));
 
         auto out = reduce(in, n, index);
         for (auto o: out) EXPECT_EQ(o, 1.0f);
     }
     {
         std::vector<double> in(n, 1);
-        std::vector<int> index = util::assign_from(util::make_span(0, n));
 
         auto out = reduce(in, n, index);
         for (auto o: out) EXPECT_EQ(o, 1.0);
@@ -76,7 +74,8 @@ TEST(reduce_by_key, single_repeated_index)
     // Perform reduction of an ascending sequence of {1,2,3,...,n}
     // The expected result is n*(n+1)/2
     for (auto n: {1, 2, 7, 31, 32, 33, 63, 64, 65, 128}) {
-        std::vector<double> in = util::assign_from(util::make_span(1, n+1));
+        std::vector<double> in(n);
+        for (int i=0; i<n; ++i) in[i] = i+1;
         std::vector<int> index(n, 0);
 
         auto out = reduce(in, 1, index);
@@ -86,8 +85,10 @@ TEST(reduce_by_key, single_repeated_index)
 
 TEST(reduce_by_key, scatter)
 {
+    // A monotonic sequence of keys with repetitions and gaps, for a reduction
+    // onto an array of length 12.
+    std::size_t n = 12;
     std::vector<int> index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11};
-    unsigned n = util::max_value(index)+1;
     std::vector<double> in(index.size(), 1);
     std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.};
 
@@ -125,28 +126,27 @@ std::vector<T> reduce_twice(const std::vector<T>& in, size_t n_out, const std::v
     EXPECT_EQ(in.size(), index.size());
     EXPECT_TRUE(std::is_sorted(index.begin(), index.end()));
 
-    using array = memory::device_vector<T>;
-    using iarray = memory::device_vector<int>;
+    using array  = gpu_vector<T>;
+    using iarray = gpu_vector<int>;
 
     int n = in.size();
 
-    array  src = memory::make_const_view(in);
-    iarray idx = memory::make_const_view(index);
-    array  dst(n_out, 0);
+    array  src(in);
+    iarray idx(index);
+    array  dst(std::vector<T>(n_out, 0));
 
     unsigned grid_dim = (n-1)/block_dim + 1;
     reduce_twice_kernel<<<grid_dim, block_dim>>>(src.data(), dst.data(), idx.data(), n);
 
-    std::vector<T> out(n_out);
-    memory::copy(dst, memory::make_view(out));
-
-    return out;
+    return dst.host_vector();
 }
 
 TEST(reduce_by_key, scatter_twice)
 {
+    // A monotonic sequence of keys with repetitions and gaps, for a reduction
+    // onto an array of length 12.
+    std::size_t n = 12;
     std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11};
-    unsigned n = util::max_value(index)+1;
     std::vector<double> in(index.size(), 1);
     std::vector<double> expected = {6., 2., 4., 2., 0., 0., 0., 6., 0., 0., 0., 2.};
 
diff --git a/test/unit/test_scope_exit.cpp b/test/unit/test_scope_exit.cpp
index 7f1890f0..ec65fad3 100644
--- a/test/unit/test_scope_exit.cpp
+++ b/test/unit/test_scope_exit.cpp
@@ -16,7 +16,8 @@ TEST(scope_exit, basic) {
 }
 
 TEST(scope_exit, noexceptcall) {
-    auto guard1 = on_scope_exit([] {});
+    bool chill = true;
+    auto guard1 = on_scope_exit([&] { if (!chill) throw 0; });
     using G1 = decltype(guard1);
     EXPECT_FALSE(noexcept(guard1.~G1()));
 
diff --git a/test/unit/test_transform.cpp b/test/unit/test_transform.cpp
index 105c07a5..eaf008be 100644
--- a/test/unit/test_transform.cpp
+++ b/test/unit/test_transform.cpp
@@ -19,7 +19,7 @@ TEST(transform, transform_view) {
 
     auto r = util::transform_view(fl, [](int i) { return i*i+0.5; });
 
-    EXPECT_EQ(5u, util::size(r));
+    EXPECT_EQ(5u, std::size(r));
     EXPECT_EQ(16.5, *(std::next(std::begin(r), 1)));
 
     std::copy(r.begin(), r.end(), std::back_inserter(result));
@@ -122,7 +122,7 @@ TEST(indirect, nocopy) {
 
 TEST(indirect, nomove) {
     testing::nomove<double> data[6];
-    for (unsigned i=0; i<util::size(data); ++i) data[i].value = 10.+i;
+    for (unsigned i=0; i<std::size(data); ++i) data[i].value = 10.+i;
     unsigned map_reverse[6] = {5, 4, 3, 2, 1, 0};
     auto reversed = util::indirect_view(data, map_reverse);
 
@@ -156,7 +156,7 @@ TEST(indirect, modifying) {
     // permuted[4] = data[1]
     // permuted[5] = data[0]
 
-    for (unsigned i = 0; i<util::size(permuted); ++i) {
+    for (unsigned i = 0; i<std::size(permuted); ++i) {
         permuted[i] = 10.+i;
     }
     std::vector<double> expected = {15., 14., 12.};
diff --git a/test/unit/test_vector.cu b/test/unit/test_vector_gpu.cpp
similarity index 76%
rename from test/unit/test_vector.cu
rename to test/unit/test_vector_gpu.cpp
index 5a195e48..54a4fd8a 100644
--- a/test/unit/test_vector.cu
+++ b/test/unit/test_vector_gpu.cpp
@@ -38,21 +38,6 @@ TEST(vector, make_host_devicevector) {
     EXPECT_TRUE((std::is_same<int, target_type::value_type>::value));
 }
 
-// test that memory::on_gpu correctly makes a view of a device vector
-/*
-TEST(vector, make_gpu_devicevector) {
-    memory::device_vector<int> dvec(10);
-    auto view = memory::on_gpu(dvec);
-    using target_type = std::decay_t<decltype(view)>;
-    EXPECT_EQ(view.size(), dvec.size());
-    EXPECT_EQ(view.data(), dvec.data());
-    EXPECT_TRUE(memory::util::is_on_gpu<target_type>());
-    EXPECT_FALSE(memory::util::is_on_host<target_type>());
-
-    EXPECT_TRUE((std::is_same<int, target_type::value_type>::value));
-}
-*/
-
 //
 //  fill
 //
@@ -72,21 +57,6 @@ TEST(vector, fill_gpu) {
             EXPECT_EQ(value, double(v[i]));
         }
     }
-
-    // fill an array view
-    /*
-    memory::device_vector<float> ubervec(N);
-    for (auto n : make_span(0u, N)) {
-        float value = float((n+1)/2.f);
-        // make a view of a sub-range of the std::vector ubervec
-        auto v = ubervec(0, n);
-        memory::fill(v, value);
-
-        for (auto i: make_span(0u, n)) {
-            EXPECT_EQ(float(v[i]), value);
-        }
-    }
-    */
 }
 
 //
-- 
GitLab