From ddbece13ba9df2cd1a30b4c5fdfdffce4de31e08 Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Mon, 25 Sep 2017 13:15:44 +0200
Subject: [PATCH] Finish Seperable CUDA compilation (#356)

Separate compilation for all CUDA code.

* Move all CUDA kernels to their own .cu files, together with C++ function wrappers.
* Compile all CUDA .cu files to a single static library.
* Merge gpu and multicore backend validation tests.
* Simply and clean up cruft from CMakeLists.txt files.
---
 mechanisms/CMakeLists.txt                     |   5 +-
 miniapp/CMakeLists.txt                        |  18 +-
 miniapp/miniapp.cu                            |   1 -
 modcc/cudaprinter.cpp                         |   7 +-
 modcc/modcc.cpp                               |   8 +-
 src/CMakeLists.txt                            |  27 +--
 src/backends/fvm.hpp                          |   9 +-
 src/backends/gpu/{fvm.cu => fvm.cpp}          |   0
 src/backends/gpu/fvm.hpp                      |  20 +-
 ...assemble_matrix.hpp => assemble_matrix.cu} |  54 ++++-
 src/backends/gpu/kernels/detail.hpp           |  49 ++--
 src/backends/gpu/kernels/interleave.cu        |  45 ++++
 src/backends/gpu/kernels/interleave.hpp       |  31 ++-
 .../{solve_matrix.hpp => solve_matrix.cu}     |  38 ++-
 src/backends/gpu/kernels/stim_current.cu      |  48 ++++
 src/backends/gpu/kernels/time_ops.cu          |  80 +++++++
 src/backends/gpu/kernels/time_ops.hpp         | 113 ---------
 src/backends/gpu/matrix_state_flat.hpp        |  40 ++--
 src/backends/gpu/matrix_state_interleaved.hpp |  66 ++++--
 src/backends/gpu/stim_current.hpp             |  16 ++
 src/backends/gpu/stimulus.hpp                 |  39 +---
 src/backends/gpu/time_ops.hpp                 |  25 ++
 tests/unit/CMakeLists.txt                     |  10 +-
 tests/unit/test_matrix.cu                     |   4 +
 tests/unit/test_multi_event_stream.cu         |  45 +---
 tests/validation/CMakeLists.txt               |  56 ++---
 tests/validation/validate_ball_and_stick.cpp  | 218 +++++++++++++++++-
 tests/validation/validate_ball_and_stick.cu   |  26 ---
 tests/validation/validate_ball_and_stick.hpp  | 191 ---------------
 tests/validation/validate_kinetic.cpp         | 115 ++++++++-
 tests/validation/validate_kinetic.cu          |  13 --
 tests/validation/validate_kinetic.hpp         |  99 --------
 tests/validation/validate_soma.cpp            |  70 +++++-
 tests/validation/validate_soma.cu             |   9 -
 tests/validation/validate_soma.hpp            |  64 -----
 tests/validation/validate_synapses.cpp        |  98 +++++++-
 tests/validation/validate_synapses.cu         |  16 --
 tests/validation/validate_synapses.hpp        |  84 -------
 38 files changed, 978 insertions(+), 879 deletions(-)
 delete mode 100644 miniapp/miniapp.cu
 rename src/backends/gpu/{fvm.cu => fvm.cpp} (100%)
 rename src/backends/gpu/kernels/{assemble_matrix.hpp => assemble_matrix.cu} (71%)
 create mode 100644 src/backends/gpu/kernels/interleave.cu
 rename src/backends/gpu/kernels/{solve_matrix.hpp => solve_matrix.cu} (70%)
 create mode 100644 src/backends/gpu/kernels/stim_current.cu
 create mode 100644 src/backends/gpu/kernels/time_ops.cu
 delete mode 100644 src/backends/gpu/kernels/time_ops.hpp
 create mode 100644 src/backends/gpu/stim_current.hpp
 create mode 100644 src/backends/gpu/time_ops.hpp
 delete mode 100644 tests/validation/validate_ball_and_stick.cu
 delete mode 100644 tests/validation/validate_ball_and_stick.hpp
 delete mode 100644 tests/validation/validate_kinetic.cu
 delete mode 100644 tests/validation/validate_kinetic.hpp
 delete mode 100644 tests/validation/validate_soma.cu
 delete mode 100644 tests/validation/validate_soma.hpp
 delete mode 100644 tests/validation/validate_synapses.cu
 delete mode 100644 tests/validation/validate_synapses.hpp

diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index bed251ad..dbd1e1a9 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -47,10 +47,11 @@ build_modules(
 # Make a library with the implementations of the mechanism kernels
 
 if(NMC_WITH_CUDA)
-    # make list of the .cu files that 
+    # make list of the .cu files that implement the mechanism kernels
     foreach(mech ${mechanisms})
-        set(cuda_mech_sources ${cuda_mech_sources} ${mech_dir}/${mech}_impl.cu)
+        list(APPEND cuda_mech_sources ${mech_dir}/${mech}_impl.cu)
     endforeach()
+
     # compile the .cu files into a library
     cuda_add_library(gpu_mechanisms ${cuda_mech_sources})
 
diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt
index 95c42b1f..c309edd9 100644
--- a/miniapp/CMakeLists.txt
+++ b/miniapp/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(HEADERS
-)
 set(MINIAPP_SOURCES
     miniapp.cpp
     io.cpp
@@ -7,15 +5,8 @@ set(MINIAPP_SOURCES
     morphology_pool.cpp
     trace.cpp
 )
-set(MINIAPP_SOURCES_CUDA
-    miniapp.cu
-    io.cpp
-    miniapp_recipes.cpp
-    morphology_pool.cpp
-    trace.cpp
-)
 
-add_executable(miniapp.exe ${MINIAPP_SOURCES} ${HEADERS})
+add_executable(miniapp.exe ${MINIAPP_SOURCES})
 
 target_link_libraries(miniapp.exe LINK_PUBLIC ${NESTMC_LIBRARIES})
 target_link_libraries(miniapp.exe LINK_PUBLIC ${EXTERNAL_LIBRARIES})
@@ -25,7 +16,8 @@ if(NMC_WITH_MPI)
     set_property(TARGET miniapp.exe APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
 endif()
 
-set_target_properties(miniapp.exe
-   PROPERTIES
-   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/miniapp"
+set_target_properties(
+    miniapp.exe
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/miniapp"
 )
diff --git a/miniapp/miniapp.cu b/miniapp/miniapp.cu
deleted file mode 100644
index 53bd563e..00000000
--- a/miniapp/miniapp.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "miniapp.cpp"
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 16decd8c..08e5f090 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -44,9 +44,8 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     buffer().add_line("#pragma once");
     buffer().add_line("#include <backends/event.hpp>");
     buffer().add_line("#include <backends/fvm_types.hpp>");
-    buffer().add_line("#include <backends/gpu/kernels/detail.hpp>");
-    buffer().add_line("#include <backends/gpu/kernels/reduce_by_key.hpp>");
     buffer().add_line("#include <backends/multi_event_stream_state.hpp>");
+    buffer().add_line("#include <backends/gpu/kernels/detail.hpp>");
     buffer().add_line();
 
     buffer().add_line("namespace nest{ namespace mc{ namespace gpu{");
@@ -132,6 +131,9 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     // kernels
     buffer().add_line("#include \"" + module_name_ + "_impl.hpp\"");
     buffer().add_line();
+    buffer().add_line("#include <backends/gpu/intrinsics.hpp>");
+    buffer().add_line("#include <backends/gpu/kernels/reduce_by_key.hpp>");
+    buffer().add_line();
     buffer().add_line("namespace nest{ namespace mc{ namespace gpu{");
     buffer().add_line("namespace kernels {");
     buffer().increase_indentation();
@@ -209,7 +211,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     buffer().add_line("#include <algorithms.hpp>");
     buffer().add_line("#include <backends/event.hpp>");
     buffer().add_line("#include <backends/fvm_types.hpp>");
-    buffer().add_line("#include <backends/gpu/intrinsics.hpp>");
     buffer().add_line("#include <backends/gpu/multi_event_stream.hpp>");
     buffer().add_line("#include <util/pprintf.hpp>");
     buffer().add_line();
diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp
index 904c09ce..a588638b 100644
--- a/modcc/modcc.cpp
+++ b/modcc/modcc.cpp
@@ -208,18 +208,14 @@ int main(int argc, char **argv) {
         }
     }
 
-    catch(compiler_exception e) {
+    catch(compiler_exception& e) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
                            " please report to modcc developers\n")
                   << e.what() << " @ " << e.location() << "\n";
         exit(1);
     }
-    catch(std::runtime_error e) {
-        std::cerr << red("error: ") << e.what() << "\n";
-        exit(1);
-    }
-    catch(std::exception e) {
+    catch(std::exception& e) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
                            " please report to modcc developers\n")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 69c0192e..2e06e3ab 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(BASE_SOURCES
     backends/multicore/fvm.cpp
+    cell_group_factory.cpp
     common_types_io.cpp
     cell.cpp
     event_binner.cpp
@@ -25,21 +26,18 @@ set(BASE_SOURCES
     util/unwind.cpp
 )
 set(CUDA_SOURCES
-    backends/gpu/fvm.cu
-    backends/gpu/multi_event_stream.cu
+    backends/gpu/fvm.cpp
     backends/gpu/fill.cu
-    backends/gpu/kernels/test_thresholds.cu
+    backends/gpu/multi_event_stream.cu
+    backends/gpu/kernels/assemble_matrix.cu
+    backends/gpu/kernels/interleave.cu
+    backends/gpu/kernels/solve_matrix.cu
+    backends/gpu/kernels/stim_current.cu
     backends/gpu/kernels/take_samples.cu
+    backends/gpu/kernels/test_thresholds.cu
+    backends/gpu/kernels/time_ops.cu
 )
 
-# The cell_group_factory acts like an interface between the
-# front end and back end.
-if(NMC_WITH_CUDA)
-    set(CUDA_SOURCES ${CUDA_SOURCES} cell_group_factory.cu)
-else()
-    set(BASE_SOURCES ${BASE_SOURCES} cell_group_factory.cpp)
-endif()
-
 if(NMC_WITH_MPI)
     set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp)
 elseif(NMC_WITH_DRYRUN)
@@ -55,12 +53,7 @@ list(APPEND NESTMC_LIBRARIES nestmc)
 
 if(NMC_WITH_CUDA)
     cuda_add_library(gpu ${CUDA_SOURCES})
-    # FIXME
-    # The gpu library uses symbols fron nestmc, so we have to
-    # add nestmc to the end. This is a temporary hack that will
-    # go away when we have properly seperable front and back end
-    # compilation.
-    list(APPEND NESTMC_LIBRARIES gpu nestmc)
+    list(APPEND NESTMC_LIBRARIES gpu)
 endif()
 
 if (NMC_AUTO_RUN_MODCC_ON_CHANGES)
diff --git a/src/backends/fvm.hpp b/src/backends/fvm.hpp
index a12ce30e..7b94edb5 100644
--- a/src/backends/fvm.hpp
+++ b/src/backends/fvm.hpp
@@ -36,13 +36,8 @@ struct null_backend: public multicore::backend {
 } // namespace mc
 } // namespace nest
 
-// FIXME: This include is where cuda-specific code leaks into the main application.
-// e.g.: CUDA kernels, functions marked __host__ __device__, etc.
-// Hence why it is guarded with NMC_HAVE_CUDA, and not, NMC_HAVE_GPU, like elsewhere in
-// the code. When we implement separate compilation of CUDA, this should be guarded with
-// NMC_HAVE_GPU, and the NMC_HAVE_CUDA flag depricated.
-#ifdef NMC_HAVE_CUDA
-    #include <backends/gpu/fvm.hpp>
+#ifdef NMC_HAVE_GPU
+#include <backends/gpu/fvm.hpp>
 #else
 namespace nest { namespace mc { namespace gpu {
     using backend = null_backend;
diff --git a/src/backends/gpu/fvm.cu b/src/backends/gpu/fvm.cpp
similarity index 100%
rename from src/backends/gpu/fvm.cu
rename to src/backends/gpu/fvm.cpp
diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp
index b9ca2254..ac439785 100644
--- a/src/backends/gpu/fvm.hpp
+++ b/src/backends/gpu/fvm.hpp
@@ -10,13 +10,13 @@
 #include <memory/memory.hpp>
 #include <util/rangeutil.hpp>
 
-#include "kernels/time_ops.hpp"
 #include "kernels/take_samples.hpp"
 #include "matrix_state_interleaved.hpp"
-#include "matrix_state_flat.hpp"
+//#include "matrix_state_flat.hpp"
 #include "multi_event_stream.hpp"
 #include "stimulus.hpp"
 #include "threshold_watcher.hpp"
+#include "time_ops.hpp"
 
 namespace nest {
 namespace mc {
@@ -105,20 +105,17 @@ struct backend {
     // perform element-wise comparison on 'array' type against `t_test`.
     template <typename V>
     static bool any_time_before(const memory::device_vector<V>& t, V t_test) {
-        // Note: benchmarking (on a P100) indicates that using the gpu::any_time_before
-        // function is slower than the copy, unless we're running over ten thousands of
-        // cells per cell group.
-        //
-        // Commenting out for now, but consider a size-dependent test or adaptive choice.
-
-        // return gpu::any_time_before(t.size(), t.data(), t_test);
+        // Note: ubbench benchmarking (on a P100) indicates that copying the
+        // time vectors to the host is faster than a device side
+        // implementation unless we're running over ten thousands of cells per
+        // cell group.
 
         auto v_copy = memory::on_host(t);
         return util::minmax_value(v_copy).first<t_test;
     }
 
     static void update_time_to(array& time_to, const_view time, value_type dt, value_type tmax) {
-        nest::mc::gpu::update_time_to<value_type, size_type>(time_to.size(), time_to.data(), time.data(), dt, tmax);
+        nest::mc::gpu::update_time_to(time_to.size(), time_to.data(), time.data(), dt, tmax);
     }
 
     // set the per-cell and per-compartment dt_ from time_to_ - time_.
@@ -126,7 +123,8 @@ struct backend {
         size_type ncell = util::size(dt_cell);
         size_type ncomp = util::size(dt_comp);
 
-        nest::mc::gpu::set_dt<value_type, size_type>(ncell, ncomp, dt_cell.data(), dt_comp.data(), time_to.data(), time.data(), cv_to_cell.data());
+        nest::mc::gpu::set_dt(
+            ncell, ncomp, dt_cell.data(), dt_comp.data(), time_to.data(), time.data(), cv_to_cell.data());
     }
 
     // perform sampling as described by marked events in a sample_event_stream
diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.cu
similarity index 71%
rename from src/backends/gpu/kernels/assemble_matrix.hpp
rename to src/backends/gpu/kernels/assemble_matrix.cu
index 1a4d6a45..c6bd144d 100644
--- a/src/backends/gpu/kernels/assemble_matrix.hpp
+++ b/src/backends/gpu/kernels/assemble_matrix.cu
@@ -1,4 +1,4 @@
-#pragma once
+#include <backends/fvm_types.hpp>
 
 #include "detail.hpp"
 
@@ -6,6 +6,7 @@ namespace nest {
 namespace mc {
 namespace gpu {
 
+namespace kernels {
 /// GPU implementatin of Hines matrix assembly
 /// Flat layout
 /// For a given time step size dt
@@ -141,6 +142,57 @@ void assemble_matrix_interleaved(
     }
 }
 
+} // namespace kernels
+
+void assemble_matrix_flat(
+        fvm_value_type* d,
+        fvm_value_type* rhs,
+        const fvm_value_type* invariant_d,
+        const fvm_value_type* voltage,
+        const fvm_value_type* current,
+        const fvm_value_type* cv_capacitance,
+        const fvm_size_type* cv_to_cell,
+        const fvm_value_type* dt_cell,
+        unsigned n)
+{
+    constexpr unsigned block_dim = 128;
+    const unsigned grid_dim = impl::block_count(n, block_dim);
+
+    kernels::assemble_matrix_flat
+        <fvm_value_type, fvm_size_type>
+        <<<grid_dim, block_dim>>>
+        (d, rhs, invariant_d, voltage, current, cv_capacitance, cv_to_cell, dt_cell, n);
+}
+
+//template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads>
+void assemble_matrix_interleaved(
+    fvm_value_type* d,
+    fvm_value_type* rhs,
+    const fvm_value_type* invariant_d,
+    const fvm_value_type* voltage,
+    const fvm_value_type* current,
+    const fvm_value_type* cv_capacitance,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    const fvm_size_type* matrix_to_cell,
+    const fvm_value_type* dt_cell,
+    unsigned padded_size, unsigned num_mtx)
+{
+    constexpr unsigned bd = impl::block_dim();
+    constexpr unsigned lw = impl::load_width();
+    constexpr unsigned block_dim = bd*lw;
+
+    // The number of threads is threads_per_matrix*num_mtx
+    const unsigned grid_dim = impl::block_count(num_mtx*lw, block_dim);
+
+    kernels::assemble_matrix_interleaved
+        <fvm_value_type, fvm_size_type, bd, lw, block_dim>
+        <<<grid_dim, block_dim>>>
+        (d, rhs, invariant_d, voltage, current, cv_capacitance,
+         sizes, starts, matrix_to_cell,
+         dt_cell, padded_size, num_mtx);
+}
+
 } // namespace gpu
 } // namespace mc
 } // namespace nest
diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp
index 35b40727..15d08a6f 100644
--- a/src/backends/gpu/kernels/detail.hpp
+++ b/src/backends/gpu/kernels/detail.hpp
@@ -1,7 +1,15 @@
 #pragma once
 
-#include <cstdint>
 #include <cfloat>
+#include <cmath>
+#include <cstdint>
+#include <climits>
+
+#ifdef __CUDACC__
+#   define HOST_DEVICE_IF_CUDA __host__ __device__
+#else
+#   define HOST_DEVICE_IF_CUDA
+#endif
 
 namespace nest {
 namespace mc {
@@ -9,34 +17,34 @@ namespace gpu {
 
 namespace impl {
 // Number of matrices per block in block-interleaved storage
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr inline unsigned block_dim() {
     return 32u;
 }
 
 // The number of threads per matrix in the interleave and reverse-interleave
 // operations.
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr inline unsigned load_width() {
     return 32u;
 }
 
 // The alignment of matrices inside the block-interleaved storage.
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr inline unsigned matrix_padding() {
     return load_width();
 }
 
 // Number of threads per warp
 // This has always been 32, however it may change in future NVIDIA gpus
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr inline unsigned threads_per_warp() {
     return 32u;
 }
 
 // The minimum number of bins required to store n values where the bins have
 // dimension of block_size.
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
     return (n+block_size-1)/block_size;
 }
@@ -50,29 +58,30 @@ constexpr inline unsigned padded_size(unsigned n, unsigned block_dim) {
 // Placeholders to use for mark padded locations in data structures that use
 // padding. Using such markers makes it easier to test that padding is
 // performed correctly.
-template <typename T> __host__ __device__ constexpr T npos();
-template <> __host__ __device__ constexpr char npos<char>() { return CHAR_MAX; }
-template <> __host__ __device__ constexpr unsigned char npos<unsigned char>() { return UCHAR_MAX; }
-template <> __host__ __device__ constexpr short npos<short>() { return SHRT_MAX; }
-template <> __host__ __device__ constexpr int npos<int>() { return INT_MAX; }
-template <> __host__ __device__ constexpr long npos<long>() { return LONG_MAX; }
-template <> __host__ __device__ constexpr float npos<float>() { return FLT_MAX; }
-template <> __host__ __device__ constexpr double npos<double>() { return DBL_MAX; }
-template <> __host__ __device__ constexpr unsigned short npos<unsigned short>() { return USHRT_MAX; }
-template <> __host__ __device__ constexpr unsigned int npos<unsigned int>() { return UINT_MAX; }
-template <> __host__ __device__ constexpr unsigned long npos<unsigned long>() { return ULONG_MAX; }
-template <> __host__ __device__ constexpr long long npos<long long>() { return LLONG_MAX; }
+#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_CUDA constexpr type npos<type>() { return cint; }
+template <typename T> HOST_DEVICE_IF_CUDA constexpr T npos();
+NPOS_SPEC(char, CHAR_MAX)
+NPOS_SPEC(unsigned char, UCHAR_MAX)
+NPOS_SPEC(short, SHRT_MAX)
+NPOS_SPEC(int, INT_MAX)
+NPOS_SPEC(long, LONG_MAX)
+NPOS_SPEC(float, FLT_MAX)
+NPOS_SPEC(double, DBL_MAX)
+NPOS_SPEC(unsigned short, USHRT_MAX)
+NPOS_SPEC(unsigned int, UINT_MAX)
+NPOS_SPEC(unsigned long, ULONG_MAX)
+NPOS_SPEC(long long, LLONG_MAX)
 
 // test if value v is npos
 template <typename T>
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 constexpr bool is_npos(T v) {
     return v == npos<T>();
 }
 
 /// Cuda lerp by u on [a,b]: (1-u)*a + u*b.
 template <typename T>
-__host__ __device__
+HOST_DEVICE_IF_CUDA
 inline T lerp(T a, T b, T u) {
     return std::fma(u, b, std::fma(-u, a, a));
 }
diff --git a/src/backends/gpu/kernels/interleave.cu b/src/backends/gpu/kernels/interleave.cu
new file mode 100644
index 00000000..cea3c723
--- /dev/null
+++ b/src/backends/gpu/kernels/interleave.cu
@@ -0,0 +1,45 @@
+#include <backends/fvm_types.hpp>
+#include "detail.hpp"
+#include "interleave.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+// host side wrapper for the flat to interleaved operation
+void flat_to_interleaved(
+    const fvm_value_type* in,
+    fvm_value_type* out,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    unsigned padded_size,
+    unsigned num_vec)
+{
+    constexpr unsigned BlockWidth = impl::block_dim();
+    constexpr unsigned LoadWidth  = impl::load_width();
+
+    flat_to_interleaved
+        <fvm_value_type, fvm_size_type, BlockWidth, LoadWidth>
+        (in, out, sizes, starts, padded_size, num_vec);
+}
+
+// host side wrapper for the interleave to flat operation
+void interleaved_to_flat(
+    const fvm_value_type* in,
+    fvm_value_type* out,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    unsigned padded_size,
+    unsigned num_vec)
+{
+    constexpr unsigned BlockWidth = impl::block_dim();
+    constexpr unsigned LoadWidth  = impl::load_width();
+
+    interleaved_to_flat
+        <fvm_value_type, fvm_size_type, BlockWidth, LoadWidth>
+        (in, out, sizes, starts, padded_size, num_vec);
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp
index 7488f38b..e6c85593 100644
--- a/src/backends/gpu/kernels/interleave.hpp
+++ b/src/backends/gpu/kernels/interleave.hpp
@@ -11,7 +11,7 @@ namespace gpu {
 // Hines matrices, see src/backends/matrix_storage.md
 ///////////////////////////////////////////////////////////////////////////////
 
-
+namespace kernels {
 // Data in a vector is to be interleaved into blocks of width BlockWidth.
 // The kernel assigns LoadWidth threads to each lane in the block.
 //
@@ -107,30 +107,47 @@ void interleaved_to_flat(
     }
 }
 
+} // namespace kernels
+
 // host side wrapper for the flat to interleaved operation
 template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
 void flat_to_interleaved(
-    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+    const T* in,
+    T* out,
+    const I* sizes,
+    const I* starts,
+    unsigned padded_size,
+    unsigned num_vec)
 {
     constexpr unsigned Threads = BlockWidth*LoadWidth;
     const unsigned blocks = impl::block_count(num_vec, BlockWidth);
 
-    flat_to_interleaved<T, I, BlockWidth, LoadWidth, Threads>
-        <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec);
+    kernels::flat_to_interleaved
+        <T, I, BlockWidth, LoadWidth, Threads>
+        <<<blocks, Threads>>>
+        (in, out, sizes, starts, padded_size, num_vec);
 }
 
 // host side wrapper for the interleave to flat operation
 template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
 void interleaved_to_flat(
-    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+    const T* in,
+    T* out,
+    const I* sizes,
+    const I* starts,
+    unsigned padded_size,
+    unsigned num_vec)
 {
     constexpr unsigned Threads = BlockWidth*LoadWidth;
     const unsigned blocks = impl::block_count(num_vec, BlockWidth);
 
-    interleaved_to_flat<T, I, BlockWidth, LoadWidth, Threads>
-        <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec);
+    kernels::interleaved_to_flat
+        <T, I, BlockWidth, LoadWidth, Threads>
+        <<<blocks, Threads>>>
+        (in, out, sizes, starts, padded_size, num_vec);
 }
 
 } // namespace gpu
 } // namespace mc
 } // namespace nest
+
diff --git a/src/backends/gpu/kernels/solve_matrix.hpp b/src/backends/gpu/kernels/solve_matrix.cu
similarity index 70%
rename from src/backends/gpu/kernels/solve_matrix.hpp
rename to src/backends/gpu/kernels/solve_matrix.cu
index 48e4a8b9..45558fa0 100644
--- a/src/backends/gpu/kernels/solve_matrix.hpp
+++ b/src/backends/gpu/kernels/solve_matrix.cu
@@ -1,11 +1,14 @@
-#pragma once
+#include <cassert>
 
 #include "detail.hpp"
+#include <backends/fvm_types.hpp>
 
 namespace nest {
 namespace mc {
 namespace gpu {
 
+namespace kernels {
+
 /// GPU implementation of Hines Matrix solver.
 /// Flat format
 template <typename T, typename I>
@@ -81,6 +84,39 @@ void solve_matrix_interleaved(
     }
 }
 
+} // namespace kernels
+
+void solve_matrix_flat(
+    fvm_value_type* rhs,
+    fvm_value_type* d,
+    const fvm_value_type* u,
+    const fvm_size_type* p,
+    const fvm_size_type* cell_cv_divs,
+    int num_mtx)
+{
+    constexpr unsigned block_dim = 128;
+    const unsigned grid_dim = impl::block_count(num_mtx, block_dim);
+    kernels::solve_matrix_flat
+        <fvm_value_type, fvm_size_type>
+        <<<grid_dim, block_dim>>>
+        (rhs, d, u, p, cell_cv_divs, num_mtx);
+}
+
+void solve_matrix_interleaved(
+    fvm_value_type* rhs,
+    fvm_value_type* d,
+    const fvm_value_type* u,
+    const fvm_size_type* p,
+    const fvm_size_type* sizes,
+    int padded_size,
+    int num_mtx)
+{
+    const unsigned grid_dim = impl::block_count(num_mtx, impl::block_dim());
+    kernels::solve_matrix_interleaved<fvm_value_type, fvm_size_type, impl::block_dim()>
+        <<<grid_dim, impl::block_dim()>>>
+        (rhs, d, u, p, sizes, padded_size, num_mtx);
+}
+
 } // namespace gpu
 } // namespace mc
 } // namespace nest
diff --git a/src/backends/gpu/kernels/stim_current.cu b/src/backends/gpu/kernels/stim_current.cu
new file mode 100644
index 00000000..7e5340bc
--- /dev/null
+++ b/src/backends/gpu/kernels/stim_current.cu
@@ -0,0 +1,48 @@
+#include <backends/fvm_types.hpp>
+#include <backends/gpu/intrinsics.hpp>
+
+namespace nest{
+namespace mc{
+namespace gpu {
+
+namespace kernels {
+    template <typename T, typename I>
+    __global__
+    void stim_current(
+        const T* delay, const T* duration, const T* amplitude,
+        const I* node_index, int n, const I* cell_index, const T* time, T* current)
+    {
+        using value_type = T;
+        using iarray = I;
+
+        auto i = threadIdx.x + blockDim.x*blockIdx.x;
+
+        if (i<n) {
+            auto t = time[cell_index[i]];
+            if (t>=delay[i] && t<delay[i]+duration[i]) {
+                // use subtraction because the electrode currents are specified
+                // in terms of current into the compartment
+                cuda_atomic_add(current+node_index[i], -amplitude[i]);
+            }
+        }
+    }
+} // namespace kernels
+
+
+void stim_current(
+    const fvm_value_type* delay, const fvm_value_type* duration, const fvm_value_type* amplitude,
+    const fvm_size_type* node_index, int n,
+    const fvm_size_type* cell_index, const fvm_value_type* time, fvm_value_type* current)
+{
+    constexpr unsigned thread_dim = 192;
+    dim3 dim_block(thread_dim);
+    dim3 dim_grid((n+thread_dim-1)/thread_dim);
+
+    kernels::stim_current<fvm_value_type, fvm_size_type><<<dim_grid, dim_block>>>
+        (delay, duration, amplitude, node_index, n, cell_index, time, current);
+
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/time_ops.cu b/src/backends/gpu/kernels/time_ops.cu
new file mode 100644
index 00000000..c2a108a8
--- /dev/null
+++ b/src/backends/gpu/kernels/time_ops.cu
@@ -0,0 +1,80 @@
+#include <backends/fvm_types.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+namespace kernels {
+    template <typename T, typename I>
+    __global__ void update_time_to(I n, T* time_to, const T* time, T dt, T tmax) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            auto t = time[i]+dt;
+            time_to[i] = t<tmax? t: tmax;
+        }
+    }
+
+    template <typename T>
+    struct less {
+        __device__ __host__
+        bool operator()(const T& a, const T& b) const { return a<b; }
+    };
+
+    // vector minus: x = y - z
+    template <typename T, typename I>
+    __global__ void vec_minus(I n, T* x, const T* y, const T* z) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            x[i] = y[i]-z[i];
+        }
+    }
+
+    // vector gather: x[i] = y[index[i]]
+    template <typename T, typename I>
+    __global__ void gather(I n, T* x, const T* y, const I* index) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            x[i] = y[index[i]];
+        }
+    }
+}
+
+void update_time_to(fvm_size_type n,
+                    fvm_value_type* time_to,
+                    const fvm_value_type* time,
+                    fvm_value_type dt,
+                    fvm_value_type tmax)
+{
+    if (!n) {
+        return;
+    }
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n-1)/blockwidth;
+    kernels::update_time_to<<<nblock, blockwidth>>>
+        (n, time_to, time, dt, tmax);
+}
+
+void set_dt(fvm_size_type ncell,
+            fvm_size_type ncomp,
+            fvm_value_type* dt_cell,
+            fvm_value_type* dt_comp,
+            const fvm_value_type* time_to,
+            const fvm_value_type* time,
+            const fvm_size_type* cv_to_cell)
+{
+    if (!ncell || !ncomp) {
+        return;
+    }
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(ncell-1)/blockwidth;
+    kernels::vec_minus<<<nblock, blockwidth>>>(ncell, dt_cell, time_to, time);
+
+    nblock = 1+(ncomp-1)/blockwidth;
+    kernels::gather<<<nblock, blockwidth>>>(ncomp, dt_comp, dt_cell, cv_to_cell);
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/time_ops.hpp b/src/backends/gpu/kernels/time_ops.hpp
deleted file mode 100644
index 94271034..00000000
--- a/src/backends/gpu/kernels/time_ops.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#pragma once
-
-#include <type_traits>
-
-#include <cuda.h>
-
-#include <memory/wrappers.hpp>
-
-namespace nest {
-namespace mc {
-namespace gpu {
-
-namespace kernels {
-    template <typename T, typename I>
-    __global__ void update_time_to(I n, T* time_to, const T* time, T dt, T tmax) {
-        int i = threadIdx.x+blockIdx.x*blockDim.x;
-        if (i<n) {
-            auto t = time[i]+dt;
-            time_to[i] = t<tmax? t: tmax;
-        }
-    }
-
-    // array-array comparison
-    template <typename T, typename I, typename Pred>
-    __global__ void array_reduce_any(I n, const T* x, const T* y, Pred p, int* rptr) {
-        int i = threadIdx.x+blockIdx.x*blockDim.x;
-        int cmp = i<n? p(x[i], y[i]): 0;
-        if (__syncthreads_or(cmp) && !threadIdx.x) {
-            *rptr=1;
-        }
-    }
-
-    // array-scalar comparison
-    template <typename T, typename I, typename Pred>
-    __global__ void array_reduce_any(I n, const T* x, T y, Pred p, int* rptr) {
-        int i = threadIdx.x+blockIdx.x*blockDim.x;
-        int cmp = i<n? p(x[i], y): 0;
-        if (__syncthreads_or(cmp) && !threadIdx.x) {
-            *rptr=1;
-        }
-    }
-
-    template <typename T>
-    struct less {
-        __device__ __host__
-        bool operator()(const T& a, const T& b) const { return a<b; }
-    };
-
-    // vector minus: x = y - z
-    template <typename T, typename I>
-    __global__ void vec_minus(I n, T* x, const T* y, const T* z) {
-        int i = threadIdx.x+blockIdx.x*blockDim.x;
-        if (i<n) {
-            x[i] = y[i]-z[i];
-        }
-    }
-
-    // vector gather: x[i] = y[index[i]]
-    template <typename T, typename I>
-    __global__ void gather(I n, T* x, const T* y, const I* index) {
-        int i = threadIdx.x+blockIdx.x*blockDim.x;
-        if (i<n) {
-            x[i] = y[index[i]];
-        }
-    }
-}
-
-template <typename T, typename I>
-void update_time_to(I n, T* time_to, const T* time, T dt, T tmax) {
-    if (!n) {
-        return;
-    }
-
-    constexpr int blockwidth = 128;
-    int nblock = 1+(n-1)/blockwidth;
-    kernels::update_time_to<<<nblock, blockwidth>>>(n, time_to, time, dt, tmax);
-}
-
-template <typename T, typename U, typename I>
-bool any_time_before(I n, T* t0, U t1) {
-    static_assert(std::is_convertible<T*, U>::value || std::is_convertible<T, U>::value,
-        "third-argument must be a compatible scalar or pointer type");
-
-    static thread_local auto r = memory::device_vector<int>(1);
-    if (!n) {
-        return false;
-    }
-
-    constexpr int blockwidth = 128;
-    int nblock = 1+(n-1)/blockwidth;
-
-    r[0] = 0;
-    kernels::array_reduce_any<<<nblock, blockwidth>>>(n, t0, t1, kernels::less<T>(), r.data());
-    return r[0];
-}
-
-template <typename T, typename I>
-void set_dt(I ncell, I ncomp, T* dt_cell, T* dt_comp, const T* time_to, const T* time, const I* cv_to_cell) {
-    if (!ncell || !ncomp) {
-        return;
-    }
-
-    constexpr int blockwidth = 128;
-    int nblock = 1+(ncell-1)/blockwidth;
-    kernels::vec_minus<<<nblock, blockwidth>>>(ncell, dt_cell, time_to, time);
-
-    nblock = 1+(ncomp-1)/blockwidth;
-    kernels::gather<<<nblock, blockwidth>>>(ncomp, dt_comp, dt_cell, cv_to_cell);
-}
-
-} // namespace gpu
-} // namespace mc
-} // namespace nest
diff --git a/src/backends/gpu/matrix_state_flat.hpp b/src/backends/gpu/matrix_state_flat.hpp
index dc7f9a40..a68ecad8 100644
--- a/src/backends/gpu/matrix_state_flat.hpp
+++ b/src/backends/gpu/matrix_state_flat.hpp
@@ -1,18 +1,35 @@
 #pragma once
 
+#include <backends/fvm_types.hpp>
 #include <memory/memory.hpp>
 #include <memory/wrappers.hpp>
 #include <util/span.hpp>
 #include <util/partition.hpp>
 #include <util/rangeutil.hpp>
 
-#include "kernels/solve_matrix.hpp"
-#include "kernels/assemble_matrix.hpp"
-
 namespace nest {
 namespace mc {
 namespace gpu {
 
+void solve_matrix_flat(
+    fvm_value_type* rhs,
+    fvm_value_type* d,
+    const fvm_value_type* u,
+    const fvm_size_type* p,
+    const fvm_size_type* cell_cv_divs,
+    int num_mtx);
+
+void assemble_matrix_flat(
+    fvm_value_type* d,
+    fvm_value_type* rhs,
+    const fvm_value_type* invariant_d,
+    const fvm_value_type* voltage,
+    const fvm_value_type* current,
+    const fvm_value_type* cv_capacitance,
+    const fvm_size_type* cv_to_cell,
+    const fvm_value_type* dt_cell,
+    unsigned n);
+
 /// matrix state
 template <typename T, typename I>
 struct matrix_state_flat {
@@ -95,25 +112,16 @@ struct matrix_state_flat {
     //   voltage [mV]
     //   current [nA]
     void assemble(const_view dt_cell, const_view voltage, const_view current) {
-        // determine the grid dimensions for the kernel
-        auto const n = voltage.size();
-        auto const block_dim = 128;
-        auto const grid_dim = impl::block_count(n, block_dim);
-
-        assemble_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
+        // perform assembly on the gpu
+        assemble_matrix_flat(
             d.data(), rhs.data(), invariant_d.data(), voltage.data(),
             current.data(), cv_capacitance.data(), cv_to_cell.data(), dt_cell.data(), size());
     }
 
     void solve() {
-        // determine the grid dimensions for the kernel
-        auto const block_dim = 128;
-        auto const grid_dim = impl::block_count(num_matrices(), block_dim);
-
         // perform solve on gpu
-        solve_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
-            rhs.data(), d.data(), u.data(), parent_index.data(),
-            cell_cv_divs.data(), num_matrices());
+        solve_matrix_flat(rhs.data(), d.data(), u.data(), parent_index.data(),
+                          cell_cv_divs.data(), num_matrices());
     }
 
     std::size_t size() const {
diff --git a/src/backends/gpu/matrix_state_interleaved.hpp b/src/backends/gpu/matrix_state_interleaved.hpp
index a6a127e8..054af9e5 100644
--- a/src/backends/gpu/matrix_state_interleaved.hpp
+++ b/src/backends/gpu/matrix_state_interleaved.hpp
@@ -1,19 +1,61 @@
 #pragma once
 
+#include <backends/fvm_types.hpp>
 #include <memory/memory.hpp>
 #include <util/debug.hpp>
 #include <util/span.hpp>
 #include <util/partition.hpp>
 #include <util/rangeutil.hpp>
+#include <util/indirect.hpp>
 
-#include "kernels/solve_matrix.hpp"
-#include "kernels/assemble_matrix.hpp"
-#include "kernels/interleave.hpp"
+#include "kernels/detail.hpp"
 
 namespace nest {
 namespace mc {
 namespace gpu {
 
+// host side wrapper for interleaved matrix assembly kernel
+void assemble_matrix_interleaved(
+    fvm_value_type* d,
+    fvm_value_type* rhs,
+    const fvm_value_type* invariant_d,
+    const fvm_value_type* voltage,
+    const fvm_value_type* current,
+    const fvm_value_type* cv_capacitance,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    const fvm_size_type* matrix_to_cell,
+    const fvm_value_type* dt_cell,
+    unsigned padded_size, unsigned num_mtx);
+
+// host side wrapper for interleaved matrix solver kernel
+void solve_matrix_interleaved(
+    fvm_value_type* rhs,
+    fvm_value_type* d,
+    const fvm_value_type* u,
+    const fvm_size_type* p,
+    const fvm_size_type* sizes,
+    int padded_size,
+    int num_mtx);
+
+// host side wrapper for the flat to interleaved operation
+void flat_to_interleaved(
+    const fvm_value_type* in,
+    fvm_value_type* out,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    unsigned padded_size,
+    unsigned num_vec);
+
+// host side wrapper for the interleave to flat operation
+void interleaved_to_flat(
+    const fvm_value_type* in,
+    fvm_value_type* out,
+    const fvm_size_type* sizes,
+    const fvm_size_type* starts,
+    unsigned padded_size,
+    unsigned num_vec);
+
 // A helper that performs the interleave operation on host memory.
 template <typename T, typename I>
 std::vector<T> flat_to_interleaved(
@@ -213,15 +255,7 @@ struct matrix_state_interleaved {
     //   voltage [mV]
     //   current [nA]
     void assemble(const_view dt_cell, const_view voltage, const_view current) {
-        constexpr auto bd = impl::block_dim();
-        constexpr auto lw = impl::load_width();
-        constexpr auto block_dim = bd*lw;
-
-        // The number of threads is threads_per_matrix*num_mtx
-        const auto num_blocks = impl::block_count(num_matrices()*lw, block_dim);
-
-        assemble_matrix_interleaved<value_type, size_type, bd, lw, block_dim>
-            <<<num_blocks, block_dim>>>
+        assemble_matrix_interleaved
             (d.data(), rhs.data(), invariant_d.data(),
              voltage.data(), current.data(), cv_capacitance.data(),
              matrix_sizes.data(), matrix_index.data(),
@@ -232,14 +266,12 @@ struct matrix_state_interleaved {
 
     void solve() {
         // Perform the Hines solve.
-        auto const grid_dim = impl::block_count(num_matrices(), impl::block_dim());
-        solve_matrix_interleaved<value_type, size_type, impl::block_dim()>
-            <<<grid_dim, impl::block_dim()>>>
-            (rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(),
+        solve_matrix_interleaved(
+             rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(),
              padded_matrix_size(), num_matrices());
 
         // Copy the solution from interleaved to front end storage.
-        interleaved_to_flat<value_type, size_type, impl::block_dim(), impl::load_width()>
+        interleaved_to_flat
             (rhs.data(), solution_.data(), matrix_sizes.data(), matrix_index.data(),
              padded_matrix_size(), num_matrices());
     }
diff --git a/src/backends/gpu/stim_current.hpp b/src/backends/gpu/stim_current.hpp
new file mode 100644
index 00000000..6d339755
--- /dev/null
+++ b/src/backends/gpu/stim_current.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <backends/fvm_types.hpp>
+
+namespace nest{
+namespace mc{
+namespace gpu {
+
+void stim_current(
+    const fvm_value_type* delay, const fvm_value_type* duration, const fvm_value_type* amplitude,
+    const fvm_size_type* node_index, int n,
+    const fvm_size_type* cell_index, const fvm_value_type* time, fvm_value_type* current);
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/stimulus.hpp b/src/backends/gpu/stimulus.hpp
index 9b2da514..651e903c 100644
--- a/src/backends/gpu/stimulus.hpp
+++ b/src/backends/gpu/stimulus.hpp
@@ -5,37 +5,13 @@
 
 #include <mechanism.hpp>
 #include <algorithms.hpp>
+#include "stim_current.hpp"
 #include <util/pprintf.hpp>
 
-#include "intrinsics.hpp"
-
 namespace nest{
 namespace mc{
 namespace gpu {
 
-namespace kernels {
-    template <typename T, typename I>
-    __global__
-    void stim_current(
-        const T* delay, const T* duration, const T* amplitude,
-        const I* node_index, int n, const I* cell_index, const T* time, T* current)
-    {
-        using value_type = T;
-        using iarray = I;
-
-        auto i = threadIdx.x + blockDim.x*blockIdx.x;
-
-        if (i<n) {
-            auto t = time[cell_index[i]];
-            if (t>=delay[i] && t<delay[i]+duration[i]) {
-                // use subtraction because the electrode currents are specified
-                // in terms of current into the compartment
-                cuda_atomic_add(current+node_index[i], -amplitude[i]);
-            }
-        }
-    }
-} // namespace kernels
-
 template<class Backend>
 class stimulus : public mechanism<Backend> {
 public:
@@ -107,16 +83,9 @@ public:
         // don't launch a kernel if there are no stimuli
         if (!size()) return;
 
-        auto n = size();
-        auto thread_dim = 192;
-        dim3 dim_block(thread_dim);
-        dim3 dim_grid((n+thread_dim-1)/thread_dim );
-
-        kernels::stim_current<value_type, size_type><<<dim_grid, dim_block>>>(
-            delay.data(), duration.data(), amplitude.data(),
-            node_index_.data(), n, vec_ci_.data(), vec_t_.data(),
-            vec_i_.data()
-        );
+        stim_current(delay.data(), duration.data(), amplitude.data(),
+                     node_index_.data(), size(), vec_ci_.data(), vec_t_.data(),
+                     vec_i_.data());
 
     }
 
diff --git a/src/backends/gpu/time_ops.hpp b/src/backends/gpu/time_ops.hpp
new file mode 100644
index 00000000..a851a7c4
--- /dev/null
+++ b/src/backends/gpu/time_ops.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <backends/fvm_types.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+void update_time_to(fvm_size_type n,
+                    fvm_value_type* time_to,
+                    const fvm_value_type* time,
+                    fvm_value_type dt,
+                    fvm_value_type tmax);
+
+void set_dt(fvm_size_type ncell,
+            fvm_size_type ncomp,
+            fvm_value_type* dt_cell,
+            fvm_value_type* dt_comp,
+            const fvm_value_type* time_to,
+            const fvm_value_type* time,
+            const fvm_size_type* cv_to_cell);
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 600cfcd6..5d58b80a 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -93,7 +93,7 @@ endif()
 
 set(TARGETS test.exe)
 
-add_executable(test.exe ${TEST_SOURCES} ${HEADERS})
+add_executable(test.exe ${TEST_SOURCES})
 target_compile_definitions(test.exe PUBLIC "-DDATADIR=\"${PROJECT_SOURCE_DIR}/data\"")
 
 if (NMC_AUTO_RUN_MODCC_ON_CHANGES)
@@ -104,7 +104,7 @@ target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..")
 
 if(NMC_WITH_CUDA)
     set(TARGETS ${TARGETS} test_cuda.exe)
-    cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS})
+    cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES})
 endif()
 
 foreach(target ${TARGETS})
@@ -122,9 +122,3 @@ foreach(target ${TARGETS})
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 endforeach()
-
-# Temporary hack (pending PR #266)
-if(NMC_WITH_CUDA)
-    target_link_libraries(test_cuda.exe LINK_PUBLIC nestmc)
-endif()
-
diff --git a/tests/unit/test_matrix.cu b/tests/unit/test_matrix.cu
index 3e7a730d..7e5bf6cc 100644
--- a/tests/unit/test_matrix.cu
+++ b/tests/unit/test_matrix.cu
@@ -12,6 +12,10 @@
 #include <memory/memory.hpp>
 #include <util/span.hpp>
 
+#include <backends/gpu/matrix_state_flat.hpp>
+#include <backends/gpu/matrix_state_interleaved.hpp>
+#include <backends/gpu/kernels/interleave.hpp>
+
 #include <cuda.h>
 
 using namespace nest::mc;
diff --git a/tests/unit/test_multi_event_stream.cu b/tests/unit/test_multi_event_stream.cu
index cb1f83e7..bf0177e9 100644
--- a/tests/unit/test_multi_event_stream.cu
+++ b/tests/unit/test_multi_event_stream.cu
@@ -7,7 +7,7 @@
 
 #include <backends/event.hpp>
 #include <backends/gpu/multi_event_stream.hpp>
-#include <backends/gpu/kernels/time_ops.hpp>
+#include <backends/gpu/time_ops.hpp>
 #include <memory/wrappers.hpp>
 #include <util/rangeutil.hpp>
 
@@ -217,7 +217,7 @@ TEST(multi_event_stream, time_if_before) {
     std::vector<double> after;
 
     for (unsigned i = 0; i<n_cell; ++i) {
-	before[i] = 0.1+i/(double)n_cell;
+        before[i] = 0.1+i/(double)n_cell;
     }
 
     memory::device_vector<double> t = memory::on_gpu(before);
@@ -230,45 +230,18 @@ TEST(multi_event_stream, time_if_before) {
     // on cell_2 to restrict corresponding element of t.
 
     for (unsigned i = 0; i<n_cell; ++i) {
-	before[i] = 2.1+0.5*i/(double)n_cell;
+        before[i] = 2.1+0.5*i/(double)n_cell;
     }
     t = memory::make_view(before);
     m.event_time_if_before(t);
     util::assign(after, memory::on_host(t));
 
     for (unsigned i = 0; i<n_cell; ++i) {
-	if (i==cell_2) {
-	    EXPECT_EQ(2., after[i]);
-	}
-	else {
-	    EXPECT_EQ(before[i], after[i]);
-	}
+        if (i==cell_2) {
+            EXPECT_EQ(2., after[i]);
+        }
+        else {
+            EXPECT_EQ(before[i], after[i]);
+        }
     }
 }
-
-TEST(multi_event_stream, any_time_before) {
-    constexpr std::size_t n = 10000;
-    std::minstd_rand R;
-    std::uniform_real_distribution<float> g(0, 10);
-
-    std::vector<double> t(n);
-    std::generate(t.begin(), t.end(), [&]{ return g(R); });
-
-    memory::device_vector<double> t0 = memory::on_gpu(t);
-
-    double tmin = *std::min_element(t.begin(), t.end());
-    EXPECT_TRUE(gpu::any_time_before(n, t0.data(), tmin+0.01));
-    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), tmin));
-
-    memory::device_vector<double> t1 = memory::on_gpu(t);
-    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), t1.data()));
-
-    t[2*n/3] += 20;
-    t1 = memory::on_gpu(t);
-    EXPECT_TRUE(gpu::any_time_before(n, t0.data(), t1.data()));
-
-    t[2*n/3] -= 30;
-    t1 = memory::on_gpu(t);
-    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), t1.data()));
-}
-
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index e013a998..45773d26 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -14,21 +14,6 @@ set(VALIDATION_SOURCES
     validate.cpp
 )
 
-set(VALIDATION_CUDA_SOURCES
-    # unit tests
-    validate_soma.cu
-    validate_ball_and_stick.cu
-    validate_kinetic.cu
-    validate_synapses.cu
-
-    # support code
-    validation_data.cpp
-    trace_analysis.cpp
-
-    # unit test driver
-    validate.cpp
-)
-
 if(NMC_VALIDATION_DATA_DIR)
     if ("${CMAKE_VERSION}" MATCHES "^3.[789].")
         message(WARNING "CMake ${CMAKE_VERSION} has broken FindCUDA; omitting NMC_DATADIR define.")
@@ -38,33 +23,22 @@ if(NMC_VALIDATION_DATA_DIR)
 endif()
 
 add_executable(validate.exe ${VALIDATION_SOURCES})
-set(TARGETS validate.exe)
 
-if(NMC_WITH_CUDA)
-    cuda_add_executable(validate_cuda.exe ${VALIDATION_CUDA_SOURCES})
-    list(APPEND TARGETS validate_cuda.exe)
-    target_link_libraries(validate_cuda.exe LINK_PUBLIC gpu)
-endif()
-
-
-foreach(target ${TARGETS})
-    target_link_libraries(${target} LINK_PUBLIC gtest)
-    target_link_libraries(${target} LINK_PUBLIC ${NESTMC_LIBRARIES})
-    target_link_libraries(${target} LINK_PUBLIC ${EXTERNAL_LIBRARIES})
-
-    if(NMC_WITH_MPI)
-        target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
-        set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
-    endif()
+target_link_libraries(validate.exe LINK_PUBLIC gtest)
+target_link_libraries(validate.exe LINK_PUBLIC ${NESTMC_LIBRARIES})
+target_link_libraries(validate.exe LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-    set_target_properties(
-        ${target}
-        PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
-    )
+if(NMC_WITH_MPI)
+    target_link_libraries(validate.exe LINK_PUBLIC ${MPI_C_LIBRARIES})
+    set_property(TARGET validate.exe APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
+endif()
 
-    if(NMC_BUILD_VALIDATION_DATA)
-        add_dependencies(${target} validation_data)
-    endif()
-endforeach()
+set_target_properties(
+    validate.exe
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
+)
 
+if(NMC_BUILD_VALIDATION_DATA)
+    add_dependencies(validate.exe validation_data)
+endif()
diff --git a/tests/validation/validate_ball_and_stick.cpp b/tests/validation/validate_ball_and_stick.cpp
index e4c22efa..9d1a99dc 100644
--- a/tests/validation/validate_ball_and_stick.cpp
+++ b/tests/validation/validate_ball_and_stick.cpp
@@ -1,26 +1,228 @@
-#include "../gtest.h"
-#include "validate_ball_and_stick.hpp"
+#include <iostream>
 
+#include <cell.hpp>
+#include <common_types.hpp>
 #include <fvm_multicell.hpp>
+#include <load_balance.hpp>
+#include <hardware/node_info.hpp>
+#include <hardware/gpu.hpp>
+#include <model.hpp>
+#include <recipe.hpp>
+#include <segment.hpp>
+#include <simple_sampler.hpp>
+#include <util/meta.hpp>
+#include <util/path.hpp>
+#include <json/json.hpp>
+
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
+#include "convergence_test.hpp"
+#include "trace_analysis.hpp"
+#include "validation_data.hpp"
+
+#include "../gtest.h"
+
+using namespace nest::mc;
+
+struct probe_point {
+    const char* label;
+    segment_location where;
+};
+
+template <typename ProbePointSeq>
+void run_ncomp_convergence_test(
+    const char* model_name,
+    const util::path& ref_data_path,
+    backend_kind backend,
+    const cell& c,
+    ProbePointSeq& probe_points,
+    float t_end=100.f)
+{
+    using namespace nest::mc;
+
+    auto max_ncomp = g_trace_io.max_ncomp();
+    auto dt = g_trace_io.min_dt();
+    auto sample_dt = g_trace_io.sample_dt();
+
+    nlohmann::json meta = {
+        {"name", "membrane voltage"},
+        {"model", model_name},
+        {"dt", dt},
+        {"sim", "nestmc"},
+        {"units", "mV"},
+        {"backend_kind", to_string(backend)}
+    };
+
+    auto exclude = stimulus_ends(c);
 
-const auto backend = nest::mc::backend_kind::multicore;
+    auto n_probe = util::size(probe_points);
+    std::vector<probe_label> plabels;
+    plabels.reserve(n_probe);
+    for (unsigned i = 0; i<n_probe; ++i) {
+        plabels.push_back(probe_label{probe_points[i].label, {0u, i}});
+    }
+
+    convergence_test_runner<int> runner("ncomp", plabels, meta);
+    runner.load_reference_data(ref_data_path);
+
+    for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) {
+        for (auto& seg: c.segments()) {
+            if (!seg->is_soma()) {
+                seg->set_compartments(ncomp);
+            }
+        }
+        cable1d_recipe rec{c};
+        for (const auto& p: probe_points) {
+            rec.add_probe(0, 0, cell_probe_address{p.where, cell_probe_address::membrane_voltage});
+        }
+
+        hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
+        auto decomp = partition_load_balance(rec, nd);
+        model m(rec, decomp);
+
+        runner.run(m, ncomp, sample_dt, t_end, dt, exclude);
+    }
+    runner.report();
+    runner.assert_all_convergence();
+}
+
+void validate_ball_and_stick(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    cell c = make_cell_ball_and_stick();
+    probe_point points[] = {
+        {"soma.mid", {0u, 0.5}},
+        {"dend.mid", {1u, 0.5}},
+        {"dend.end", {1u, 1.0}}
+    };
+
+    run_ncomp_convergence_test(
+        "ball_and_stick",
+        "neuron_ball_and_stick.json",
+        backend,
+        c,
+        points);
+}
+
+void validate_ball_and_taper(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    cell c = make_cell_ball_and_taper();
+    probe_point points[] = {
+        {"soma.mid",  {0u, 0.5}},
+        {"taper.mid", {1u, 0.5}},
+        {"taper.end", {1u, 1.0}}
+    };
+
+    run_ncomp_convergence_test(
+        "ball_and_taper",
+        "neuron_ball_and_taper.json",
+        backend,
+        c,
+        points);
+}
+
+void validate_ball_and_3stick(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    cell c = make_cell_ball_and_3stick();
+    probe_point points[] = {
+        {"soma.mid",  {0u, 0.5}},
+        {"dend1.mid", {1u, 0.5}},
+        {"dend1.end", {1u, 1.0}},
+        {"dend2.mid", {2u, 0.5}},
+        {"dend2.end", {2u, 1.0}},
+        {"dend3.mid", {3u, 0.5}},
+        {"dend3.end", {3u, 1.0}}
+    };
+
+    run_ncomp_convergence_test(
+        "ball_and_3stick",
+        "neuron_ball_and_3stick.json",
+        backend,
+        c,
+        points);
+}
+
+void validate_rallpack1(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    cell c = make_cell_simple_cable();
+    probe_point points[] = {
+        {"cable.x0.0", {1u, 0.0}},
+        {"cable.x0.3", {1u, 0.3}},
+        {"cable.x1.0", {1u, 1.0}}
+    };
+
+    run_ncomp_convergence_test(
+        "rallpack1",
+        "numeric_rallpack1.json",
+        backend,
+        c,
+        points,
+        250.f);
+}
+
+void validate_ball_and_squiggle(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    cell c = make_cell_ball_and_squiggle();
+    probe_point points[] = {
+        {"soma.mid", {0u, 0.5}},
+        {"dend.mid", {1u, 0.5}},
+        {"dend.end", {1u, 1.0}}
+    };
+
+#if 0
+    // *temporarily* disabled: compartment division policy will
+    // be moved into backend policy classes.
+
+    run_ncomp_convergence_test<lowered_cell_div<div_compartment_sampler>>(
+        "ball_and_squiggle_sampler",
+        "neuron_ball_and_squiggle.json",
+        c,
+        samplers);
+#endif
+
+    run_ncomp_convergence_test(
+        "ball_and_squiggle_integrator",
+        "neuron_ball_and_squiggle.json",
+        backend,
+        c,
+        points);
+}
 
 TEST(ball_and_stick, neuron_ref) {
-    validate_ball_and_stick(backend);
+    validate_ball_and_stick(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_ball_and_stick(backend_kind::gpu);
+    }
 }
 
 TEST(ball_and_taper, neuron_ref) {
-    validate_ball_and_taper(backend);
+    validate_ball_and_taper(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_ball_and_taper(backend_kind::gpu);
+    }
 }
 
 TEST(ball_and_3stick, neuron_ref) {
-    validate_ball_and_3stick(backend);
+    validate_ball_and_3stick(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_ball_and_3stick(backend_kind::gpu);
+    }
 }
 
 TEST(rallpack1, numeric_ref) {
-    validate_rallpack1(backend);
+    validate_rallpack1(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_rallpack1(backend_kind::gpu);
+    }
 }
 
 TEST(ball_and_squiggle, neuron_ref) {
-    validate_ball_and_squiggle(backend);
+    validate_ball_and_squiggle(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_ball_and_squiggle(backend_kind::gpu);
+    }
 }
diff --git a/tests/validation/validate_ball_and_stick.cu b/tests/validation/validate_ball_and_stick.cu
deleted file mode 100644
index 1ad415c8..00000000
--- a/tests/validation/validate_ball_and_stick.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "../gtest.h"
-#include "validate_ball_and_stick.hpp"
-
-#include <fvm_multicell.hpp>
-
-const auto backend = nest::mc::backend_kind::gpu;
-
-TEST(ball_and_stick, neuron_ref) {
-    validate_ball_and_stick(backend);
-}
-
-TEST(ball_and_taper, neuron_ref) {
-    validate_ball_and_taper(backend);
-}
-
-TEST(ball_and_3stick, neuron_ref) {
-    validate_ball_and_3stick(backend);
-}
-
-TEST(rallpack1, numeric_ref) {
-    validate_rallpack1(backend);
-}
-
-TEST(ball_and_squiggle, neuron_ref) {
-    validate_ball_and_squiggle(backend);
-}
diff --git a/tests/validation/validate_ball_and_stick.hpp b/tests/validation/validate_ball_and_stick.hpp
deleted file mode 100644
index 516d82c7..00000000
--- a/tests/validation/validate_ball_and_stick.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <json/json.hpp>
-
-#include <cell.hpp>
-#include <common_types.hpp>
-#include <fvm_multicell.hpp>
-#include <load_balance.hpp>
-#include <hardware/node_info.hpp>
-#include <model.hpp>
-#include <recipe.hpp>
-#include <segment.hpp>
-#include <simple_sampler.hpp>
-#include <util/meta.hpp>
-#include <util/path.hpp>
-
-#include "../gtest.h"
-
-#include "../common_cells.hpp"
-#include "../simple_recipes.hpp"
-#include "convergence_test.hpp"
-#include "trace_analysis.hpp"
-#include "validation_data.hpp"
-
-#include <iostream>
-
-struct probe_point {
-    const char* label;
-    nest::mc::segment_location where;
-};
-
-template <typename ProbePointSeq>
-void run_ncomp_convergence_test(
-    const char* model_name,
-    const nest::mc::util::path& ref_data_path,
-    nest::mc::backend_kind backend,
-    const nest::mc::cell& c,
-    ProbePointSeq& probe_points,
-    float t_end=100.f)
-{
-    using namespace nest::mc;
-
-    auto max_ncomp = g_trace_io.max_ncomp();
-    auto dt = g_trace_io.min_dt();
-    auto sample_dt = g_trace_io.sample_dt();
-
-    nlohmann::json meta = {
-        {"name", "membrane voltage"},
-        {"model", model_name},
-        {"dt", dt},
-        {"sim", "nestmc"},
-        {"units", "mV"},
-        {"backend_kind", to_string(backend)}
-    };
-
-    auto exclude = stimulus_ends(c);
-
-    auto n_probe = util::size(probe_points);
-    std::vector<probe_label> plabels;
-    plabels.reserve(n_probe);
-    for (unsigned i = 0; i<n_probe; ++i) {
-        plabels.push_back(probe_label{probe_points[i].label, {0u, i}});
-    }
-
-    convergence_test_runner<int> runner("ncomp", plabels, meta);
-    runner.load_reference_data(ref_data_path);
-
-    for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) {
-        for (auto& seg: c.segments()) {
-            if (!seg->is_soma()) {
-                seg->set_compartments(ncomp);
-            }
-        }
-        cable1d_recipe rec{c};
-        for (const auto& p: probe_points) {
-            rec.add_probe(0, 0, cell_probe_address{p.where, cell_probe_address::membrane_voltage});
-        }
-
-        hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
-        auto decomp = partition_load_balance(rec, nd);
-        model m(rec, decomp);
-
-        runner.run(m, ncomp, sample_dt, t_end, dt, exclude);
-    }
-    runner.report();
-    runner.assert_all_convergence();
-}
-
-void validate_ball_and_stick(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    cell c = make_cell_ball_and_stick();
-    probe_point points[] = {
-        {"soma.mid", {0u, 0.5}},
-        {"dend.mid", {1u, 0.5}},
-        {"dend.end", {1u, 1.0}}
-    };
-
-    run_ncomp_convergence_test(
-        "ball_and_stick",
-        "neuron_ball_and_stick.json",
-        backend,
-        c,
-        points);
-}
-
-void validate_ball_and_taper(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    cell c = make_cell_ball_and_taper();
-    probe_point points[] = {
-        {"soma.mid",  {0u, 0.5}},
-        {"taper.mid", {1u, 0.5}},
-        {"taper.end", {1u, 1.0}}
-    };
-
-    run_ncomp_convergence_test(
-        "ball_and_taper",
-        "neuron_ball_and_taper.json",
-        backend,
-        c,
-        points);
-}
-
-void validate_ball_and_3stick(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    cell c = make_cell_ball_and_3stick();
-    probe_point points[] = {
-        {"soma.mid",  {0u, 0.5}},
-        {"dend1.mid", {1u, 0.5}},
-        {"dend1.end", {1u, 1.0}},
-        {"dend2.mid", {2u, 0.5}},
-        {"dend2.end", {2u, 1.0}},
-        {"dend3.mid", {3u, 0.5}},
-        {"dend3.end", {3u, 1.0}}
-    };
-
-    run_ncomp_convergence_test(
-        "ball_and_3stick",
-        "neuron_ball_and_3stick.json",
-        backend,
-        c,
-        points);
-}
-
-void validate_rallpack1(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    cell c = make_cell_simple_cable();
-    probe_point points[] = {
-        {"cable.x0.0", {1u, 0.0}},
-        {"cable.x0.3", {1u, 0.3}},
-        {"cable.x1.0", {1u, 1.0}}
-    };
-
-    run_ncomp_convergence_test(
-        "rallpack1",
-        "numeric_rallpack1.json",
-        backend,
-        c,
-        points,
-        250.f);
-}
-
-void validate_ball_and_squiggle(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    cell c = make_cell_ball_and_squiggle();
-    probe_point points[] = {
-        {"soma.mid", {0u, 0.5}},
-        {"dend.mid", {1u, 0.5}},
-        {"dend.end", {1u, 1.0}}
-    };
-
-#if 0
-    // *temporarily* disabled: compartment division policy will
-    // be moved into backend policy classes.
-
-    run_ncomp_convergence_test<lowered_cell_div<div_compartment_sampler>>(
-        "ball_and_squiggle_sampler",
-        "neuron_ball_and_squiggle.json",
-        c,
-        samplers);
-#endif
-
-    run_ncomp_convergence_test(
-        "ball_and_squiggle_integrator",
-        "neuron_ball_and_squiggle.json",
-        backend,
-        c,
-        points);
-}
diff --git a/tests/validation/validate_kinetic.cpp b/tests/validation/validate_kinetic.cpp
index a9d95faa..6815a91c 100644
--- a/tests/validation/validate_kinetic.cpp
+++ b/tests/validation/validate_kinetic.cpp
@@ -1,13 +1,118 @@
-#include "validate_kinetic.hpp"
-
 #include "../gtest.h"
 
-const auto backend = nest::mc::backend_kind::multicore;
+#include <json/json.hpp>
+
+#include <common_types.hpp>
+#include <cell.hpp>
+#include <fvm_multicell.hpp>
+#include <hardware/node_info.hpp>
+#include <hardware/gpu.hpp>
+#include <load_balance.hpp>
+#include <model.hpp>
+#include <recipe.hpp>
+#include <simple_sampler.hpp>
+#include <util/rangeutil.hpp>
+
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
+
+#include "convergence_test.hpp"
+#include "trace_analysis.hpp"
+#include "validation_data.hpp"
+
+void run_kinetic_dt(
+    nest::mc::backend_kind backend,
+    nest::mc::cell& c,
+    float t_end,
+    nlohmann::json meta,
+    const std::string& ref_file)
+{
+    using namespace nest::mc;
+
+    float sample_dt = g_trace_io.sample_dt();
+
+    cable1d_recipe rec{c};
+    rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
+    probe_label plabels[1] = {"soma.mid", {0u, 0u}};
+
+    meta["sim"] = "nestmc";
+    meta["backend_kind"] = to_string(backend);
+
+    convergence_test_runner<float> runner("dt", plabels, meta);
+    runner.load_reference_data(ref_file);
+
+    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
+    auto decomp = partition_load_balance(rec, nd);
+    model model(rec, decomp);
+
+    auto exclude = stimulus_ends(c);
+
+    // use dt = 0.05, 0.02, 0.01, 0.005, 0.002,  ...
+    double max_oo_dt = std::round(1.0/g_trace_io.min_dt());
+    for (double base = 100; ; base *= 10) {
+        for (double multiple: {5., 2., 1.}) {
+            double oo_dt = base/multiple;
+            if (oo_dt>max_oo_dt) goto end;
+
+            model.reset();
+            float dt = float(1./oo_dt);
+            runner.run(model, dt, sample_dt, t_end, dt, exclude);
+        }
+    }
+
+end:
+    runner.report();
+    runner.assert_all_convergence();
+}
+
+void validate_kinetic_kin1(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    // 20 Âµm diameter soma with single mechanism, current probe
+    cell c;
+    auto soma = c.add_soma(10);
+    soma->add_mechanism(std::string("test_kin1"));
+
+    nlohmann::json meta = {
+        {"model", "test_kin1"},
+        {"name", "membrane current"},
+        {"units", "nA"}
+    };
+
+    run_kinetic_dt(backend, c, 100.f, meta, "numeric_kin1.json");
+}
+
+void validate_kinetic_kinlva(nest::mc::backend_kind backend) {
+    using namespace nest::mc;
+
+    // 20 Âµm diameter soma with single mechanism, current probe
+    cell c;
+    auto soma = c.add_soma(10);
+    c.add_stimulus({0,0.5}, {20., 130., -0.025});
+    soma->add_mechanism(std::string("test_kinlva"));
+
+    nlohmann::json meta = {
+        {"model", "test_kinlva"},
+        {"name", "membrane voltage"},
+        {"units", "mV"}
+    };
+
+    run_kinetic_dt(backend, c, 300.f, meta, "numeric_kinlva.json");
+}
+
+
+using namespace nest::mc;
 
 TEST(kinetic, kin1_numeric_ref) {
-    validate_kinetic_kin1(backend);
+    validate_kinetic_kin1(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_kinetic_kin1(nest::mc::backend_kind::gpu);
+    }
 }
 
 TEST(kinetic, kinlva_numeric_ref) {
-    validate_kinetic_kinlva(backend);
+    validate_kinetic_kinlva(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_kinetic_kinlva(nest::mc::backend_kind::gpu);
+    }
 }
diff --git a/tests/validation/validate_kinetic.cu b/tests/validation/validate_kinetic.cu
deleted file mode 100644
index afc51031..00000000
--- a/tests/validation/validate_kinetic.cu
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "validate_kinetic.hpp"
-
-#include "../gtest.h"
-
-const auto backend = nest::mc::backend_kind::gpu;
-
-TEST(kinetic, kin1_numeric_ref) {
-    validate_kinetic_kin1(backend);
-}
-
-TEST(kinetic, kinlva_numeric_ref) {
-    validate_kinetic_kinlva(backend);
-}
diff --git a/tests/validation/validate_kinetic.hpp b/tests/validation/validate_kinetic.hpp
deleted file mode 100644
index 91932493..00000000
--- a/tests/validation/validate_kinetic.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#include <json/json.hpp>
-
-#include <common_types.hpp>
-#include <cell.hpp>
-#include <fvm_multicell.hpp>
-#include <hardware/node_info.hpp>
-#include <load_balance.hpp>
-#include <model.hpp>
-#include <recipe.hpp>
-#include <simple_sampler.hpp>
-#include <util/rangeutil.hpp>
-
-#include "../common_cells.hpp"
-#include "../simple_recipes.hpp"
-
-#include "convergence_test.hpp"
-#include "trace_analysis.hpp"
-#include "validation_data.hpp"
-
-void run_kinetic_dt(
-    nest::mc::backend_kind backend,
-    nest::mc::cell& c,
-    float t_end,
-    nlohmann::json meta,
-    const std::string& ref_file)
-{
-    using namespace nest::mc;
-
-    float sample_dt = g_trace_io.sample_dt();
-
-    cable1d_recipe rec{c};
-    rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
-    probe_label plabels[1] = {"soma.mid", {0u, 0u}};
-
-    meta["sim"] = "nestmc";
-    meta["backend_kind"] = to_string(backend);
-
-    convergence_test_runner<float> runner("dt", plabels, meta);
-    runner.load_reference_data(ref_file);
-
-    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
-    auto decomp = partition_load_balance(rec, nd);
-    model model(rec, decomp);
-
-    auto exclude = stimulus_ends(c);
-
-    // use dt = 0.05, 0.02, 0.01, 0.005, 0.002,  ...
-    double max_oo_dt = std::round(1.0/g_trace_io.min_dt());
-    for (double base = 100; ; base *= 10) {
-        for (double multiple: {5., 2., 1.}) {
-            double oo_dt = base/multiple;
-            if (oo_dt>max_oo_dt) goto end;
-
-            model.reset();
-            float dt = float(1./oo_dt);
-            runner.run(model, dt, sample_dt, t_end, dt, exclude);
-        }
-    }
-
-end:
-    runner.report();
-    runner.assert_all_convergence();
-}
-
-void validate_kinetic_kin1(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    // 20 Âµm diameter soma with single mechanism, current probe
-    cell c;
-    auto soma = c.add_soma(10);
-    soma->add_mechanism(std::string("test_kin1"));
-
-    nlohmann::json meta = {
-        {"model", "test_kin1"},
-        {"name", "membrane current"},
-        {"units", "nA"}
-    };
-
-    run_kinetic_dt(backend, c, 100.f, meta, "numeric_kin1.json");
-}
-
-void validate_kinetic_kinlva(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    // 20 Âµm diameter soma with single mechanism, current probe
-    cell c;
-    auto soma = c.add_soma(10);
-    c.add_stimulus({0,0.5}, {20., 130., -0.025});
-    soma->add_mechanism(std::string("test_kinlva"));
-
-    nlohmann::json meta = {
-        {"model", "test_kinlva"},
-        {"name", "membrane voltage"},
-        {"units", "mV"}
-    };
-
-    run_kinetic_dt(backend, c, 300.f, meta, "numeric_kinlva.json");
-}
-
diff --git a/tests/validation/validate_soma.cpp b/tests/validation/validate_soma.cpp
index acb74861..a41f4353 100644
--- a/tests/validation/validate_soma.cpp
+++ b/tests/validation/validate_soma.cpp
@@ -1,10 +1,74 @@
-#include "validate_soma.hpp"
+#include <json/json.hpp>
+
+#include <common_types.hpp>
+#include <cell.hpp>
+#include <fvm_multicell.hpp>
+#include <hardware/gpu.hpp>
+#include <hardware/node_info.hpp>
+#include <load_balance.hpp>
+#include <model.hpp>
+#include <recipe.hpp>
+#include <simple_sampler.hpp>
+#include <util/rangeutil.hpp>
+
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
+
+#include "convergence_test.hpp"
+#include "trace_analysis.hpp"
+#include "validation_data.hpp"
 
 #include "../gtest.h"
 
+using namespace nest::mc;
+
+void validate_soma(backend_kind backend) {
+    float sample_dt = g_trace_io.sample_dt();
+
+    cell c = make_cell_soma_only();
+
+    cable1d_recipe rec{c};
+    rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
+    probe_label plabels[1] = {"soma.mid", {0u, 0u}};
+
+    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
+    auto decomp = partition_load_balance(rec, nd);
+    model m(rec, decomp);
 
-const auto backend = nest::mc::backend_kind::multicore;
+    nlohmann::json meta = {
+        {"name", "membrane voltage"},
+        {"model", "soma"},
+        {"sim", "nestmc"},
+        {"units", "mV"},
+        {"backend_kind", to_string(backend)}
+    };
+
+    convergence_test_runner<float> runner("dt", plabels, meta);
+    runner.load_reference_data("numeric_soma.json");
+
+    float t_end = 100.f;
+
+    // use dt = 0.05, 0.02, 0.01, 0.005, 0.002,  ...
+    double max_oo_dt = std::round(1.0/g_trace_io.min_dt());
+    for (double base = 100; ; base *= 10) {
+        for (double multiple: {5., 2., 1.}) {
+            double oo_dt = base/multiple;
+            if (oo_dt>max_oo_dt) goto end;
+
+            m.reset();
+            float dt = float(1./oo_dt);
+            runner.run(m, dt, sample_dt, t_end, dt, {});
+        }
+    }
+end:
+
+    runner.report();
+    runner.assert_all_convergence();
+}
 
 TEST(soma, numeric_ref) {
-    validate_soma(backend);
+    validate_soma(backend_kind::multicore);
+    if (hw::num_gpus()) {
+        validate_soma(backend_kind::gpu);
+    }
 }
diff --git a/tests/validation/validate_soma.cu b/tests/validation/validate_soma.cu
deleted file mode 100644
index f726155d..00000000
--- a/tests/validation/validate_soma.cu
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "validate_soma.hpp"
-
-#include "../gtest.h"
-
-const auto backend = nest::mc::backend_kind::gpu;
-
-TEST(soma, numeric_ref) {
-    validate_soma(backend);
-}
diff --git a/tests/validation/validate_soma.hpp b/tests/validation/validate_soma.hpp
deleted file mode 100644
index edc985b8..00000000
--- a/tests/validation/validate_soma.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <json/json.hpp>
-
-#include <common_types.hpp>
-#include <cell.hpp>
-#include <fvm_multicell.hpp>
-#include <hardware/node_info.hpp>
-#include <load_balance.hpp>
-#include <model.hpp>
-#include <recipe.hpp>
-#include <simple_sampler.hpp>
-#include <util/rangeutil.hpp>
-
-#include "../common_cells.hpp"
-#include "../simple_recipes.hpp"
-
-#include "convergence_test.hpp"
-#include "trace_analysis.hpp"
-#include "validation_data.hpp"
-
-void validate_soma(nest::mc::backend_kind backend) {
-    using namespace nest::mc;
-
-    float sample_dt = g_trace_io.sample_dt();
-
-    cell c = make_cell_soma_only();
-
-    cable1d_recipe rec{c};
-    rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
-    probe_label plabels[1] = {"soma.mid", {0u, 0u}};
-
-    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
-    auto decomp = partition_load_balance(rec, nd);
-    model m(rec, decomp);
-
-    nlohmann::json meta = {
-        {"name", "membrane voltage"},
-        {"model", "soma"},
-        {"sim", "nestmc"},
-        {"units", "mV"},
-        {"backend_kind", to_string(backend)}
-    };
-
-    convergence_test_runner<float> runner("dt", plabels, meta);
-    runner.load_reference_data("numeric_soma.json");
-
-    float t_end = 100.f;
-
-    // use dt = 0.05, 0.02, 0.01, 0.005, 0.002,  ...
-    double max_oo_dt = std::round(1.0/g_trace_io.min_dt());
-    for (double base = 100; ; base *= 10) {
-        for (double multiple: {5., 2., 1.}) {
-            double oo_dt = base/multiple;
-            if (oo_dt>max_oo_dt) goto end;
-
-            m.reset();
-            float dt = float(1./oo_dt);
-            runner.run(m, dt, sample_dt, t_end, dt, {});
-        }
-    }
-end:
-
-    runner.report();
-    runner.assert_all_convergence();
-}
diff --git a/tests/validation/validate_synapses.cpp b/tests/validation/validate_synapses.cpp
index 1a8bbf03..557a7470 100644
--- a/tests/validation/validate_synapses.cpp
+++ b/tests/validation/validate_synapses.cpp
@@ -1,16 +1,102 @@
+#include <cell.hpp>
+#include <cell_group.hpp>
 #include <fvm_multicell.hpp>
+#include <hardware/node_info.hpp>
+#include <hardware/gpu.hpp>
+#include <json/json.hpp>
+#include <load_balance.hpp>
+#include <model.hpp>
+#include <recipe.hpp>
+#include <simple_sampler.hpp>
+#include <util/path.hpp>
 
 #include "../gtest.h"
-#include "validate_synapses.hpp"
 
-const auto backend = nest::mc::backend_kind::multicore;
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
+
+#include "convergence_test.hpp"
+#include "trace_analysis.hpp"
+#include "validation_data.hpp"
+
+using namespace nest::mc;
+
+void run_synapse_test(
+    const char* syn_type,
+    const util::path& ref_data_path,
+    backend_kind backend,
+    float t_end=70.f,
+    float dt=0.001)
+{
+    auto max_ncomp = g_trace_io.max_ncomp();
+    nlohmann::json meta = {
+        {"name", "membrane voltage"},
+        {"model", syn_type},
+        {"sim", "nestmc"},
+        {"units", "mV"},
+        {"backend_kind", to_string(backend)}
+    };
+
+    cell c = make_cell_ball_and_stick(false); // no stimuli
+    parameter_list syn_default(syn_type);
+    c.add_synapse({1, 0.5}, syn_default);
+
+    // injected spike events
+    std::vector<postsynaptic_spike_event> synthetic_events = {
+        {{0u, 0u}, 10.0, 0.04},
+        {{0u, 0u}, 20.0, 0.04},
+        {{0u, 0u}, 40.0, 0.04}
+    };
+
+    // exclude points of discontinuity from linf analysis
+    std::vector<float> exclude = {10.f, 20.f, 40.f};
+
+    float sample_dt = g_trace_io.sample_dt();
+    probe_label plabels[3] = {
+        {"soma.mid", {0u, 0u}},
+        {"dend.mid", {0u, 1u}},
+        {"dend.end", {0u, 2u}}
+    };
+
+    convergence_test_runner<int> runner("ncomp", plabels, meta);
+    runner.load_reference_data(ref_data_path);
+
+    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
+    for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) {
+        c.cable(1)->set_compartments(ncomp);
+
+        cable1d_recipe rec{c};
+        // soma.mid
+        rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
+        // dend.mid
+        rec.add_probe(0, 0, cell_probe_address{{1, 0.5}, cell_probe_address::membrane_voltage});
+        // dend.end
+        rec.add_probe(0, 0, cell_probe_address{{1, 1.0}, cell_probe_address::membrane_voltage});
+
+        auto decomp = partition_load_balance(rec, nd);
+        model m(rec, decomp);
+        m.group(0).enqueue_events(synthetic_events);
+
+        runner.run(m, ncomp, sample_dt, t_end, dt, exclude);
+    }
+    runner.report();
+    runner.assert_all_convergence();
+}
 
 TEST(simple_synapse, expsyn_neuron_ref) {
-    SCOPED_TRACE("expsyn");
-    run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend);
+    SCOPED_TRACE("expsyn-multicore");
+    run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend_kind::multicore);
+    if (hw::num_gpus()) {
+        SCOPED_TRACE("expsyn-gpu");
+        run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend_kind::gpu);
+    }
 }
 
 TEST(simple_synapse, exp2syn_neuron_ref) {
-    SCOPED_TRACE("exp2syn");
-    run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend);
+    SCOPED_TRACE("exp2syn-multicore");
+    run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend_kind::multicore);
+    if (hw::num_gpus()) {
+        SCOPED_TRACE("exp2syn-gpu");
+        run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend_kind::gpu);
+    }
 }
diff --git a/tests/validation/validate_synapses.cu b/tests/validation/validate_synapses.cu
deleted file mode 100644
index 886bbdfb..00000000
--- a/tests/validation/validate_synapses.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <fvm_multicell.hpp>
-
-#include "../gtest.h"
-#include "validate_synapses.hpp"
-
-const auto backend = nest::mc::backend_kind::gpu;
-
-TEST(simple_synapse, expsyn_neuron_ref) {
-    SCOPED_TRACE("expsyn");
-    run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend);
-}
-
-TEST(simple_synapse, exp2syn_neuron_ref) {
-    SCOPED_TRACE("exp2syn");
-    run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend);
-}
diff --git a/tests/validation/validate_synapses.hpp b/tests/validation/validate_synapses.hpp
deleted file mode 100644
index 429da370..00000000
--- a/tests/validation/validate_synapses.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <json/json.hpp>
-
-#include <cell.hpp>
-#include <cell_group.hpp>
-#include <fvm_multicell.hpp>
-#include <hardware/node_info.hpp>
-#include <load_balance.hpp>
-#include <model.hpp>
-#include <recipe.hpp>
-#include <simple_sampler.hpp>
-#include <util/path.hpp>
-
-#include "../gtest.h"
-
-#include "../common_cells.hpp"
-#include "../simple_recipes.hpp"
-
-#include "convergence_test.hpp"
-#include "trace_analysis.hpp"
-#include "validation_data.hpp"
-
-void run_synapse_test(
-    const char* syn_type,
-    const nest::mc::util::path& ref_data_path,
-    nest::mc::backend_kind backend,
-    float t_end=70.f,
-    float dt=0.001)
-{
-    using namespace nest::mc;
-
-    auto max_ncomp = g_trace_io.max_ncomp();
-    nlohmann::json meta = {
-        {"name", "membrane voltage"},
-        {"model", syn_type},
-        {"sim", "nestmc"},
-        {"units", "mV"},
-        {"backend_kind", to_string(backend)}
-    };
-
-    cell c = make_cell_ball_and_stick(false); // no stimuli
-    parameter_list syn_default(syn_type);
-    c.add_synapse({1, 0.5}, syn_default);
-
-    // injected spike events
-    std::vector<postsynaptic_spike_event> synthetic_events = {
-        {{0u, 0u}, 10.0, 0.04},
-        {{0u, 0u}, 20.0, 0.04},
-        {{0u, 0u}, 40.0, 0.04}
-    };
-
-    // exclude points of discontinuity from linf analysis
-    std::vector<float> exclude = {10.f, 20.f, 40.f};
-
-    float sample_dt = g_trace_io.sample_dt();
-    probe_label plabels[3] = {
-        {"soma.mid", {0u, 0u}},
-        {"dend.mid", {0u, 1u}},
-        {"dend.end", {0u, 2u}}
-    };
-
-    convergence_test_runner<int> runner("ncomp", plabels, meta);
-    runner.load_reference_data(ref_data_path);
-
-    hw::node_info nd(1, backend==backend_kind::gpu? 1: 0);
-    for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) {
-        c.cable(1)->set_compartments(ncomp);
-
-        cable1d_recipe rec{c};
-        // soma.mid
-        rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage});
-        // dend.mid
-        rec.add_probe(0, 0, cell_probe_address{{1, 0.5}, cell_probe_address::membrane_voltage});
-        // dend.end
-        rec.add_probe(0, 0, cell_probe_address{{1, 1.0}, cell_probe_address::membrane_voltage});
-
-        auto decomp = partition_load_balance(rec, nd);
-        model m(rec, decomp);
-        m.group(0).enqueue_events(synthetic_events);
-
-        runner.run(m, ncomp, sample_dt, t_end, dt, exclude);
-    }
-    runner.report();
-    runner.assert_all_convergence();
-}
-- 
GitLab