diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 2885a1c5be2df231df570f0f731e985c66c371ae..5672dd48fb152b23b67bd2fdbe7c65756bec8d89 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -41,7 +41,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line();
     text_.add_line("#include <mechanism.hpp>");
     text_.add_line("#include <algorithms.hpp>");
-    text_.add_line("#include <backends/gpu_intrinsics.hpp>");
+    text_.add_line("#include <backends/gpu/intrinsics.hpp>");
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7e6e6f160ef3f1fcc552fc54c2643636327727f1..d11ca436ed2aeb51c50d8fa11bf4e891c3e421df 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,10 +9,10 @@ set(BASE_SOURCES
     util/debug.cpp
     util/path.cpp
     util/unwind.cpp
-    backends/fvm_multicore.cpp
+    backends/multicore/fvm.cpp
 )
 set(CUDA_SOURCES
-    backends/fvm_gpu.cu
+    backends/gpu/fvm.cu
     memory/fill.cu
 )
 
diff --git a/src/backends/fvm.hpp b/src/backends/fvm.hpp
index 1cdc16d62593770dee3f0aed71e93a6e6a51dd53..a8124da0815f85dbb674ed5e81c31d5cbf2328b9 100644
--- a/src/backends/fvm.hpp
+++ b/src/backends/fvm.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "fvm_multicore.hpp"
+#include <backends/multicore/fvm.hpp>
 
 #ifdef NMC_HAVE_CUDA
-    #include "fvm_gpu.hpp"
+    #include <backends/gpu/fvm.hpp>
 #endif
diff --git a/src/backends/fvm_gpu.hpp b/src/backends/fvm_gpu.hpp
deleted file mode 100644
index 5683efbf528e721d33253cbbd5433ad68e958af4..0000000000000000000000000000000000000000
--- a/src/backends/fvm_gpu.hpp
+++ /dev/null
@@ -1,458 +0,0 @@
-#pragma once
-
-#include <map>
-#include <string>
-
-#include <common_types.hpp>
-#include <mechanism.hpp>
-#include <memory/memory.hpp>
-#include <memory/managed_ptr.hpp>
-#include <util/span.hpp>
-
-#include "stimulus_gpu.hpp"
-#include "gpu_stack.hpp"
-
-namespace nest {
-namespace mc {
-namespace gpu {
-
-/// Parameter pack for passing matrix fields and dimensions to the
-/// Hines matrix solver implemented on the GPU backend.
-template <typename T, typename I>
-struct matrix_solve_param_pack {
-    T* d;
-    const T* u;
-    T* rhs;
-    const I* p;
-    const I* cell_index;
-    I n;
-    I ncells;
-};
-
-/// Parameter pack for passing matrix and fvm fields and dimensions to the
-/// FVM matrix generator implemented on the GPU
-template <typename T, typename I>
-struct matrix_update_param_pack {
-    T* d;
-    const T* u;
-    T* rhs;
-    const T* invariant_d;
-    const T* cv_capacitance;
-    const T* face_conductance;
-    const T* voltage;
-    const T* current;
-    I n;
-};
-
-// forward declarations of the matrix solver implementation
-// see the bottom of the file for implementation
-
-template <typename T, typename I>
-__global__ void matrix_solve(matrix_solve_param_pack<T, I> params);
-
-template <typename T, typename I>
-__global__ void assemble_matrix(matrix_update_param_pack<T, I> params, T dt);
-
-/// kernel used to test for threshold crossing test code.
-/// params:
-///     t       : current time (ms)
-///     t_prev  : time of last test (ms)
-///     size    : number of values to test
-///     is_crossed  : crossing state at time t_prev (true or false)
-///     prev_values : values at sample points (see index) sampled at t_prev
-///     index      : index with locations in values to test for crossing
-///     values     : values at t_prev
-///     thresholds : threshold values to watch for crossings
-template <typename T, typename I, typename Stack>
-__global__
-void test_thresholds(
-    float t, float t_prev, int size,
-    Stack& stack,
-    I* is_crossed, T* prev_values,
-    const I* index, const T* values, const T* thresholds);
-
-struct backend {
-    /// define the real and index types
-    using value_type = double;
-    using size_type  = nest::mc::cell_lid_type;
-
-    /// define storage types
-    using array  = memory::device_vector<value_type>;
-    using iarray = memory::device_vector<size_type>;
-
-    using view       = typename array::view_type;
-    using const_view = typename array::const_view_type;
-
-    using iview       = typename iarray::view_type;
-    using const_iview = typename iarray::const_view_type;
-
-    using host_array  = typename memory::host_vector<value_type>;
-    using host_iarray = typename memory::host_vector<size_type>;
-
-    using host_view   = typename host_array::view_type;
-    using host_iview  = typename host_iarray::const_view_type;
-
-    static std::string name() {
-        return "gpu";
-    }
-
-    //
-    // matrix infrastructure
-    //
-
-    /// matrix state
-    struct matrix_state {
-        const_iview p;
-        const_iview cell_index;
-
-        array d;     // [Î¼S]
-        array u;     // [Î¼S]
-        array rhs;   // [nA]
-
-        array cv_capacitance;      // [pF]
-        array face_conductance;    // [Î¼S]
-
-        // the invariant part of the matrix diagonal
-        array invariant_d;         // [Î¼S]
-
-        std::size_t size() const { return p.size(); }
-
-        matrix_state() = default;
-
-        matrix_state(const_iview p, const_iview cell_index):
-            p(p), cell_index(cell_index),
-            d(size()), u(size()), rhs(size())
-        {}
-
-        matrix_state(const_iview p, const_iview cell_index, array cap, array cond):
-            p(p), cell_index(cell_index),
-            d(size()), u(size()), rhs(size()),
-            cv_capacitance(std::move(cap)),
-            face_conductance(std::move(cond))
-        {
-            auto n = d.size();
-            host_array invariant_d_tmp(n, 0);
-            host_array u_tmp(n, 0);
-
-            // make a copy of the conductance on the host
-            host_array face_conductance_tmp = face_conductance;
-            auto p_tmp = memory::on_host(p);
-            for(auto i: util::make_span(1u, n)) {
-                auto gij = face_conductance_tmp[i];
-
-                u_tmp[i] = -gij;
-                invariant_d_tmp[i] += gij;
-                invariant_d_tmp[p_tmp[i]] += gij;
-            }
-            invariant_d = invariant_d_tmp;
-            memory::copy(u_tmp, u);
-        }
-
-        // Assemble the matrix
-        // Afterwards the diagonal and RHS will have been set given dt, voltage and current
-        //   dt      [ms]
-        //   voltage [mV]
-        //   current [nA]
-        void assemble(value_type dt, const_view voltage, const_view current) {
-            EXPECTS(has_fvm_state());
-
-            // determine the grid dimensions for the kernel
-            auto const n = voltage.size();
-            auto const block_dim = 128;
-            auto const grid_dim = (n+block_dim-1)/block_dim;
-
-            auto params = matrix_update_param_pack<value_type, size_type> {
-                d.data(), u.data(), rhs.data(),
-                invariant_d.data(), cv_capacitance.data(), face_conductance.data(),
-                voltage.data(), current.data(), size_type(n)};
-
-            assemble_matrix<value_type, size_type><<<grid_dim, block_dim>>>
-                (params, dt);
-
-        }
-
-        void solve() {
-            using solve_param_pack = matrix_solve_param_pack<value_type, size_type>;
-
-            // pack the parameters into a single struct for kernel launch
-            auto params = solve_param_pack{
-                 d.data(), u.data(), rhs.data(),
-                 p.data(), cell_index.data(),
-                 size_type(d.size()), size_type(cell_index.size()-1)
-            };
-
-            // determine the grid dimensions for the kernel
-            auto const n = params.ncells;
-            auto const block_dim = 96;
-            auto const grid_dim = (n+block_dim-1)/block_dim;
-
-            // perform solve on gpu
-            matrix_solve<value_type, size_type><<<grid_dim, block_dim>>>(params);
-        }
-
-        // Test if the matrix has the full state required to assemble the
-        // matrix in the fvm scheme.
-        bool has_fvm_state() const {
-            return cv_capacitance.size()>0;
-        }
-    };
-
-    //
-    // mechanism infrastructure
-    //
-    using ion = mechanisms::ion<backend>;
-
-    using mechanism = mechanisms::mechanism_ptr<backend>;
-
-    using stimulus = mechanisms::gpu::stimulus<backend>;
-
-    static mechanism make_mechanism(
-        const std::string& name,
-        view vec_v, view vec_i,
-        const std::vector<value_type>& weights,
-        const std::vector<size_type>& node_indices)
-    {
-        if (!has_mechanism(name)) {
-            throw std::out_of_range("no mechanism in database : " + name);
-        }
-
-        return mech_map_.find(name)->
-            second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices));
-    }
-
-    static bool has_mechanism(const std::string& name) { return mech_map_.count(name)>0; }
-
-    /// threshold crossing logic
-    /// used as part of spike detection back end
-    class threshold_watcher {
-    public:
-        /// stores a single crossing event
-        struct threshold_crossing {
-            size_type index;    // index of variable
-            value_type time;    // time of crossing
-            __host__ __device__
-            friend bool operator==
-                (const threshold_crossing& lhs, const threshold_crossing& rhs)
-            {
-                return lhs.index==rhs.index && lhs.time==rhs.time;
-            }
-        };
-
-        using stack_type = gpu_stack<threshold_crossing>;
-
-        threshold_watcher() = default;
-
-        threshold_watcher(
-                const_view values,
-                const std::vector<size_type>& index,
-                const std::vector<value_type>& thresh,
-                value_type t=0):
-            values_(values),
-            index_(memory::make_const_view(index)),
-            thresholds_(memory::make_const_view(thresh)),
-            prev_values_(values),
-            is_crossed_(size()),
-            stack_(memory::make_managed_ptr<stack_type>(10*size()))
-        {
-            reset(t);
-        }
-
-        /// Remove all stored crossings that were detected in previous calls
-        /// to test()
-        void clear_crossings() {
-            stack_->clear();
-        }
-
-        /// Reset state machine for each detector.
-        /// Assume that the values in values_ have been set correctly before
-        /// calling, because the values are used to determine the initial state
-        void reset(value_type t=0) {
-            clear_crossings();
-
-            // Make host-side copies of the information needed to calculate
-            // the initial crossed state
-            auto values = memory::on_host(values_);
-            auto thresholds = memory::on_host(thresholds_);
-            auto index = memory::on_host(index_);
-
-            // calculate the initial crossed state in host memory
-            auto crossed = std::vector<size_type>(size());
-            for (auto i: util::make_span(0u, size())) {
-                crossed[i] = values[index[i]] < thresholds[i] ? 0 : 1;
-            }
-
-            // copy the initial crossed state to device memory
-            is_crossed_ = memory::on_gpu(crossed);
-
-            // reset time of last test
-            t_prev_ = t;
-        }
-
-        bool is_crossed(size_type i) const {
-            return is_crossed_[i];
-        }
-
-        const std::vector<threshold_crossing> crossings() const {
-            return std::vector<threshold_crossing>(stack_->begin(), stack_->end());
-        }
-
-        /// The time at which the last test was performed
-        value_type last_test_time() const {
-            return t_prev_;
-        }
-
-        /// Tests each target for changed threshold state.
-        /// Crossing events are recorded for each threshold that has been
-        /// crossed since current time t, and the last time the test was
-        /// performed.
-        void test(value_type t) {
-            EXPECTS(t_prev_<t);
-
-            constexpr int block_dim = 128;
-            const int grid_dim = (size()+block_dim-1)/block_dim;
-            test_thresholds<<<grid_dim, block_dim>>>(
-                t, t_prev_, size(),
-                *stack_,
-                is_crossed_.data(), prev_values_.data(),
-                index_.data(), values_.data(), thresholds_.data());
-
-            // Check that the number of spikes has not exceeded
-            // the capacity of the stack.
-            EXPECTS(stack_->size() <= stack_->capacity());
-
-            t_prev_ = t;
-        }
-
-        /// the number of threashold values that are being monitored
-        std::size_t size() const {
-            return index_.size();
-        }
-
-        /// Data type used to store the crossings.
-        /// Provided to make type-generic calling code.
-        using crossing_list =  std::vector<threshold_crossing>;
-
-    private:
-
-        const_view values_;         // values to watch: on gpu
-        iarray index_;              // indexes of values to watch: on gpu
-
-        array thresholds_;          // threshold for each watch: on gpu
-        value_type t_prev_;         // time of previous sample: on host
-        array prev_values_;         // values at previous sample time: on host
-        iarray is_crossed_;         // bool flag for state of each watch: on gpu
-
-        memory::managed_ptr<stack_type> stack_;
-    };
-
-private:
-
-    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
-    static std::map<std::string, maker_type> mech_map_;
-
-    template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
-        return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(weights), std::move(node_indices));
-    }
-};
-
-/// GPU implementation of Hines Matrix solver.
-/// Naive implementation with one CUDA thread per matrix.
-template <typename T, typename I>
-__global__
-void matrix_solve(matrix_solve_param_pack<T, I> params) {
-    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
-    auto d   = params.d;
-    auto u   = params.u;
-    auto rhs = params.rhs;
-    auto p   = params.p;
-
-    if(tid < params.ncells) {
-        // get range of this thread's cell matrix
-        auto first = params.cell_index[tid];
-        auto last  = params.cell_index[tid+1];
-
-        // backward sweep
-        for(auto i=last-1; i>first; --i) {
-            auto factor = u[i] / d[i];
-            d[p[i]]   -= factor * u[i];
-            rhs[p[i]] -= factor * rhs[i];
-        }
-
-        __syncthreads();
-        rhs[first] /= d[first];
-
-        // forward sweep
-        for(auto i=first+1; i<last; ++i) {
-            rhs[i] -= u[i] * rhs[p[i]];
-            rhs[i] /= d[i];
-        }
-    }
-}
-
-/// GPU implementatin of Hines matrix assembly
-/// For a given time step size dt
-///     - use the precomputed alpha and alpha_d values to construct the diagonal
-///       and off diagonal of the symmetric Hines matrix.
-///     - compute the RHS of the linear system to solve
-template <typename T, typename I>
-__global__
-void assemble_matrix(matrix_update_param_pack<T, I> params, T dt) {
-    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
-
-    T factor = 1e-3/dt;
-    if(tid < params.n) {
-        auto gi = factor * params.cv_capacitance[tid];
-
-        params.d[tid] = gi + params.invariant_d[tid];
-
-        params.rhs[tid] = gi*params.voltage[tid] - params.current[tid];
-    }
-}
-
-template <typename T, typename I, typename Stack>
-__global__
-void test_thresholds(
-    float t, float t_prev, int size,
-    Stack& stack,
-    I* is_crossed, T* prev_values,
-    const I* index, const T* values, const T* thresholds)
-{
-    int i = threadIdx.x + blockIdx.x*blockDim.x;
-
-    bool crossed = false;
-    float crossing_time;
-
-    if (i<size) {
-        // Test for threshold crossing
-        const auto v_prev = prev_values[i];
-        const auto v      = values[index[i]];
-        const auto thresh = thresholds[i];
-
-        if (!is_crossed[i]) {
-            if (v>=thresh) {
-                // The threshold has been passed, so estimate the time using
-                // linear interpolation
-                auto pos = (thresh - v_prev)/(v - v_prev);
-                crossing_time = t_prev + pos*(t - t_prev);
-
-                is_crossed[i] = 1;
-                crossed = true;
-            }
-        }
-        else if (v<thresh) {
-            is_crossed[i]=0;
-        }
-
-        prev_values[i] = v;
-    }
-
-    if (crossed) {
-        stack.push_back({I(i), crossing_time});
-    }
-}
-
-} // namespace multicore
-} // namespace mc
-} // namespace nest
diff --git a/src/backends/fvm_multicore.hpp b/src/backends/fvm_multicore.hpp
deleted file mode 100644
index 8bddb9566490690856d54aa436409d8224d4bc4c..0000000000000000000000000000000000000000
--- a/src/backends/fvm_multicore.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-
-#include <map>
-#include <string>
-
-#include <common_types.hpp>
-#include <mechanism.hpp>
-#include <memory/memory.hpp>
-#include <memory/wrappers.hpp>
-#include <util/span.hpp>
-
-#include "stimulus_multicore.hpp"
-
-namespace nest {
-namespace mc {
-namespace multicore {
-
-struct backend {
-    /// define the real and index types
-    using value_type = double;
-    using size_type  = nest::mc::cell_lid_type;
-
-    /// define storage types
-    using array  = memory::host_vector<value_type>;
-    using iarray = memory::host_vector<size_type>;
-
-    using view       = typename array::view_type;
-    using const_view = typename array::const_view_type;
-
-    using iview       = typename iarray::view_type;
-    using const_iview = typename iarray::const_view_type;
-
-    using host_array  = array;
-    using host_iarray = iarray;
-
-    using host_view   = view;
-    using host_iview  = iview;
-
-    /// matrix state
-    struct matrix_state {
-        const_iview p;
-        const_iview cell_index;
-
-        array d;     // [Î¼S]
-        array u;     // [Î¼S]
-        array rhs;   // [nA]
-
-        array cv_capacitance;      // [pF]
-        array face_conductance;    // [Î¼S]
-
-        // the invariant part of the matrix diagonal
-        array invariant_d;         // [Î¼S]
-
-        std::size_t size() const { return p.size(); }
-
-        matrix_state() = default;
-
-        matrix_state(const_iview p, const_iview cell_index):
-            p(p), cell_index(cell_index),
-            d(size()), u(size()), rhs(size())
-        {}
-
-        matrix_state(const_iview p, const_iview cell_index, array cap, array cond):
-            p(p), cell_index(cell_index),
-            d(size()), u(size()), rhs(size()),
-            cv_capacitance(std::move(cap)),
-            face_conductance(std::move(cond))
-        {
-            auto n = d.size();
-            invariant_d = array(n, 0);
-            for (auto i: util::make_span(1u, n)) {
-                auto gij = face_conductance[i];
-
-                u[i] = -gij;
-                invariant_d[i] += gij;
-                invariant_d[p[i]] += gij;
-            }
-        }
-
-        // Assemble the matrix
-        // Afterwards the diagonal and RHS will have been set given dt, voltage and current
-        //   dt      [ms]
-        //   voltage [mV]
-        //   current [nA]
-        void assemble(value_type dt, const_view voltage, const_view current) {
-            EXPECTS(has_fvm_state());
-
-            auto n = d.size();
-            value_type factor = 1e-3/dt;
-            for (auto i: util::make_span(0u, n)) {
-                auto gi = factor*cv_capacitance[i];
-
-                d[i] = gi + invariant_d[i];
-
-                rhs[i] = gi*voltage[i] - current[i];
-            }
-        }
-
-        void solve() {
-            const size_type ncells = cell_index.size()-1;
-
-            // loop over submatrices
-            for (auto m: util::make_span(0, ncells)) {
-                auto first = cell_index[m];
-                auto last = cell_index[m+1];
-
-                // backward sweep
-                for(auto i=last-1; i>first; --i) {
-                    auto factor = u[i] / d[i];
-                    d[p[i]]   -= factor * u[i];
-                    rhs[p[i]] -= factor * rhs[i];
-                }
-                rhs[first] /= d[first];
-
-                // forward sweep
-                for(auto i=first+1; i<last; ++i) {
-                    rhs[i] -= u[i] * rhs[p[i]];
-                    rhs[i] /= d[i];
-                }
-            }
-        }
-
-        // Test if the matrix has the full state required to assemble the
-        // matrix in the fvm scheme.
-        bool has_fvm_state() const {
-            return cv_capacitance.size()>0;
-        }
-    };
-
-    //
-    // mechanism infrastructure
-    //
-    using ion = mechanisms::ion<backend>;
-
-    using mechanism = mechanisms::mechanism_ptr<backend>;
-
-    using stimulus = mechanisms::multicore::stimulus<backend>;
-
-    static mechanism make_mechanism(
-        const std::string& name,
-        view vec_v, view vec_i,
-        const std::vector<value_type>& weights,
-        const std::vector<size_type>& node_indices)
-    {
-        if (!has_mechanism(name)) {
-            throw std::out_of_range("no mechanism in database : " + name);
-        }
-
-        return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices));
-    }
-
-    static bool has_mechanism(const std::string& name) {
-        return mech_map_.count(name)>0;
-    }
-
-    static std::string name() {
-        return "cpu";
-    }
-
-    /// threshold crossing logic
-    /// used as part of spike detection back end
-    class threshold_watcher {
-    public:
-        /// stores a single crossing event
-        struct threshold_crossing {
-            size_type index;    // index of variable
-            value_type time;    // time of crossing
-            friend bool operator== (
-                const threshold_crossing& lhs, const threshold_crossing& rhs)
-            {
-                return lhs.index==rhs.index && lhs.time==rhs.time;
-            }
-        };
-
-        threshold_watcher() = default;
-
-        threshold_watcher(
-                const_view vals,
-                const std::vector<size_type>& indxs,
-                const std::vector<value_type>& thresh,
-                value_type t=0):
-            values_(vals),
-            index_(memory::make_const_view(indxs)),
-            thresholds_(memory::make_const_view(thresh)),
-            v_prev_(vals)
-        {
-            is_crossed_ = iarray(size());
-            reset(t);
-        }
-
-        /// Remove all stored crossings that were detected in previous calls
-        /// to the test() member function.
-        void clear_crossings() {
-            crossings_.clear();
-        }
-
-        /// Reset state machine for each detector.
-        /// Assume that the values in values_ have been set correctly before
-        /// calling, because the values are used to determine the initial state
-        void reset(value_type t=0) {
-            clear_crossings();
-            for (auto i=0u; i<size(); ++i) {
-                is_crossed_[i] = values_[index_[i]]>=thresholds_[i];
-            }
-            t_prev_ = t;
-        }
-
-        const std::vector<threshold_crossing>& crossings() const {
-            return crossings_;
-        }
-
-        /// The time at which the last test was performed
-        value_type last_test_time() const {
-            return t_prev_;
-        }
-
-        /// Tests each target for changed threshold state
-        /// Crossing events are recorded for each threshold that
-        /// is crossed since the last call to test
-        void test(value_type t) {
-            for (auto i=0u; i<size(); ++i) {
-                auto v_prev = v_prev_[i];
-                auto v      = values_[index_[i]];
-                auto thresh = thresholds_[i];
-                if (!is_crossed_[i]) {
-                    if (v>=thresh) {
-                        // the threshold has been passed, so estimate the time using
-                        // linear interpolation
-                        auto pos = (thresh - v_prev)/(v - v_prev);
-                        auto crossing_time = t_prev_ + pos*(t - t_prev_);
-                        crossings_.push_back({i, crossing_time});
-
-                        is_crossed_[i] = true;
-                    }
-                }
-                else {
-                    if (v<thresh) {
-                        is_crossed_[i] = false;
-                    }
-                }
-
-                v_prev_[i] = v;
-            }
-            t_prev_ = t;
-        }
-
-        bool is_crossed(size_type i) const {
-            return is_crossed_[i];
-        }
-
-        /// the number of threashold values that are being monitored
-        std::size_t size() const {
-            return index_.size();
-        }
-
-        /// Data type used to store the crossings.
-        /// Provided to make type-generic calling code.
-        using crossing_list =  std::vector<threshold_crossing>;
-
-    private:
-        const_view values_;
-        iarray index_;
-
-        array thresholds_;
-        value_type t_prev_;
-        array v_prev_;
-        crossing_list crossings_;
-        iarray is_crossed_;
-    };
-
-
-private:
-
-    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
-    static std::map<std::string, maker_type> mech_map_;
-
-    template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
-        return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(weights), std::move(node_indices));
-    }
-};
-
-} // namespace multicore
-} // namespace mc
-} // namespace nest
-
diff --git a/src/backends/fvm_gpu.cu b/src/backends/gpu/fvm.cu
similarity index 68%
rename from src/backends/fvm_gpu.cu
rename to src/backends/gpu/fvm.cu
index d83705c64c10787ad628d5c765c3181775cc282a..d5c6debf0faf4e84852265e138a1eb8f290a33d3 100644
--- a/src/backends/fvm_gpu.cu
+++ b/src/backends/gpu/fvm.cu
@@ -1,9 +1,11 @@
-#include "fvm_gpu.hpp"
+#include "fvm.hpp"
 
 #include <mechanisms/gpu/hh.hpp>
 #include <mechanisms/gpu/pas.hpp>
 #include <mechanisms/gpu/expsyn.hpp>
 #include <mechanisms/gpu/exp2syn.hpp>
+#include <mechanisms/gpu/test_kin1.hpp>
+#include <mechanisms/gpu/test_kinlva.hpp>
 
 namespace nest {
 namespace mc {
@@ -14,7 +16,9 @@ backend::mech_map_ = {
     { "pas",     maker<mechanisms::gpu::pas::mechanism_pas> },
     { "hh",      maker<mechanisms::gpu::hh::mechanism_hh> },
     { "expsyn",  maker<mechanisms::gpu::expsyn::mechanism_expsyn> },
-    { "exp2syn", maker<mechanisms::gpu::exp2syn::mechanism_exp2syn> }
+    { "exp2syn", maker<mechanisms::gpu::exp2syn::mechanism_exp2syn> },
+    { "test_kin1", maker<mechanisms::gpu::test_kin1::mechanism_test_kin1> },
+    { "test_kinlva", maker<mechanisms::gpu::test_kinlva::mechanism_test_kinlva> }
 };
 
 } // namespace multicore
diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..80a8de85f8a1ba59e821fbe0cfd3e46637ea6386
--- /dev/null
+++ b/src/backends/gpu/fvm.hpp
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <map>
+#include <string>
+
+#include <common_types.hpp>
+#include <mechanism.hpp>
+#include <memory/memory.hpp>
+
+#include "matrix_state_interleaved.hpp"
+#include "matrix_state_flat.hpp"
+#include "stimulus.hpp"
+#include "threshold_watcher.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+struct backend {
+    /// define the real and index types
+    using value_type = double;
+    using size_type  = nest::mc::cell_lid_type;
+
+    /// define storage types
+    using array  = memory::device_vector<value_type>;
+    using iarray = memory::device_vector<size_type>;
+
+    using view       = typename array::view_type;
+    using const_view = typename array::const_view_type;
+
+    using iview       = typename iarray::view_type;
+    using const_iview = typename iarray::const_view_type;
+
+    using host_array  = typename memory::host_vector<value_type>;
+    using host_iarray = typename memory::host_vector<size_type>;
+
+    using host_view   = typename host_array::view_type;
+    using host_iview  = typename host_iarray::const_view_type;
+
+    static std::string name() {
+        return "gpu";
+    }
+
+    // matrix back end implementation
+    using matrix_state = matrix_state_interleaved<value_type, size_type>;
+
+    // mechanism infrastructure
+    using ion = mechanisms::ion<backend>;
+
+    using mechanism = mechanisms::mechanism_ptr<backend>;
+
+    using stimulus = mechanisms::gpu::stimulus<backend>;
+
+    static mechanism make_mechanism(
+        const std::string& name,
+        view vec_v, view vec_i,
+        const std::vector<value_type>& weights,
+        const std::vector<size_type>& node_indices)
+    {
+        if (!has_mechanism(name)) {
+            throw std::out_of_range("no mechanism in database : " + name);
+        }
+
+        return mech_map_.find(name)->
+            second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices));
+    }
+
+    static bool has_mechanism(const std::string& name) {
+        return mech_map_.count(name)>0;
+    }
+
+    using threshold_watcher =
+        nest::mc::gpu::threshold_watcher<value_type, size_type>;
+
+private:
+
+    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
+    static std::map<std::string, maker_type> mech_map_;
+
+    template <template <typename> class Mech>
+    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
+        return mechanisms::make_mechanism<Mech<backend>>
+            (vec_v, vec_i, std::move(weights), std::move(node_indices));
+    }
+};
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu_intrinsics.hpp b/src/backends/gpu/intrinsics.hpp
similarity index 100%
rename from src/backends/gpu_intrinsics.hpp
rename to src/backends/gpu/intrinsics.hpp
diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8524846859e3e657a6de13cfa5d71beccac35946
--- /dev/null
+++ b/src/backends/gpu/kernels/assemble_matrix.hpp
@@ -0,0 +1,104 @@
+#pragma once
+
+#include "detail.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// GPU implementatin of Hines matrix assembly
+/// Flat layout
+/// For a given time step size dt
+///     - use the precomputed alpha and alpha_d values to construct the diagonal
+///       and off diagonal of the symmetric Hines matrix.
+///     - compute the RHS of the linear system to solve
+template <typename T, typename I>
+__global__
+void assemble_matrix_flat(
+        T* d, T* rhs, const T* invariant_d,
+        const T* voltage, const T* current, const T* cv_capacitance,
+        T dt, unsigned n)
+{
+    const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    // The 1e-3 is a constant of proportionality required to ensure that the
+    // conductance (gi) values have units Î¼S (micro-Siemens).
+    // See the model documentation in docs/model for more information.
+    T factor = 1e-3/dt;
+    if (tid<n) {
+        auto gi = factor * cv_capacitance[tid];
+        d[tid] = gi + invariant_d[tid];
+        rhs[tid] = gi*voltage[tid] - current[tid];
+    }
+}
+
+/// GPU implementatin of Hines matrix assembly
+/// Interleaved layout
+/// For a given time step size dt
+///     - use the precomputed alpha and alpha_d values to construct the diagonal
+///       and off diagonal of the symmetric Hines matrix.
+///     - compute the RHS of the linear system to solve
+template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads>
+__global__
+void assemble_matrix_interleaved(
+        T* d,
+        T* rhs,
+        const T* invariant_d,
+        const T* voltage,
+        const T* current,
+        const T* cv_capacitance,
+        const I* sizes,
+        const I* starts,
+        T dt, unsigned padded_size, unsigned num_mtx)
+{
+    static_assert(BlockWidth*LoadWidth==Threads,
+        "number of threads must equal number of values to process per block");
+    __shared__ T buffer_v[Threads];
+    __shared__ T buffer_i[Threads];
+
+    const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+    const unsigned lid = threadIdx.x;
+
+    const unsigned mtx_id   = tid/LoadWidth;
+    const unsigned mtx_lane = tid - mtx_id*LoadWidth;
+
+    const unsigned blk_id   = tid/(BlockWidth*LoadWidth);
+    const unsigned blk_row  = lid/BlockWidth;
+    const unsigned blk_lane = lid - blk_row*BlockWidth;
+
+    const unsigned blk_pos  = LoadWidth*blk_lane + blk_row;
+
+    const bool do_load  = mtx_id<num_mtx;
+
+    unsigned load_pos  = do_load? starts[mtx_id] + mtx_lane     : 0;
+    const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0;
+    unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane);
+
+    const unsigned max_size = sizes[0];
+
+    // The 1e-3 is a constant of proportionality required to ensure that the
+    // conductance (gi) values have units Î¼S (micro-Siemens).
+    // See the model documentation in docs/model for more information.
+    T factor = 1e-3/dt;
+    for (unsigned j=0u; j<max_size; j+=LoadWidth) {
+        if (do_load && load_pos<end) {
+            buffer_v[lid] = voltage[load_pos];
+            buffer_i[lid] = current[load_pos];
+        }
+
+        __syncthreads();
+
+        if (j+blk_row<padded_size) {
+            const auto gi = factor * cv_capacitance[store_pos];
+            d[store_pos]   = gi + invariant_d[store_pos];
+            rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos];
+        }
+
+        store_pos += LoadWidth*BlockWidth;
+        load_pos  += LoadWidth;
+    }
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f3993faee6d47af106d4f05a1e33783b1befc18
--- /dev/null
+++ b/src/backends/gpu/kernels/detail.hpp
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <cstdint>
+#include <cfloat>
+
+#include <iostream>
+#include <limits>
+#include <string>
+#include <vector>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+namespace impl {
+// Number of matrices per block in block-interleaved storage
+__host__ __device__
+constexpr inline unsigned block_dim() {
+    return 32u;
+}
+
+// The number of threads per matrix in the interleave and reverse-interleave
+// operations.
+__host__ __device__
+constexpr inline unsigned load_width() {
+    return 32u;
+}
+
+// The alignment of matrices inside the block-interleaved storage.
+__host__ __device__
+constexpr inline unsigned matrix_padding() {
+    return load_width();
+}
+
+// Number of threads per warp
+// This has always been 32, however it may change in future NVIDIA gpus
+__host__ __device__
+constexpr inline unsigned threads_per_warp() {
+    return 32u;
+}
+
+// The minimum number of bins required to store n values where the bins have
+// dimension of block_size.
+__host__ __device__
+constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
+    return (n+block_size-1)/block_size;
+}
+
+// The smallest size of a buffer required to store n items in such that the
+// buffer has size that is a multiple of block_dim.
+constexpr inline unsigned padded_size (unsigned n, unsigned block_dim) {
+    return n%block_dim ? n+block_dim-(n%block_dim): n;
+}
+
+// Placeholders to use for mark padded locations in data structures that use
+// padding. Using such markers makes it easier to test that padding is
+// performed correctly.
+template <typename T> __host__ __device__ constexpr T npos();
+template <> __host__ __device__ constexpr char npos<char>() { return CHAR_MAX; }
+template <> __host__ __device__ constexpr unsigned char npos<unsigned char>() { return UCHAR_MAX; }
+template <> __host__ __device__ constexpr short npos<short>() { return SHRT_MAX; }
+template <> __host__ __device__ constexpr int npos<int>() { return INT_MAX; }
+template <> __host__ __device__ constexpr long npos<long>() { return LONG_MAX; }
+template <> __host__ __device__ constexpr float npos<float>() { return FLT_MAX; }
+template <> __host__ __device__ constexpr double npos<double>() { return DBL_MAX; }
+template <> __host__ __device__ constexpr unsigned short npos<unsigned short>() { return USHRT_MAX; }
+template <> __host__ __device__ constexpr unsigned int npos<unsigned int>() { return UINT_MAX; }
+template <> __host__ __device__ constexpr unsigned long npos<unsigned long>() { return ULONG_MAX; }
+template <> __host__ __device__ constexpr long long npos<long long>() { return LLONG_MAX; }
+
+// test if value v is npos
+template <typename T>
+__host__ __device__
+constexpr bool is_npos(T v) {
+    return v == npos<T>();
+}
+
+} // namespace impl
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3762368af4c2575f3e7e25b11467837cf3264872
--- /dev/null
+++ b/src/backends/gpu/kernels/interleave.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include "detail.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+///////////////////////////////////////////////////////////////////////////////
+// For more information about the interleaved and flat storage formats for
+// Hines matrices, see src/backends/matrix_storage.md
+///////////////////////////////////////////////////////////////////////////////
+
+
+// Data in a vector is to be interleaved into blocks of width BlockWidth.
+// The kernel assigns LoadWidth threads to each lane in the block.
+//
+// Note that all indexes can reasonably be represented by an unsigned 32-bit
+// integer, so we use unsigned explicitly.
+template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads>
+__global__
+void flat_to_interleaved(
+    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+{
+    static_assert(BlockWidth*LoadWidth==Threads, "");
+
+    __shared__ T buffer[Threads];
+
+    const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+    const unsigned lid = threadIdx.x;
+
+    const unsigned mtx_id   = tid/LoadWidth;
+    const unsigned mtx_lane = tid - mtx_id*LoadWidth;
+
+    const unsigned blk_id   = tid/(BlockWidth*LoadWidth);
+    const unsigned blk_row  = lid/BlockWidth;
+    const unsigned blk_lane = lid - blk_row*BlockWidth;
+
+    const unsigned blk_pos  = LoadWidth*blk_lane + blk_row;
+
+    const bool do_load  = mtx_id<num_vec;
+
+    // only threads that participate in loading access starts and sizes arrays
+    unsigned load_pos  = do_load? starts[mtx_id] + mtx_lane     : 0u;
+    const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0u;
+    unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane);
+
+    for (unsigned i=0u; i<padded_size; i+=LoadWidth) {
+        auto loaded = impl::npos<T>();
+        if (do_load && load_pos<end) {
+            loaded = in[load_pos];
+        }
+        buffer[lid] = loaded;
+        __syncthreads();
+        if (i+blk_row<padded_size) {
+            out[store_pos] = buffer[blk_pos];
+        }
+        load_pos  += LoadWidth;
+        store_pos += LoadWidth*BlockWidth;
+    }
+}
+
+// Note that all indexes can reasonably be represented by an unsigned 32-bit
+// integer, so we use unsigned explicitly.
+template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned THREADS>
+__global__
+void interleaved_to_flat(
+    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+{
+    static_assert(BlockWidth*LoadWidth==THREADS, "");
+
+    __shared__ T buffer[THREADS];
+
+    const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+    const unsigned lid = threadIdx.x;
+
+    const unsigned mtx_id   = tid/LoadWidth;
+    const unsigned mtx_lane = tid - mtx_id*LoadWidth;
+
+    const unsigned blk_id   = tid/(BlockWidth*LoadWidth);
+    const unsigned blk_row  = lid/BlockWidth;
+    const unsigned blk_lane = lid - blk_row*BlockWidth;
+
+    const unsigned blk_pos  = LoadWidth*blk_lane + blk_row;
+
+    const bool do_store = mtx_id<num_vec;
+
+    // only threads that participate in storing access starts and sizes arrays
+    unsigned store_pos = do_store? starts[mtx_id] + mtx_lane     : 0u;
+    const unsigned end = do_store? starts[mtx_id] + sizes[mtx_id]: 0u;
+    unsigned load_pos  = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane);
+
+    for (unsigned i=0u; i<padded_size; i+=LoadWidth) {
+        auto loaded = impl::npos<T>();
+        if (i+blk_row<padded_size) {
+            loaded = in[load_pos];
+        }
+        buffer[blk_pos] = loaded;
+        __syncthreads();
+        if (do_store && store_pos<end) {
+            out[store_pos] = buffer[lid];
+        }
+        load_pos  += LoadWidth*BlockWidth;
+        store_pos += LoadWidth;
+    }
+}
+
+// host side wrapper for the flat to interleaved operation
+template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
+void flat_to_interleaved(
+    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+{
+    constexpr unsigned Threads = BlockWidth*LoadWidth;
+    const unsigned blocks = impl::block_count(num_vec, BlockWidth);
+
+    flat_to_interleaved<T, I, BlockWidth, LoadWidth, Threads>
+        <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec);
+}
+
+// host side wrapper for the interleave to flat operation
+template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
+void interleaved_to_flat(
+    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
+{
+    constexpr unsigned Threads = BlockWidth*LoadWidth;
+    const unsigned blocks = impl::block_count(num_vec, BlockWidth);
+
+    interleaved_to_flat<T, I, BlockWidth, LoadWidth, Threads>
+        <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec);
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/solve_matrix.hpp b/src/backends/gpu/kernels/solve_matrix.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2cfa5f105f54f3a0e0e9c357f8dc3ad6ea7ee4b5
--- /dev/null
+++ b/src/backends/gpu/kernels/solve_matrix.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "detail.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// GPU implementation of Hines Matrix solver.
+/// Flat format
+template <typename T, typename I>
+__global__
+void solve_matrix_flat(
+    T* rhs, T* d, const T* u, const I* p, const I* cell_index, int num_mtx)
+{
+    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if (tid<num_mtx) {
+        // get range of this thread's cell matrix
+        const auto first = cell_index[tid];
+        const auto last  = cell_index[tid+1];
+
+        // backward sweep
+        for(auto i=last-1; i>first; --i) {
+            auto factor = u[i] / d[i];
+            d[p[i]]   -= factor * u[i];
+            rhs[p[i]] -= factor * rhs[i];
+        }
+        rhs[first] /= d[first];
+
+        // forward sweep
+        for(auto i=first+1; i<last; ++i) {
+            rhs[i] -= u[i] * rhs[p[i]];
+            rhs[i] /= d[i];
+        }
+    }
+}
+
+/// GPU implementation of Hines Matrix solver.
+/// Block-interleaved format
+template <typename T, typename I, int BlockWidth>
+__global__
+void solve_matrix_interleaved(
+    T* rhs, T* d, const T* u, const I* p, const I* sizes, int padded_size, int num_mtx)
+{
+    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if (tid<num_mtx) {
+        const auto block       = tid/BlockWidth;
+        const auto block_start = block*BlockWidth;
+        const auto block_lane  = tid - block_start;
+
+        // get range of this thread's cell matrix
+        const auto first    = block_start*padded_size + block_lane;
+        const auto last     = first + BlockWidth*(sizes[tid]-1);
+        const auto last_max = first + BlockWidth*(sizes[block_start]-1);
+
+        // backward sweep
+        for(auto i=last_max; i>first; i-=BlockWidth) {
+            if (i<=last) {
+                auto factor = u[i] / d[i];
+                d[p[i]]   -= factor * u[i];
+                rhs[p[i]] -= factor * rhs[i];
+            }
+        }
+        rhs[first] /= d[first];
+
+        // forward sweep
+        for(auto i=first+BlockWidth; i<=last; i+=BlockWidth) {
+            rhs[i] -= u[i] * rhs[p[i]];
+            rhs[i] /= d[i];
+        }
+    }
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/kernels/test_thresholds.hpp b/src/backends/gpu/kernels/test_thresholds.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..946030704607b87bad5531c8530f671fe52a48d8
--- /dev/null
+++ b/src/backends/gpu/kernels/test_thresholds.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// kernel used to test for threshold crossing test code.
+/// params:
+///     t       : current time (ms)
+///     t_prev  : time of last test (ms)
+///     size    : number of values to test
+///     is_crossed  : crossing state at time t_prev (true or false)
+///     prev_values : values at sample points (see index) sampled at t_prev
+///     index      : index with locations in values to test for crossing
+///     values     : values at t_prev
+///     thresholds : threshold values to watch for crossings
+template <typename T, typename I, typename Stack>
+__global__
+void test_thresholds(
+    float t, float t_prev, int size,
+    Stack& stack,
+    I* is_crossed, T* prev_values,
+    const I* index, const T* values, const T* thresholds)
+{
+    int i = threadIdx.x + blockIdx.x*blockDim.x;
+
+    bool crossed = false;
+    float crossing_time;
+
+    if (i<size) {
+        // Test for threshold crossing
+        const auto v_prev = prev_values[i];
+        const auto v      = values[index[i]];
+        const auto thresh = thresholds[i];
+
+        if (!is_crossed[i]) {
+            if (v>=thresh) {
+                // The threshold has been passed, so estimate the time using
+                // linear interpolation
+                auto pos = (thresh - v_prev)/(v - v_prev);
+                crossing_time = t_prev + pos*(t - t_prev);
+
+                is_crossed[i] = 1;
+                crossed = true;
+            }
+        }
+        else if (v<thresh) {
+            is_crossed[i]=0;
+        }
+
+        prev_values[i] = v;
+    }
+
+    if (crossed) {
+        stack.push_back({I(i), crossing_time});
+    }
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/matrix_state_flat.hpp b/src/backends/gpu/matrix_state_flat.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d214079f7f5841a43c6f6c52cf9992e3e0a635e3
--- /dev/null
+++ b/src/backends/gpu/matrix_state_flat.hpp
@@ -0,0 +1,118 @@
+#pragma once
+
+#include <memory/memory.hpp>
+#include <util/span.hpp>
+#include <util/rangeutil.hpp>
+
+#include "kernels/solve_matrix.hpp"
+#include "kernels/assemble_matrix.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// matrix state
+template <typename T, typename I>
+struct matrix_state_flat {
+    using value_type = T;
+    using size_type = I;
+
+    using array  = memory::device_vector<value_type>;
+    using iarray = memory::device_vector<size_type>;
+
+    using view = typename array::view_type;
+    using const_view = typename array::const_view_type;
+
+    iarray parent_index;
+    iarray cell_index;
+
+    array d;     // [Î¼S]
+    array u;     // [Î¼S]
+    array rhs;   // [nA]
+
+    array cv_capacitance;      // [pF]
+    array face_conductance;    // [Î¼S]
+
+    // the invariant part of the matrix diagonal
+    array invariant_d;         // [Î¼S]
+
+    // interface for exposing the solution to the outside world
+    view solution;
+
+    matrix_state_flat() = default;
+
+    matrix_state_flat(const std::vector<size_type>& p,
+                 const std::vector<size_type>& cell_idx,
+                 const std::vector<value_type>& cv_cap,
+                 const std::vector<value_type>& face_cond):
+        parent_index(memory::make_const_view(p)),
+        cell_index(memory::make_const_view(cell_idx)),
+        d(p.size()),
+        u(p.size()),
+        rhs(p.size()),
+        cv_capacitance(memory::make_const_view(cv_cap))
+    {
+        EXPECTS(cv_cap.size() == size());
+        EXPECTS(face_cond.size() == size());
+        EXPECTS(cell_idx.back() == size());
+        EXPECTS(cell_idx.size() > 2u);
+
+        using memory::make_const_view;
+
+        auto n = d.size();
+        std::vector<value_type> invariant_d_tmp(n, 0);
+        std::vector<value_type> u_tmp(n, 0);
+
+        for (auto i: util::make_span(1u, n)) {
+            auto gij = face_cond[i];
+
+            u_tmp[i] = -gij;
+            invariant_d_tmp[i] += gij;
+            invariant_d_tmp[p[i]] += gij;
+        }
+        invariant_d = make_const_view(invariant_d_tmp);
+        u = make_const_view(u_tmp);
+
+        solution = rhs;
+    }
+
+    // Assemble the matrix
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
+    //   dt      [ms]
+    //   voltage [mV]
+    //   current [nA]
+    void assemble(value_type dt, const_view voltage, const_view current) {
+        // determine the grid dimensions for the kernel
+        auto const n = voltage.size();
+        auto const block_dim = 128;
+        auto const grid_dim = impl::block_count(n, block_dim);
+
+        assemble_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
+            d.data(), rhs.data(), invariant_d.data(), voltage.data(),
+            current.data(), cv_capacitance.data(), dt, size());
+    }
+
+    void solve() {
+        // determine the grid dimensions for the kernel
+        auto const block_dim = 128;
+        auto const grid_dim = impl::block_count(num_matrices(), block_dim);
+
+        // perform solve on gpu
+        solve_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
+            rhs.data(), d.data(), u.data(), parent_index.data(),
+            cell_index.data(), num_matrices());
+    }
+
+    std::size_t size() const {
+        return parent_index.size();
+    }
+
+private:
+    unsigned num_matrices() const {
+        return cell_index.size()-1;
+    }
+};
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/matrix_state_interleaved.hpp b/src/backends/gpu/matrix_state_interleaved.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f1888395dbaf7624edef3813ad1a9984170e697
--- /dev/null
+++ b/src/backends/gpu/matrix_state_interleaved.hpp
@@ -0,0 +1,256 @@
+#pragma once
+
+#include <memory/memory.hpp>
+#include <util/span.hpp>
+#include <util/rangeutil.hpp>
+
+#include "kernels/solve_matrix.hpp"
+#include "kernels/assemble_matrix.hpp"
+#include "kernels/interleave.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+// A helper that performs the interleave operation on host memory.
+template <typename T, typename I>
+std::vector<T> flat_to_interleaved(
+        const std::vector<T>& in,
+        const std::vector<I>& sizes,
+        const std::vector<I>& starts,
+        unsigned block_width, unsigned num_vec, unsigned padded_length)
+{
+    auto num_blocks = impl::block_count(num_vec, block_width);
+    std::vector<T> out(num_blocks*block_width*padded_length, impl::npos<T>());
+    for (auto mtx: util::make_span(0u, num_vec)) {
+        auto block = mtx/block_width;
+        auto lane  = mtx%block_width;
+
+        auto len = sizes[mtx];
+        auto src = starts[mtx];
+        auto dst = block*(block_width*padded_length) + lane;
+        for (auto i: util::make_span(0, len)) {
+            out[dst] = in[src+i];
+            dst += block_width;
+        }
+    }
+    return out;
+};
+
+/// matrix state
+template <typename T, typename I>
+struct matrix_state_interleaved {
+    using value_type = T;
+    using size_type = I;
+
+    using array  = memory::device_vector<value_type>;
+    using iarray = memory::device_vector<size_type>;
+
+    using const_view = typename array::const_view_type;
+
+    // Permutation and index information required for forward and backward
+    // interleave-permutation of vectors.
+
+    // size of each matrix (after permutation in ascending size)
+    iarray matrix_sizes;
+    // start values corresponding to matrix i in the external storage
+    iarray matrix_index;
+
+    // Storage for the matrix and parent index in interleaved format.
+    // Includes the cv_capacitance, which is required for matrix assembly.
+
+    iarray parent_index;
+    array d;   // [Î¼S]
+    array u;   // [Î¼S]
+    array rhs; // [nA]
+
+    // required for matrix assembly
+    array cv_capacitance; // [pF]
+
+    // the invariant part of the matrix diagonal
+    array invariant_d;    // [Î¼S]
+
+    // the length of a vector required to store values for one
+    // matrix with padding
+    unsigned padded_size;
+
+    //  Storage for solution in uninterleaved format.
+    //  Used to hold the storage for passing to caller, and must be updated
+    //  after each call to the ::solve() method.
+    array solution;
+
+    // default constructor
+    matrix_state_interleaved() = default;
+
+    // Construct matrix state for a set of matrices defined by parent_index p
+    // The matrix solver stores the matrix in an "interleaved" structure for
+    // optimal solution, which requires a significant amount of precomputing
+    // of indexes and data structures in the constructor.
+    //  cv_cap      // [pF]
+    //  face_cond   // [Î¼S]
+    matrix_state_interleaved(const std::vector<size_type>& p,
+                 const std::vector<size_type>& cell_idx,
+                 const std::vector<value_type>& cv_cap,
+                 const std::vector<value_type>& face_cond)
+    {
+        EXPECTS(cv_cap.size()    == p.size());
+        EXPECTS(face_cond.size() == p.size());
+        EXPECTS(cell_idx.back()  == p.size());
+
+        // Just because you never know.
+        EXPECTS(cell_idx.size() <= UINT_MAX);
+
+        using util::make_span;
+
+        // Convenience for commonly used type in this routine.
+        using svec = std::vector<size_type>;
+
+        //
+        // Sort matrices in descending order of size.
+        //
+
+        // Find the size of each matrix.
+        const auto num_mtx = cell_idx.size()-1;
+        svec sizes;
+        for (auto it=cell_idx.begin()+1; it!=cell_idx.end(); ++it) {
+            sizes.push_back(*it - *(it-1));
+        }
+
+        // Find permutations and sort indexes/sizes.
+        svec perm(num_mtx);
+        std::iota(perm.begin(), perm.end(), 0);
+        // calculate the permutation of matrices to put the in ascending size
+        util::stable_sort_by(perm, [&sizes](size_type i){ return sizes[i]; });
+        std::reverse(perm.begin(), perm.end());
+
+        // TODO: refactor to be less verbose with permutation_view
+        svec sizes_p;
+        for (auto i: make_span(0, num_mtx)) {
+            sizes_p.push_back(sizes[perm[i]]);
+        }
+        svec cell_index_p;
+        for (auto i: make_span(0, num_mtx)) {
+            cell_index_p.push_back(cell_idx[perm[i]]);
+        }
+
+        //
+        // Calculate dimensions required to store matrices.
+        //
+        using impl::block_dim;
+        using impl::matrix_padding;
+
+        // To start, take simplest approach of assuming all matrices stored
+        // in blocks of the same dimension: padded_size
+        padded_size = impl::padded_size(sizes_p[0], matrix_padding());
+        const auto num_blocks = impl::block_count(num_mtx, block_dim());
+
+        const auto total_storage = num_blocks*block_dim()*padded_size;
+
+        // calculate the interleaved and permuted p vector
+        constexpr auto npos = std::numeric_limits<size_type>::max();
+        std::vector<size_type> p_tmp(total_storage, npos);
+        for (auto mtx: make_span(0, num_mtx)) {
+            auto block = mtx/block_dim();
+            auto lane  = mtx%block_dim();
+
+            auto len = sizes_p[mtx];
+            auto src = cell_index_p[mtx];
+            auto dst = block*(block_dim()*padded_size) + lane;
+            for (auto i: make_span(0, len)) {
+                // the p indexes are always relative to the start of the p vector.
+                // the addition and subtraction of dst and src respectively is to convert from
+                // the original offset to the new padded and permuted offset.
+                p_tmp[dst+block_dim()*i] = dst + block_dim()*(p[src+i]-src);
+            }
+        }
+
+        d   = array(total_storage);
+        u   = array(total_storage);
+        rhs = array(total_storage);
+        parent_index = memory::make_const_view(p_tmp);
+
+        //
+        //  Calculate the invariant part of the matrix diagonal and the
+        //  upper diagonal on the host, then copy to the device.
+        //
+
+        std::vector<value_type> invariant_d_tmp(p.size(), 0);
+        std::vector<value_type> u_tmp(p.size(), 0);
+        auto face_conductance_tmp = memory::on_host(face_cond);
+        for (auto i: util::make_span(1u, p.size())) {
+            auto gij = face_conductance_tmp[i];
+
+            u_tmp[i] = -gij;
+            invariant_d_tmp[i] += gij;
+            invariant_d_tmp[p[i]] += gij;
+        }
+
+        // Helper that converts to interleaved format on the host, then copies to device
+        // memory, for use as an rvalue in an assignemt to a device vector.
+        auto interleave = [&] (std::vector<T>const& x) {
+            return memory::on_gpu(
+                flat_to_interleaved(x, sizes_p, cell_index_p, block_dim(), num_mtx, padded_size));
+        };
+        u           = interleave(u_tmp);
+        invariant_d = interleave(invariant_d_tmp);
+        cv_capacitance = interleave(cv_cap);
+
+        matrix_sizes = memory::make_const_view(sizes_p);
+        matrix_index = memory::make_const_view(cell_index_p);
+
+        // Allocate space for storing the un-interleaved solution.
+        solution = array(p.size());
+    }
+
+    // Assemble the matrix
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
+    //   dt      [ms]
+    //   voltage [mV]
+    //   current [nA]
+    void assemble(value_type dt, const_view voltage, const_view current) {
+        constexpr auto bd = impl::block_dim();
+        constexpr auto lw = impl::load_width();
+        constexpr auto block_dim = bd*lw;
+
+        // The number of threads is threads_per_matrix*num_mtx
+        const auto num_blocks = impl::block_count(num_matrices()*lw, block_dim);
+
+        assemble_matrix_interleaved<value_type, size_type, bd, lw, block_dim>
+            <<<num_blocks, block_dim>>>
+            (d.data(), rhs.data(), invariant_d.data(),
+             voltage.data(), current.data(), cv_capacitance.data(),
+             matrix_sizes.data(), matrix_index.data(),
+             dt, padded_matrix_size(), num_matrices());
+
+    }
+
+    void solve() {
+        // Perform the Hines solve.
+        auto const grid_dim = impl::block_count(num_matrices(), impl::block_dim());
+        solve_matrix_interleaved<value_type, size_type, impl::block_dim()>
+            <<<grid_dim, impl::block_dim()>>>
+            (rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(),
+             padded_matrix_size(), num_matrices());
+
+        // Copy the solution from interleaved to front end storage.
+        interleaved_to_flat<value_type, size_type, impl::block_dim(), impl::load_width()>
+            (rhs.data(), solution.data(), matrix_sizes.data(), matrix_index.data(),
+             padded_matrix_size(), num_matrices());
+    }
+
+private:
+
+    // The number of matrices stored in the matrix state.
+    unsigned num_matrices() const {
+        return matrix_sizes.size();
+    }
+
+    // The full padded matrix size
+    unsigned padded_matrix_size() const {
+        return padded_size;
+    }
+};
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu_stack.hpp b/src/backends/gpu/stack.hpp
similarity index 97%
rename from src/backends/gpu_stack.hpp
rename to src/backends/gpu/stack.hpp
index 7767b9e535d22aa05d23b7c7bd18690028f95ca0..2cad1d10ba55b4189c7a438e3bed4401d88da2c4 100644
--- a/src/backends/gpu_stack.hpp
+++ b/src/backends/gpu/stack.hpp
@@ -20,7 +20,7 @@ namespace gpu {
 // It is designed to be initialized empty with a given capacity on the host,
 // updated by device kernels, and periodically read and reset from the host side.
 template <typename T>
-class gpu_stack {
+class stack {
     using value_type = T;
     using allocator = memory::managed_allocator<value_type>;
 
@@ -37,13 +37,13 @@ class gpu_stack {
 
 public:
 
-    gpu_stack(unsigned capacity):
+    stack(unsigned capacity):
         capacity_(capacity), size_(0u)
     {
         data_ = allocator().allocate(capacity_);
     }
 
-    ~gpu_stack() {
+    ~stack() {
         allocator().deallocate(data_, capacity_);
     }
 
diff --git a/src/backends/stimulus_gpu.hpp b/src/backends/gpu/stimulus.hpp
similarity index 87%
rename from src/backends/stimulus_gpu.hpp
rename to src/backends/gpu/stimulus.hpp
index 3d2ac907fc89e9d2bd0ac188c980418c870b2706..6cc41703dba92be14fd0a87cad59ccf20da0691f 100644
--- a/src/backends/stimulus_gpu.hpp
+++ b/src/backends/gpu/stimulus.hpp
@@ -7,24 +7,14 @@
 #include <algorithms.hpp>
 #include <util/pprintf.hpp>
 
+#include "intrinsics.hpp"
+
 namespace nest{
 namespace mc{
 namespace mechanisms {
 namespace gpu {
 
 namespace kernels {
-    __device__
-    inline double atomicAdd(double* address, double val) {
-        using I = unsigned long long int;
-        I* address_as_ull = (I*)address;
-        I old = *address_as_ull, assumed;
-        do {
-            assumed = old;
-            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
-        } while (assumed != old);
-        return __longlong_as_double(old);
-    }
-
     template <typename T, typename I>
     __global__
     void stim_current(
@@ -40,7 +30,7 @@ namespace kernels {
             if (t>=delay[i] && t<(delay[i]+duration[i])) {
                 // use subtraction because the electrode currents are specified
                 // in terms of current into the compartment
-                atomicAdd(current+node_index[i], -amplitude[i]);
+                cuda_atomic_add(current+node_index[i], -amplitude[i]);
             }
         }
     }
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4624c077239eff3a5a74cda42c60939b1ea8ae8
--- /dev/null
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <common_types.hpp>
+#include <memory/memory.hpp>
+#include <memory/managed_ptr.hpp>
+#include <util/span.hpp>
+
+#include "stack.hpp"
+#include "kernels/test_thresholds.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+/// threshold crossing logic
+/// used as part of spike detection back end
+template <typename T, typename I>
+class threshold_watcher {
+public:
+    using value_type = T;
+    using size_type = I;
+
+    using array = memory::device_vector<T>;
+    using iarray = memory::device_vector<I>;
+    using const_view = typename array::const_view_type;
+
+    /// stores a single crossing event
+    struct threshold_crossing {
+        size_type index;    // index of variable
+        value_type time;    // time of crossing
+        __host__ __device__
+        friend bool operator==
+            (const threshold_crossing& lhs, const threshold_crossing& rhs)
+        {
+            return lhs.index==rhs.index && lhs.time==rhs.time;
+        }
+    };
+
+    using stack_type = stack<threshold_crossing>;
+
+    threshold_watcher() = default;
+
+    threshold_watcher(
+            const_view values,
+            const std::vector<size_type>& index,
+            const std::vector<value_type>& thresh,
+            value_type t=0):
+        values_(values),
+        index_(memory::make_const_view(index)),
+        thresholds_(memory::make_const_view(thresh)),
+        prev_values_(values),
+        is_crossed_(size()),
+        stack_(memory::make_managed_ptr<stack_type>(10*size()))
+    {
+        reset(t);
+    }
+
+    /// Remove all stored crossings that were detected in previous calls
+    /// to test()
+    void clear_crossings() {
+        stack_->clear();
+    }
+
+    /// Reset state machine for each detector.
+    /// Assume that the values in values_ have been set correctly before
+    /// calling, because the values are used to determine the initial state
+    void reset(value_type t=0) {
+        clear_crossings();
+
+        // Make host-side copies of the information needed to calculate
+        // the initial crossed state
+        auto values = memory::on_host(values_);
+        auto thresholds = memory::on_host(thresholds_);
+        auto index = memory::on_host(index_);
+
+        // calculate the initial crossed state in host memory
+        auto crossed = std::vector<size_type>(size());
+        for (auto i: util::make_span(0u, size())) {
+            crossed[i] = values[index[i]] < thresholds[i] ? 0 : 1;
+        }
+
+        // copy the initial crossed state to device memory
+        is_crossed_ = memory::on_gpu(crossed);
+
+        // reset time of last test
+        t_prev_ = t;
+    }
+
+    bool is_crossed(size_type i) const {
+        return is_crossed_[i];
+    }
+
+    const std::vector<threshold_crossing> crossings() const {
+        return std::vector<threshold_crossing>(stack_->begin(), stack_->end());
+    }
+
+    /// The time at which the last test was performed
+    value_type last_test_time() const {
+        return t_prev_;
+    }
+
+    /// Tests each target for changed threshold state.
+    /// Crossing events are recorded for each threshold that has been
+    /// crossed since current time t, and the last time the test was
+    /// performed.
+    void test(value_type t) {
+        EXPECTS(t_prev_<t);
+
+        constexpr int block_dim = 128;
+        const int grid_dim = (size()+block_dim-1)/block_dim;
+        test_thresholds<<<grid_dim, block_dim>>>(
+            t, t_prev_, size(),
+            *stack_,
+            is_crossed_.data(), prev_values_.data(),
+            index_.data(), values_.data(), thresholds_.data());
+
+        // Check that the number of spikes has not exceeded
+        // the capacity of the stack.
+        EXPECTS(stack_->size() <= stack_->capacity());
+
+        t_prev_ = t;
+    }
+
+    /// the number of threashold values that are being monitored
+    std::size_t size() const {
+        return index_.size();
+    }
+
+    /// Data type used to store the crossings.
+    /// Provided to make type-generic calling code.
+    using crossing_list =  std::vector<threshold_crossing>;
+
+private:
+
+    const_view values_;         // values to watch: on gpu
+    iarray index_;              // indexes of values to watch: on gpu
+
+    array thresholds_;          // threshold for each watch: on gpu
+    value_type t_prev_;         // time of previous sample: on host
+    array prev_values_;         // values at previous sample time: on host
+    iarray is_crossed_;         // bool flag for state of each watch: on gpu
+
+    memory::managed_ptr<stack_type> stack_;
+};
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/matrix_storage.md b/src/backends/matrix_storage.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6ed40c2cf9121dc480f1a8604906e2e96b18166
--- /dev/null
+++ b/src/backends/matrix_storage.md
@@ -0,0 +1,108 @@
+# Flat and Interleaved Matrix Storage
+
+This document describes the layout of different storage schemes for matrices use in the GPU back end.
+
+An NxN Hines matrix can be stored compactly with 3 vectors of length N:
+  * `d`: the diagonal of the matrix
+  * `u`: the upper/lower part of the matrix (referred to somewhat casually as the super diagonal)
+  * `p`: the parent index
+Additionally, we often store N*1 vectors that have one value per compartment, e.g. voltage, solution or rhs vectors.
+
+In NestMC multicompartment a single cell has a matrix structure associated with in, that is derived directly from the connections between its constituent compartments. NestMC groups these cells into groups of cells, called `cell_groups`. The matrices for all the cells in a group are packed together 
+
+The matrix packing applies the same packing operation to each vector associated with a matrix, i.e. the `u`, `d`, `p` and solution, voltage vectors.
+
+In this discussion we use a simple example group of matrices to illustrate the storage methods, because an example is more illustrative than a formal description:
+  * 7 vectors labeled `{a, b, c, d, e, f, g}`
+  * the vectors have respective lenghts `{8, 7, 6, 6, 5, 5, 3}`
+  * the `i`th value in vector `a` is labelled `ai`
+
+## Flat storage
+
+Take a vector vals containing the values:
+
+```
+vals = [a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 | c0 c1 c2 c3 c4 c5 | d0 d1 d2 d3 d4 d5 | e0 e1 e2 e3 e4 | f0 f1 f2 f3 f4 | g0 g1 g2 ]
+```
+
+To fully describe the set of matrices we need an index vector of lenth `#matrices+1`:
+
+```
+indx = [0, 8, 15, 21, 27, 32, 37, 40]
+```
+
+To look up the value of the `i`th entry in the vector `m`, we use the following formula to calculate the index
+
+```
+lookup_flt(i,m): indx[m] + i
+```
+
+## Interleaved storage
+
+To store the matrices with block width 4 and padded matrix size of 8 two arrays are also required:
+
+```
+vals =
+[ a0 b0 c0 d0 | a1 b1 c1 d1 | a2 b2 c2 d2 | a3 b3 c3 d3 | a4 b4 c4 d4 | a5 b5 c5 d5 | a6 b6  *  * | a7  *  *  * |
+  e0 f0 g0  * | e1 f1 g1  * | e2 f2 g2  * | e3 f3  *  * | e4 f4  *  * |  *  *  *  * |  *  *  *  * |  *  *  *  * ]
+sizes = [8, 7, 6, 6, 5, 5, 3]
+```
+
+where `*` indicates padding, or a location in `vals` that does not hold a value that is part of one of the packed vectors.
+
+To look up the value of the `i`th entry in the vector `m`, we use the following formula to calculate the index into `vals`
+
+```
+lookup_int(i,m) = floor(m/BW)*BW*N + m-floor(m/BW)*BW + i*BW
+```
+
+The `block` and `lane` (i.e. the block-local index) of a matrix can be computed
+
+```
+block = floor(m/BW)
+lane = m-block*BW
+```
+
+so that the index calcuation can be expressed more succinctly and clearly:
+
+```
+lookup_int(i,m): block*BW*N + lane + i*BW
+```
+
+## On parent indexes
+
+Parent index vectors are packed in the same format as other vectors, however the index values must also be modified because parent indexes are relative.
+
+```
+p_flt(i,m) = indx[m] + p_lcl(i, m)
+p_int(i,m) = lookup_int(0, m) + BW*p_lcl(i, m)
+```
+
+For example, the following two cells
+
+```
+cell 1, 6 nodes:
+
+0--1--2--3
+   \
+    4--5
+
+cell 2, 8 nodes:
+
+0--1--2--3
+   \
+    4--5--6
+     \
+      7
+```
+
+have the following packed structures
+
+```
+p_lcl = [0 0 1 2 1 4 | 0 0 1 2 1 4 5 4]
+p_flt = [0 0 1 2 1 4 | 6 6 7 8 7 10 11 10]
+p_int = [0 1 * *| 0 1 * * | 4 5 * * | 8 9 * * | 4 5 * * | 16 17 * * | 20 * * * | 16 * * * ]
+```
+
+Where the interleaved storage used block width 4, and packed matrix size 8, as in the earlier example.
+
diff --git a/src/backends/fvm_multicore.cpp b/src/backends/multicore/fvm.cpp
similarity index 96%
rename from src/backends/fvm_multicore.cpp
rename to src/backends/multicore/fvm.cpp
index 8d20f2ffe7161832bbdd66fd77dd83255766f8b4..06f45a40d19a76b4fdcf910676896cf960303715 100644
--- a/src/backends/fvm_multicore.cpp
+++ b/src/backends/multicore/fvm.cpp
@@ -1,4 +1,4 @@
-#include "fvm_multicore.hpp"
+#include "fvm.hpp"
 
 #include <mechanisms/multicore/hh.hpp>
 #include <mechanisms/multicore/pas.hpp>
diff --git a/src/backends/multicore/fvm.hpp b/src/backends/multicore/fvm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d04928cf00a64978a1ae076957a8c7e4695b9099
--- /dev/null
+++ b/src/backends/multicore/fvm.hpp
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <map>
+#include <string>
+
+#include <common_types.hpp>
+#include <mechanism.hpp>
+#include <memory/memory.hpp>
+#include <memory/wrappers.hpp>
+#include <util/span.hpp>
+
+#include "matrix_state.hpp"
+#include "stimulus.hpp"
+#include "threshold_watcher.hpp"
+
+namespace nest {
+namespace mc {
+namespace multicore {
+
+struct backend {
+    /// define the real and index types
+    using value_type = double;
+    using size_type  = nest::mc::cell_lid_type;
+
+    /// define storage types
+    using array  = memory::host_vector<value_type>;
+    using iarray = memory::host_vector<size_type>;
+
+    using view       = typename array::view_type;
+    using const_view = typename array::const_view_type;
+
+    using iview       = typename iarray::view_type;
+    using const_iview = typename iarray::const_view_type;
+
+    using host_array  = array;
+    using host_iarray = iarray;
+
+    using host_view   = view;
+    using host_iview  = iview;
+
+    /// matrix state
+    using matrix_state =
+        nest::mc::multicore::matrix_state<value_type, size_type>;
+
+    //
+    // mechanism infrastructure
+    //
+    using ion = mechanisms::ion<backend>;
+
+    using mechanism = mechanisms::mechanism_ptr<backend>;
+
+    using stimulus = mechanisms::multicore::stimulus<backend>;
+
+    static mechanism make_mechanism(
+        const std::string& name,
+        view vec_v, view vec_i,
+        const std::vector<value_type>& weights,
+        const std::vector<size_type>& node_indices)
+    {
+        if (!has_mechanism(name)) {
+            throw std::out_of_range("no mechanism in database : " + name);
+        }
+
+        return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices));
+    }
+
+    static bool has_mechanism(const std::string& name) {
+        return mech_map_.count(name)>0;
+    }
+
+    static std::string name() {
+        return "cpu";
+    }
+
+    /// threshold crossing logic
+    /// used as part of spike detection back end
+    using threshold_watcher =
+        nest::mc::multicore::threshold_watcher<value_type, size_type>;
+
+
+private:
+
+    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
+    static std::map<std::string, maker_type> mech_map_;
+
+    template <template <typename> class Mech>
+    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
+        return mechanisms::make_mechanism<Mech<backend>>
+            (vec_v, vec_i, std::move(weights), std::move(node_indices));
+    }
+};
+
+} // namespace multicore
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/multicore/matrix_state.hpp b/src/backends/multicore/matrix_state.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0414a9d02ce7f6d30d9535b3c5ca7ab0553c9c12
--- /dev/null
+++ b/src/backends/multicore/matrix_state.hpp
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <memory/memory.hpp>
+#include <util/span.hpp>
+
+namespace nest {
+namespace mc {
+namespace multicore {
+
+template <typename T, typename I>
+struct matrix_state {
+public:
+    using value_type = T;
+    using size_type = I;
+
+    using array = memory::host_vector<value_type>;
+    using const_view = typename array::const_view_type;
+    using iarray = memory::host_vector<size_type>;
+    iarray parent_index;
+    iarray cell_index;
+
+    array d;     // [Î¼S]
+    array u;     // [Î¼S]
+    array rhs;   // [nA]
+
+    array cv_capacitance;      // [pF]
+    array face_conductance;    // [Î¼S]
+
+    // the invariant part of the matrix diagonal
+    array invariant_d;         // [Î¼S]
+
+    const_view solution;
+
+    matrix_state() = default;
+
+    matrix_state(const std::vector<size_type>& p,
+                 const std::vector<size_type>& cell_idx,
+                 const std::vector<value_type>& cap,
+                 const std::vector<value_type>& cond):
+        parent_index(memory::make_const_view(p)),
+        cell_index(memory::make_const_view(cell_idx)),
+        d(size(), 0), u(size(), 0), rhs(size()),
+        cv_capacitance(memory::make_const_view(cap)),
+        face_conductance(memory::make_const_view(cond))
+    {
+        EXPECTS(cap.size() == size());
+        EXPECTS(cond.size() == size());
+        EXPECTS(cell_idx.back() == size());
+
+        auto n = size();
+        invariant_d = array(n, 0);
+        for (auto i: util::make_span(1u, n)) {
+            auto gij = face_conductance[i];
+
+            u[i] = -gij;
+            invariant_d[i] += gij;
+            invariant_d[p[i]] += gij;
+        }
+
+        // In this back end the solution is a simple view of the rhs, which
+        // contains the solution after the matrix_solve is performed.
+        solution = rhs;
+    }
+
+    // Assemble the matrix
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
+    //   dt      [ms]
+    //   voltage [mV]
+    //   current [nA]
+    void assemble(value_type dt, const_view voltage, const_view current) {
+        auto n = size();
+        value_type factor = 1e-3/dt;
+        for (auto i: util::make_span(0u, n)) {
+            auto gi = factor*cv_capacitance[i];
+
+            d[i] = gi + invariant_d[i];
+
+            rhs[i] = gi*voltage[i] - current[i];
+        }
+    }
+
+    void solve() {
+        const size_type ncells = cell_index.size()-1;
+
+        // loop over submatrices
+        for (auto m: util::make_span(0, ncells)) {
+            auto first = cell_index[m];
+            auto last = cell_index[m+1];
+
+            // backward sweep
+            for(auto i=last-1; i>first; --i) {
+                auto factor = u[i] / d[i];
+                d[parent_index[i]]   -= factor * u[i];
+                rhs[parent_index[i]] -= factor * rhs[i];
+            }
+            rhs[first] /= d[first];
+
+            // forward sweep
+            for(auto i=first+1; i<last; ++i) {
+                rhs[i] -= u[i] * rhs[parent_index[i]];
+                rhs[i] /= d[i];
+            }
+        }
+    }
+
+private:
+
+    std::size_t size() const {
+        return parent_index.size();
+    }
+};
+
+} // namespace multicore
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/stimulus_multicore.hpp b/src/backends/multicore/stimulus.hpp
similarity index 100%
rename from src/backends/stimulus_multicore.hpp
rename to src/backends/multicore/stimulus.hpp
diff --git a/src/backends/multicore/threshold_watcher.hpp b/src/backends/multicore/threshold_watcher.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7cfb3ab5db0abca8662e6685edbaabb25183088
--- /dev/null
+++ b/src/backends/multicore/threshold_watcher.hpp
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <memory/memory.hpp>
+
+namespace nest {
+namespace mc {
+namespace multicore {
+
+template <typename T, typename I>
+class threshold_watcher {
+public:
+    using value_type = T;
+    using size_type = I;
+
+    using array = memory::host_vector<value_type>;
+    using const_view = typename array::const_view_type;
+    using iarray = memory::host_vector<size_type>;
+
+    /// stores a single crossing event
+    struct threshold_crossing {
+        size_type index;    // index of variable
+        value_type time;    // time of crossing
+        friend bool operator== (
+            const threshold_crossing& lhs, const threshold_crossing& rhs)
+        {
+            return lhs.index==rhs.index && lhs.time==rhs.time;
+        }
+    };
+
+    threshold_watcher() = default;
+
+    threshold_watcher(
+            const_view vals,
+            const std::vector<size_type>& indxs,
+            const std::vector<value_type>& thresh,
+            value_type t=0):
+        values_(vals),
+        index_(memory::make_const_view(indxs)),
+        thresholds_(memory::make_const_view(thresh)),
+        v_prev_(vals)
+    {
+        is_crossed_ = iarray(size());
+        reset(t);
+    }
+
+    /// Remove all stored crossings that were detected in previous calls
+    /// to the test() member function.
+    void clear_crossings() {
+        crossings_.clear();
+    }
+
+    /// Reset state machine for each detector.
+    /// Assume that the values in values_ have been set correctly before
+    /// calling, because the values are used to determine the initial state
+    void reset(value_type t=0) {
+        clear_crossings();
+        for (auto i=0u; i<size(); ++i) {
+            is_crossed_[i] = values_[index_[i]]>=thresholds_[i];
+        }
+        t_prev_ = t;
+    }
+
+    const std::vector<threshold_crossing>& crossings() const {
+        return crossings_;
+    }
+
+    /// The time at which the last test was performed
+    value_type last_test_time() const {
+        return t_prev_;
+    }
+
+    /// Tests each target for changed threshold state
+    /// Crossing events are recorded for each threshold that
+    /// is crossed since the last call to test
+    void test(value_type t) {
+        for (auto i=0u; i<size(); ++i) {
+            auto v_prev = v_prev_[i];
+            auto v      = values_[index_[i]];
+            auto thresh = thresholds_[i];
+            if (!is_crossed_[i]) {
+                if (v>=thresh) {
+                    // the threshold has been passed, so estimate the time using
+                    // linear interpolation
+                    auto pos = (thresh - v_prev)/(v - v_prev);
+                    auto crossing_time = t_prev_ + pos*(t - t_prev_);
+                    crossings_.push_back({i, crossing_time});
+
+                    is_crossed_[i] = true;
+                }
+            }
+            else {
+                if (v<thresh) {
+                    is_crossed_[i] = false;
+                }
+            }
+
+            v_prev_[i] = v;
+        }
+        t_prev_ = t;
+    }
+
+    bool is_crossed(size_type i) const {
+        return is_crossed_[i];
+    }
+
+    /// the number of threashold values that are being monitored
+    std::size_t size() const {
+        return index_.size();
+    }
+
+    /// Data type used to store the crossings.
+    /// Provided to make type-generic calling code.
+    using crossing_list =  std::vector<threshold_crossing>;
+
+private:
+    const_view values_;
+    iarray index_;
+
+    array thresholds_;
+    value_type t_prev_;
+    array v_prev_;
+    crossing_list crossings_;
+    iarray is_crossed_;
+};
+
+} // namespace multicore
+} // namespace mc
+} // namespace nest
diff --git a/src/matrix.hpp b/src/matrix.hpp
index f37e54a834316552f3fdc12f0a939b50e2d901c6..8f010763cce427011f191b36c821fe7c0167e9df 100644
--- a/src/matrix.hpp
+++ b/src/matrix.hpp
@@ -11,8 +11,9 @@ namespace nest {
 namespace mc {
 
 /// Hines matrix
-/// the TargetPolicy defines the backend specific data types and solver
-template<class Backend>
+/// Make the back end state implementation optional to allow for
+/// testing different implementations in the same code.
+template<class Backend, class State=typename Backend::matrix_state>
 class matrix {
 public:
     using backend = Backend;
@@ -25,37 +26,23 @@ public:
     using array = typename backend::array;
     using iarray = typename backend::iarray;
 
-    using view = typename backend::view;
-    using iview = typename backend::iview;
     using const_view = typename backend::const_view;
     using const_iview = typename backend::const_iview;
 
     using host_array = typename backend::host_array;
 
     // back end specific storage for matrix state
-    using state = typename backend::matrix_state;
+    using state = State;
 
     matrix() = default;
 
-    /// construct matrix for one or more cells, described by a parent index and
-    /// a cell index.
-    matrix(const std::vector<size_type>& pi, const std::vector<size_type>& ci):
-        parent_index_(memory::make_const_view(pi)),
-        cell_index_(memory::make_const_view(ci)),
-        state_(parent_index_, cell_index_)
-    {
-        EXPECTS(cell_index_[num_cells()] == parent_index_.size());
-    }
-
     matrix( const std::vector<size_type>& pi,
             const std::vector<size_type>& ci,
             const std::vector<value_type>& cv_capacitance,
             const std::vector<value_type>& face_conductance):
         parent_index_(memory::make_const_view(pi)),
         cell_index_(memory::make_const_view(ci)),
-        state_( parent_index_, cell_index_,
-                memory::make_const_view(cv_capacitance),
-                memory::make_const_view(face_conductance))
+        state_(pi, ci, cv_capacitance, face_conductance)
     {
         EXPECTS(cell_index_[num_cells()] == parent_index_.size());
     }
@@ -88,7 +75,7 @@ public:
 
     /// Get a view of the solution
     const_view solution() const {
-        return state_.rhs;
+        return state_.solution;
     }
 
     private:
diff --git a/tests/unit/common.hpp b/tests/unit/common.hpp
index dc7b5a357fdbc8e0a1aa25ec993a36e3d9c6427f..af9833f5840b64fb8e21e72d2831640fc580c621 100644
--- a/tests/unit/common.hpp
+++ b/tests/unit/common.hpp
@@ -126,6 +126,8 @@ template <typename FPType, typename Seq1, typename Seq2>
     for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) {
         using FP = testing::internal::FloatingPoint<FPType>;
 
+        // cast to FPType to avoid warnings about lowering conversion
+        // if FPType has lower precision than Seq{12}::value_type
         auto v1 = *i1;
         auto v2 = *i2;
 
diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu
index 694300e6239ed6c95ff0a90e72740e4e7014c60f..fb9877c9a7fd6de6258c31bf7c40fcc062357ab3 100644
--- a/tests/unit/test_atomics.cu
+++ b/tests/unit/test_atomics.cu
@@ -1,6 +1,6 @@
 #include "../gtest.h"
 
-#include <backends/gpu_intrinsics.hpp>
+#include <backends/gpu/intrinsics.hpp>
 #include <memory/managed_ptr.hpp>
 
 namespace kernels {
diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu
index 38087a9570ad975f8a3028966a56a0b47842ac61..2756078dca09d4bfebf14adb992bf05cf61a4d28 100644
--- a/tests/unit/test_gpu_stack.cu
+++ b/tests/unit/test_gpu_stack.cu
@@ -1,14 +1,14 @@
 #include "../gtest.h"
 
-#include <backends/gpu_stack.hpp>
+#include <backends/gpu/stack.hpp>
 #include <memory/managed_ptr.hpp>
 
 using namespace nest::mc;
 
-TEST(gpu_stack, construction) {
+TEST(stack, construction) {
     using T = int;
 
-    gpu::gpu_stack<T> s(10);
+    gpu::stack<T> s(10);
 
     EXPECT_EQ(0u, s.size());
     EXPECT_EQ(10u, s.capacity());
@@ -18,7 +18,7 @@ TEST(gpu_stack, construction) {
 namespace kernels {
     template <typename F>
     __global__
-    void push_back(gpu::gpu_stack<int>& s, F f) {
+    void push_back(gpu::stack<int>& s, F f) {
         if (f(threadIdx.x)) {
             s.push_back(threadIdx.x);
         }
@@ -46,9 +46,9 @@ namespace kernels {
     };
 }
 
-TEST(gpu_stack, push_back) {
+TEST(stack, push_back) {
     using T = int;
-    using stack = gpu::gpu_stack<T>;
+    using stack = gpu::stack<T>;
 
     const unsigned n = 10;
     EXPECT_TRUE(n%2 == 0); // require n is even for tests to work
diff --git a/tests/unit/test_matrix.cpp b/tests/unit/test_matrix.cpp
index 7bd9911140324db1b7d909434dc66e47cafe0d90..f74a6760047119678ef8192d1bca26c1ab15350c 100644
--- a/tests/unit/test_matrix.cpp
+++ b/tests/unit/test_matrix.cpp
@@ -5,31 +5,29 @@
 
 #include <math.hpp>
 #include <matrix.hpp>
-#include <backends/fvm_multicore.hpp>
+#include <backends/multicore/fvm.hpp>
 #include <util/span.hpp>
 
 using namespace nest::mc;
 
 using matrix_type = matrix<nest::mc::multicore::backend>;
-using size_type = matrix_type::size_type;
+using size_type  = matrix_type::size_type;
+using value_type = matrix_type::value_type;
+
+using vvec = std::vector<value_type>;
 
 TEST(matrix, construct_from_parent_only)
 {
-    using util::make_span;
-
-    // pass parent index as a std::vector cast to host data
-    {
-        std::vector<size_type> p = {0,0,1};
-        matrix_type m(p, {0, 3});
-        EXPECT_EQ(m.num_cells(), 1u);
-        EXPECT_EQ(m.size(), 3u);
-        EXPECT_EQ(p.size(), 3u);
-
-        auto mp = m.p();
-        EXPECT_EQ(mp[0], 0u);
-        EXPECT_EQ(mp[1], 0u);
-        EXPECT_EQ(mp[2], 1u);
-    }
+    std::vector<size_type> p = {0,0,1};
+    matrix_type m(p, {0, 3}, vvec(3), vvec(3));
+    EXPECT_EQ(m.num_cells(), 1u);
+    EXPECT_EQ(m.size(), 3u);
+    EXPECT_EQ(p.size(), 3u);
+
+    auto mp = m.p();
+    EXPECT_EQ(mp[0], 0u);
+    EXPECT_EQ(mp[1], 0u);
+    EXPECT_EQ(mp[2], 1u);
 }
 
 TEST(matrix, solve_host)
@@ -39,7 +37,7 @@ TEST(matrix, solve_host)
 
     // trivial case : 1x1 matrix
     {
-        matrix_type m({0}, {0,1});
+        matrix_type m({0}, {0,1}, vvec(1), vvec(1));
         auto& state = m.state_;
         fill(state.d,  2);
         fill(state.u, -1);
@@ -55,7 +53,7 @@ TEST(matrix, solve_host)
         for(auto n : make_span(2u,1001u)) {
             auto p = std::vector<size_type>(n);
             std::iota(p.begin()+1, p.end(), 0);
-            matrix_type m(p, {0, n});
+            matrix_type m(p, {0, n}, vvec(n), vvec(n));
 
             EXPECT_EQ(m.size(), n);
             EXPECT_EQ(m.num_cells(), 1u);
diff --git a/tests/unit/test_matrix.cu b/tests/unit/test_matrix.cu
index 3f13a86f636748585191d85e9bef2d6e04e2e27e..dbf39f034fc42fd6243c5fa098648d6eb41ad54f 100644
--- a/tests/unit/test_matrix.cu
+++ b/tests/unit/test_matrix.cu
@@ -2,64 +2,387 @@
 #include <vector>
 
 #include "../gtest.h"
+#include "common.hpp"
 
 #include <math.hpp>
 #include <matrix.hpp>
-#include <backends/fvm_gpu.hpp>
+#include <backends/gpu/fvm.hpp>
+#include <backends/multicore/fvm.hpp>
 #include <memory/memory.hpp>
 #include <util/span.hpp>
 
-using matrix_type = nest::mc::matrix<nest::mc::gpu::backend>;
-using index_type = matrix_type::size_type;
+using namespace nest::mc;
 
-TEST(matrix, solve_gpu)
+using gpu::impl::npos;
+using util::make_span;
+using util::assign_from;
+using memory::on_gpu;
+using memory::on_host;
+
+using testing::seq_almost_eq;
+
+using std::begin;
+using std::end;
+
+// will test the flat_to_interleaved and interleaved_to_flat operations for the
+// set of matrices defined by sizes and starts.
+// Applies the interleave to the vector in values, and checks this against
+// a reference result generated using a host side reference implementation.
+// Then the interleave result is reverse_interleaved, and the result is
+// compared to the original input.
+//
+// This is implemented in a separate function to facilitate testing on a
+// broad range of BlockWidth and LoadWidth compile time parameters.
+template <typename T, typename I, int BlockWidth, int LoadWidth>
+::testing::AssertionResult test_interleave(
+        std::vector<I> sizes,
+        std::vector<I> starts,
+        std::vector<T> values,
+        int padded_size)
+{
+    auto num_mtx = sizes.size();
+
+    auto in  = on_gpu(memory::make_const_view(values));
+    auto sizes_d = on_gpu(memory::make_const_view(sizes));
+    auto starts_d = on_gpu(memory::make_const_view(starts));
+
+    int packed_size = padded_size * BlockWidth * gpu::impl::block_count(num_mtx, BlockWidth);
+
+    // forward will hold the result of the interleave operation on the GPU
+    auto forward = memory::device_vector<T>(packed_size, npos<T>());
+
+    // find the reference interleaved values using host side implementation
+    auto baseline = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size);
+
+    // find the interleaved values on gpu
+    gpu::flat_to_interleaved<T, I, BlockWidth, LoadWidth>(in.data(), forward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx);
+
+    std::vector<T> result_f = assign_from(on_host(forward));
+    std::vector<T> expected = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size);
+    const auto forward_success = (result_f==expected);
+    if (!forward_success) {
+        return ::testing::AssertionFailure() << "interleave to flat failed: BlockWidth "
+            << BlockWidth << ", LoadWidth " << LoadWidth << "\n";
+    }
+
+    // backward will hold the result of reverse interleave on the GPU
+    auto backward = memory::device_vector<T>(values.size(), npos<T>());
+    gpu::interleaved_to_flat<T, I, BlockWidth, LoadWidth>(forward.data(), backward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx);
+
+    std::vector<T> result_b = assign_from(on_host(backward));
+
+    // we expect that the result of the reverse permutation is the original input vector
+    const auto backward_success = (result_b==values);
+    if (!backward_success) {
+        return ::testing::AssertionFailure() << "flat to interleave failed: BlockWidth "
+            << BlockWidth << ", LoadWidth " << LoadWidth << "\n";
+    }
+
+    return ::testing::AssertionSuccess();
+}
+
+// test conversion to and from interleaved back end storage format
+TEST(matrix, interleave)
 {
-    using namespace nest::mc;
+    using I = int;
+    using T = int;
+    using ivec = std::vector<I>;
+    using tvec = std::vector<T>;
+
+    // simple case with 4 matrices of length 2
+    {
+        const int padded_size = 2;
+        const int num_mtx = 4;
+        ivec sizes(num_mtx, padded_size);
+
+        // find the start position of each matrix in the flat storage
+        // we are assuming that the matrices are unpermuted
+        ivec starts(num_mtx, 0);
+        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
+
+        tvec values(padded_size*num_mtx);
+        std::iota(values.begin(), values.end(), 0);
 
-    using nest::mc::util::make_span;
+        EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size)));
 
-    // trivial case : 1x1 matrix
+        EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size)));
+
+        EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size)));
+    }
+
+    // another small example with matrices of differing lengths
     {
-        matrix_type m({0}, {0,1});
+        const int padded_size = 8;
+        const int num_mtx = 8;
+        ivec sizes = {6, 5, 4, 4, 3, 2, 2, 1};
+
+        // find the start position of each matrix in the flat storage
+        // we are assuming that the matrices are unpermuted
+        ivec starts(num_mtx, 0);
+        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
 
-        auto& state = m.state_;
-        memory::fill(state.d,  2);
-        memory::fill(state.u, -1);
-        memory::fill(state.rhs,1);
+        tvec values(algorithms::sum(sizes));
+        std::iota(values.begin(), values.end(), 0);
 
-        m.solve();
+        EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size)));
 
-        auto rhs = memory::on_host(m.solution());
+        EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size)));
 
-        EXPECT_EQ(rhs[0], 0.5);
+        EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size)));
+        EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size)));
     }
 
-    // matrices in the range of 2x2 to 100x100
+    // more interesting case...
     {
-        using namespace nest::mc;
-        for(auto n : make_span(2u,101u)) {
-            auto p = std::vector<index_type>(n);
-            std::iota(p.begin()+1, p.end(), 0);
-            matrix_type m{p, {0, n}};
-
-            EXPECT_EQ(m.size(), n);
-            EXPECT_EQ(m.num_cells(), 1u);
-
-            auto& state = m.state_;
-            memory::fill(state.d,  2);
-            memory::fill(state.u, -1);
-            memory::fill(state.rhs,1);
-
-            m.solve();
-
-            auto x = memory::on_host(m.solution());
-            auto err = math::square(std::fabs(2.*x[0] - x[1] - 1.));
-            for(auto i : make_span(1,n-1)) {
-                err += math::square(std::fabs(2.*x[i] - x[i-1] - x[i+1] - 1.));
-            }
-            err += math::square(std::fabs(2.*x[n-1] - x[n-2] - 1.));
-
-            EXPECT_NEAR(0., std::sqrt(err), 1e-8);
+        const int padded_size = 256;
+        const int num_mtx = 1000;
+        ivec sizes(num_mtx);
+        for (auto i: make_span(  0, 100)) sizes[i] = 250;
+        for (auto i: make_span(100, 103)) sizes[i] = 213;
+        for (auto i: make_span(103, 150)) sizes[i] = 200;
+        for (auto i: make_span(150, 500)) sizes[i] = 178;
+        for (auto i: make_span(500, 999)) sizes[i] = 6;
+
+        // we are assuming that the matrices are unpermuted
+        ivec starts(num_mtx, 0);
+        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
+
+        tvec values(algorithms::sum(sizes));
+        std::iota(values.begin(), values.end(), 0);
+
+        // test in "full" 1024 thread configuration with 32 threads per matrix
+        EXPECT_TRUE((test_interleave<T, I, 32, 32>(sizes, starts, values, padded_size)));
+    }
+}
+
+// Test that matrix assembly works.
+// The test proceeds by assembling a reference matrix on the host and
+// device backends, then performs solve, and compares solution.
+//
+// limitations of test
+//  * matrices all have same size and structure
+TEST(matrix, assemble)
+{
+    using gpu_state = gpu::backend::matrix_state;
+    using mc_state  = multicore::backend::matrix_state;
+
+    using T = typename gpu::backend::value_type;
+    using I = typename gpu::backend::size_type;
+
+    using gpu_array  = typename gpu::backend::array;
+    using host_array = typename multicore::backend::array;
+
+    // There are two matrix structures:
+    //
+    // p_1: 3 branches, 6 compartments
+    //
+    //           3
+    //          /.
+    // 0 - 1 - 2
+    //          \.
+    //           4
+    //            \.
+    //             5
+    //
+    // p_2: 5 branches, 8 compartments
+    //
+    //             4
+    //            /.
+    //           3
+    //          / \.
+    // 0 - 1 - 2   5
+    //          \.
+    //           6
+    //            \.
+    //             7
+
+    // The parent indexes that define the two matrix structures
+    std::vector<std::vector<I>>
+        p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6} };
+
+    // Make a set of matrices based on repeating this pattern.
+    // We assign the patterns round-robin, i.e. so that the input
+    // matrices will have alternating sizes of 6 and 8, which will
+    // test the solver with variable matrix size, and exercise
+    // solvers that reorder matrices according to size.
+    const int num_mtx = 8;
+
+    std::vector<I> p;
+    std::vector<I> cell_index;
+    for (auto m=0; m<num_mtx; ++m) {
+        auto &p_ref = p_base[m%2];
+        auto first = p.size();
+        for (auto i: p_ref) {
+            p.push_back(i + first);
         }
+        cell_index.push_back(first);
     }
+    cell_index.push_back(p.size());
+
+    auto group_size = cell_index.back();
+
+    // Build the capacitance and conductance vectors and
+    // populate with nonzero random values.
+
+    auto gen  = std::mt19937();
+    auto dist = std::uniform_real_distribution<T>(1, 2);
+
+    std::vector<T> Cm(group_size);
+    std::generate(Cm.begin(), Cm.end(), [&](){return dist(gen);});
+
+    std::vector<T> g(group_size);
+    std::generate(g.begin(), g.end(), [&](){return dist(gen);});
+
+    // Make the referenace matrix and the gpu matrix
+    auto m_mc  = mc_state( p, cell_index, Cm, g); // on host
+    auto m_gpu = gpu_state(p, cell_index, Cm, g); // on gpu
+
+    // Voltage and current values
+    m_mc.assemble( 0.2, host_array(group_size, -64), host_array(group_size, 10));
+    m_mc.solve();
+    m_gpu.assemble(0.2, gpu_array(group_size, -64),  gpu_array(group_size, 10));
+    m_gpu.solve();
+
+    // Compare the GPU and CPU results.
+    // Cast result to float, because we are happy to ignore small differencs
+    std::vector<float> result_h = util::assign_from(m_mc.solution);
+    std::vector<float> result_g = util::assign_from(on_host(m_gpu.solution));
+    EXPECT_TRUE(seq_almost_eq<float>(result_h, result_g));
+}
+
+// test that the flat and interleaved storage back ends produce identical results
+TEST(matrix, backends)
+{
+    using T = typename gpu::backend::value_type;
+    using I = typename gpu::backend::size_type;
+
+    using state_flat = gpu::matrix_state_flat<T, I>;
+    using state_intl = gpu::matrix_state_interleaved<T, I>;
+
+    using gpu_array  = typename gpu::backend::array;
+
+    // There are two matrix structures:
+    //
+    // p_1: 3 branches, 6 compartments
+    //
+    //           3
+    //          /.
+    // 0 - 1 - 2
+    //          \.
+    //           4
+    //            \.
+    //             5
+    //
+    // p_2: 5 branches, 8 compartments
+    //
+    //             4
+    //            /.
+    //           3
+    //          / \.
+    // 0 - 1 - 2   5
+    //          \.
+    //           6
+    //            \.
+    //             7
+
+    // The parent indexes that define the two matrix structures
+    std::vector<std::vector<I>>
+        p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6} };
+
+    // Make a set of matrices based on repeating this pattern.
+    // We assign the patterns round-robin, i.e. so that the input
+    // matrices will have alternating sizes of 6 and 8, which will
+    // test the solver with variable matrix size, and exercise
+    // solvers that reorder matrices according to size.
+    const int num_mtx = 200;
+
+    std::vector<I> p;
+    std::vector<I> cell_index;
+    for (auto m=0; m<num_mtx; ++m) {
+        auto &p_ref = p_base[m%2];
+        auto first = p.size();
+        for (auto i: p_ref) {
+            p.push_back(i + first);
+        }
+        cell_index.push_back(first);
+    }
+    cell_index.push_back(p.size());
+
+    auto group_size = cell_index.back();
+
+    // Build the capacitance and conductance vectors and
+    // populate with nonzero random values
+
+    auto gen  = std::mt19937();
+    gen.seed(100);
+    auto dist = std::uniform_real_distribution<T>(1, 200);
+
+    std::vector<T> Cm(group_size);
+    std::vector<T> g(group_size);
+    std::vector<T> v(group_size);
+    std::vector<T> i(group_size);
+
+    std::generate(Cm.begin(), Cm.end(), [&](){return dist(gen);});
+    std::generate(g.begin(), g.end(), [&](){return dist(gen);});
+    std::generate(v.begin(), v.end(), [&](){return dist(gen);});
+    std::generate(i.begin(), i.end(), [&](){return dist(gen);});
+
+    // Make the referenace matrix and the gpu matrix
+    auto flat = state_flat(p, cell_index, Cm, g); // flat
+    auto intl = state_intl(p, cell_index, Cm, g); // interleaved
+
+    // voltage and current values
+    flat.assemble(0.02, on_gpu(v), on_gpu(i));
+    intl.assemble(0.02, on_gpu(v), on_gpu(i));
+
+    flat.solve();
+    intl.solve();
+
+    // Compare the results.
+    // We expect exact equality for the two gpu matrix implementations because both
+    // perform the same operations in the same order on the same inputs.
+    std::vector<double> x_flat = assign_from(on_host(flat.solution));
+    std::vector<double> x_intl = assign_from(on_host(intl.solution));
+    EXPECT_EQ(x_flat, x_intl);
 }
diff --git a/tests/unit/test_mechanisms.cpp b/tests/unit/test_mechanisms.cpp
index 7a4f57da152afcea0649254cb1456491efece61f..f91d3bf592a518d8c64c5dfbdb6d893ae0386e0e 100644
--- a/tests/unit/test_mechanisms.cpp
+++ b/tests/unit/test_mechanisms.cpp
@@ -17,7 +17,7 @@
 #include "mechanisms/multicore/test_kinlva.hpp"
 
 #include <initializer_list>
-#include <backends/fvm_multicore.hpp>
+#include <backends/multicore/fvm.hpp>
 #include <ion.hpp>
 #include <matrix.hpp>
 #include <memory/wrappers.hpp>
diff --git a/tests/unit/test_spikes.cpp b/tests/unit/test_spikes.cpp
index 2941698cc5b47cc2739e19f3eaca7f911001f0f1..35ef18e7d7b2574b79097f06d2a87aff7c429451 100644
--- a/tests/unit/test_spikes.cpp
+++ b/tests/unit/test_spikes.cpp
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <spike.hpp>
-#include <backends/fvm_multicore.hpp>
+#include <backends/multicore/fvm.hpp>
 
 using namespace nest::mc;
 
diff --git a/tests/unit/test_spikes.cu b/tests/unit/test_spikes.cu
index 01daf1562b7ffde7dfd4700f041ac63dacdae2d7..12bb89ee975b90020337b91b3acf4a4e5d482780 100644
--- a/tests/unit/test_spikes.cu
+++ b/tests/unit/test_spikes.cu
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <spike.hpp>
-#include <backends/fvm_gpu.hpp>
+#include <backends/gpu/fvm.hpp>
 
 using namespace nest::mc;
 
diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp
index cd899a5b0fd4890cce0b8c14860340facada0055..3e9b8ef4aa5b903fbf494ab5e79d4a12581d5927 100644
--- a/tests/unit/test_synapses.cpp
+++ b/tests/unit/test_synapses.cpp
@@ -2,7 +2,7 @@
 #include "../test_util.hpp"
 
 #include <cell.hpp>
-#include <backends/fvm_multicore.hpp>
+#include <backends/multicore/fvm.hpp>
 
 #include <mechanisms/multicore/expsyn.hpp>
 #include <mechanisms/multicore/exp2syn.hpp>
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index a42fc4ddd65cb0ad6fce7fe8376bac4caa25d48e..cea12373b6d9850ab82a2a92af510b5f0122aef1 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -18,6 +18,7 @@ set(VALIDATION_CUDA_SOURCES
     # unit tests
     validate_soma.cu
     validate_ball_and_stick.cu
+    validate_kinetic.cu
     validate_synapses.cu
 
     # support code
diff --git a/tests/validation/validate_kinetic.cu b/tests/validation/validate_kinetic.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c32cd9345582fa0d141fb6e82b099ddf8f7b0ce
--- /dev/null
+++ b/tests/validation/validate_kinetic.cu
@@ -0,0 +1,13 @@
+#include "validate_kinetic.hpp"
+
+#include "../gtest.h"
+
+using lowered_cell = nest::mc::fvm::fvm_multicell<nest::mc::gpu::backend>;
+
+TEST(kinetic, kin1_numeric_ref) {
+    validate_kinetic_kin1<lowered_cell>();
+}
+
+TEST(kinetic, kinlva_numeric_ref) {
+    validate_kinetic_kinlva<lowered_cell>();
+}