diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4576c0622c0a98cf7ffda6e95535653fb1a14677..66e801105c5d83020583d14caad1346477366754 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,6 @@ option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF)
 #----------------------------------------------------------
 
 option(ARB_WITH_GPU "build with GPU support" OFF)
-option(ARB_WITH_GPU_FINE_MATRIX "use optimized fine matrix solver" ON)
 
 option(ARB_WITH_MPI "build with MPI support" OFF)
 
@@ -234,9 +233,6 @@ endif()
 if(ARB_WITH_GPU)
     set(ARB_WITH_CUDA TRUE)
     target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU)
-    if(ARB_WITH_GPU_FINE_MATRIX)
-        target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU_FINE_MATRIX)
-    endif()
 
     target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index f0d0924be86d7c48b4e15fea08c81d80fb51bcf4..2e20955cacefaf6309117d3745a45668bc367e91 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -69,7 +69,6 @@ if(ARB_WITH_CUDA)
         backends/gpu/stimulus.cu
         backends/gpu/threshold_watcher.cu
         backends/gpu/matrix_assemble.cu
-        backends/gpu/matrix_interleave.cu
         backends/gpu/matrix_fine.cu
         backends/gpu/matrix_solve.cu
         backends/gpu/multi_event_stream.cpp
diff --git a/arbor/backends/gpu/forest.cpp b/arbor/backends/gpu/forest.cpp
index 6fe4de8544ff376176487186fa055f33f1516a16..e920147cb0fb117fe5a812a7ab5aff821fae3968 100644
--- a/arbor/backends/gpu/forest.cpp
+++ b/arbor/backends/gpu/forest.cpp
@@ -19,7 +19,7 @@ forest::forest(const std::vector<size_type>& p, const std::vector<size_type>& ce
             util::assign_from(
                 util::transform_view(
                     util::subrange_view(p, cell_cv_divs[c], cell_cv_divs[c+1]),
-                    [cell_start](unsigned i) {return i == -1 ? i : i - cell_start;}));
+                    [cell_start](size_type i) -> unsigned {return i == size_type(-1) ? i : i - cell_start;}));
 
         auto fine_tree = tree(cell_p);
 
diff --git a/arbor/backends/gpu/fvm.hpp b/arbor/backends/gpu/fvm.hpp
index b40b0a63a77c3f9d4b744148a9174708a303a8d3..0a56bf3cc536d16de072b5aa7e51b6418bfc4928 100644
--- a/arbor/backends/gpu/fvm.hpp
+++ b/arbor/backends/gpu/fvm.hpp
@@ -16,12 +16,7 @@
 
 #include "threshold_watcher.hpp"
 
-
-#ifdef ARB_HAVE_GPU_FINE_MATRIX
-    #include "matrix_state_fine.hpp"
-#else
-    #include "matrix_state_interleaved.hpp"
-#endif
+#include "matrix_state_fine.hpp"
 
 namespace arb {
 namespace gpu {
@@ -45,11 +40,7 @@ struct backend {
         return memory::on_host(v);
     }
 
-#ifdef ARB_HAVE_GPU_FINE_MATRIX
     using matrix_state = arb::gpu::matrix_state_fine<value_type, index_type>;
-#else
-    using matrix_state = arb::gpu::matrix_state_interleaved<value_type, index_type>;
-#endif
     using threshold_watcher = arb::gpu::threshold_watcher;
 
     using deliverable_event_stream = arb::gpu::deliverable_event_stream;
diff --git a/arbor/backends/gpu/matrix_interleave.cu b/arbor/backends/gpu/matrix_interleave.cu
deleted file mode 100644
index ae7fbdefb04a49b6b29ea871f7631d97c2bf58e1..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/matrix_interleave.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <arbor/fvm_types.hpp>
-
-#include "matrix_common.hpp"
-#include "matrix_interleave.hpp"
-
-namespace arb {
-namespace gpu {
-
-// host side wrapper for the flat to interleaved operation
-void flat_to_interleaved(
-    const fvm_value_type* in,
-    fvm_value_type* out,
-    const fvm_index_type* sizes,
-    const fvm_index_type* starts,
-    unsigned padded_size,
-    unsigned num_vec)
-{
-    constexpr unsigned BlockWidth = impl::matrices_per_block();
-    constexpr unsigned LoadWidth  = impl::load_width();
-
-    flat_to_interleaved
-        <fvm_value_type, fvm_index_type, BlockWidth, LoadWidth>
-        (in, out, sizes, starts, padded_size, num_vec);
-}
-
-// host side wrapper for the interleave to flat operation
-void interleaved_to_flat(
-    const fvm_value_type* in,
-    fvm_value_type* out,
-    const fvm_index_type* sizes,
-    const fvm_index_type* starts,
-    unsigned padded_size,
-    unsigned num_vec)
-{
-    constexpr unsigned BlockWidth = impl::matrices_per_block();
-    constexpr unsigned LoadWidth  = impl::load_width();
-
-    interleaved_to_flat
-        <fvm_value_type, fvm_index_type, BlockWidth, LoadWidth>
-        (in, out, sizes, starts, padded_size, num_vec);
-}
-
-} // namespace gpu
-} // namespace arb
diff --git a/arbor/backends/gpu/matrix_interleave.hpp b/arbor/backends/gpu/matrix_interleave.hpp
deleted file mode 100644
index dc9c5674e4797f0732648ac5559518c2d62840ff..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/matrix_interleave.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#pragma once
-
-#include "cuda_common.hpp"
-#include "matrix_common.hpp"
-
-namespace arb {
-namespace gpu {
-
-///////////////////////////////////////////////////////////////////////////////
-// For more information about the interleaved and flat storage formats for
-// Hines matrices, see src/backends/matrix_storage.md
-///////////////////////////////////////////////////////////////////////////////
-
-namespace kernels {
-// Data in a vector is to be interleaved into blocks of width BlockWidth.
-// The kernel assigns LoadWidth threads to each lane in the block.
-//
-// Note that all indexes can reasonably be represented by an unsigned 32-bit
-// integer, so we use unsigned explicitly.
-template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads>
-__global__
-void flat_to_interleaved(
-    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
-{
-    static_assert(BlockWidth*LoadWidth==Threads, "");
-
-    __shared__ T buffer[Threads];
-
-    const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
-    const unsigned lid = threadIdx.x;
-
-    const unsigned mtx_id   = tid/LoadWidth;
-    const unsigned mtx_lane = tid - mtx_id*LoadWidth;
-
-    const unsigned blk_id   = tid/(BlockWidth*LoadWidth);
-    const unsigned blk_row  = lid/BlockWidth;
-    const unsigned blk_lane = lid - blk_row*BlockWidth;
-
-    const unsigned blk_pos  = LoadWidth*blk_lane + blk_row;
-
-    const bool do_load  = mtx_id<num_vec;
-
-    // only threads that participate in loading access starts and sizes arrays
-    unsigned load_pos  = do_load? starts[mtx_id] + mtx_lane     : 0u;
-    const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0u;
-    unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane);
-
-    for (unsigned i=0u; i<padded_size; i+=LoadWidth) {
-        auto loaded = impl::npos<T>();
-        if (do_load && load_pos<end) {
-            loaded = in[load_pos];
-        }
-        buffer[lid] = loaded;
-        __syncthreads();
-        if (i+blk_row<padded_size) {
-            out[store_pos] = buffer[blk_pos];
-        }
-        __syncthreads();
-        load_pos  += LoadWidth;
-        store_pos += LoadWidth*BlockWidth;
-    }
-}
-
-// Note that all indexes can reasonably be represented by an unsigned 32-bit
-// integer, so we use unsigned explicitly.
-template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned THREADS>
-__global__
-void interleaved_to_flat(
-    const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec)
-{
-    static_assert(BlockWidth*LoadWidth==THREADS, "");
-
-    __shared__ T buffer[THREADS];
-
-    const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
-    const unsigned lid = threadIdx.x;
-
-    const unsigned mtx_id   = tid/LoadWidth;
-    const unsigned mtx_lane = tid - mtx_id*LoadWidth;
-
-    const unsigned blk_id   = tid/(BlockWidth*LoadWidth);
-    const unsigned blk_row  = lid/BlockWidth;
-    const unsigned blk_lane = lid - blk_row*BlockWidth;
-
-    const unsigned blk_pos  = LoadWidth*blk_lane + blk_row;
-
-    const bool do_store = mtx_id<num_vec;
-
-    // only threads that participate in storing access starts and sizes arrays
-    unsigned store_pos = do_store? starts[mtx_id] + mtx_lane     : 0u;
-    const unsigned end = do_store? starts[mtx_id] + sizes[mtx_id]: 0u;
-    unsigned load_pos  = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane);
-
-    for (unsigned i=0u; i<padded_size; i+=LoadWidth) {
-        auto loaded = impl::npos<T>();
-        if (i+blk_row<padded_size) {
-            loaded = in[load_pos];
-        }
-        buffer[blk_pos] = loaded;
-        __syncthreads();
-        if (do_store && store_pos<end) {
-            out[store_pos] = buffer[lid];
-        }
-        __syncthreads();
-        load_pos  += LoadWidth*BlockWidth;
-        store_pos += LoadWidth;
-    }
-}
-
-} // namespace kernels
-
-// host side wrapper for the flat to interleaved operation
-template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
-void flat_to_interleaved(
-    const T* in,
-    T* out,
-    const I* sizes,
-    const I* starts,
-    unsigned padded_size,
-    unsigned num_vec)
-{
-    constexpr unsigned Threads = BlockWidth*LoadWidth;
-    const unsigned blocks = impl::block_count(num_vec, BlockWidth);
-
-    kernels::flat_to_interleaved
-        <T, I, BlockWidth, LoadWidth, Threads>
-        <<<blocks, Threads>>>
-        (in, out, sizes, starts, padded_size, num_vec);
-}
-
-// host side wrapper for the interleave to flat operation
-template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth>
-void interleaved_to_flat(
-    const T* in,
-    T* out,
-    const I* sizes,
-    const I* starts,
-    unsigned padded_size,
-    unsigned num_vec)
-{
-    constexpr unsigned Threads = BlockWidth*LoadWidth;
-    const unsigned blocks = impl::block_count(num_vec, BlockWidth);
-
-    kernels::interleaved_to_flat
-        <T, I, BlockWidth, LoadWidth, Threads>
-        <<<blocks, Threads>>>
-        (in, out, sizes, starts, padded_size, num_vec);
-}
-
-} // namespace gpu
-} // namespace arb
-
diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp
index 88264c1ddd0f8b28da69c875c2156c4b1e88eb64..17c92471028fd888d63b2405ef600bf16f09a761 100644
--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -431,7 +431,9 @@ public:
 
             temp_u_shuffled[i] = -gij;
             invariant_d_tmp[i] += gij;
-            invariant_d_tmp[p[i]] += gij;
+            if (p[i]!=-1) {
+                invariant_d_tmp[p[i]] += gij;
+            }
         }
         u_shuffled = memory::make_const_view(temp_u_shuffled);
 
diff --git a/arbor/backends/gpu/matrix_state_interleaved.hpp b/arbor/backends/gpu/matrix_state_interleaved.hpp
deleted file mode 100644
index 89e63cb74b48fecd78501a4fa1d04725113be593..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/matrix_state_interleaved.hpp
+++ /dev/null
@@ -1,308 +0,0 @@
-#pragma once
-
-#include <arbor/assert.hpp>
-#include <arbor/fvm_types.hpp>
-#include <arbor/math.hpp>
-
-#include "memory/memory.hpp"
-#include "util/span.hpp"
-#include "util/partition.hpp"
-#include "util/rangeutil.hpp"
-#include "util/indirect.hpp"
-
-#include "cuda_common.hpp"
-#include "matrix_common.hpp"
-
-namespace arb {
-namespace gpu {
-
-// host side wrapper for interleaved matrix assembly kernel
-void assemble_matrix_interleaved(
-    fvm_value_type* d,
-    fvm_value_type* rhs,
-    const fvm_value_type* invariant_d,
-    const fvm_value_type* voltage,
-    const fvm_value_type* current,
-    const fvm_value_type* conductivity,
-    const fvm_value_type* cv_capacitance,
-    const fvm_value_type* area,
-    const fvm_index_type* sizes,
-    const fvm_index_type* starts,
-    const fvm_index_type* matrix_to_cell,
-    const fvm_value_type* dt_intdom,
-    const fvm_index_type* cell_to_intdom,
-    unsigned padded_size, unsigned num_mtx);
-
-// host side wrapper for interleaved matrix solver kernel
-void solve_matrix_interleaved(
-    fvm_value_type* rhs,
-    fvm_value_type* d,
-    const fvm_value_type* u,
-    const fvm_index_type* p,
-    const fvm_index_type* sizes,
-    int padded_size,
-    int num_mtx);
-
-// host side wrapper for the flat to interleaved operation
-void flat_to_interleaved(
-    const fvm_value_type* in,
-    fvm_value_type* out,
-    const fvm_index_type* sizes,
-    const fvm_index_type* starts,
-    unsigned padded_size,
-    unsigned num_vec);
-
-// host side wrapper for the interleave to flat operation
-void interleaved_to_flat(
-    const fvm_value_type* in,
-    fvm_value_type* out,
-    const fvm_index_type* sizes,
-    const fvm_index_type* starts,
-    unsigned padded_size,
-    unsigned num_vec);
-
-// A helper that performs the interleave operation on host memory.
-template <typename T, typename I>
-std::vector<T> flat_to_interleaved(
-        const std::vector<T>& in,
-        const std::vector<I>& sizes,
-        const std::vector<I>& starts,
-        unsigned block_width, unsigned num_vec, unsigned padded_length)
-{
-    auto num_blocks = impl::block_count(num_vec, block_width);
-    std::vector<T> out(num_blocks*block_width*padded_length, impl::npos<T>());
-    for (auto mtx: util::make_span(0u, num_vec)) {
-        auto block = mtx/block_width;
-        auto lane  = mtx%block_width;
-
-        auto len = sizes[mtx];
-        auto src = starts[mtx];
-        auto dst = block*(block_width*padded_length) + lane;
-        for (auto i: util::make_span(0, len)) {
-            out[dst] = in[src+i];
-            dst += block_width;
-        }
-    }
-    return out;
-};
-
-/// matrix state
-template <typename T, typename I>
-struct matrix_state_interleaved {
-    using value_type = T;
-    using index_type = I;
-
-    using array  = memory::device_vector<value_type>;
-    using iarray = memory::device_vector<index_type>;
-
-    using const_view = typename array::const_view_type;
-
-    // Permutation and index information required for forward and backward
-    // interleave-permutation of vectors.
-
-    // size of each matrix (after permutation in ascending size)
-    iarray matrix_sizes;
-    // start values corresponding to matrix i in the external storage
-    iarray matrix_index;
-    // map from matrix index (after permutation) to original cell index
-    iarray matrix_to_cell_index;
-
-    // Storage for the matrix and parent index in interleaved format.
-    // Includes the cv_capacitance, which is required for matrix assembly.
-
-    iarray parent_index;
-    array d;   // [μS]
-    array u;   // [μS]
-    array rhs; // [nA]
-
-    // required for matrix assembly
-    array cv_capacitance; // [pF]
-
-    // required for matrix assembly
-    array cv_area; // [μm^2]
-
-    // the invariant part of the matrix diagonal
-    array invariant_d;    // [μS]
-
-    iarray cell_to_intdom;
-
-    // the length of a vector required to store values for one
-    // matrix with padding
-    unsigned padded_size;
-
-    //  Storage for solution in uninterleaved format.
-    //  Used to hold the storage for passing to caller, and must be updated
-    //  after each call to the ::solve() method.
-    array solution_;
-
-    // default constructor
-    matrix_state_interleaved() = default;
-
-    // Construct matrix state for a set of matrices defined by parent_index p
-    // The matrix solver stores the matrix in an "interleaved" structure for
-    // optimal solution, which requires a significant amount of precomputing
-    // of indexes and data structures in the constructor.
-    //  cv_cap      // [pF]
-    //  face_cond   // [μS]
-    matrix_state_interleaved(const std::vector<index_type>& p,
-                 const std::vector<index_type>& cell_cv_divs,
-                 const std::vector<value_type>& cv_cap,
-                 const std::vector<value_type>& face_cond,
-                 const std::vector<value_type>& area,
-                 const std::vector<index_type>& cell_intdom)
-    {
-        arb_assert(cv_cap.size()    == p.size());
-        arb_assert(face_cond.size() == p.size());
-        arb_assert(cell_cv_divs.back()  == (index_type)p.size());
-
-        // Just because you never know.
-        arb_assert(cell_cv_divs.size() <= UINT_MAX);
-
-        using util::make_span;
-        using util::indirect_view;
-
-        // Convenience for commonly used type in this routine.
-        using svec = std::vector<index_type>;
-
-        //
-        // Sort matrices in descending order of size.
-        //
-
-        // Find the size of each matrix.
-        svec sizes;
-        for (auto cv_span: util::partition_view(cell_cv_divs)) {
-            sizes.push_back(cv_span.second-cv_span.first);
-        }
-        const auto num_mtx = sizes.size();
-
-        // Find permutations and sort indexes/sizes.
-        svec perm(num_mtx);
-        std::iota(perm.begin(), perm.end(), 0);
-        // calculate the permutation of matrices to put the in ascending size
-        util::stable_sort_by(perm, [&sizes](index_type i){ return sizes[i]; });
-        std::reverse(perm.begin(), perm.end());
-
-        svec sizes_p = util::assign_from(indirect_view(sizes, perm));
-
-        auto cell_to_cv = util::subrange_view(cell_cv_divs, 0, num_mtx);
-        svec cell_to_cv_p = util::assign_from(indirect_view(cell_to_cv, perm));
-
-        //
-        // Calculate dimensions required to store matrices.
-        //
-        constexpr unsigned block_dim = impl::matrices_per_block();
-        using impl::matrix_padding;
-
-        // To start, take simplest approach of assuming all matrices stored
-        // in blocks of the same dimension: padded_size
-        padded_size = math::round_up(sizes_p[0], matrix_padding());
-        const auto num_blocks = impl::block_count(num_mtx, block_dim);
-
-        const auto total_storage = num_blocks*block_dim*padded_size;
-
-        // calculate the interleaved and permuted p vector
-        constexpr auto npos = std::numeric_limits<index_type>::max();
-        std::vector<index_type> p_tmp(total_storage, npos);
-        for (auto mtx: make_span(0, num_mtx)) {
-            auto block = mtx/block_dim;
-            auto lane  = mtx%block_dim;
-
-            auto len = sizes_p[mtx];
-            auto src = cell_to_cv_p[mtx];
-            auto dst = block*(block_dim*padded_size) + lane;
-            for (auto i: make_span(0, len)) {
-                // the p indexes are always relative to the start of the p vector.
-                // the addition and subtraction of dst and src respectively is to convert from
-                // the original offset to the new padded and permuted offset.
-                p_tmp[dst+block_dim*i] = dst + block_dim*(p[src+i]-src);
-            }
-        }
-
-        d   = array(total_storage);
-        u   = array(total_storage);
-        rhs = array(total_storage);
-        parent_index = memory::make_const_view(p_tmp);
-
-        //
-        //  Calculate the invariant part of the matrix diagonal and the
-        //  upper diagonal on the host, then copy to the device.
-        //
-
-        std::vector<value_type> invariant_d_tmp(p.size(), 0);
-        std::vector<value_type> u_tmp(p.size(), 0);
-        auto face_conductance_tmp = memory::on_host(face_cond);
-        for (auto i: util::make_span(1u, p.size())) {
-            auto gij = face_conductance_tmp[i];
-
-            u_tmp[i] = -gij;
-            invariant_d_tmp[i] += gij;
-            invariant_d_tmp[p[i]] += gij;
-        }
-
-        // Helper that converts to interleaved format on the host, then copies to device
-        // memory, for use as an rvalue in an assignemt to a device vector.
-        auto interleave = [&] (std::vector<T>const& x) {
-            return memory::on_gpu(
-                flat_to_interleaved(x, sizes_p, cell_to_cv_p, block_dim, num_mtx, padded_size));
-        };
-        u              = interleave(u_tmp);
-        invariant_d    = interleave(invariant_d_tmp);
-        cv_area        = interleave(area);
-        cv_capacitance = interleave(cv_cap);
-
-        matrix_sizes = memory::make_const_view(sizes_p);
-        matrix_index = memory::make_const_view(cell_to_cv_p);
-        matrix_to_cell_index = memory::make_const_view(perm);
-        cell_to_intdom = memory::make_const_view(cell_intdom);
-
-        // Allocate space for storing the un-interleaved solution.
-        solution_ = array(p.size());
-    }
-
-    const_view solution() const {
-        return solution_;
-    }
-
-    // Assemble the matrix
-    // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
-    //   dt_intdom         [ms]     (per integration domain)
-    //   voltage           [mV]     (per compartment)
-    //   current density   [A/m²]   (per compartment)
-    //   conductivity      [kS/m²]  (per compartment)
-    void assemble(const_view dt_intdom, const_view voltage, const_view current, const_view conductivity) {
-        assemble_matrix_interleaved
-            (d.data(), rhs.data(), invariant_d.data(),
-             voltage.data(), current.data(), conductivity.data(), cv_capacitance.data(), cv_area.data(),
-             matrix_sizes.data(), matrix_index.data(),
-             matrix_to_cell_index.data(),
-             dt_intdom.data(), cell_to_intdom.data(), padded_matrix_size(), num_matrices());
-
-    }
-
-    void solve() {
-        // Perform the Hines solve.
-        solve_matrix_interleaved(
-             rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(),
-             padded_matrix_size(), num_matrices());
-
-        // Copy the solution from interleaved to front end storage.
-        interleaved_to_flat
-            (rhs.data(), solution_.data(), matrix_sizes.data(), matrix_index.data(),
-             padded_matrix_size(), num_matrices());
-    }
-
-private:
-
-    // The number of matrices stored in the matrix state.
-    unsigned num_matrices() const {
-        return matrix_sizes.size();
-    }
-
-    // The full padded matrix size
-    unsigned padded_matrix_size() const {
-        return padded_size;
-    }
-};
-
-} // namespace gpu
-} // namespace arb
diff --git a/arbor/backends/multicore/matrix_state.hpp b/arbor/backends/multicore/matrix_state.hpp
index 7cba8e961a5fd1bcb8f1ce92f1f8897df8aabe92..84e3f041c3f25a2c41c6e50b827c7747fa2f00b2 100644
--- a/arbor/backends/multicore/matrix_state.hpp
+++ b/arbor/backends/multicore/matrix_state.hpp
@@ -61,7 +61,9 @@ public:
 
             u[i] = -gij;
             invariant_d[i] += gij;
-            invariant_d[p[i]] += gij;
+            if (p[i]!=-1) {
+                invariant_d[p[i]] += gij;
+            }
         }
     }
 
diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu
index a4a763772cf7e6bd4f78fe23c658513c38b2e303..16f74a8c813bcf79c354b4c73d36dc2a387299a9 100644
--- a/test/unit/test_matrix.cu
+++ b/test/unit/test_matrix.cu
@@ -13,8 +13,6 @@
 
 #include "backends/gpu/cuda_common.hpp"
 #include "backends/gpu/matrix_state_flat.hpp"
-#include "backends/gpu/matrix_state_interleaved.hpp"
-#include "backends/gpu/matrix_interleave.hpp"
 #include "backends/gpu/matrix_state_fine.hpp"
 
 #include "../gtest.h"
@@ -23,7 +21,6 @@
 
 using namespace arb;
 
-using gpu::impl::npos;
 using util::make_span;
 using util::assign_from;
 using memory::on_gpu;
@@ -34,179 +31,6 @@ using testing::seq_almost_eq;
 using std::begin;
 using std::end;
 
-
-// Test the flat_to_interleaved and interleaved_to_flat operations for the
-// set of matrices defined by sizes and starts.
-// Applies the interleave to the vector in values, and checks this against
-// a reference result generated using a host side reference implementation.
-// Then the interleave result is reverse_interleaved, and the result is
-// compared to the original input.
-//
-// This is implemented in a separate function to facilitate testing on a
-// broad range of BlockWidth and LoadWidth compile time parameters.
-template <typename T, typename I, int BlockWidth, int LoadWidth>
-::testing::AssertionResult test_interleave(
-        std::vector<I> sizes,
-        std::vector<I> starts,
-        std::vector<T> values,
-        int padded_size)
-{
-    auto num_mtx = sizes.size();
-
-    auto in  = on_gpu(memory::make_const_view(values));
-    auto sizes_d = on_gpu(memory::make_const_view(sizes));
-    auto starts_d = on_gpu(memory::make_const_view(starts));
-
-    int packed_size = padded_size * BlockWidth * gpu::impl::block_count(num_mtx, BlockWidth);
-
-    // forward will hold the result of the interleave operation on the GPU
-    auto forward = memory::device_vector<T>(packed_size, npos<T>());
-
-    // find the reference interleaved values using host side implementation
-    auto baseline = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size);
-
-    // find the interleaved values on gpu
-    gpu::flat_to_interleaved<T, I, BlockWidth, LoadWidth>(in.data(), forward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx);
-
-    std::vector<T> result_f = assign_from(on_host(forward));
-    std::vector<T> expected = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size);
-    const auto forward_success = (result_f==expected);
-    if (!forward_success) {
-        return ::testing::AssertionFailure() << "interleave to flat failed: BlockWidth "
-            << BlockWidth << ", LoadWidth " << LoadWidth << "\n";
-    }
-
-    // backward will hold the result of reverse interleave on the GPU
-    auto backward = memory::device_vector<T>(values.size(), npos<T>());
-    gpu::interleaved_to_flat<T, I, BlockWidth, LoadWidth>(forward.data(), backward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx);
-
-    std::vector<T> result_b = assign_from(on_host(backward));
-
-    // we expect that the result of the reverse permutation is the original input vector
-    const auto backward_success = (result_b==values);
-    if (!backward_success) {
-        return ::testing::AssertionFailure() << "flat to interleave failed: BlockWidth "
-            << BlockWidth << ", LoadWidth " << LoadWidth << "\n";
-    }
-
-    return ::testing::AssertionSuccess();
-}
-
-// test conversion to and from interleaved back end storage format
-TEST(matrix, interleave)
-{
-    using I = int;
-    using T = int;
-    using ivec = std::vector<I>;
-    using tvec = std::vector<T>;
-
-    // simple case with 4 matrices of length 2
-    {
-        const int padded_size = 2;
-        const int num_mtx = 4;
-        ivec sizes(num_mtx, padded_size);
-
-        // find the start position of each matrix in the flat storage
-        // we are assuming that the matrices are unpermuted
-        ivec starts(num_mtx, 0);
-        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
-
-        tvec values(padded_size*num_mtx);
-        std::iota(values.begin(), values.end(), 0);
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size)));
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size)));
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size)));
-    }
-
-    // another small example with matrices of differing lengths
-    {
-        const int padded_size = 8;
-        const int num_mtx = 8;
-        ivec sizes = {6, 5, 4, 4, 3, 2, 2, 1};
-
-        // find the start position of each matrix in the flat storage
-        // we are assuming that the matrices are unpermuted
-        ivec starts(num_mtx, 0);
-        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
-
-        tvec values(util::sum(sizes));
-        std::iota(values.begin(), values.end(), 0);
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size)));
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size)));
-
-        EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size)));
-        EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size)));
-    }
-
-    // more interesting case...
-    {
-        const int padded_size = 256;
-        const int num_mtx = 1000;
-        ivec sizes(num_mtx);
-        for (auto i: make_span(  0, 100)) sizes[i] = 250;
-        for (auto i: make_span(100, 103)) sizes[i] = 213;
-        for (auto i: make_span(103, 150)) sizes[i] = 200;
-        for (auto i: make_span(150, 500)) sizes[i] = 178;
-        for (auto i: make_span(500, 999)) sizes[i] = 6;
-
-        // we are assuming that the matrices are unpermuted
-        ivec starts(num_mtx, 0);
-        std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1);
-
-        tvec values(util::sum(sizes));
-        std::iota(values.begin(), values.end(), 0);
-
-        // test in "full" 1024 thread configuration with 32 threads per matrix
-        EXPECT_TRUE((test_interleave<T, I, 32, 32>(sizes, starts, values, padded_size)));
-    }
-}
-
 // test that the flat and interleaved storage back ends produce identical results
 TEST(matrix, backends)
 {
@@ -214,7 +38,6 @@ TEST(matrix, backends)
     using I = fvm_index_type;
 
     using state_flat = gpu::matrix_state_flat<T, I>;
-    using state_intl = gpu::matrix_state_interleaved<T, I>;
     using state_fine = gpu::matrix_state_fine<T, I>;
 
     using gpu_array  = memory::device_vector<T>;
@@ -291,7 +114,6 @@ TEST(matrix, backends)
 
     // Make the reference matrix and the gpu matrix
     auto flat = state_flat(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // flat
-    auto intl = state_intl(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // interleaved
     auto fine = state_fine(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // interleaved
 
     // Set the integration times for the cells to be between 0.01 and 0.02 ms.
@@ -307,18 +129,15 @@ TEST(matrix, backends)
     auto gpu_mg = on_gpu(mg);
 
     flat.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg);
-    intl.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg);
     fine.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg);
 
     flat.solve();
-    intl.solve();
     fine.solve();
 
     // Compare the results.
     // We expect exact equality for the two gpu matrix implementations because both
     // perform the same operations in the same order on the same inputs.
     std::vector<double> x_flat = assign_from(on_host(flat.solution()));
-    std::vector<double> x_intl = assign_from(on_host(intl.solution()));
     // as the fine algorithm contains atomics the solution might be slightly
     // different from flat and interleaved
     std::vector<double> x_fine = assign_from(on_host(fine.solution()));
@@ -329,6 +148,5 @@ TEST(matrix, backends)
                 util::count_along(x_flat),
                 [&](unsigned i) {return std::abs(x_flat[i] - x_fine[i]);}));
 
-    EXPECT_EQ(x_flat, x_intl);
     EXPECT_LE(max_diff_fine, 1e-12);
 }