diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index c8e7c339d6ad689183977fa2e6660fdcce7afd82..2257032d3f4754a461d15d9c837a9ea95c48c05c 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -67,7 +67,6 @@ if(ARB_WITH_CUDA)
         backends/gpu/matrix_assemble.cu
         backends/gpu/matrix_interleave.cu
         backends/gpu/matrix_fine.cu
-        backends/gpu/matrix_fine.cpp
         backends/gpu/matrix_solve.cu
         backends/gpu/multi_event_stream.cpp
         backends/gpu/multi_event_stream.cu
diff --git a/arbor/backends/gpu/managed_ptr.hpp b/arbor/backends/gpu/managed_ptr.hpp
deleted file mode 100644
index 98202d3234645bbed8d389c20df4ea8c9ec1d0af..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/managed_ptr.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <memory/allocator.hpp>
-
-namespace arb {
-namespace gpu {
-
-// used to indicate that the type pointed to by the managed_ptr is to be
-// constructed in the managed_ptr constructor
-struct construct_in_place_tag {};
-
-// Like std::unique_ptr, but for CUDA managed memory.
-// Handles memory allocation and freeing, and the construction and destruction
-// of the type being stored in the allocated memory.
-// Implemented as a stand alone type instead of as a std::unique_ptr with
-// custom desctructor so that __device__ annotation can be added to members
-// like ::get, ::operator*, etc., which enables the use of the smart pointer
-// in device side code.
-//
-// It is very strongly recommended that the helper make_managed_ptr be used
-// instead of directly constructing the managed_ptr.
-template <typename T>
-class managed_ptr {
-public:
-
-    using element_type = T;
-    using pointer = element_type*;
-    using reference = element_type&;
-    
-    managed_ptr(const managed_ptr& other) = delete;
-
-    // Allocate memory and construct in place using args.
-    // This is an extension over the std::unique_ptr interface, because the
-    // point of the wrapper is to hide the complexity of allocating managed
-    // memory and constructing a type in place.
-    template <typename... Args>
-    managed_ptr(construct_in_place_tag, Args&&... args) {
-        memory::managed_allocator<element_type> allocator;
-        data_ = allocator.allocate(1u);
-        synchronize();
-        data_ = new (data_) element_type(std::forward<Args>(args)...);
-    }
-
-    managed_ptr(managed_ptr&& other) {
-        std::swap(other.data_, data_);
-    }
-
-    // pointer to the managed object
-    __host__ __device__
-    pointer get() const {
-        return data_;
-    }
-
-    // return a reference to the managed object
-    __host__ __device__
-    reference operator *() const {
-        return *data_;
-    }
-
-    // return a reference to the managed object
-    __host__ __device__
-    pointer operator->() const {
-        return get();
-    }
-
-    managed_ptr& operator=(managed_ptr&& other) {
-        swap(std::move(other));
-        return *this;
-    }
-
-    ~managed_ptr() {
-        if (is_allocated()) {
-            memory::managed_allocator<element_type> allocator;
-            synchronize(); // required to ensure that memory is not in use on GPU
-            data_->~element_type();
-            allocator.deallocate(data_, 1u);
-        }
-    }
-
-    void swap(managed_ptr&& other) {
-        std::swap(other.data_, data_);
-    }
-
-    __host__ __device__
-    operator bool() const {
-        return is_allocated();
-    }
-
-    void synchronize() const {
-        cudaDeviceSynchronize();
-    }
-
-private:
-    __host__ __device__
-    bool is_allocated() const {
-        return data_!=nullptr;
-    }
-
-    pointer data_ = nullptr;
-};
-
-// The correct way to construct a type in managed memory.
-// Equivalent to std::make_unique_ptr for std::unique_ptr
-template <typename T, typename... Args>
-managed_ptr<T> make_managed_ptr(Args&&... args) {
-    return managed_ptr<T>(construct_in_place_tag(), std::forward<Args>(args)...);
-}
-
-} // namespace gpu
-} // namespace arb
-
diff --git a/arbor/backends/gpu/matrix_fine.cpp b/arbor/backends/gpu/matrix_fine.cpp
deleted file mode 100644
index b1d7a2279454ebdb89c3b484bc5c9e4b950f8b5e..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/matrix_fine.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <ostream>
-
-#include <cuda_runtime.h>
-
-#include "memory/cuda_wrappers.hpp"
-#include "util/span.hpp"
-
-#include "matrix_fine.hpp"
-
-namespace arb {
-namespace gpu {
-
-level::level(unsigned branches):
-    num_branches(branches)
-{
-    using memory::cuda_malloc_managed;
-
-    using arb::memory::cuda_malloc_managed;
-    if (num_branches!=0) {
-        lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned)));
-        parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned)));
-        cudaDeviceSynchronize();
-    }
-}
-
-level::level(level&& other) {
-    std::swap(other.lengths, this->lengths);
-    std::swap(other.parents, this->parents);
-    std::swap(other.num_branches, this->num_branches);
-    std::swap(other.max_length, this->max_length);
-    std::swap(other.data_index, this->data_index);
-}
-
-level::level(const level& other) {
-    using memory::cuda_malloc_managed;
-
-    num_branches = other.num_branches;
-    max_length = other.max_length;
-    data_index = other.data_index;
-    if (num_branches!=0) {
-        lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned)));
-        parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned)));
-        cudaDeviceSynchronize();
-        std::copy(other.lengths, other.lengths+num_branches, lengths);
-        std::copy(other.parents, other.parents+num_branches, parents);
-    }
-}
-
-level::~level() {
-    if (num_branches!=0) {
-        cudaDeviceSynchronize(); // to ensure that managed memory has been freed
-        if (lengths) arb::memory::cuda_free(lengths);
-        if (parents) arb::memory::cuda_free(parents);
-    }
-}
-
-std::ostream& operator<<(std::ostream& o, const level& l) {
-    cudaDeviceSynchronize();
-    o << "branches:" << l.num_branches
-      << " max_len:" << l.max_length
-      << " data_idx:" << l.data_index
-      << " lengths:[";
-    for (auto i: util::make_span(l.num_branches)) o << l.lengths[i] << " ";
-    o << "] parents:[";
-    for (auto i: util::make_span(l.num_branches)) o << l.parents[i] << " ";
-    o << "]";
-    return o;
-}
-
-} // namespace gpu
-} // namespace arb
diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu
index 6fe430dfe3640e0346884674323e2db13d59c1ee..68bf0bcd075b5b81908566084944e54fae43a074 100644
--- a/arbor/backends/gpu/matrix_fine.cu
+++ b/arbor/backends/gpu/matrix_fine.cu
@@ -70,7 +70,7 @@ void assemble_matrix_fine(
             // conductance (gi) values have units Î¼S (micro-Siemens).
             // See the model documentation in docs/model for more information.
             T oodt_factor = T(1e-3)/dt;
-	    T area_factor = T(1e-3)*area[tid];
+            T area_factor = T(1e-3)*area[tid];
 
             const auto gi = oodt_factor*cv_capacitance[tid] + area_factor*conductivity[tid];
             const auto pid = perm[tid];
@@ -100,30 +100,40 @@ void solve_matrix_fine(
     T* rhs,
     T* d,
     const T* u,
-    const level* levels,
-    const unsigned* levels_start,
-    unsigned* num_matrix, // number of packed matrices = number of cells
-    unsigned* padded_size)
+    const level_metadata* level_meta,
+    const fvm_index_type* level_lengths,
+    const fvm_index_type* level_parents,
+    const fvm_index_type* block_index,
+    fvm_index_type* num_matrix, // number of packed matrices = number of cells
+    fvm_index_type* padded_size)
 {
     const auto tid = threadIdx.x;
     const auto bid = blockIdx.x;
 
-    const auto first_level = levels_start[bid];
-    const auto num_levels  = levels_start[bid + 1] - first_level;
-    const auto block_levels = &levels[first_level];
+    const auto first_level = block_index[bid];
+    const auto num_levels  = block_index[bid + 1] - first_level;
+
+    const auto block_level_meta = &level_meta[first_level];
 
     // backward substitution
 
     for (unsigned l=0; l<num_levels-1; ++l) {
-        const auto& lvl = block_levels[l];
+        // Metadata for this level and the next level
+        const auto& lvl_meta = block_level_meta[l];
+        const auto& next_lvl_meta = block_level_meta[l+1];
+
+        // Addresses of the first elements of level_lengths and level_parents
+        // that belong to this level
+        const auto lvl_lengths = level_lengths + lvl_meta.level_data_index;
+        const auto lvl_parents = level_parents + lvl_meta.level_data_index;
 
-        const unsigned width = lvl.num_branches;
+        const unsigned width = lvl_meta.num_branches;
 
         // Perform backward substitution for each branch on this level.
         // One thread per branch.
         if (tid < width) {
-            const unsigned len = lvl.lengths[tid];
-            unsigned pos = lvl.data_index + tid;
+            const unsigned len = lvl_lengths[tid];
+            unsigned pos = lvl_meta.matrix_data_index + tid;
 
             // Zero diagonal term implies dt==0; just leave rhs (for whole matrix)
             // alone in that case.
@@ -149,8 +159,8 @@ void solve_matrix_fine(
                 // Update d and rhs at the parent node of this branch.
                 // A parent may have more than one contributing to it, so we use
                 // atomic updates to avoid races conditions.
-                const unsigned parent_index = block_levels[l+1].data_index;
-                const unsigned p = parent_index + lvl.parents[tid];
+                const unsigned parent_index = next_lvl_meta.matrix_data_index;
+                const unsigned p = parent_index + lvl_parents[tid];
                 //d[p]   -= factor * u[pos];
                 cuda_atomic_add(d  +p, -factor*u[pos]);
                 //rhs[p] -= factor * rhs[pos];
@@ -161,13 +171,15 @@ void solve_matrix_fine(
     }
 
     {
-        // the levels are sorted such that the root is the last level
-        const auto& lvl = block_levels[num_levels-1];
+        // The levels are sorted such that the root is the last level
+        const auto& last_lvl_meta = block_level_meta[num_levels-1];
+        const auto lvl_lengths = level_lengths + last_lvl_meta.level_data_index;
+
         const unsigned width = num_matrix[bid];
 
         if (tid < width) {
-            const unsigned len = lvl.lengths[tid];
-            unsigned pos = lvl.data_index + tid;
+            const unsigned len = lvl_lengths[tid];
+            unsigned pos = last_lvl_meta.matrix_data_index + tid;
 
             if (d[pos]!=0) {
 
@@ -200,10 +212,15 @@ void solve_matrix_fine(
 
     // take great care with loop limits decrementing unsigned counter l
     for (unsigned l=num_levels-1; l>0; --l) {
-        const auto& lvl = block_levels[l-1];
+        const auto& lvl_meta = block_level_meta[l-1];
+
+        // Addresses of the first elements of level_lengths and level_parents
+        // that belong to this level
+        const auto lvl_lengths = level_lengths + lvl_meta.level_data_index;
+        const auto lvl_parents = level_parents + lvl_meta.level_data_index;
 
-        const unsigned width = lvl.num_branches;
-        const unsigned parent_index = block_levels[l].data_index;
+        const unsigned width = lvl_meta.num_branches;
+        const unsigned parent_index = block_level_meta[l].matrix_data_index;
 
         __syncthreads();
 
@@ -211,12 +228,12 @@ void solve_matrix_fine(
         // One thread per branch.
         if (tid < width) {
             // Load the rhs value for the parent node of this branch.
-            const unsigned p = parent_index + lvl.parents[tid];
+            const unsigned p = parent_index + lvl_parents[tid];
             T rhsp = rhs[p];
 
             // Find the index of the first node in this branch.
-            const unsigned len = lvl.lengths[tid];
-            unsigned pos = lvl.data_index + (len-1)*width + tid;
+            const unsigned len = lvl_lengths[tid];
+            unsigned pos = lvl_meta.matrix_data_index + (len-1)*width + tid;
 
             if (d[pos]!=0) {
                 // each branch perform substitution
@@ -295,23 +312,25 @@ void assemble_matrix_fine(
 //         |       |           |       |       |           |
 //
 // levels       = [L0, L1, L2, L3, L4, L5, L6, L7, ... ]
-// levels_start = [0, 3, 5, 8, ...]
+// block_index  = [0, 3, 5, 8, ...]
 // num_levels   = [3, 2, 3, ...]
 // num_cells    = [2, 3, ...]
 // num_blocks   = level_start.size() - 1 = num_levels.size() = num_cells.size()
 void solve_matrix_fine(
     fvm_value_type* rhs,
-    fvm_value_type* d,                // diagonal values
-    const fvm_value_type* u,          // upper diagonal (and lower diagonal as the matrix is SPD)
-    const level* levels,              // pointer to an array containing level meta-data for all blocks
-    const unsigned* levels_start,     // start index into levels for each cuda block
-    unsigned* num_cells,              // he number of cells packed into this single matrix
-    unsigned* padded_size,            // length of rhs, d, u, including padding
-    unsigned num_blocks,              // nuber of blocks
-    unsigned blocksize)               // size of each block
+    fvm_value_type* d,                     // diagonal values
+    const fvm_value_type* u,               // upper diagonal (and lower diagonal as the matrix is SPD)
+    const level_metadata* level_meta,      // information pertaining to each level
+    const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
+    const fvm_index_type* level_parents,   // parents of branches of every level concatenated
+    const fvm_index_type* block_index,     // start index into levels for each cuda block
+    fvm_index_type* num_cells,             // the number of cells packed into this single matrix
+    fvm_index_type* padded_size,           // length of rhs, d, u, including padding
+    unsigned num_blocks,                   // number of blocks
+    unsigned blocksize)                    // size of each block
 {
     kernels::solve_matrix_fine<<<num_blocks, blocksize>>>(
-        rhs, d, u, levels, levels_start,
+        rhs, d, u, level_meta, level_lengths, level_parents, block_index,
         num_cells, padded_size);
 }
 
diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp
index 1a15bd7fc7244a1cad8418d673c8d198715bd54f..77a112bec70545b03193d619788ed099202f0070 100644
--- a/arbor/backends/gpu/matrix_fine.hpp
+++ b/arbor/backends/gpu/matrix_fine.hpp
@@ -5,37 +5,13 @@
 namespace arb {
 namespace gpu {
 
-struct level {
-    level() = default;
-
-    level(unsigned branches);
-    level(level&& other);
-    level(const level& other);
-
-    ~level();
-
-    unsigned num_branches = 0; // Number of branches
+struct level_metadata {
+    unsigned num_branches = 0; // Number of branches in a level
     unsigned max_length = 0;   // Length of the longest branch
-    unsigned data_index = 0;   // Index into data values of the first branch
-
-    //  The lengths and parents vectors are raw pointers to managed memory,
-    //  so there is need for tricksy deep copy of this type to GPU.
-
-    // An array holding the length of each branch in the level.
-    // length: num_branches.
-    unsigned* lengths = nullptr;
-
-    // An array with the index of the parent branch for each branch on this level.
-    // length: num_branches.
-    // When performing backward/forward substitution we need to update/read
-    // data values for the parent node for each branch.
-    // This can be done easily if we know where the parent branch is located
-    // on the next level.
-    unsigned* parents = nullptr;
+    unsigned matrix_data_index = 0;   // Index into data values (d, u, rhs) of the first branch
+    unsigned level_data_index  = 0;   // Index into data values (lengths, parents) of each level
 };
 
-std::ostream& operator<<(std::ostream& o, const level& l);
-
 // C wrappers around kernels
 void gather(
     const fvm_value_type* from,
@@ -66,14 +42,16 @@ void assemble_matrix_fine(
 
 void solve_matrix_fine(
     fvm_value_type* rhs,
-    fvm_value_type* d,                // diagonal values
-    const fvm_value_type* u,          // upper diagonal (and lower diagonal as the matrix is SPD)
-    const level* levels,              // pointer to an array containing level meta-data for all blocks
-    const unsigned* levels_end,       // end index (exclusive) into levels for each cuda block
-    unsigned* num_cells,              // he number of cells packed into this single matrix
-    unsigned* padded_size,            // length of rhs, d, u, including padding
-    unsigned num_blocks,              // nuber of blocks
-    unsigned blocksize);              // size of each block
+    fvm_value_type* d,                     // diagonal values
+    const fvm_value_type* u,               // upper diagonal (and lower diagonal as the matrix is SPD)
+    const level_metadata* level_meta,      // information pertaining to each level
+    const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
+    const fvm_index_type* level_parents,   // parents of branches of every level concatenated
+    const fvm_index_type* block_index,     // start index (exclusive) into levels for each cuda block
+    fvm_index_type* num_cells,             // the number of cells packed into this single matrix
+    fvm_index_type* padded_size,           // length of rhs, d, u, including padding
+    unsigned num_blocks,                   // number of blocks
+    unsigned blocksize);                   // size of each block
 
 } // namespace gpu
 } // namespace arb
diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp
index e2459166eda64531cba0d3079c55a670697db8fb..88264c1ddd0f8b28da69c875c2156c4b1e88eb64 100644
--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -83,8 +83,7 @@ public:
     using const_view = typename array::const_view_type;
     using iarray     = memory::device_vector<size_type>;
 
-    template <typename ValueType>
-    using managed_vector = std::vector<ValueType, memory::managed_allocator<ValueType>>;
+    using metadata_array = memory::device_vector<level_metadata>;
 
     iarray cv_to_cell;
 
@@ -92,43 +91,52 @@ public:
     array u;     // [Î¼S]
     array rhs;   // [nA]
 
-    // required for matrix assembly
-
-    array cv_area; // [Î¼m^2]
-
+    // Required for matrix assembly
+    array cv_area;             // [Î¼m^2]
     array cv_capacitance;      // [pF]
 
-    // the invariant part of the matrix diagonal
+    // Invariant part of the matrix diagonal
     array invariant_d;         // [Î¼S]
 
-    // for storing the solution in unpacked format
+    // Solution in unpacked format
     array solution_;
 
+    // Maps cell to integration domain
     iarray cell_to_intdom;
 
-    // the maximum nuber of branches in each level per block
+    // Maximum number of branches in each level per block
     unsigned max_branches_per_level;
 
-    // number of rows in matrix
+    // Number of rows in matrix
     unsigned matrix_size;
 
-    // number of cells
+    // Number of cells
     unsigned num_cells;
-    managed_vector<unsigned> num_cells_in_block;
+    iarray num_cells_in_block;
 
-    // end of the data of each level
-    managed_vector<unsigned> data_partition;
+    // End of the data of each level
+    iarray data_partition;
     std::size_t data_size;
 
-    // the meta data for each level for each block layed out linearly in memory
-    managed_vector<level> levels;
-    // the start of the levels of each block
-    // block b owns { leves[level_start[b]], ..., leves[level_start[b+1] - 1] }
+    // Metadata for each level
+    // Includes indices into d, u, rhs and level_lengths, level_parents
+    metadata_array level_meta;
+
+    // Stores the lengths (number of compartments) of the branches of each
+    // level sequentially in memory. Indexed by level_metadata::level_data_index
+    iarray level_lengths;
+
+    // Stores the indices of the parent of each of the branches in each
+    // level sequentially in memory. Indexed by level_metadata::level_data_index
+    iarray level_parents;
+
+    // Stores the indices to the first level belonging to each block
+    // block b owns { levels[block_index[b]], ..., levels[block_index[b+1] - 1] }
     // there is an additional entry at the end of the vector to make the above
-    // compuation save
-    managed_vector<unsigned> levels_start;
+    // computation safe
+    iarray block_index;
 
-    // permutation from front end storage to packed storage
+    // Permutation from front end storage to packed storage
     //      `solver_format[perm[i]] = external_format[i]`
     iarray perm;
 
@@ -161,7 +169,10 @@ public:
         unsigned current_block = 0;
         std::vector<unsigned> block_num_branches_per_depth;
         std::vector<unsigned> block_ix(num_cells);
-        num_cells_in_block.resize(1, 0);
+
+        // Accumulate num cells in block in a temporary vector to be copied to the device
+        std::vector<size_type> temp_ncells_in_block;
+        temp_ncells_in_block.resize(1, 0);
 
         // branch_map = branch_maps[block] is a branch map for each cuda block
         // branch_map[depth] is list of branches is this level
@@ -209,7 +220,7 @@ public:
             if (fits_current_block) {
                 // put the cell into current block
                 block_ix[c] = current_block;
-                num_cells_in_block[block_ix[c]] += 1;
+                temp_ncells_in_block[block_ix[c]] += 1;
                 // and increment counter
                 for (auto i: make_span(cell_num_levels)) {
                     block_num_branches_per_depth[i] += cell_num_branches_per_depth[i];
@@ -217,7 +228,7 @@ public:
             } else {
                 // otherwise start a new block
                 block_ix[c] = current_block + 1;
-                num_cells_in_block.push_back(1);
+                temp_ncells_in_block.push_back(1);
                 branch_maps.resize(branch_maps.size()+1);
                 current_block += 1;
                 // and reset counter
@@ -231,6 +242,7 @@ public:
                     }
                 }
             }
+            num_cells_in_block = memory::make_const_view(temp_ncells_in_block);
 
 
             // the branch map for the block in which we put the cell
@@ -300,53 +312,63 @@ public:
             }
         }
 
-        unsigned total_num_levels = std::accumulate(
-            branch_maps.begin(), branch_maps.end(), 0,
-            [](unsigned value, decltype(branch_maps[0])& l) {
-                return value + l.size();});
-
-        // construct description for the set of branches on each level for each
+        // Construct description for the set of branches on each level for each
         // block. This is later used to sort the branches in each block in each
         // level into conineous chunks which are easier to read for the cuda
         // kernel.
-        levels.reserve(total_num_levels);
-        levels_start.reserve(branch_maps.size() + 1);
-        levels_start.push_back(0);
-        data_partition.reserve(branch_maps.size());
-        // offset into the packed data format, used to apply permutation on data
+
+        // Accumulate metadata about the levels, level lengths, level parents,
+        // data_partition and block indices in temporary vectors to be copied to the device
+        std::vector<level_metadata> temp_meta;
+        std::vector<size_type> temp_lengths, temp_parents, temp_data_part, temp_block_index;
+
+        temp_block_index.reserve(branch_maps.size() + 1);
+        temp_block_index.push_back(0);
+        temp_data_part.reserve(branch_maps.size());
+
+        // Offset into the packed data format, used to apply permutation on data
         auto pos = 0u;
+        // Offset into the packed data format, used to access level_lengths and level_parents
+        auto data_start = 0u;
         for (const auto& branch_map: branch_maps) {
             for (const auto& lvl_branches: branch_map) {
 
-                level lvl(lvl_branches.size());
+                level_metadata lvl_meta;
+                std::vector<size_type> lvl_lengths(lvl_branches.size()), lvl_parents(lvl_branches.size());
+
+                lvl_meta.num_branches = lvl_branches.size();
+                lvl_meta.matrix_data_index = pos;
+                lvl_meta.level_data_index = data_start;
 
                 // The length of the first branch is the upper bound on branch
                 // length as they are sorted in descending order of length.
-                lvl.max_length = lvl_branches.front().length;
-                lvl.data_index = pos;
+                lvl_meta.max_length = lvl_branches.front().length;
 
                 unsigned bi = 0u;
                 for (const auto& b: lvl_branches) {
                     // Set the length of the branch.
-                    lvl.lengths[bi] = b.length;
+                    lvl_lengths[bi] = b.length;
 
                     // Set the parent indexes. During the forward and backward
                     // substitution phases each branch accesses the last node in
                     // its parent branch.
                     auto index = b.parent_id==npos? npos: branch_locs[b.parent_id].index;
-                    lvl.parents[bi] = index;
+                    lvl_parents[bi] = index;
                     ++bi;
                 }
 
-                pos += lvl.max_length*lvl.num_branches;
+                data_start+= lvl_meta.num_branches;
+                pos += lvl_meta.max_length*lvl_meta.num_branches;
 
-                levels.push_back(std::move(lvl));
+                temp_meta.push_back(std::move(lvl_meta));
+                util::append(temp_lengths, lvl_lengths);
+                util::append(temp_parents, lvl_parents);
             }
-            auto prev_end = levels_start.back();
-            levels_start.push_back(prev_end + branch_map.size());
-            data_partition.push_back(pos);
+            auto prev_end = temp_block_index.back();
+            temp_block_index.push_back(prev_end + branch_map.size());
+            temp_data_part.push_back(pos);
         }
-	data_size = pos;
+        data_size = pos;
 
         // set matrix state
         matrix_size = p.size();
@@ -356,13 +378,14 @@ public:
         std::vector<size_type> perm_tmp(matrix_size);
         for (auto block: make_span(branch_maps.size())) {
             const auto& branch_map = branch_maps[block];
-            const auto first_level = levels_start[block];
+            const auto first_level = temp_block_index[block];
 
-            for (auto i: make_span(levels_start[block + 1] - first_level)) {
-                const auto& l = levels[first_level + i];
+            for (auto i: make_span(temp_block_index[block + 1] - first_level)) {
+                const auto& l = temp_meta[first_level + i];
                 for (auto j: make_span(l.num_branches)) {
                     const auto& b = branch_map[i][j];
-                    auto to = l.data_index + j + l.num_branches*(l.lengths[j]-1);
+                    auto j_lvl_length = temp_lengths[l.level_data_index + j];
+                    auto to = l.matrix_data_index + j + l.num_branches*(j_lvl_length-1);
                     auto from = b.start_idx;
                     for (auto k: make_span(b.length)) {
                         perm_tmp[from + k] = to - k*l.num_branches;
@@ -371,6 +394,12 @@ public:
             }
         }
 
+        level_meta     = memory::make_const_view(temp_meta);
+        level_lengths  = memory::make_const_view(temp_lengths);
+        level_parents  = memory::make_const_view(temp_parents);
+        data_partition = memory::make_const_view(temp_data_part);
+        block_index    = memory::make_const_view(temp_block_index);
+
         auto perm_balancing = trees.permutation();
 
         // apppy permutation form balancing
@@ -395,14 +424,16 @@ public:
 
         // the invariant part of d is stored in in flat form
         std::vector<value_type> invariant_d_tmp(matrix_size, 0);
-        managed_vector<value_type> u_tmp(matrix_size, 0);
+        std::vector<value_type> temp_u_shuffled(matrix_size, 0);
+        array u_shuffled;
         for (auto i: make_span(1u, matrix_size)) {
             auto gij = face_conductance[i];
 
-            u_tmp[i] = -gij;
+            temp_u_shuffled[i] = -gij;
             invariant_d_tmp[i] += gij;
             invariant_d_tmp[p[i]] += gij;
         }
+        u_shuffled = memory::make_const_view(temp_u_shuffled);
 
         // the matrix components u, d and rhs are stored in packed form
         auto nan = std::numeric_limits<double>::quiet_NaN();
@@ -410,8 +441,8 @@ public:
         u   = array(data_size, nan);
         rhs = array(data_size, nan);
 
-        // transform u_tmp values into packed u vector.
-        flat_to_packed(u_tmp, u);
+        // transform u_shuffled values into packed u vector.
+        flat_to_packed(u_shuffled, u);
 
         // the invariant part of d, cv_area and the solution are in flat form
         solution_ = array(matrix_size, 0);
@@ -459,7 +490,8 @@ public:
     void solve() {
         solve_matrix_fine(
             rhs.data(), d.data(), u.data(),
-            levels.data(), levels_start.data(),
+            level_meta.data(), level_lengths.data(), level_parents.data(),
+            block_index.data(),
             num_cells_in_block.data(),
             data_partition.data(),
             num_cells_in_block.size(), max_branches_per_level);
diff --git a/arbor/backends/gpu/stack_storage.hpp b/arbor/backends/gpu/stack_storage.hpp
index 412baebeda0d2eb17a33f2e88f3cf42ddcc57e6a..58cee60b96638f3e9684d02534ab663a768a0e7c 100644
--- a/arbor/backends/gpu/stack_storage.hpp
+++ b/arbor/backends/gpu/stack_storage.hpp
@@ -7,8 +7,7 @@ namespace gpu {
 
 // Concrete storage of gpu stack datatype.
 // The stack datatype resides in host memory, and holds a pointer to the
-// stack_storage in managed memory, which can be accessed by both host and
-// gpu code.
+// stack_storage in device memory.
 template <typename T>
 struct stack_storage {
     using value_type = T;
diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp
index eca942b9264841a69771c48391453dc86222d896..0a1a4351ccd13127f1295c307528d19cec3933be 100644
--- a/arbor/backends/gpu/threshold_watcher.hpp
+++ b/arbor/backends/gpu/threshold_watcher.hpp
@@ -1,5 +1,8 @@
 #pragma once
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 #include <arbor/arbexcept.hpp>
 #include <arbor/common_types.hpp>
 #include <arbor/fvm_types.hpp>
@@ -10,7 +13,6 @@
 
 #include "backends/threshold_crossing.hpp"
 #include "backends/gpu/gpu_store_types.hpp"
-#include "backends/gpu/managed_ptr.hpp"
 #include "backends/gpu/stack.hpp"
 
 #include "stack.hpp"
@@ -115,9 +117,7 @@ public:
                 cv_index_.data(), values_, thresholds_.data());
 
             // Check that the number of spikes has not exceeded capacity.
-            // ATTENTION: requires cudaDeviceSynchronize to avoid simultaneous
-            // host-device managed memory access.
-            arb_assert((cudaDeviceSynchronize(), !stack_.overflow()));
+            arb_assert(!stack_.overflow());
         }
     }
 
diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp
index a71b5a902773a3c29a89a80a0ffc4cbf8831d33d..2a21cfe167d8e92e892ed1f9ee1fe183eaad1abd 100644
--- a/arbor/gpu_context.cpp
+++ b/arbor/gpu_context.cpp
@@ -12,8 +12,7 @@
 namespace arb {
 
 enum gpu_flags {
-    has_concurrent_managed_access = 1,
-    has_atomic_double = 2
+    has_atomic_double = 1
 };
 
 gpu_context_handle make_gpu_context(int id) {
@@ -24,10 +23,6 @@ bool gpu_context_has_gpu(const gpu_context& ctx) {
     return ctx.has_gpu();
 }
 
-bool gpu_context::has_concurrent_managed_access() const {
-    return attributes_ & gpu_flags::has_concurrent_managed_access;
-}
-
 bool gpu_context::has_atomic_double() const {
     return attributes_ & gpu_flags::has_atomic_double;
 }
@@ -42,8 +37,6 @@ void gpu_context::set_gpu() const {
     throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU.");
 }
 
-void gpu_context::synchronize_for_managed_access() const {}
-
 gpu_context::gpu_context(int) {
     throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU.");
 }
@@ -66,20 +59,11 @@ gpu_context::gpu_context(int gpu_id) {
 
     // Record the device attributes
     attributes_ = 0;
-    if (prop.concurrentManagedAccess) {
-        attributes_ |= gpu_flags::has_concurrent_managed_access;
-    }
     if (prop.major*100 + prop.minor >= 600) {
         attributes_ |= gpu_flags::has_atomic_double;
     }
 }
 
-void gpu_context::synchronize_for_managed_access() const {
-    if(!has_concurrent_managed_access()) {
-        cudaDeviceSynchronize();
-    }
-}
-
 void gpu_context::set_gpu() const {
     if (!has_gpu()) {
         throw arbor_exception(
diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp
index 44de47493c42614ce82f3c4d9ef4e6dec8585a8d..32f70ee10f56e5201c9a2898e5ebc6260b257ae5 100644
--- a/arbor/gpu_context.hpp
+++ b/arbor/gpu_context.hpp
@@ -13,9 +13,7 @@ public:
     gpu_context() = default;
     gpu_context(int id);
 
-    bool has_concurrent_managed_access() const;
     bool has_atomic_double() const;
-    void synchronize_for_managed_access() const;
     bool has_gpu() const;
     // Calls cudaSetDevice(id), so that GPU calls from the calling thread will
     // be executed on the GPU.
diff --git a/arbor/memory/allocator.hpp b/arbor/memory/allocator.hpp
index 67983d7da588547cc326eed06c3ff16a25bb1ca1..34e2cf746a68fd068eceeaad9d6a66acd7604711 100644
--- a/arbor/memory/allocator.hpp
+++ b/arbor/memory/allocator.hpp
@@ -127,34 +127,6 @@ namespace impl {
             }
         };
 
-        // bare bones implementation of standard compliant allocator for managed memory
-        template <size_type Alignment=1024>
-        struct managed_policy {
-            // Managed memory is aligned on 1024 byte boundaries.
-            // So the Alignment parameter must be a factor of 1024
-            static_assert(1024%Alignment==0, "CUDA managed memory is always aligned on 1024 byte boundaries");
-
-            void* allocate_policy(std::size_t n) {
-                if (!n) {
-                    return nullptr;
-                }
-                return cuda_malloc_managed(n);
-            }
-
-            static constexpr size_type alignment() {
-                return Alignment;
-            }
-
-            // managed memory can be used with standard memcpy
-            static constexpr bool is_malloc_compatible() {
-                return true;
-            }
-
-            void free_policy(void* p) {
-                cuda_free(p);
-            }
-        };
-
         class device_policy {
         public:
             void *allocate_policy(size_type size) {
@@ -267,13 +239,6 @@ namespace util {
         }
     };
 
-    template <>
-    struct type_printer<impl::cuda::managed_policy<>>{
-        static std::string print() {
-            return std::string("managed_policy");
-        }
-    };
-
     template <typename T, typename Policy>
     struct type_printer<allocator<T,Policy>>{
         static std::string print() {
@@ -296,9 +261,6 @@ using aligned_allocator = allocator<T, impl::aligned_policy<alignment>>;
 template <class T, size_t alignment=1024>
 using pinned_allocator = allocator<T, impl::cuda::pinned_policy<alignment>>;
 
-template <class T, size_t alignment=1024>
-using managed_allocator = allocator<T, impl::cuda::managed_policy<alignment>>;
-
 // use 256 as default alignment, because that is the default for cudaMalloc
 template <class T, size_t alignment=256>
 using cuda_allocator = allocator<T, impl::cuda::device_policy>;
diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp
index d31c82b40cdb3c44b9827a3f79596c443535018f..5091e8657733139043c85d931cf6981465fb161a 100644
--- a/arbor/memory/cuda_wrappers.cpp
+++ b/arbor/memory/cuda_wrappers.cpp
@@ -56,15 +56,6 @@ void* cuda_malloc(std::size_t n) {
     return ptr;
 }
 
-void* cuda_malloc_managed(std::size_t n) {
-    void* ptr;
-
-    if (auto error = cudaMallocManaged(&ptr, n)) {
-        HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes of managed memory");
-    }
-    return ptr;
-}
-
 void cuda_free(void* ptr) {
     if (auto error = cudaFree(ptr)) {
         HANDLE_CUDA_ERROR(error, "");
@@ -108,11 +99,6 @@ void* cuda_malloc(std::size_t n) {
     return 0;
 }
 
-void* cuda_malloc_managed(std::size_t n) {
-    NOCUDA;
-    return 0;
-}
-
 void cuda_free(void* ptr) {
     NOCUDA;
 }
diff --git a/arbor/memory/cuda_wrappers.hpp b/arbor/memory/cuda_wrappers.hpp
index 04e014acb774bea0084b3e7340fa4cf12bd55983..7bd2dbd38b712539ac53df329786b571d0c53d31 100644
--- a/arbor/memory/cuda_wrappers.hpp
+++ b/arbor/memory/cuda_wrappers.hpp
@@ -9,7 +9,6 @@ void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n);
 void* cuda_host_register(void* ptr, std::size_t size);
 void cuda_host_unregister(void* ptr);
 void* cuda_malloc(std::size_t n);
-void* cuda_malloc_managed(std::size_t n);
 void cuda_free(void* ptr);
 
 } // namespace memory
diff --git a/test/unit/mech_private_field_access.cpp b/test/unit/mech_private_field_access.cpp
index 3653e87b64546251cdde57128f249043c1b7e610..f201a6b860c1247b4f750bf017863766593fe67b 100644
--- a/test/unit/mech_private_field_access.cpp
+++ b/test/unit/mech_private_field_access.cpp
@@ -40,7 +40,6 @@ std::vector<fvm_value_type> mechanism_field(gpu::mechanism* m, const std::string
     const fvm_value_type* field_data = *opt_ptr.value();
     std::vector<fvm_value_type> values(m->size());
 
-    cudaDeviceSynchronize();
     memory::cuda_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size());
     return values;
 }
diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu
index 7f05e324a61193185ec71d931511d7e5d5226696..d6050122dade8945db80c28690849b86ea89f0e4 100644
--- a/test/unit/test_gpu_stack.cu
+++ b/test/unit/test_gpu_stack.cu
@@ -2,7 +2,6 @@
 
 #include "backends/gpu/stack.hpp"
 #include "backends/gpu/stack_cu.hpp"
-#include "backends/gpu/managed_ptr.hpp"
 #include "gpu_context.hpp"
 
 using namespace arb;
diff --git a/test/unit/test_intrin.cu b/test/unit/test_intrin.cu
index e8d7aa34a043ab5b2cb03be35524ebb212993694..e6b36e876db192c71f7c680c629f622749319a20 100644
--- a/test/unit/test_intrin.cu
+++ b/test/unit/test_intrin.cu
@@ -4,7 +4,6 @@
 
 #include "backends/gpu/cuda_atomic.hpp"
 #include "backends/gpu/math_cu.hpp"
-#include "backends/gpu/managed_ptr.hpp"
 #include "memory/memory.hpp"
 #include "util/rangeutil.hpp"
 #include "util/span.hpp"
@@ -46,34 +45,38 @@ namespace kernels {
 TEST(gpu_intrinsics, cuda_atomic_add) {
     int expected = (128*129)/2;
 
-    auto f = arb::gpu::make_managed_ptr<float>(0.f);
-    kernels::test_atomic_add<<<1, 128>>>(f.get());
-    cudaDeviceSynchronize();
+    arb::memory::device_vector<float> f(1);
+    f[0] = 0.f;
 
-    EXPECT_EQ(float(expected), *f);
+    kernels::test_atomic_add<<<1, 128>>>(f.data());
 
-    auto d = arb::gpu::make_managed_ptr<double>(0.);
-    kernels::test_atomic_add<<<1, 128>>>(d.get());
-    cudaDeviceSynchronize();
+    EXPECT_EQ(float(expected), f[0]);
 
-    EXPECT_EQ(double(expected), *d);
+    arb::memory::device_vector<double> d(1);
+    d[0] = 0.f;
+
+    kernels::test_atomic_add<<<1, 128>>>(d.data());
+
+    EXPECT_EQ(double(expected), d[0]);
 }
 
 // test atomic subtraction wrapper for single and double precision
 TEST(gpu_intrinsics, cuda_atomic_sub) {
     int expected = -(128*129)/2;
 
-    auto f = arb::gpu::make_managed_ptr<float>(0.f);
-    kernels::test_atomic_sub<<<1, 128>>>(f.get());
-    cudaDeviceSynchronize();
+    arb::memory::device_vector<float> f(1);
+    f[0] = 0.f;
+
+    kernels::test_atomic_sub<<<1, 128>>>(f.data());
+
+    EXPECT_EQ(float(expected), f[0]);
 
-    EXPECT_EQ(float(expected), *f);
+    arb::memory::device_vector<double> d(1);
+    d[0] = 0.f;
 
-    auto d = arb::gpu::make_managed_ptr<double>(0.);
-    kernels::test_atomic_sub<<<1, 128>>>(d.get());
-    cudaDeviceSynchronize();
+    kernels::test_atomic_sub<<<1, 128>>>(d.data());
 
-    EXPECT_EQ(double(expected), *d);
+    EXPECT_EQ(double(expected), d[0]);
 }
 
 TEST(gpu_intrinsics, minmax) {
@@ -109,26 +112,22 @@ TEST(gpu_intrinsics, minmax) {
 
     // test min
     kernels::test_min<<<1, n>>>(lhs.data(), rhs.data(), result.data());
-    cudaDeviceSynchronize();
     for (auto i: make_span(0, n)) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_min);
     }
 
     kernels::test_min<<<1, n>>>(rhs.data(), lhs.data(), result.data());
-    cudaDeviceSynchronize();
     for (auto i: make_span(0, n)) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_min);
     }
 
     // test max
     kernels::test_max<<<1, n>>>(lhs.data(), rhs.data(), result.data());
-    cudaDeviceSynchronize();
     for (auto i: make_span(0, n)) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_max);
     }
 
     kernels::test_max<<<1, n>>>(rhs.data(), lhs.data(), result.data());
-    cudaDeviceSynchronize();
     for (auto i: make_span(0, n)) {
         EXPECT_EQ(double(result[i]), inputs[i].expected_max);
     }
@@ -145,7 +144,6 @@ TEST(gpu_intrinsics, exprelr) {
     arb::memory::device_vector<double> result(n);
 
     kernels::test_exprelr<<<1,n>>>(x.data(), result.data());
-    cudaDeviceSynchronize();
 
     auto index = arb::util::make_span(0, n);
     for (auto i: index) {
diff --git a/test/unit/test_kinetic_linear.cpp b/test/unit/test_kinetic_linear.cpp
index 5a9d00965d1e6740da9c89cf3974af686df87f51..b9d5707f82cdad9eb14f07dc9fa404c70d9bb311 100644
--- a/test/unit/test_kinetic_linear.cpp
+++ b/test/unit/test_kinetic_linear.cpp
@@ -189,7 +189,7 @@ TEST(mech_kinetic_gpu, kintetic_linear_2_conserve) {
     run_test<gpu::backend>("test1_kin_steadystate", state_variables, {}, t0_values, t1_1_values, 0.5);
 }
 
-TEST(mech_kinetic, kintetic_nonlinear) {
+TEST(mech_kinetic_gpu, kintetic_nonlinear) {
     std::vector<std::string> state_variables = {"a", "b", "c"};
     std::vector<fvm_value_type> t0_values = {0.2, 0.3, 0.5};
     std::vector<fvm_value_type> t1_0_values = {0.222881, 0.31144, 0.48856};
@@ -199,7 +199,7 @@ TEST(mech_kinetic, kintetic_nonlinear) {
     run_test<gpu::backend>("test3_kin_diff", state_variables, {}, t0_values, t1_1_values, 0.025);
 }
 
-TEST(mech_kinetic, kintetic_nonlinear_scaled) {
+TEST(mech_kinetic_gpu, kintetic_nonlinear_scaled) {
     std::vector<std::string> state_variables = {"A", "B", "C", "d", "e"};
     std::vector<fvm_value_type> t0_values = {4.5, 6.6, 0.28, 2, 0};
     std::vector<fvm_value_type> t1_values = {4.087281958014442,