diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8429bac998cf01963b596fb50bda7a0efd70d101..5bbabc608b4e20f952146c31c0a8697996cf3c0c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,7 +27,7 @@ set(BASE_SOURCES
 set(CUDA_SOURCES
     backends/gpu/fvm.cu
     backends/gpu/multi_event_stream.cu
-    memory/fill.cu
+    backends/gpu/fill.cu
 )
 
 # The cell_group_factory acts like an interface between the
diff --git a/src/backends/gpu/fill.cu b/src/backends/gpu/fill.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f82f1d8eb809c0340ac1050f67a69bba101adc6c
--- /dev/null
+++ b/src/backends/gpu/fill.cu
@@ -0,0 +1,43 @@
+#include <cstdint>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+template <typename T, typename I>
+__global__
+void fill_kernel(T* v, T value, I n) {
+    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if(tid < n) {
+        v[tid] = value;
+    }
+}
+
+inline unsigned grid_dim(std::size_t n, unsigned block_dim) {
+    return (n+block_dim-1)/block_dim;
+}
+
+void fill8(uint8_t* v, uint8_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill16(uint16_t* v, uint16_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill32(uint32_t* v, uint32_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill64(uint64_t* v, uint64_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/backends/gpu/fill.hpp b/src/backends/gpu/fill.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b1447fcee449557eb4e32926294da6ae0d440de
--- /dev/null
+++ b/src/backends/gpu/fill.hpp
@@ -0,0 +1,56 @@
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+//
+// prototypes for compiled wrappers around fill kernels for GPU memory
+//
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+void fill8(uint8_t* v, uint8_t value, std::size_t n);
+void fill16(uint16_t* v, uint16_t value, std::size_t n);
+void fill32(uint32_t* v, uint32_t value, std::size_t n);
+void fill64(uint64_t* v, uint64_t value, std::size_t n);
+
+// Brief:
+// Perform type punning to pass arbitrary POD types to the GPU backend
+// without polluting the library front end with CUDA kernels that would
+// require compilation with nvcc.
+//
+// Detail:
+// The implementation takes advantage of 4 fill functions that fill GPU
+// memory with a {8, 16, 32, 64} bit unsigned integer. These these functions
+// are used to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD
+// value. e.g. for a 64-bit double, first convert the double into a 64-bit
+// unsigned integer (with the same bits, not the same value), then call the
+// 64-bit fill kernel precompiled using nvcc in the gpu library. This
+// technique of converting from one type to another is called type-punning.
+// There are some subtle challenges, due to C++'s strict aliasing rules,
+// that require memcpy of single bytes if alignment of the two types does
+// not match.
+
+#define FILL(N) \
+template <typename T> \
+typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \
+fill(T* ptr, T value, std::size_t n) { \
+    using I = uint ## N ## _t; \
+    I v; \
+    std::copy_n( \
+        reinterpret_cast<char*>(&value), \
+        sizeof(T), \
+        reinterpret_cast<char*>(&v) \
+    ); \
+    nest::mc::gpu::fill ## N(reinterpret_cast<I*>(ptr), v, n); \
+}
+
+FILL(8)
+FILL(16)
+FILL(32)
+FILL(64)
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/memory/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp
similarity index 95%
rename from src/memory/managed_ptr.hpp
rename to src/backends/gpu/managed_ptr.hpp
index 944424bb6f0960bf3a15f62d5935729e4747832b..ca42638e649b55c83dd43b61b52ca9f0f43ba41f 100644
--- a/src/memory/managed_ptr.hpp
+++ b/src/backends/gpu/managed_ptr.hpp
@@ -6,7 +6,7 @@
 
 namespace nest {
 namespace mc {
-namespace memory {
+namespace gpu {
 
 // used to indicate that the type pointed to by the managed_ptr is to be
 // constructed in the managed_ptr constructor
@@ -40,7 +40,7 @@ class managed_ptr {
     // memory and constructing a type in place.
     template <typename... Args>
     managed_ptr(construct_in_place_tag, Args&&... args) {
-        managed_allocator<element_type> allocator;
+        memory::managed_allocator<element_type> allocator;
         data_ = allocator.allocate(1u);
         synchronize();
         data_ = new (data_) element_type(std::forward<Args>(args)...);
@@ -75,7 +75,7 @@ class managed_ptr {
 
     ~managed_ptr() {
         if (is_allocated()) {
-            managed_allocator<element_type> allocator;
+            memory::managed_allocator<element_type> allocator;
             synchronize(); // required to ensure that memory is not in use on GPU
             data_->~element_type();
             allocator.deallocate(data_, 1u);
@@ -112,7 +112,7 @@ managed_ptr<T> make_managed_ptr(Args&&... args) {
     return managed_ptr<T>(construct_in_place_tag(), std::forward<Args>(args)...);
 }
 
-} // namespace memory
+} // namespace gpu
 } // namespace mc
 } // namespace nest
 
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index 1d28bb329d01e587ee3bdaae5dc2f5efbffeedf7..39a2a9e8dd5df308cf6db13799fdcb1579a92f85 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -2,9 +2,9 @@
 
 #include <common_types.hpp>
 #include <memory/memory.hpp>
-#include <memory/managed_ptr.hpp>
 #include <util/span.hpp>
 
+#include "managed_ptr.hpp"
 #include "stack.hpp"
 #include "kernels/test_thresholds.hpp"
 
@@ -29,12 +29,6 @@ public:
     struct threshold_crossing {
         size_type index;    // index of variable
         value_type time;    // time of crossing
-        __host__ __device__
-        friend bool operator==
-            (const threshold_crossing& lhs, const threshold_crossing& rhs)
-        {
-            return lhs.index==rhs.index && lhs.time==rhs.time;
-        }
     };
 
     using stack_type = stack<threshold_crossing>;
@@ -57,7 +51,7 @@ public:
         thresholds_(memory::make_const_view(thresh)),
         prev_values_(values),
         is_crossed_(size()),
-        stack_(memory::make_managed_ptr<stack_type>(10*size()))
+        stack_(make_managed_ptr<stack_type>(10*size()))
     {
         reset();
     }
@@ -137,7 +131,7 @@ private:
     array prev_values_;         // values at previous sample time: on gpu
     iarray is_crossed_;         // bool flag for state of each watch: on gpu
 
-    memory::managed_ptr<stack_type> stack_;
+    managed_ptr<stack_type> stack_;
 };
 
 } // namespace gpu
diff --git a/src/memory/device_coordinator.hpp b/src/memory/device_coordinator.hpp
index 05fa27b931be3cf7e50320b25c369e45724ab2aa..5d273da336c04dd53ae7b44d8f3fabe2d3c869c6 100644
--- a/src/memory/device_coordinator.hpp
+++ b/src/memory/device_coordinator.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <algorithm>
 #include <cstdint>
 #include <exception>
 
 #include <util/debug.hpp>
+#include <backends/gpu/fill.hpp>
 
 #include "allocator.hpp"
 #include "array.hpp"
@@ -47,44 +47,6 @@ namespace util {
     };
 } // namespace util
 
-namespace gpu {
-    // brief:
-    // We have to perform some type punning to pass arbitrary POD types to the
-    // GPU backend without polluting the library front end with CUDA kernels
-    // that would require compilation with nvcc.
-    //
-    // detail:
-    // The implementation takes advantage of 4 fill functions that fill GPU
-    // memory with a {8, 16, 32, 64} bit unsigned integer. We want to use these
-    // functions to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD
-    // value. The technique to do this with a 64-bit double, is to first convert
-    // the double into a 64-bit unsigned integer (with the same bits, not the
-    // same value), then call the 64-bit fill kernel precompiled using nvcc in
-    // the gpu library. This technique of converting from one type to another
-    // is called type-punning. There are some subtle challenges, due to C++'s
-    // strict aliasing rules, that require memcpy of single bytes if alignment
-    // of the two types does not match.
-
-    #define FILL(N) \
-    template <typename T> \
-    typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \
-    fill(T* ptr, T value, size_t n) { \
-        using I = uint ## N ## _t; \
-        I v; \
-        std::copy_n( \
-            reinterpret_cast<char*>(&value), \
-            sizeof(T), \
-            reinterpret_cast<char*>(&v) \
-        ); \
-        fill ## N(reinterpret_cast<I*>(ptr), v, n); \
-    }
-
-    FILL(8)
-    FILL(16)
-    FILL(32)
-    FILL(64)
-}
-
 template <typename T>
 class const_device_reference {
 public:
@@ -291,7 +253,7 @@ public:
     // fill memory
     void set(view_type &rng, value_type value) {
         if (rng.size()) {
-            gpu::fill<value_type>(rng.data(), value, rng.size());
+            nest::mc::gpu::fill<value_type>(rng.data(), value, rng.size());
         }
     }
 
diff --git a/src/memory/fill.cu b/src/memory/fill.cu
deleted file mode 100644
index 917b03d42768670ab1d6166961c7c58093472e08..0000000000000000000000000000000000000000
--- a/src/memory/fill.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <cstdint>
-
-namespace nest {
-namespace mc {
-namespace memory {
-namespace gpu {
-    template <typename T, typename I>
-    __global__
-    void fill_kernel(T* v, T value, I n) {
-        auto tid = threadIdx.x + blockDim.x*blockIdx.x;
-
-        if(tid < n) {
-            v[tid] = value;
-        }
-    }
-
-    unsigned grid_dim(std::size_t n, unsigned block_dim) {
-        return (n+block_dim-1)/block_dim;
-    }
-
-    void fill8(uint8_t* v, uint8_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill16(uint16_t* v, uint16_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill32(uint32_t* v, uint32_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill64(uint64_t* v, uint64_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-} // namespace gpu
-} // namespace memory
-} // namespace nest
-} // namespace mc
diff --git a/src/memory/gpu.hpp b/src/memory/gpu.hpp
index c5955eac21d908519e511351fbf4f56f22f09a26..895882729280cf88ba3021acd84724f1f28d2f87 100644
--- a/src/memory/gpu.hpp
+++ b/src/memory/gpu.hpp
@@ -15,14 +15,6 @@ namespace mc {
 namespace memory {
 namespace gpu {
 
-//
-// prototypes for compiled wrappers around fill kernels for GPU memory
-//
-void fill8(uint8_t* v, uint8_t value, std::size_t n);
-void fill16(uint16_t* v, uint16_t value, std::size_t n);
-void fill32(uint32_t* v, uint32_t value, std::size_t n);
-void fill64(uint64_t* v, uint64_t value, std::size_t n);
-
 //
 // helpers for memory where at least one of the target or source is on the gpu
 //
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 61b36aa2a2b79acc7a7ec36e44d206873807dafc..3016564404cf1a8d8087a90e5997826fb6e4c204 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -99,8 +99,6 @@ endif()
 target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..")
 
 if(NMC_WITH_CUDA)
-    # Omit -DDATADIR for cuda target because of CMake 3.7/3.8 FindCUDA quoting bug.
-
     set(TARGETS ${TARGETS} test_cuda.exe)
     cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS})
 endif()
diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu
index fb9877c9a7fd6de6258c31bf7c40fcc062357ab3..e92004f54afe6ae5e429272af6c5f15a8af0c058 100644
--- a/tests/unit/test_atomics.cu
+++ b/tests/unit/test_atomics.cu
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <backends/gpu/intrinsics.hpp>
-#include <memory/managed_ptr.hpp>
+#include <backends/gpu/managed_ptr.hpp>
 
 namespace kernels {
     template <typename T>
@@ -21,13 +21,13 @@ namespace kernels {
 TEST(gpu_intrinsics, cuda_atomic_add) {
     int expected = (128*129)/2;
 
-    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    auto f = nest::mc::gpu::make_managed_ptr<float>(0.f);
     kernels::test_atomic_add<<<1, 128>>>(f.get());
     cudaDeviceSynchronize();
 
     EXPECT_EQ(float(expected), *f);
 
-    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    auto d = nest::mc::gpu::make_managed_ptr<double>(0.);
     kernels::test_atomic_add<<<1, 128>>>(d.get());
     cudaDeviceSynchronize();
 
@@ -38,13 +38,13 @@ TEST(gpu_intrinsics, cuda_atomic_add) {
 TEST(gpu_intrinsics, cuda_atomic_sub) {
     int expected = -(128*129)/2;
 
-    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    auto f = nest::mc::gpu::make_managed_ptr<float>(0.f);
     kernels::test_atomic_sub<<<1, 128>>>(f.get());
     cudaDeviceSynchronize();
 
     EXPECT_EQ(float(expected), *f);
 
-    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    auto d = nest::mc::gpu::make_managed_ptr<double>(0.);
     kernels::test_atomic_sub<<<1, 128>>>(d.get());
     cudaDeviceSynchronize();
 
diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu
index 2756078dca09d4bfebf14adb992bf05cf61a4d28..ad20a4d5a1e2732f333041fac793e175c67505bc 100644
--- a/tests/unit/test_gpu_stack.cu
+++ b/tests/unit/test_gpu_stack.cu
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <backends/gpu/stack.hpp>
-#include <memory/managed_ptr.hpp>
+#include <backends/gpu/managed_ptr.hpp>
 
 using namespace nest::mc;
 
@@ -52,7 +52,7 @@ TEST(stack, push_back) {
 
     const unsigned n = 10;
     EXPECT_TRUE(n%2 == 0); // require n is even for tests to work
-    auto s = memory::make_managed_ptr<stack>(n);
+    auto s = gpu::make_managed_ptr<stack>(n);
 
     kernels::push_back<<<1, n>>>(*s, kernels::all_ftor());
     cudaDeviceSynchronize();
diff --git a/tests/unit/test_mc_cell_group.cu b/tests/unit/test_mc_cell_group.cu
index 118b97033ec4c6370dc265c2dc5782fc76da04ad..1abdbda3c908cb8cca22c2371ea8490b7d04d4fd 100644
--- a/tests/unit/test_mc_cell_group.cu
+++ b/tests/unit/test_mc_cell_group.cu
@@ -6,7 +6,8 @@
 #include <fvm_multicell.hpp>
 #include <util/rangeutil.hpp>
 
-#include "../test_common_cells.hpp"
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
 
 using namespace nest::mc;
 
@@ -26,8 +27,7 @@ nest::mc::cell make_cell() {
 
 TEST(cell_group, test)
 {
-    using cell_group_type = mc_cell_group<fvm_cell>;
-    auto group = cell_group_type({0u}, util::singleton_view(make_cell()));
+    mc_cell_group<fvm_cell> group({0u}, cable1d_recipe(make_cell()));
 
     group.advance(50, 0.01);
 
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index 288f7ded6b51607a05d11d0258ae1fc08c76e922..e013a9981fd81e0dafb3e53eeea274d525d03d2c 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -30,7 +30,7 @@ set(VALIDATION_CUDA_SOURCES
 )
 
 if(NMC_VALIDATION_DATA_DIR)
-    if ("${CMAKE_VERSION}" MATCHES "^3.[78].")
+    if ("${CMAKE_VERSION}" MATCHES "^3.[789].")
         message(WARNING "CMake ${CMAKE_VERSION} has broken FindCUDA; omitting NMC_DATADIR define.")
     else()
         add_definitions("-DNMC_DATADIR=\"${NMC_VALIDATION_DATA_DIR}\"")