From 40e2f52349a4b7a10722787ba4f735f8331614fd Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Mon, 4 Sep 2017 13:41:19 +0200
Subject: [PATCH] Move nvcc-only code from memory to backends::gpu (#342)

* Move gpu-kernel code from memory to backends/gpu

A small step towards seperate back end compilation for CUDA.
Move the following code to the gpu backend:
- the memory::fill* wrappers and fill kernel
- the managed_ptr type
  - only used in backends::gpu
  - has `__device__ __host__` members for dual host-device use.

* update unit&validation tests
---
 src/CMakeLists.txt                           |  2 +-
 src/backends/gpu/fill.cu                     | 43 +++++++++++++++
 src/backends/gpu/fill.hpp                    | 56 ++++++++++++++++++++
 src/{memory => backends/gpu}/managed_ptr.hpp |  8 +--
 src/backends/gpu/threshold_watcher.hpp       | 12 ++---
 src/memory/device_coordinator.hpp            | 42 +--------------
 src/memory/fill.cu                           | 45 ----------------
 src/memory/gpu.hpp                           |  8 ---
 tests/unit/CMakeLists.txt                    |  2 -
 tests/unit/test_atomics.cu                   | 10 ++--
 tests/unit/test_gpu_stack.cu                 |  4 +-
 tests/unit/test_mc_cell_group.cu             |  6 +--
 tests/validation/CMakeLists.txt              |  2 +-
 13 files changed, 120 insertions(+), 120 deletions(-)
 create mode 100644 src/backends/gpu/fill.cu
 create mode 100644 src/backends/gpu/fill.hpp
 rename src/{memory => backends/gpu}/managed_ptr.hpp (95%)
 delete mode 100644 src/memory/fill.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8429bac9..5bbabc60 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,7 +27,7 @@ set(BASE_SOURCES
 set(CUDA_SOURCES
     backends/gpu/fvm.cu
     backends/gpu/multi_event_stream.cu
-    memory/fill.cu
+    backends/gpu/fill.cu
 )
 
 # The cell_group_factory acts like an interface between the
diff --git a/src/backends/gpu/fill.cu b/src/backends/gpu/fill.cu
new file mode 100644
index 00000000..f82f1d8e
--- /dev/null
+++ b/src/backends/gpu/fill.cu
@@ -0,0 +1,43 @@
+#include <cstdint>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+template <typename T, typename I>
+__global__
+void fill_kernel(T* v, T value, I n) {
+    auto tid = threadIdx.x + blockDim.x*blockIdx.x;
+
+    if(tid < n) {
+        v[tid] = value;
+    }
+}
+
+inline unsigned grid_dim(std::size_t n, unsigned block_dim) {
+    return (n+block_dim-1)/block_dim;
+}
+
+void fill8(uint8_t* v, uint8_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill16(uint16_t* v, uint16_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill32(uint32_t* v, uint32_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+void fill64(uint64_t* v, uint64_t value, std::size_t n) {
+    unsigned block_dim = 192;
+    fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
+};
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/backends/gpu/fill.hpp b/src/backends/gpu/fill.hpp
new file mode 100644
index 00000000..4b1447fc
--- /dev/null
+++ b/src/backends/gpu/fill.hpp
@@ -0,0 +1,56 @@
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+//
+// prototypes for compiled wrappers around fill kernels for GPU memory
+//
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+void fill8(uint8_t* v, uint8_t value, std::size_t n);
+void fill16(uint16_t* v, uint16_t value, std::size_t n);
+void fill32(uint32_t* v, uint32_t value, std::size_t n);
+void fill64(uint64_t* v, uint64_t value, std::size_t n);
+
+// Brief:
+// Perform type punning to pass arbitrary POD types to the GPU backend
+// without polluting the library front end with CUDA kernels that would
+// require compilation with nvcc.
+//
+// Detail:
+// The implementation takes advantage of 4 fill functions that fill GPU
+// memory with a {8, 16, 32, 64} bit unsigned integer. These these functions
+// are used to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD
+// value. e.g. for a 64-bit double, first convert the double into a 64-bit
+// unsigned integer (with the same bits, not the same value), then call the
+// 64-bit fill kernel precompiled using nvcc in the gpu library. This
+// technique of converting from one type to another is called type-punning.
+// There are some subtle challenges, due to C++'s strict aliasing rules,
+// that require memcpy of single bytes if alignment of the two types does
+// not match.
+
+#define FILL(N) \
+template <typename T> \
+typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \
+fill(T* ptr, T value, std::size_t n) { \
+    using I = uint ## N ## _t; \
+    I v; \
+    std::copy_n( \
+        reinterpret_cast<char*>(&value), \
+        sizeof(T), \
+        reinterpret_cast<char*>(&v) \
+    ); \
+    nest::mc::gpu::fill ## N(reinterpret_cast<I*>(ptr), v, n); \
+}
+
+FILL(8)
+FILL(16)
+FILL(32)
+FILL(64)
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/memory/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp
similarity index 95%
rename from src/memory/managed_ptr.hpp
rename to src/backends/gpu/managed_ptr.hpp
index 944424bb..ca42638e 100644
--- a/src/memory/managed_ptr.hpp
+++ b/src/backends/gpu/managed_ptr.hpp
@@ -6,7 +6,7 @@
 
 namespace nest {
 namespace mc {
-namespace memory {
+namespace gpu {
 
 // used to indicate that the type pointed to by the managed_ptr is to be
 // constructed in the managed_ptr constructor
@@ -40,7 +40,7 @@ class managed_ptr {
     // memory and constructing a type in place.
     template <typename... Args>
     managed_ptr(construct_in_place_tag, Args&&... args) {
-        managed_allocator<element_type> allocator;
+        memory::managed_allocator<element_type> allocator;
         data_ = allocator.allocate(1u);
         synchronize();
         data_ = new (data_) element_type(std::forward<Args>(args)...);
@@ -75,7 +75,7 @@ class managed_ptr {
 
     ~managed_ptr() {
         if (is_allocated()) {
-            managed_allocator<element_type> allocator;
+            memory::managed_allocator<element_type> allocator;
             synchronize(); // required to ensure that memory is not in use on GPU
             data_->~element_type();
             allocator.deallocate(data_, 1u);
@@ -112,7 +112,7 @@ managed_ptr<T> make_managed_ptr(Args&&... args) {
     return managed_ptr<T>(construct_in_place_tag(), std::forward<Args>(args)...);
 }
 
-} // namespace memory
+} // namespace gpu
 } // namespace mc
 } // namespace nest
 
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index 1d28bb32..39a2a9e8 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -2,9 +2,9 @@
 
 #include <common_types.hpp>
 #include <memory/memory.hpp>
-#include <memory/managed_ptr.hpp>
 #include <util/span.hpp>
 
+#include "managed_ptr.hpp"
 #include "stack.hpp"
 #include "kernels/test_thresholds.hpp"
 
@@ -29,12 +29,6 @@ public:
     struct threshold_crossing {
         size_type index;    // index of variable
         value_type time;    // time of crossing
-        __host__ __device__
-        friend bool operator==
-            (const threshold_crossing& lhs, const threshold_crossing& rhs)
-        {
-            return lhs.index==rhs.index && lhs.time==rhs.time;
-        }
     };
 
     using stack_type = stack<threshold_crossing>;
@@ -57,7 +51,7 @@ public:
         thresholds_(memory::make_const_view(thresh)),
         prev_values_(values),
         is_crossed_(size()),
-        stack_(memory::make_managed_ptr<stack_type>(10*size()))
+        stack_(make_managed_ptr<stack_type>(10*size()))
     {
         reset();
     }
@@ -137,7 +131,7 @@ private:
     array prev_values_;         // values at previous sample time: on gpu
     iarray is_crossed_;         // bool flag for state of each watch: on gpu
 
-    memory::managed_ptr<stack_type> stack_;
+    managed_ptr<stack_type> stack_;
 };
 
 } // namespace gpu
diff --git a/src/memory/device_coordinator.hpp b/src/memory/device_coordinator.hpp
index 05fa27b9..5d273da3 100644
--- a/src/memory/device_coordinator.hpp
+++ b/src/memory/device_coordinator.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <algorithm>
 #include <cstdint>
 #include <exception>
 
 #include <util/debug.hpp>
+#include <backends/gpu/fill.hpp>
 
 #include "allocator.hpp"
 #include "array.hpp"
@@ -47,44 +47,6 @@ namespace util {
     };
 } // namespace util
 
-namespace gpu {
-    // brief:
-    // We have to perform some type punning to pass arbitrary POD types to the
-    // GPU backend without polluting the library front end with CUDA kernels
-    // that would require compilation with nvcc.
-    //
-    // detail:
-    // The implementation takes advantage of 4 fill functions that fill GPU
-    // memory with a {8, 16, 32, 64} bit unsigned integer. We want to use these
-    // functions to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD
-    // value. The technique to do this with a 64-bit double, is to first convert
-    // the double into a 64-bit unsigned integer (with the same bits, not the
-    // same value), then call the 64-bit fill kernel precompiled using nvcc in
-    // the gpu library. This technique of converting from one type to another
-    // is called type-punning. There are some subtle challenges, due to C++'s
-    // strict aliasing rules, that require memcpy of single bytes if alignment
-    // of the two types does not match.
-
-    #define FILL(N) \
-    template <typename T> \
-    typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \
-    fill(T* ptr, T value, size_t n) { \
-        using I = uint ## N ## _t; \
-        I v; \
-        std::copy_n( \
-            reinterpret_cast<char*>(&value), \
-            sizeof(T), \
-            reinterpret_cast<char*>(&v) \
-        ); \
-        fill ## N(reinterpret_cast<I*>(ptr), v, n); \
-    }
-
-    FILL(8)
-    FILL(16)
-    FILL(32)
-    FILL(64)
-}
-
 template <typename T>
 class const_device_reference {
 public:
@@ -291,7 +253,7 @@ public:
     // fill memory
     void set(view_type &rng, value_type value) {
         if (rng.size()) {
-            gpu::fill<value_type>(rng.data(), value, rng.size());
+            nest::mc::gpu::fill<value_type>(rng.data(), value, rng.size());
         }
     }
 
diff --git a/src/memory/fill.cu b/src/memory/fill.cu
deleted file mode 100644
index 917b03d4..00000000
--- a/src/memory/fill.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <cstdint>
-
-namespace nest {
-namespace mc {
-namespace memory {
-namespace gpu {
-    template <typename T, typename I>
-    __global__
-    void fill_kernel(T* v, T value, I n) {
-        auto tid = threadIdx.x + blockDim.x*blockIdx.x;
-
-        if(tid < n) {
-            v[tid] = value;
-        }
-    }
-
-    unsigned grid_dim(std::size_t n, unsigned block_dim) {
-        return (n+block_dim-1)/block_dim;
-    }
-
-    void fill8(uint8_t* v, uint8_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill16(uint16_t* v, uint16_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill32(uint32_t* v, uint32_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-
-    void fill64(uint64_t* v, uint64_t value, std::size_t n) {
-        unsigned block_dim = 192;
-        fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n);
-    };
-} // namespace gpu
-} // namespace memory
-} // namespace nest
-} // namespace mc
diff --git a/src/memory/gpu.hpp b/src/memory/gpu.hpp
index c5955eac..89588272 100644
--- a/src/memory/gpu.hpp
+++ b/src/memory/gpu.hpp
@@ -15,14 +15,6 @@ namespace mc {
 namespace memory {
 namespace gpu {
 
-//
-// prototypes for compiled wrappers around fill kernels for GPU memory
-//
-void fill8(uint8_t* v, uint8_t value, std::size_t n);
-void fill16(uint16_t* v, uint16_t value, std::size_t n);
-void fill32(uint32_t* v, uint32_t value, std::size_t n);
-void fill64(uint64_t* v, uint64_t value, std::size_t n);
-
 //
 // helpers for memory where at least one of the target or source is on the gpu
 //
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 61b36aa2..30165644 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -99,8 +99,6 @@ endif()
 target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..")
 
 if(NMC_WITH_CUDA)
-    # Omit -DDATADIR for cuda target because of CMake 3.7/3.8 FindCUDA quoting bug.
-
     set(TARGETS ${TARGETS} test_cuda.exe)
     cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS})
 endif()
diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu
index fb9877c9..e92004f5 100644
--- a/tests/unit/test_atomics.cu
+++ b/tests/unit/test_atomics.cu
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <backends/gpu/intrinsics.hpp>
-#include <memory/managed_ptr.hpp>
+#include <backends/gpu/managed_ptr.hpp>
 
 namespace kernels {
     template <typename T>
@@ -21,13 +21,13 @@ namespace kernels {
 TEST(gpu_intrinsics, cuda_atomic_add) {
     int expected = (128*129)/2;
 
-    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    auto f = nest::mc::gpu::make_managed_ptr<float>(0.f);
     kernels::test_atomic_add<<<1, 128>>>(f.get());
     cudaDeviceSynchronize();
 
     EXPECT_EQ(float(expected), *f);
 
-    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    auto d = nest::mc::gpu::make_managed_ptr<double>(0.);
     kernels::test_atomic_add<<<1, 128>>>(d.get());
     cudaDeviceSynchronize();
 
@@ -38,13 +38,13 @@ TEST(gpu_intrinsics, cuda_atomic_add) {
 TEST(gpu_intrinsics, cuda_atomic_sub) {
     int expected = -(128*129)/2;
 
-    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    auto f = nest::mc::gpu::make_managed_ptr<float>(0.f);
     kernels::test_atomic_sub<<<1, 128>>>(f.get());
     cudaDeviceSynchronize();
 
     EXPECT_EQ(float(expected), *f);
 
-    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    auto d = nest::mc::gpu::make_managed_ptr<double>(0.);
     kernels::test_atomic_sub<<<1, 128>>>(d.get());
     cudaDeviceSynchronize();
 
diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu
index 2756078d..ad20a4d5 100644
--- a/tests/unit/test_gpu_stack.cu
+++ b/tests/unit/test_gpu_stack.cu
@@ -1,7 +1,7 @@
 #include "../gtest.h"
 
 #include <backends/gpu/stack.hpp>
-#include <memory/managed_ptr.hpp>
+#include <backends/gpu/managed_ptr.hpp>
 
 using namespace nest::mc;
 
@@ -52,7 +52,7 @@ TEST(stack, push_back) {
 
     const unsigned n = 10;
     EXPECT_TRUE(n%2 == 0); // require n is even for tests to work
-    auto s = memory::make_managed_ptr<stack>(n);
+    auto s = gpu::make_managed_ptr<stack>(n);
 
     kernels::push_back<<<1, n>>>(*s, kernels::all_ftor());
     cudaDeviceSynchronize();
diff --git a/tests/unit/test_mc_cell_group.cu b/tests/unit/test_mc_cell_group.cu
index 118b9703..1abdbda3 100644
--- a/tests/unit/test_mc_cell_group.cu
+++ b/tests/unit/test_mc_cell_group.cu
@@ -6,7 +6,8 @@
 #include <fvm_multicell.hpp>
 #include <util/rangeutil.hpp>
 
-#include "../test_common_cells.hpp"
+#include "../common_cells.hpp"
+#include "../simple_recipes.hpp"
 
 using namespace nest::mc;
 
@@ -26,8 +27,7 @@ nest::mc::cell make_cell() {
 
 TEST(cell_group, test)
 {
-    using cell_group_type = mc_cell_group<fvm_cell>;
-    auto group = cell_group_type({0u}, util::singleton_view(make_cell()));
+    mc_cell_group<fvm_cell> group({0u}, cable1d_recipe(make_cell()));
 
     group.advance(50, 0.01);
 
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index 288f7ded..e013a998 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -30,7 +30,7 @@ set(VALIDATION_CUDA_SOURCES
 )
 
 if(NMC_VALIDATION_DATA_DIR)
-    if ("${CMAKE_VERSION}" MATCHES "^3.[78].")
+    if ("${CMAKE_VERSION}" MATCHES "^3.[789].")
         message(WARNING "CMake ${CMAKE_VERSION} has broken FindCUDA; omitting NMC_DATADIR define.")
     else()
         add_definitions("-DNMC_DATADIR=\"${NMC_VALIDATION_DATA_DIR}\"")
-- 
GitLab