diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8429bac998cf01963b596fb50bda7a0efd70d101..5bbabc608b4e20f952146c31c0a8697996cf3c0c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,7 +27,7 @@ set(BASE_SOURCES set(CUDA_SOURCES backends/gpu/fvm.cu backends/gpu/multi_event_stream.cu - memory/fill.cu + backends/gpu/fill.cu ) # The cell_group_factory acts like an interface between the diff --git a/src/backends/gpu/fill.cu b/src/backends/gpu/fill.cu new file mode 100644 index 0000000000000000000000000000000000000000..f82f1d8eb809c0340ac1050f67a69bba101adc6c --- /dev/null +++ b/src/backends/gpu/fill.cu @@ -0,0 +1,43 @@ +#include <cstdint> + +namespace nest { +namespace mc { +namespace gpu { + +template <typename T, typename I> +__global__ +void fill_kernel(T* v, T value, I n) { + auto tid = threadIdx.x + blockDim.x*blockIdx.x; + + if(tid < n) { + v[tid] = value; + } +} + +inline unsigned grid_dim(std::size_t n, unsigned block_dim) { + return (n+block_dim-1)/block_dim; +} + +void fill8(uint8_t* v, uint8_t value, std::size_t n) { + unsigned block_dim = 192; + fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); +}; + +void fill16(uint16_t* v, uint16_t value, std::size_t n) { + unsigned block_dim = 192; + fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); +}; + +void fill32(uint32_t* v, uint32_t value, std::size_t n) { + unsigned block_dim = 192; + fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); +}; + +void fill64(uint64_t* v, uint64_t value, std::size_t n) { + unsigned block_dim = 192; + fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); +}; + +} // namespace gpu +} // namespace nest +} // namespace mc diff --git a/src/backends/gpu/fill.hpp b/src/backends/gpu/fill.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4b1447fcee449557eb4e32926294da6ae0d440de --- /dev/null +++ b/src/backends/gpu/fill.hpp @@ -0,0 +1,56 @@ +#include <algorithm> +#include <cstdint> +#include <type_traits> + +// +// prototypes for compiled wrappers around fill kernels for GPU memory +// + +namespace nest { +namespace mc { +namespace gpu { + +void fill8(uint8_t* v, uint8_t value, std::size_t n); +void fill16(uint16_t* v, uint16_t value, std::size_t n); +void fill32(uint32_t* v, uint32_t value, std::size_t n); +void fill64(uint64_t* v, uint64_t value, std::size_t n); + +// Brief: +// Perform type punning to pass arbitrary POD types to the GPU backend +// without polluting the library front end with CUDA kernels that would +// require compilation with nvcc. +// +// Detail: +// The implementation takes advantage of 4 fill functions that fill GPU +// memory with a {8, 16, 32, 64} bit unsigned integer. These these functions +// are used to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD +// value. e.g. for a 64-bit double, first convert the double into a 64-bit +// unsigned integer (with the same bits, not the same value), then call the +// 64-bit fill kernel precompiled using nvcc in the gpu library. This +// technique of converting from one type to another is called type-punning. +// There are some subtle challenges, due to C++'s strict aliasing rules, +// that require memcpy of single bytes if alignment of the two types does +// not match. + +#define FILL(N) \ +template <typename T> \ +typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \ +fill(T* ptr, T value, std::size_t n) { \ + using I = uint ## N ## _t; \ + I v; \ + std::copy_n( \ + reinterpret_cast<char*>(&value), \ + sizeof(T), \ + reinterpret_cast<char*>(&v) \ + ); \ + nest::mc::gpu::fill ## N(reinterpret_cast<I*>(ptr), v, n); \ +} + +FILL(8) +FILL(16) +FILL(32) +FILL(64) + +} // namespace gpu +} // namespace nest +} // namespace mc diff --git a/src/memory/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp similarity index 95% rename from src/memory/managed_ptr.hpp rename to src/backends/gpu/managed_ptr.hpp index 944424bb6f0960bf3a15f62d5935729e4747832b..ca42638e649b55c83dd43b61b52ca9f0f43ba41f 100644 --- a/src/memory/managed_ptr.hpp +++ b/src/backends/gpu/managed_ptr.hpp @@ -6,7 +6,7 @@ namespace nest { namespace mc { -namespace memory { +namespace gpu { // used to indicate that the type pointed to by the managed_ptr is to be // constructed in the managed_ptr constructor @@ -40,7 +40,7 @@ class managed_ptr { // memory and constructing a type in place. template <typename... Args> managed_ptr(construct_in_place_tag, Args&&... args) { - managed_allocator<element_type> allocator; + memory::managed_allocator<element_type> allocator; data_ = allocator.allocate(1u); synchronize(); data_ = new (data_) element_type(std::forward<Args>(args)...); @@ -75,7 +75,7 @@ class managed_ptr { ~managed_ptr() { if (is_allocated()) { - managed_allocator<element_type> allocator; + memory::managed_allocator<element_type> allocator; synchronize(); // required to ensure that memory is not in use on GPU data_->~element_type(); allocator.deallocate(data_, 1u); @@ -112,7 +112,7 @@ managed_ptr<T> make_managed_ptr(Args&&... args) { return managed_ptr<T>(construct_in_place_tag(), std::forward<Args>(args)...); } -} // namespace memory +} // namespace gpu } // namespace mc } // namespace nest diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp index 1d28bb329d01e587ee3bdaae5dc2f5efbffeedf7..39a2a9e8dd5df308cf6db13799fdcb1579a92f85 100644 --- a/src/backends/gpu/threshold_watcher.hpp +++ b/src/backends/gpu/threshold_watcher.hpp @@ -2,9 +2,9 @@ #include <common_types.hpp> #include <memory/memory.hpp> -#include <memory/managed_ptr.hpp> #include <util/span.hpp> +#include "managed_ptr.hpp" #include "stack.hpp" #include "kernels/test_thresholds.hpp" @@ -29,12 +29,6 @@ public: struct threshold_crossing { size_type index; // index of variable value_type time; // time of crossing - __host__ __device__ - friend bool operator== - (const threshold_crossing& lhs, const threshold_crossing& rhs) - { - return lhs.index==rhs.index && lhs.time==rhs.time; - } }; using stack_type = stack<threshold_crossing>; @@ -57,7 +51,7 @@ public: thresholds_(memory::make_const_view(thresh)), prev_values_(values), is_crossed_(size()), - stack_(memory::make_managed_ptr<stack_type>(10*size())) + stack_(make_managed_ptr<stack_type>(10*size())) { reset(); } @@ -137,7 +131,7 @@ private: array prev_values_; // values at previous sample time: on gpu iarray is_crossed_; // bool flag for state of each watch: on gpu - memory::managed_ptr<stack_type> stack_; + managed_ptr<stack_type> stack_; }; } // namespace gpu diff --git a/src/memory/device_coordinator.hpp b/src/memory/device_coordinator.hpp index 05fa27b931be3cf7e50320b25c369e45724ab2aa..5d273da336c04dd53ae7b44d8f3fabe2d3c869c6 100644 --- a/src/memory/device_coordinator.hpp +++ b/src/memory/device_coordinator.hpp @@ -1,10 +1,10 @@ #pragma once -#include <algorithm> #include <cstdint> #include <exception> #include <util/debug.hpp> +#include <backends/gpu/fill.hpp> #include "allocator.hpp" #include "array.hpp" @@ -47,44 +47,6 @@ namespace util { }; } // namespace util -namespace gpu { - // brief: - // We have to perform some type punning to pass arbitrary POD types to the - // GPU backend without polluting the library front end with CUDA kernels - // that would require compilation with nvcc. - // - // detail: - // The implementation takes advantage of 4 fill functions that fill GPU - // memory with a {8, 16, 32, 64} bit unsigned integer. We want to use these - // functions to fill a block of GPU memory with _any_ 8, 16, 32 or 64 bit POD - // value. The technique to do this with a 64-bit double, is to first convert - // the double into a 64-bit unsigned integer (with the same bits, not the - // same value), then call the 64-bit fill kernel precompiled using nvcc in - // the gpu library. This technique of converting from one type to another - // is called type-punning. There are some subtle challenges, due to C++'s - // strict aliasing rules, that require memcpy of single bytes if alignment - // of the two types does not match. - - #define FILL(N) \ - template <typename T> \ - typename std::enable_if<sizeof(T)==sizeof(uint ## N ## _t)>::type \ - fill(T* ptr, T value, size_t n) { \ - using I = uint ## N ## _t; \ - I v; \ - std::copy_n( \ - reinterpret_cast<char*>(&value), \ - sizeof(T), \ - reinterpret_cast<char*>(&v) \ - ); \ - fill ## N(reinterpret_cast<I*>(ptr), v, n); \ - } - - FILL(8) - FILL(16) - FILL(32) - FILL(64) -} - template <typename T> class const_device_reference { public: @@ -291,7 +253,7 @@ public: // fill memory void set(view_type &rng, value_type value) { if (rng.size()) { - gpu::fill<value_type>(rng.data(), value, rng.size()); + nest::mc::gpu::fill<value_type>(rng.data(), value, rng.size()); } } diff --git a/src/memory/fill.cu b/src/memory/fill.cu deleted file mode 100644 index 917b03d42768670ab1d6166961c7c58093472e08..0000000000000000000000000000000000000000 --- a/src/memory/fill.cu +++ /dev/null @@ -1,45 +0,0 @@ -#include <iostream> -#include <cstdlib> -#include <cstdint> - -namespace nest { -namespace mc { -namespace memory { -namespace gpu { - template <typename T, typename I> - __global__ - void fill_kernel(T* v, T value, I n) { - auto tid = threadIdx.x + blockDim.x*blockIdx.x; - - if(tid < n) { - v[tid] = value; - } - } - - unsigned grid_dim(std::size_t n, unsigned block_dim) { - return (n+block_dim-1)/block_dim; - } - - void fill8(uint8_t* v, uint8_t value, std::size_t n) { - unsigned block_dim = 192; - fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); - }; - - void fill16(uint16_t* v, uint16_t value, std::size_t n) { - unsigned block_dim = 192; - fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); - }; - - void fill32(uint32_t* v, uint32_t value, std::size_t n) { - unsigned block_dim = 192; - fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); - }; - - void fill64(uint64_t* v, uint64_t value, std::size_t n) { - unsigned block_dim = 192; - fill_kernel<<<grid_dim(n, block_dim), block_dim>>>(v, value, n); - }; -} // namespace gpu -} // namespace memory -} // namespace nest -} // namespace mc diff --git a/src/memory/gpu.hpp b/src/memory/gpu.hpp index c5955eac21d908519e511351fbf4f56f22f09a26..895882729280cf88ba3021acd84724f1f28d2f87 100644 --- a/src/memory/gpu.hpp +++ b/src/memory/gpu.hpp @@ -15,14 +15,6 @@ namespace mc { namespace memory { namespace gpu { -// -// prototypes for compiled wrappers around fill kernels for GPU memory -// -void fill8(uint8_t* v, uint8_t value, std::size_t n); -void fill16(uint16_t* v, uint16_t value, std::size_t n); -void fill32(uint32_t* v, uint32_t value, std::size_t n); -void fill64(uint64_t* v, uint64_t value, std::size_t n); - // // helpers for memory where at least one of the target or source is on the gpu // diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 61b36aa2a2b79acc7a7ec36e44d206873807dafc..3016564404cf1a8d8087a90e5997826fb6e4c204 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -99,8 +99,6 @@ endif() target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..") if(NMC_WITH_CUDA) - # Omit -DDATADIR for cuda target because of CMake 3.7/3.8 FindCUDA quoting bug. - set(TARGETS ${TARGETS} test_cuda.exe) cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS}) endif() diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu index fb9877c9a7fd6de6258c31bf7c40fcc062357ab3..e92004f54afe6ae5e429272af6c5f15a8af0c058 100644 --- a/tests/unit/test_atomics.cu +++ b/tests/unit/test_atomics.cu @@ -1,7 +1,7 @@ #include "../gtest.h" #include <backends/gpu/intrinsics.hpp> -#include <memory/managed_ptr.hpp> +#include <backends/gpu/managed_ptr.hpp> namespace kernels { template <typename T> @@ -21,13 +21,13 @@ namespace kernels { TEST(gpu_intrinsics, cuda_atomic_add) { int expected = (128*129)/2; - auto f = nest::mc::memory::make_managed_ptr<float>(0.f); + auto f = nest::mc::gpu::make_managed_ptr<float>(0.f); kernels::test_atomic_add<<<1, 128>>>(f.get()); cudaDeviceSynchronize(); EXPECT_EQ(float(expected), *f); - auto d = nest::mc::memory::make_managed_ptr<double>(0.); + auto d = nest::mc::gpu::make_managed_ptr<double>(0.); kernels::test_atomic_add<<<1, 128>>>(d.get()); cudaDeviceSynchronize(); @@ -38,13 +38,13 @@ TEST(gpu_intrinsics, cuda_atomic_add) { TEST(gpu_intrinsics, cuda_atomic_sub) { int expected = -(128*129)/2; - auto f = nest::mc::memory::make_managed_ptr<float>(0.f); + auto f = nest::mc::gpu::make_managed_ptr<float>(0.f); kernels::test_atomic_sub<<<1, 128>>>(f.get()); cudaDeviceSynchronize(); EXPECT_EQ(float(expected), *f); - auto d = nest::mc::memory::make_managed_ptr<double>(0.); + auto d = nest::mc::gpu::make_managed_ptr<double>(0.); kernels::test_atomic_sub<<<1, 128>>>(d.get()); cudaDeviceSynchronize(); diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu index 2756078dca09d4bfebf14adb992bf05cf61a4d28..ad20a4d5a1e2732f333041fac793e175c67505bc 100644 --- a/tests/unit/test_gpu_stack.cu +++ b/tests/unit/test_gpu_stack.cu @@ -1,7 +1,7 @@ #include "../gtest.h" #include <backends/gpu/stack.hpp> -#include <memory/managed_ptr.hpp> +#include <backends/gpu/managed_ptr.hpp> using namespace nest::mc; @@ -52,7 +52,7 @@ TEST(stack, push_back) { const unsigned n = 10; EXPECT_TRUE(n%2 == 0); // require n is even for tests to work - auto s = memory::make_managed_ptr<stack>(n); + auto s = gpu::make_managed_ptr<stack>(n); kernels::push_back<<<1, n>>>(*s, kernels::all_ftor()); cudaDeviceSynchronize(); diff --git a/tests/unit/test_mc_cell_group.cu b/tests/unit/test_mc_cell_group.cu index 118b97033ec4c6370dc265c2dc5782fc76da04ad..1abdbda3c908cb8cca22c2371ea8490b7d04d4fd 100644 --- a/tests/unit/test_mc_cell_group.cu +++ b/tests/unit/test_mc_cell_group.cu @@ -6,7 +6,8 @@ #include <fvm_multicell.hpp> #include <util/rangeutil.hpp> -#include "../test_common_cells.hpp" +#include "../common_cells.hpp" +#include "../simple_recipes.hpp" using namespace nest::mc; @@ -26,8 +27,7 @@ nest::mc::cell make_cell() { TEST(cell_group, test) { - using cell_group_type = mc_cell_group<fvm_cell>; - auto group = cell_group_type({0u}, util::singleton_view(make_cell())); + mc_cell_group<fvm_cell> group({0u}, cable1d_recipe(make_cell())); group.advance(50, 0.01); diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt index 288f7ded6b51607a05d11d0258ae1fc08c76e922..e013a9981fd81e0dafb3e53eeea274d525d03d2c 100644 --- a/tests/validation/CMakeLists.txt +++ b/tests/validation/CMakeLists.txt @@ -30,7 +30,7 @@ set(VALIDATION_CUDA_SOURCES ) if(NMC_VALIDATION_DATA_DIR) - if ("${CMAKE_VERSION}" MATCHES "^3.[78].") + if ("${CMAKE_VERSION}" MATCHES "^3.[789].") message(WARNING "CMake ${CMAKE_VERSION} has broken FindCUDA; omitting NMC_DATADIR define.") else() add_definitions("-DNMC_DATADIR=\"${NMC_VALIDATION_DATA_DIR}\"")