From 6b659a39a8ac68007f740d7944622d54ac2c3298 Mon Sep 17 00:00:00 2001 From: Ben Cumming <louncharf@gmail.com> Date: Wed, 11 Apr 2018 15:48:24 +0200 Subject: [PATCH] Fix support for Keplar (K20 & K80) GPUs. (#470) Fixes issue #467 * Add GPU synchronization points where required for Kepler to coordinate CPU access of managed memory. * Use hand-rolled double precision atomic addition for Kelper targets. * Replace `ARB_WITH_CUDA` build option with `ARB_GPU_MODEL` option that takes one of 'none', 'K20', 'K80' or 'P100', and set up source-code defines accoringly. * Clean up of redundant compiler flags and defines no longer required now that the project uses separate compilation for CUDA sources. --- CMakeLists.txt | 33 +++++++++++++--------- doc/install.rst | 18 +++++++----- src/backends/gpu/kernels/ions.cu | 20 +++++++------ src/backends/gpu/kernels/reduce_by_key.hpp | 3 +- src/backends/gpu/managed_ptr.hpp | 11 ++++++++ src/backends/gpu/stack.hpp | 14 +++++---- src/backends/gpu/threshold_watcher.hpp | 7 +++-- 7 files changed, 68 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c05fdcf..02d70e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,27 +125,32 @@ endif() #---------------------------------------------------------- # CUDA support #---------------------------------------------------------- -option(ARB_WITH_CUDA "use CUDA for GPU offload" OFF) -if(ARB_WITH_CUDA) - find_package(CUDA REQUIRED) - - # Turn off annoying and incorrect warnings generated in the JSON file. - # We also work around the same issue with the intel compiler. - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\") +set(ARB_GPU_MODEL "none" CACHE STRING "The target GPU architecture: one of {none,K20,K80,P100}") +set_property(CACHE ARB_GPU_MODEL PROPERTY STRINGS none K20 K80 P100 ) - # set the CUDA target specfic flags - # code regions protected by ARB_HAVE_CUDA should only be available to the CUDA - # compiler, which regions protected by ARB_HAVE_GPU are visible to both host - # and device compiler when targetting GPU. - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_CUDA) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_GPU) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) # minimum target P100 GPUs +set(ARB_WITH_CUDA FALSE) +if(NOT ARB_GPU_MODEL MATCHES "none") + find_package(CUDA REQUIRED) + set(ARB_WITH_CUDA TRUE) add_definitions(-DARB_HAVE_GPU) include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES}) endif() +if(ARB_GPU_MODEL MATCHES "K20") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35) + add_definitions(-DARB_CUDA_ARCH=350) +elseif(ARB_GPU_MODEL MATCHES "K80") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_37) + add_definitions(-DARB_CUDA_ARCH=370) +elseif(ARB_GPU_MODEL MATCHES "P100") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) + add_definitions(-DARB_CUDA_ARCH=600) +elseif(NOT ARB_GPU_MODEL MATCHES "none") + message( FATAL_ERROR "-- GPU architecture '${ARB_GPU_MODEL}' not supported. Use one of {none, K20, K80, P100}") +endif() + #---------------------------------------------------------- # Cray/BGQ/Generic Linux/other flag? #---------------------------------------------------------- diff --git a/doc/install.rst b/doc/install.rst index 8cd30e75..0a5375a5 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -228,13 +228,13 @@ CMake parameters and flags, follow links to the more detailed descriptions below cmake .. -DARB_THREADING_MODEL=tbb -DARB_VECTORIZE_TARGET=KNL -.. topic:: `Release <buildtarget_>`_ mode with `CUDA <gpu_>`_ and `AVX2 <vectorize_>`_ and `GCC 5 <compilers_>`_ +.. topic:: `Release <buildtarget_>`_ mode with support for: `P100 GPUs <gpu_>`_; `AVX2 <vectorize_>`_; and `GCC 5 <compilers_>`_ .. code-block:: bash export CC=gcc-5 export CXX=g++-5 - cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_WITH_CUDA=ON + cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_GPU_MODEL=P100 .. _buildtarget: @@ -350,16 +350,20 @@ which is implemented in the Arbor source code. GPU Backend ----------- -Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the CMake ``ARB_WITH_CUDA`` option. +Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the +CMake ``ARB_GPU_MODEL`` option to match the GPU model to target: .. code-block:: bash - cmake .. -DARB_WITH_CUDA=ON + cmake -DARB_GPU_MODEL={none, K20, K80, P100} + +By default ``ARB_GPU_MODEL=none``, and a GPU target must explicitly be set to +build for and run on GPUs. .. Note:: - Abor requires: - * CUDA version >= 8 - * P100 or more recent GPU (``-arch=sm_60``) + The main difference between the Kepler (K20 & K80) and Pascal (P100) GPUs is + the latter's built-in support for double precision atomics and fewer GPU + synchronizations when accessing managed memory. .. _cluster: diff --git a/src/backends/gpu/kernels/ions.cu b/src/backends/gpu/kernels/ions.cu index 3631315a..a5061c5f 100644 --- a/src/backends/gpu/kernels/ions.cu +++ b/src/backends/gpu/kernels/ions.cu @@ -41,10 +41,12 @@ void nernst(std::size_t n, const fvm_value_type* Xi, fvm_value_type* eX) { - constexpr int block_dim = 128; - const int grid_dim = impl::block_count(n, block_dim); - kernels::nernst<<<grid_dim, block_dim>>> - (n, valency, temperature, Xo, Xi, eX); + if (n>0) { + constexpr int block_dim = 128; + const int grid_dim = impl::block_count(n, block_dim); + kernels::nernst<<<grid_dim, block_dim>>> + (n, valency, temperature, Xo, Xi, eX); + } } void init_concentration( @@ -53,10 +55,12 @@ void init_concentration( const fvm_value_type* weight_Xi, const fvm_value_type* weight_Xo, fvm_value_type c_int, fvm_value_type c_ext) { - constexpr int block_dim = 128; - const int grid_dim = impl::block_count(n, block_dim); - kernels::init_concentration<<<grid_dim, block_dim>>> - (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext); + if (n>0) { + constexpr int block_dim = 128; + const int grid_dim = impl::block_count(n, block_dim); + kernels::init_concentration<<<grid_dim, block_dim>>> + (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext); + } } } // namespace gpu diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp index a0ad5e39..29b054b1 100644 --- a/src/backends/gpu/kernels/reduce_by_key.hpp +++ b/src/backends/gpu/kernels/reduce_by_key.hpp @@ -2,6 +2,7 @@ #include <cstdint> #include "detail.hpp" +#include <backends/gpu/intrinsics.hpp> namespace arb { namespace gpu { @@ -162,7 +163,7 @@ void reduce_by_key(T contribution, T* target, I idx) { if(run.is_root()) { // Update atomically in case the run spans multiple warps. - atomicAdd(target+idx, contribution); + cuda_atomic_add(target+idx, contribution); } } diff --git a/src/backends/gpu/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp index 8343470f..ba0d99de 100644 --- a/src/backends/gpu/managed_ptr.hpp +++ b/src/backends/gpu/managed_ptr.hpp @@ -7,6 +7,17 @@ namespace arb { namespace gpu { +// Pre-pascal NVIDIA GPUs don't support page faulting for GPU reads of managed +// memory, so when a kernel is launched, all managed memory is copied to the +// GPU. The upshot of this is that no CPU-side reads can be made of _any_ +// managed memory can be made whe _any_ kernel is running. The following helper +// function can be used to determine whether synchronization is required before +// CPU-side reads of managed memory. +constexpr +bool managed_synch_required() { + return (ARB_CUDA_ARCH < 600); // all GPUs before P100 +} + // used to indicate that the type pointed to by the managed_ptr is to be // constructed in the managed_ptr constructor struct construct_in_place_tag {}; diff --git a/src/backends/gpu/stack.hpp b/src/backends/gpu/stack.hpp index 591bda9f..3c901f2e 100644 --- a/src/backends/gpu/stack.hpp +++ b/src/backends/gpu/stack.hpp @@ -2,6 +2,7 @@ #include <algorithm> +#include <backends/gpu/managed_ptr.hpp> #include <memory/allocator.hpp> #include "stack_common.hpp" @@ -28,13 +29,13 @@ class stack { using allocator = memory::managed_allocator<U>; using storage_type = stack_storage<value_type>; - storage_type* storage_; + managed_ptr<storage_type> storage_; - storage_type* create_storage(unsigned n) { - auto p = allocator<storage_type>().allocate(1); + managed_ptr<storage_type> create_storage(unsigned n) { + auto p = make_managed_ptr<storage_type>(); p->capacity = n; p->stores = 0; - p->data = allocator<value_type>().allocate(n); + p->data = n? allocator<value_type>().allocate(n): nullptr; return p; } @@ -56,8 +57,9 @@ public: explicit stack(unsigned capacity): storage_(create_storage(capacity)) {} ~stack() { - allocator<value_type>().deallocate(storage_->data, storage_->capacity); - allocator<storage_type>().deallocate(storage_, 1); + if (storage_->data) { + allocator<value_type>().deallocate(storage_->data, storage_->capacity); + } } void clear() { diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp index ebb4ed6b..6315fe32 100644 --- a/src/backends/gpu/threshold_watcher.hpp +++ b/src/backends/gpu/threshold_watcher.hpp @@ -54,9 +54,11 @@ public: reset(); } - /// Remove all stored crossings that were detected in previous calls - /// to test() + /// Remove all stored crossings that were detected in previous calls to test() void clear_crossings() { + if (managed_synch_required()) { + cudaDeviceSynchronize(); + } stack_.clear(); } @@ -90,6 +92,7 @@ public: if (stack_.overflow()) { throw std::runtime_error("GPU spike buffer overflow."); } + return std::vector<threshold_crossing>(stack_.begin(), stack_.end()); } -- GitLab