diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c05fdcf4a7927ca221050222ab25596631146db..02d70e06b34d688b4f8751ede06d6fdf673aa720 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,27 +125,32 @@ endif() #---------------------------------------------------------- # CUDA support #---------------------------------------------------------- -option(ARB_WITH_CUDA "use CUDA for GPU offload" OFF) -if(ARB_WITH_CUDA) - find_package(CUDA REQUIRED) - - # Turn off annoying and incorrect warnings generated in the JSON file. - # We also work around the same issue with the intel compiler. - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\") +set(ARB_GPU_MODEL "none" CACHE STRING "The target GPU architecture: one of {none,K20,K80,P100}") +set_property(CACHE ARB_GPU_MODEL PROPERTY STRINGS none K20 K80 P100 ) - # set the CUDA target specfic flags - # code regions protected by ARB_HAVE_CUDA should only be available to the CUDA - # compiler, which regions protected by ARB_HAVE_GPU are visible to both host - # and device compiler when targetting GPU. - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_CUDA) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_GPU) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) # minimum target P100 GPUs +set(ARB_WITH_CUDA FALSE) +if(NOT ARB_GPU_MODEL MATCHES "none") + find_package(CUDA REQUIRED) + set(ARB_WITH_CUDA TRUE) add_definitions(-DARB_HAVE_GPU) include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES}) endif() +if(ARB_GPU_MODEL MATCHES "K20") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35) + add_definitions(-DARB_CUDA_ARCH=350) +elseif(ARB_GPU_MODEL MATCHES "K80") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_37) + add_definitions(-DARB_CUDA_ARCH=370) +elseif(ARB_GPU_MODEL MATCHES "P100") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) + add_definitions(-DARB_CUDA_ARCH=600) +elseif(NOT ARB_GPU_MODEL MATCHES "none") + message( FATAL_ERROR "-- GPU architecture '${ARB_GPU_MODEL}' not supported. Use one of {none, K20, K80, P100}") +endif() + #---------------------------------------------------------- # Cray/BGQ/Generic Linux/other flag? #---------------------------------------------------------- diff --git a/doc/install.rst b/doc/install.rst index 8cd30e7587ae56b40c82863ac1781aa443a8f66a..0a5375a5766ed6267540393a1e017aacd6daaad1 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -228,13 +228,13 @@ CMake parameters and flags, follow links to the more detailed descriptions below cmake .. -DARB_THREADING_MODEL=tbb -DARB_VECTORIZE_TARGET=KNL -.. topic:: `Release <buildtarget_>`_ mode with `CUDA <gpu_>`_ and `AVX2 <vectorize_>`_ and `GCC 5 <compilers_>`_ +.. topic:: `Release <buildtarget_>`_ mode with support for: `P100 GPUs <gpu_>`_; `AVX2 <vectorize_>`_; and `GCC 5 <compilers_>`_ .. code-block:: bash export CC=gcc-5 export CXX=g++-5 - cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_WITH_CUDA=ON + cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_GPU_MODEL=P100 .. _buildtarget: @@ -350,16 +350,20 @@ which is implemented in the Arbor source code. GPU Backend ----------- -Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the CMake ``ARB_WITH_CUDA`` option. +Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the +CMake ``ARB_GPU_MODEL`` option to match the GPU model to target: .. code-block:: bash - cmake .. -DARB_WITH_CUDA=ON + cmake -DARB_GPU_MODEL={none, K20, K80, P100} + +By default ``ARB_GPU_MODEL=none``, and a GPU target must explicitly be set to +build for and run on GPUs. .. Note:: - Abor requires: - * CUDA version >= 8 - * P100 or more recent GPU (``-arch=sm_60``) + The main difference between the Kepler (K20 & K80) and Pascal (P100) GPUs is + the latter's built-in support for double precision atomics and fewer GPU + synchronizations when accessing managed memory. .. _cluster: diff --git a/src/backends/gpu/kernels/ions.cu b/src/backends/gpu/kernels/ions.cu index 3631315a796e20b0bca28751978745a6e61cd366..a5061c5fdf13800644f809e75897f23ddffcff4d 100644 --- a/src/backends/gpu/kernels/ions.cu +++ b/src/backends/gpu/kernels/ions.cu @@ -41,10 +41,12 @@ void nernst(std::size_t n, const fvm_value_type* Xi, fvm_value_type* eX) { - constexpr int block_dim = 128; - const int grid_dim = impl::block_count(n, block_dim); - kernels::nernst<<<grid_dim, block_dim>>> - (n, valency, temperature, Xo, Xi, eX); + if (n>0) { + constexpr int block_dim = 128; + const int grid_dim = impl::block_count(n, block_dim); + kernels::nernst<<<grid_dim, block_dim>>> + (n, valency, temperature, Xo, Xi, eX); + } } void init_concentration( @@ -53,10 +55,12 @@ void init_concentration( const fvm_value_type* weight_Xi, const fvm_value_type* weight_Xo, fvm_value_type c_int, fvm_value_type c_ext) { - constexpr int block_dim = 128; - const int grid_dim = impl::block_count(n, block_dim); - kernels::init_concentration<<<grid_dim, block_dim>>> - (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext); + if (n>0) { + constexpr int block_dim = 128; + const int grid_dim = impl::block_count(n, block_dim); + kernels::init_concentration<<<grid_dim, block_dim>>> + (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext); + } } } // namespace gpu diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp index a0ad5e39e9f5852e90d83d4b98f15c474c4b321e..29b054b1c16c2f767b325c8e3145e426b2fc8c7d 100644 --- a/src/backends/gpu/kernels/reduce_by_key.hpp +++ b/src/backends/gpu/kernels/reduce_by_key.hpp @@ -2,6 +2,7 @@ #include <cstdint> #include "detail.hpp" +#include <backends/gpu/intrinsics.hpp> namespace arb { namespace gpu { @@ -162,7 +163,7 @@ void reduce_by_key(T contribution, T* target, I idx) { if(run.is_root()) { // Update atomically in case the run spans multiple warps. - atomicAdd(target+idx, contribution); + cuda_atomic_add(target+idx, contribution); } } diff --git a/src/backends/gpu/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp index 8343470fab5afa66169179fde3948a8a36f5d36d..ba0d99deeef8b860efb6987b2222315618e7b6da 100644 --- a/src/backends/gpu/managed_ptr.hpp +++ b/src/backends/gpu/managed_ptr.hpp @@ -7,6 +7,17 @@ namespace arb { namespace gpu { +// Pre-pascal NVIDIA GPUs don't support page faulting for GPU reads of managed +// memory, so when a kernel is launched, all managed memory is copied to the +// GPU. The upshot of this is that no CPU-side reads can be made of _any_ +// managed memory can be made whe _any_ kernel is running. The following helper +// function can be used to determine whether synchronization is required before +// CPU-side reads of managed memory. +constexpr +bool managed_synch_required() { + return (ARB_CUDA_ARCH < 600); // all GPUs before P100 +} + // used to indicate that the type pointed to by the managed_ptr is to be // constructed in the managed_ptr constructor struct construct_in_place_tag {}; diff --git a/src/backends/gpu/stack.hpp b/src/backends/gpu/stack.hpp index 591bda9fbe443a220fb93606ca1ee611fdafb7ca..3c901f2ef0ae924d12da8b0d4de5815dc50764b4 100644 --- a/src/backends/gpu/stack.hpp +++ b/src/backends/gpu/stack.hpp @@ -2,6 +2,7 @@ #include <algorithm> +#include <backends/gpu/managed_ptr.hpp> #include <memory/allocator.hpp> #include "stack_common.hpp" @@ -28,13 +29,13 @@ class stack { using allocator = memory::managed_allocator<U>; using storage_type = stack_storage<value_type>; - storage_type* storage_; + managed_ptr<storage_type> storage_; - storage_type* create_storage(unsigned n) { - auto p = allocator<storage_type>().allocate(1); + managed_ptr<storage_type> create_storage(unsigned n) { + auto p = make_managed_ptr<storage_type>(); p->capacity = n; p->stores = 0; - p->data = allocator<value_type>().allocate(n); + p->data = n? allocator<value_type>().allocate(n): nullptr; return p; } @@ -56,8 +57,9 @@ public: explicit stack(unsigned capacity): storage_(create_storage(capacity)) {} ~stack() { - allocator<value_type>().deallocate(storage_->data, storage_->capacity); - allocator<storage_type>().deallocate(storage_, 1); + if (storage_->data) { + allocator<value_type>().deallocate(storage_->data, storage_->capacity); + } } void clear() { diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp index ebb4ed6bd573c65ef9449d3a9a49492652246e6c..6315fe32571a3dec0e242f2c031c6a4136b97405 100644 --- a/src/backends/gpu/threshold_watcher.hpp +++ b/src/backends/gpu/threshold_watcher.hpp @@ -54,9 +54,11 @@ public: reset(); } - /// Remove all stored crossings that were detected in previous calls - /// to test() + /// Remove all stored crossings that were detected in previous calls to test() void clear_crossings() { + if (managed_synch_required()) { + cudaDeviceSynchronize(); + } stack_.clear(); } @@ -90,6 +92,7 @@ public: if (stack_.overflow()) { throw std::runtime_error("GPU spike buffer overflow."); } + return std::vector<threshold_crossing>(stack_.begin(), stack_.end()); }