From 6b659a39a8ac68007f740d7944622d54ac2c3298 Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Wed, 11 Apr 2018 15:48:24 +0200
Subject: [PATCH] Fix support for Keplar (K20 & K80) GPUs. (#470)
Fixes issue #467
* Add GPU synchronization points where required for Kepler to coordinate CPU access of managed memory.
* Use hand-rolled double precision atomic addition for Kelper targets.
* Replace `ARB_WITH_CUDA` build option with `ARB_GPU_MODEL` option that takes one of 'none', 'K20', 'K80' or 'P100', and set up source-code defines accoringly.
* Clean up of redundant compiler flags and defines no longer required now that the project uses separate compilation for CUDA sources.
---
CMakeLists.txt | 33 +++++++++++++---------
doc/install.rst | 18 +++++++-----
src/backends/gpu/kernels/ions.cu | 20 +++++++------
src/backends/gpu/kernels/reduce_by_key.hpp | 3 +-
src/backends/gpu/managed_ptr.hpp | 11 ++++++++
src/backends/gpu/stack.hpp | 14 +++++----
src/backends/gpu/threshold_watcher.hpp | 7 +++--
7 files changed, 68 insertions(+), 38 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c05fdcf..02d70e06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,27 +125,32 @@ endif()
#----------------------------------------------------------
# CUDA support
#----------------------------------------------------------
-option(ARB_WITH_CUDA "use CUDA for GPU offload" OFF)
-if(ARB_WITH_CUDA)
- find_package(CUDA REQUIRED)
-
- # Turn off annoying and incorrect warnings generated in the JSON file.
- # We also work around the same issue with the intel compiler.
- set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\")
+set(ARB_GPU_MODEL "none" CACHE STRING "The target GPU architecture: one of {none,K20,K80,P100}")
+set_property(CACHE ARB_GPU_MODEL PROPERTY STRINGS none K20 K80 P100 )
- # set the CUDA target specfic flags
- # code regions protected by ARB_HAVE_CUDA should only be available to the CUDA
- # compiler, which regions protected by ARB_HAVE_GPU are visible to both host
- # and device compiler when targetting GPU.
- set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_CUDA)
- set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_GPU)
- set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) # minimum target P100 GPUs
+set(ARB_WITH_CUDA FALSE)
+if(NOT ARB_GPU_MODEL MATCHES "none")
+ find_package(CUDA REQUIRED)
+ set(ARB_WITH_CUDA TRUE)
add_definitions(-DARB_HAVE_GPU)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
endif()
+if(ARB_GPU_MODEL MATCHES "K20")
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
+ add_definitions(-DARB_CUDA_ARCH=350)
+elseif(ARB_GPU_MODEL MATCHES "K80")
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_37)
+ add_definitions(-DARB_CUDA_ARCH=370)
+elseif(ARB_GPU_MODEL MATCHES "P100")
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
+ add_definitions(-DARB_CUDA_ARCH=600)
+elseif(NOT ARB_GPU_MODEL MATCHES "none")
+ message( FATAL_ERROR "-- GPU architecture '${ARB_GPU_MODEL}' not supported. Use one of {none, K20, K80, P100}")
+endif()
+
#----------------------------------------------------------
# Cray/BGQ/Generic Linux/other flag?
#----------------------------------------------------------
diff --git a/doc/install.rst b/doc/install.rst
index 8cd30e75..0a5375a5 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -228,13 +228,13 @@ CMake parameters and flags, follow links to the more detailed descriptions below
cmake .. -DARB_THREADING_MODEL=tbb -DARB_VECTORIZE_TARGET=KNL
-.. topic:: `Release <buildtarget_>`_ mode with `CUDA <gpu_>`_ and `AVX2 <vectorize_>`_ and `GCC 5 <compilers_>`_
+.. topic:: `Release <buildtarget_>`_ mode with support for: `P100 GPUs <gpu_>`_; `AVX2 <vectorize_>`_; and `GCC 5 <compilers_>`_
.. code-block:: bash
export CC=gcc-5
export CXX=g++-5
- cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_WITH_CUDA=ON
+ cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_GPU_MODEL=P100
.. _buildtarget:
@@ -350,16 +350,20 @@ which is implemented in the Arbor source code.
GPU Backend
-----------
-Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the CMake ``ARB_WITH_CUDA`` option.
+Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the
+CMake ``ARB_GPU_MODEL`` option to match the GPU model to target:
.. code-block:: bash
- cmake .. -DARB_WITH_CUDA=ON
+ cmake -DARB_GPU_MODEL={none, K20, K80, P100}
+
+By default ``ARB_GPU_MODEL=none``, and a GPU target must explicitly be set to
+build for and run on GPUs.
.. Note::
- Abor requires:
- * CUDA version >= 8
- * P100 or more recent GPU (``-arch=sm_60``)
+ The main difference between the Kepler (K20 & K80) and Pascal (P100) GPUs is
+ the latter's built-in support for double precision atomics and fewer GPU
+ synchronizations when accessing managed memory.
.. _cluster:
diff --git a/src/backends/gpu/kernels/ions.cu b/src/backends/gpu/kernels/ions.cu
index 3631315a..a5061c5f 100644
--- a/src/backends/gpu/kernels/ions.cu
+++ b/src/backends/gpu/kernels/ions.cu
@@ -41,10 +41,12 @@ void nernst(std::size_t n,
const fvm_value_type* Xi,
fvm_value_type* eX)
{
- constexpr int block_dim = 128;
- const int grid_dim = impl::block_count(n, block_dim);
- kernels::nernst<<<grid_dim, block_dim>>>
- (n, valency, temperature, Xo, Xi, eX);
+ if (n>0) {
+ constexpr int block_dim = 128;
+ const int grid_dim = impl::block_count(n, block_dim);
+ kernels::nernst<<<grid_dim, block_dim>>>
+ (n, valency, temperature, Xo, Xi, eX);
+ }
}
void init_concentration(
@@ -53,10 +55,12 @@ void init_concentration(
const fvm_value_type* weight_Xi, const fvm_value_type* weight_Xo,
fvm_value_type c_int, fvm_value_type c_ext)
{
- constexpr int block_dim = 128;
- const int grid_dim = impl::block_count(n, block_dim);
- kernels::init_concentration<<<grid_dim, block_dim>>>
- (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext);
+ if (n>0) {
+ constexpr int block_dim = 128;
+ const int grid_dim = impl::block_count(n, block_dim);
+ kernels::init_concentration<<<grid_dim, block_dim>>>
+ (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext);
+ }
}
} // namespace gpu
diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp
index a0ad5e39..29b054b1 100644
--- a/src/backends/gpu/kernels/reduce_by_key.hpp
+++ b/src/backends/gpu/kernels/reduce_by_key.hpp
@@ -2,6 +2,7 @@
#include <cstdint>
#include "detail.hpp"
+#include <backends/gpu/intrinsics.hpp>
namespace arb {
namespace gpu {
@@ -162,7 +163,7 @@ void reduce_by_key(T contribution, T* target, I idx) {
if(run.is_root()) {
// Update atomically in case the run spans multiple warps.
- atomicAdd(target+idx, contribution);
+ cuda_atomic_add(target+idx, contribution);
}
}
diff --git a/src/backends/gpu/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp
index 8343470f..ba0d99de 100644
--- a/src/backends/gpu/managed_ptr.hpp
+++ b/src/backends/gpu/managed_ptr.hpp
@@ -7,6 +7,17 @@
namespace arb {
namespace gpu {
+// Pre-pascal NVIDIA GPUs don't support page faulting for GPU reads of managed
+// memory, so when a kernel is launched, all managed memory is copied to the
+// GPU. The upshot of this is that no CPU-side reads can be made of _any_
+// managed memory can be made whe _any_ kernel is running. The following helper
+// function can be used to determine whether synchronization is required before
+// CPU-side reads of managed memory.
+constexpr
+bool managed_synch_required() {
+ return (ARB_CUDA_ARCH < 600); // all GPUs before P100
+}
+
// used to indicate that the type pointed to by the managed_ptr is to be
// constructed in the managed_ptr constructor
struct construct_in_place_tag {};
diff --git a/src/backends/gpu/stack.hpp b/src/backends/gpu/stack.hpp
index 591bda9f..3c901f2e 100644
--- a/src/backends/gpu/stack.hpp
+++ b/src/backends/gpu/stack.hpp
@@ -2,6 +2,7 @@
#include <algorithm>
+#include <backends/gpu/managed_ptr.hpp>
#include <memory/allocator.hpp>
#include "stack_common.hpp"
@@ -28,13 +29,13 @@ class stack {
using allocator = memory::managed_allocator<U>;
using storage_type = stack_storage<value_type>;
- storage_type* storage_;
+ managed_ptr<storage_type> storage_;
- storage_type* create_storage(unsigned n) {
- auto p = allocator<storage_type>().allocate(1);
+ managed_ptr<storage_type> create_storage(unsigned n) {
+ auto p = make_managed_ptr<storage_type>();
p->capacity = n;
p->stores = 0;
- p->data = allocator<value_type>().allocate(n);
+ p->data = n? allocator<value_type>().allocate(n): nullptr;
return p;
}
@@ -56,8 +57,9 @@ public:
explicit stack(unsigned capacity): storage_(create_storage(capacity)) {}
~stack() {
- allocator<value_type>().deallocate(storage_->data, storage_->capacity);
- allocator<storage_type>().deallocate(storage_, 1);
+ if (storage_->data) {
+ allocator<value_type>().deallocate(storage_->data, storage_->capacity);
+ }
}
void clear() {
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index ebb4ed6b..6315fe32 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -54,9 +54,11 @@ public:
reset();
}
- /// Remove all stored crossings that were detected in previous calls
- /// to test()
+ /// Remove all stored crossings that were detected in previous calls to test()
void clear_crossings() {
+ if (managed_synch_required()) {
+ cudaDeviceSynchronize();
+ }
stack_.clear();
}
@@ -90,6 +92,7 @@ public:
if (stack_.overflow()) {
throw std::runtime_error("GPU spike buffer overflow.");
}
+
return std::vector<threshold_crossing>(stack_.begin(), stack_.end());
}
--
GitLab