From 6b659a39a8ac68007f740d7944622d54ac2c3298 Mon Sep 17 00:00:00 2001
From: Ben Cumming <louncharf@gmail.com>
Date: Wed, 11 Apr 2018 15:48:24 +0200
Subject: [PATCH] Fix support for Keplar (K20 & K80) GPUs. (#470)

Fixes issue #467

* Add GPU synchronization points where required for Kepler to coordinate CPU access of managed memory.
* Use hand-rolled double precision atomic addition for Kelper targets.
* Replace `ARB_WITH_CUDA` build option with `ARB_GPU_MODEL` option that takes one of 'none', 'K20', 'K80' or 'P100', and set up source-code defines accoringly.
* Clean up of redundant compiler flags and defines no longer required now that the project uses separate compilation for CUDA sources.
---
 CMakeLists.txt                             | 33 +++++++++++++---------
 doc/install.rst                            | 18 +++++++-----
 src/backends/gpu/kernels/ions.cu           | 20 +++++++------
 src/backends/gpu/kernels/reduce_by_key.hpp |  3 +-
 src/backends/gpu/managed_ptr.hpp           | 11 ++++++++
 src/backends/gpu/stack.hpp                 | 14 +++++----
 src/backends/gpu/threshold_watcher.hpp     |  7 +++--
 7 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c05fdcf..02d70e06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,27 +125,32 @@ endif()
 #----------------------------------------------------------
 # CUDA support
 #----------------------------------------------------------
-option(ARB_WITH_CUDA "use CUDA for GPU offload" OFF)
-if(ARB_WITH_CUDA)
-    find_package(CUDA REQUIRED)
-
-    # Turn off annoying and incorrect warnings generated in the JSON file.
-    # We also work around the same issue with the intel compiler.
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\")
+set(ARB_GPU_MODEL "none" CACHE STRING "The target GPU architecture: one of {none,K20,K80,P100}")
+set_property(CACHE ARB_GPU_MODEL PROPERTY STRINGS none K20 K80 P100 )
 
-    # set the CUDA target specfic flags
-    # code regions protected by ARB_HAVE_CUDA should only be available to the CUDA
-    # compiler, which regions protected by ARB_HAVE_GPU are visible to both host
-    # and device compiler when targetting GPU.
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_CUDA)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_GPU)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) # minimum target P100 GPUs
+set(ARB_WITH_CUDA FALSE)
 
+if(NOT ARB_GPU_MODEL MATCHES "none")
+    find_package(CUDA REQUIRED)
+    set(ARB_WITH_CUDA TRUE)
     add_definitions(-DARB_HAVE_GPU)
     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
     list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
 endif()
 
+if(ARB_GPU_MODEL MATCHES "K20")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
+    add_definitions(-DARB_CUDA_ARCH=350)
+elseif(ARB_GPU_MODEL MATCHES "K80")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_37)
+    add_definitions(-DARB_CUDA_ARCH=370)
+elseif(ARB_GPU_MODEL MATCHES "P100")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
+    add_definitions(-DARB_CUDA_ARCH=600)
+elseif(NOT ARB_GPU_MODEL MATCHES "none")
+    message( FATAL_ERROR "-- GPU architecture '${ARB_GPU_MODEL}' not supported. Use one of {none, K20, K80, P100}")
+endif()
+
 #----------------------------------------------------------
 # Cray/BGQ/Generic Linux/other flag?
 #----------------------------------------------------------
diff --git a/doc/install.rst b/doc/install.rst
index 8cd30e75..0a5375a5 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -228,13 +228,13 @@ CMake parameters and flags, follow links to the more detailed descriptions below
 
         cmake .. -DARB_THREADING_MODEL=tbb -DARB_VECTORIZE_TARGET=KNL
 
-.. topic:: `Release <buildtarget_>`_ mode with `CUDA <gpu_>`_ and `AVX2 <vectorize_>`_ and `GCC 5 <compilers_>`_
+.. topic:: `Release <buildtarget_>`_ mode with support for: `P100 GPUs <gpu_>`_; `AVX2 <vectorize_>`_; and `GCC 5 <compilers_>`_
 
     .. code-block:: bash
 
         export CC=gcc-5
         export CXX=g++-5
-        cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_WITH_CUDA=ON
+        cmake .. -DARB_VECTORIZE_TARGET=AVX2 -DARB_GPU_MODEL=P100
 
 .. _buildtarget:
 
@@ -350,16 +350,20 @@ which is implemented in the Arbor source code.
 GPU Backend
 -----------
 
-Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the CMake ``ARB_WITH_CUDA`` option.
+Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the
+CMake ``ARB_GPU_MODEL`` option to match the GPU model to target:
 
 .. code-block:: bash
 
-    cmake .. -DARB_WITH_CUDA=ON
+    cmake -DARB_GPU_MODEL={none, K20, K80, P100}
+
+By default ``ARB_GPU_MODEL=none``, and a GPU target must explicitly be set to
+build for and run on GPUs.
 
 .. Note::
-    Abor requires:
-      * CUDA version >= 8
-      * P100 or more recent GPU (``-arch=sm_60``)
+    The main difference between the Kepler (K20 & K80) and Pascal (P100) GPUs is
+    the latter's built-in support for double precision atomics and fewer GPU
+    synchronizations when accessing managed memory.
 
 .. _cluster:
 
diff --git a/src/backends/gpu/kernels/ions.cu b/src/backends/gpu/kernels/ions.cu
index 3631315a..a5061c5f 100644
--- a/src/backends/gpu/kernels/ions.cu
+++ b/src/backends/gpu/kernels/ions.cu
@@ -41,10 +41,12 @@ void nernst(std::size_t n,
             const fvm_value_type* Xi,
             fvm_value_type* eX)
 {
-    constexpr int block_dim = 128;
-    const int grid_dim = impl::block_count(n, block_dim);
-    kernels::nernst<<<grid_dim, block_dim>>>
-        (n, valency, temperature, Xo, Xi, eX);
+    if (n>0) {
+        constexpr int block_dim = 128;
+        const int grid_dim = impl::block_count(n, block_dim);
+        kernels::nernst<<<grid_dim, block_dim>>>
+            (n, valency, temperature, Xo, Xi, eX);
+    }
 }
 
 void init_concentration(
@@ -53,10 +55,12 @@ void init_concentration(
             const fvm_value_type* weight_Xi, const fvm_value_type* weight_Xo,
             fvm_value_type c_int, fvm_value_type c_ext)
 {
-    constexpr int block_dim = 128;
-    const int grid_dim = impl::block_count(n, block_dim);
-    kernels::init_concentration<<<grid_dim, block_dim>>>
-        (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext);
+    if (n>0) {
+        constexpr int block_dim = 128;
+        const int grid_dim = impl::block_count(n, block_dim);
+        kernels::init_concentration<<<grid_dim, block_dim>>>
+            (n, Xi, Xo, weight_Xi, weight_Xo, c_int, c_ext);
+    }
 }
 
 } // namespace gpu
diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp
index a0ad5e39..29b054b1 100644
--- a/src/backends/gpu/kernels/reduce_by_key.hpp
+++ b/src/backends/gpu/kernels/reduce_by_key.hpp
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include "detail.hpp"
+#include <backends/gpu/intrinsics.hpp>
 
 namespace arb {
 namespace gpu {
@@ -162,7 +163,7 @@ void reduce_by_key(T contribution, T* target, I idx) {
 
     if(run.is_root()) {
         // Update atomically in case the run spans multiple warps.
-        atomicAdd(target+idx, contribution);
+        cuda_atomic_add(target+idx, contribution);
     }
 }
 
diff --git a/src/backends/gpu/managed_ptr.hpp b/src/backends/gpu/managed_ptr.hpp
index 8343470f..ba0d99de 100644
--- a/src/backends/gpu/managed_ptr.hpp
+++ b/src/backends/gpu/managed_ptr.hpp
@@ -7,6 +7,17 @@
 namespace arb {
 namespace gpu {
 
+// Pre-pascal NVIDIA GPUs don't support page faulting for GPU reads of managed
+// memory, so when a kernel is launched, all managed memory is copied to the
+// GPU. The upshot of this is that no CPU-side reads can be made of _any_
+// managed memory can be made whe _any_ kernel is running.  The following helper
+// function can be used to determine whether synchronization is required before
+// CPU-side reads of managed memory.
+constexpr
+bool managed_synch_required() {
+    return (ARB_CUDA_ARCH < 600); // all GPUs before P100
+}
+
 // used to indicate that the type pointed to by the managed_ptr is to be
 // constructed in the managed_ptr constructor
 struct construct_in_place_tag {};
diff --git a/src/backends/gpu/stack.hpp b/src/backends/gpu/stack.hpp
index 591bda9f..3c901f2e 100644
--- a/src/backends/gpu/stack.hpp
+++ b/src/backends/gpu/stack.hpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 
+#include <backends/gpu/managed_ptr.hpp>
 #include <memory/allocator.hpp>
 #include "stack_common.hpp"
 
@@ -28,13 +29,13 @@ class stack {
     using allocator = memory::managed_allocator<U>;
 
     using storage_type = stack_storage<value_type>;
-    storage_type* storage_;
+    managed_ptr<storage_type> storage_;
 
-    storage_type* create_storage(unsigned n) {
-        auto p = allocator<storage_type>().allocate(1);
+    managed_ptr<storage_type> create_storage(unsigned n) {
+        auto p = make_managed_ptr<storage_type>();
         p->capacity = n;
         p->stores = 0;
-        p->data = allocator<value_type>().allocate(n);
+        p->data = n? allocator<value_type>().allocate(n): nullptr;
         return p;
     }
 
@@ -56,8 +57,9 @@ public:
     explicit stack(unsigned capacity): storage_(create_storage(capacity)) {}
 
     ~stack() {
-        allocator<value_type>().deallocate(storage_->data, storage_->capacity);
-        allocator<storage_type>().deallocate(storage_, 1);
+        if (storage_->data) {
+            allocator<value_type>().deallocate(storage_->data, storage_->capacity);
+        }
     }
 
     void clear() {
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index ebb4ed6b..6315fe32 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -54,9 +54,11 @@ public:
         reset();
     }
 
-    /// Remove all stored crossings that were detected in previous calls
-    /// to test()
+    /// Remove all stored crossings that were detected in previous calls to test()
     void clear_crossings() {
+        if (managed_synch_required()) {
+            cudaDeviceSynchronize();
+        }
         stack_.clear();
     }
 
@@ -90,6 +92,7 @@ public:
         if (stack_.overflow()) {
             throw std::runtime_error("GPU spike buffer overflow.");
         }
+
         return std::vector<threshold_crossing>(stack_.begin(), stack_.end());
     }
 
-- 
GitLab