From 6ad689fae552e92845dc7163e5f22d33a4ad810e Mon Sep 17 00:00:00 2001
From: bcumming <bcumming@cscs.ch>
Date: Mon, 15 Apr 2019 22:49:48 +0200
Subject: [PATCH] Fix reduce-by-key CUDA (#737)

Fix bug in CUDA reduce_by_key implementation on V100 or later GPUs.
The bug was not triggered for current use cases of the algorithm in Arbor, though it will be a problem when more than one reduction is to be performed in a single kernel invocation, which is required for ac
cumulating both current and conductance values.
* Use warp-synchronous aware operations to avoid problems on V100.
* Simplify reduction kernel.
* Rename `run_length` ancillary data structure to `key_set_pos`.
* Add unit tests that trigger the incorrect behaviour observed in #736.

Fixes #736.
---
 arbor/backends/gpu/reduce_by_key.hpp | 185 +++++++--------------------
 modcc/printer/cudaprinter.cpp        |  13 +-
 test/unit/test_reduce_by_key.cu      |  78 ++++++++++-
 3 files changed, 129 insertions(+), 147 deletions(-)

diff --git a/arbor/backends/gpu/reduce_by_key.hpp b/arbor/backends/gpu/reduce_by_key.hpp
index f7e8e4c8..5bf73bf7 100644
--- a/arbor/backends/gpu/reduce_by_key.hpp
+++ b/arbor/backends/gpu/reduce_by_key.hpp
@@ -7,163 +7,70 @@
 namespace arb {
 namespace gpu {
 
-namespace impl{
-
-constexpr unsigned mask_all = 0xFFFFFFFF;
-
-// Wrappers around the CUDA warp intrinsics used in this file.
-// CUDA 9 replaced the warp intrinsics with _sync variants, and
-// depricated the old symbols.
-// These wrappers can be removed CUDA 9 becomnse the minimum
-// version used by Arbor.
-
-template <typename T>
-static __device__ __inline__
-T arb_shfl(T x, unsigned lane) {
-#if __CUDACC_VER_MAJOR__ < 9
-    return __shfl(x, lane);
-#else
-    return __shfl_sync(mask_all, x, lane);
-#endif
-}
-
-template <typename T>
-static __device__ __inline__
-unsigned arb_shfl_up(T x, unsigned delta) {
-#if __CUDACC_VER_MAJOR__ < 9
-    return __shfl_up(x, delta);
-#else
-    return __shfl_up_sync(mask_all, x, delta);
-#endif
-}
-
-static __device__ __inline__
-unsigned arb_ballot(int pred) {
-#if __CUDACC_VER_MAJOR__ < 9
-    return __ballot(pred);
-#else
-    return __ballot_sync(mask_all, pred);
-#endif
-}
-
-// return the power of 2 that is less than or equal to i
-__device__ __inline__
-unsigned rounddown_power_of_2(std::uint32_t i) {
-    // handle power of 2 and zero
-    if(__popc(i)<2) return i;
-
-    return 1u<<(31u - __clz(i));
-}
-
-// The __shfl warp intrinsic is only implemented for 32 bit values.
-// To shuffle a 64 bit value (usually a double), the value must be copied
-// with two 32 bit shuffles.
-// get_from_lane is a wrapper around __shfl() for both single and double
-// precision.
-// TODO: CUDA 9 provides a double precision version of __shfl that
-// implements this hack. When we make CUDA 9 the minimum version, use
-// the CUDA implementation instead.
-
-__device__ __inline__
-double get_from_lane(double x, unsigned lane) {
-    // split the double number into two 32b registers.
-    int lo, hi;
-
-    asm volatile( "mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(x) );
-
-    // shuffle the two 32b registers.
-    lo = arb_shfl(lo, lane);
-    hi = arb_shfl(hi, lane);
-
-    // return the recombined 64 bit value
-    return __hiloint2double(hi, lo);
-}
-
-__device__ __inline__
-float get_from_lane(float value, unsigned lane) {
-    return arb_shfl(value, lane);
-}
-
-// run_length_type Stores information about a run length.
+// key_set_pos stores information required by a thread to calculate its
+// contribution to a reduce by key operation.
 //
-// A run length is a set of identical adjacent indexes in an index array.
+// In reduce_by_key each thread performs a reduction with threads that have the same
+// index in a sorted index array.
+// The algorithm for reduce_by_key implemented here requires that each thread
+// knows how far it is from the end of the key set, and whether it is the first
+// thread in the warp with the key.
 //
-// When doing a parallel reduction by index each thread must know about
-// which of its neighbour threads are contributiing to the same global
-// location (i.e. which neighbours have the same index).
-//
-// The constructor takes the thread id and index of each thread
-// and the threads work cooperatively using warp shuffles to determine
-// their run length information, so that each thread will have unique
-// information that describes its run length and its position therein.
-struct run_length_type {
-    unsigned left;
-    unsigned right;
-    unsigned width;
-    unsigned lane_id;
-
-    __device__ __inline__
-    bool is_root() const {
-        return left == lane_id;
-    }
-
-    __device__ __inline__
-    bool may_cross_warp() const {
-        return left==0 || right==threads_per_warp();
-    }
-
-    template <typename I1>
+// As currently implemented, this could be performed inline inside the reduce_by_key
+// function, however it is in a separate data type as a first step towards reusing
+// the same key set information for multipe reduction kernel calls.
+struct key_set_pos {
+    unsigned width;         // distance to one past the end of this run
+    unsigned lane_id;       // id of this warp lane
+    unsigned key_mask;      // warp mask of threads participating in reduction
+    unsigned is_root;       // if this lane is the first in the run
+
+    // The constructor takes the index of each thread and a mask of which threads
+    // in the warp are participating in the reduction, and the threads work cooperatively
+    // using warp intrinsics and bit twiddling, so that each thread will have unique
+    // information that describes its position in the key set.
     __device__
-    run_length_type(I1 idx) {
-        auto right_limit = [] (unsigned roots, unsigned shift) {
-            unsigned zeros_right  = __ffs(roots>>shift);
-            return zeros_right ? shift -1 + zeros_right : threads_per_warp();
-        };
+    key_set_pos(int idx, unsigned mask) {
+        key_mask = mask;
+        lane_id = threadIdx.x%impl::threads_per_warp();
+        unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);
 
-        lane_id = threadIdx.x%threads_per_warp();
+        // Determine if this thread is the root (i.e. first thread with this key).
+        int left_idx  = __shfl_up_sync(key_mask, idx, lane_id? 1: 0);
 
-        // determine if this thread is the root
-        int left_idx  = arb_shfl_up(idx, 1);
-        int is_root = 1;
-        if(lane_id>0) {
-            is_root = (left_idx != idx);
-        }
+        is_root = lane_id? left_idx!=idx: 1;
 
-        // determine the range this thread contributes to
-        unsigned roots = arb_ballot(is_root);
+        // Determine the range this thread contributes to.
+        unsigned roots = __ballot_sync(key_mask, is_root);
 
-        right = right_limit(roots, lane_id+1);
-        left  = threads_per_warp()-1-right_limit(__brev(roots), threads_per_warp()-1-lane_id);
-        width = rounddown_power_of_2(right - left);
+        // Find the distance to the lane id one past the end of the run.
+        // Take care if this is the last run in the warp.
+        width = __ffs(roots>>(lane_id+1));
+        if (!width) width = num_lanes-lane_id;
     }
 };
 
-} // namespace impl
-
 template <typename T, typename I>
 __device__ __inline__
-void reduce_by_key(T contribution, T* target, I idx) {
-    impl::run_length_type run(idx);
+void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
+    key_set_pos run(i, mask);
+    unsigned shift = 1;
+    const unsigned width = run.width;
 
-    // get local copies of right and width, which are modified in the reduction loop
-    auto rhs = run.right;
-    auto width = run.width;
+    unsigned w = shift<width? shift: 0;
 
-    while (width) {
-        unsigned source_lane = run.lane_id + width;
+    while (__any_sync(run.key_mask, w)) {
+        T source_value = __shfl_down_sync(run.key_mask, contribution, w);
 
-        auto source_value = impl::get_from_lane(contribution, source_lane);
-        if (source_lane<rhs) {
-            contribution += source_value;
-        }
+        if (w) contribution += source_value;
 
-        rhs = run.left + width;
-        width >>= 1;
+        shift <<= 1;
+        w = shift<width? shift: 0;
     }
 
-    if(run.is_root()) {
-        // Update atomically in case the run spans multiple warps.
-        cuda_atomic_add(target+idx, contribution);
+    if(run.is_root) {
+        // The update must be atomic, because the run may span multiple warps.
+        cuda_atomic_add(target+i, contribution);
     }
 }
 
diff --git a/modcc/printer/cudaprinter.cpp b/modcc/printer/cudaprinter.cpp
index a5304524..e84200f2 100644
--- a/modcc/printer/cudaprinter.cpp
+++ b/modcc/printer/cudaprinter.cpp
@@ -394,6 +394,17 @@ void emit_api_body_cu(std::ostream& out, APIMethod* e, bool is_point_proc) {
 
     if (!body->statements().empty()) {
         out << "int tid_ = threadIdx.x + blockDim.x*blockIdx.x;\n";
+        if (is_point_proc) {
+            // The run length information is only required if this method will
+            // update an indexed variable, like current or conductance.
+            // This is the case if one of the external variables "is_write".
+            auto it = std::find_if(indexed_vars.begin(), indexed_vars.end(),
+                      [](auto& sym){return sym->external_variable()->is_write();});
+            if (it!=indexed_vars.end()) {
+                out << "unsigned lane_mask_ = __ballot_sync(0xffffffff, tid_<n_);\n";
+            }
+        }
+
         out << "if (tid_<n_) {\n" << indent;
 
         for (auto& index: indices) {
@@ -445,7 +456,7 @@ void emit_state_update_cu(std::ostream& out, Symbol* from,
         out << "arb::gpu::reduce_by_key(";
         is_minus && out << "-";
         out << from->name()
-            << ", params_." << d.data_var << ", " << index_i_name(d.index_var) << ");\n";
+            << ", params_." << d.data_var << ", " << index_i_name(d.index_var) << ", lane_mask_);\n";
     }
     else {
         out << cuprint(external) << (is_minus? " -= ": " += ") << from->name() << ";\n";
diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu
index ad662fe0..830a883e 100644
--- a/test/unit/test_reduce_by_key.cu
+++ b/test/unit/test_reduce_by_key.cu
@@ -14,8 +14,9 @@ __global__
 void reduce_kernel(const T* src, T* dst, const I* index, int n) {
     unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
 
+    unsigned mask = __ballot_sync(0xffffffff, tid<n);
     if (tid<n) {
-        gpu::reduce_by_key(src[tid], dst, index[tid]);
+        gpu::reduce_by_key(src[tid], dst, index[tid], mask);
     }
 }
 
@@ -69,8 +70,8 @@ TEST(reduce_by_key, single_repeated_index)
         std::vector<double> in(n, 1);
         std::vector<int> index(n, 0);
 
-        auto out = reduce(in, 1, index);
-        EXPECT_EQ(out[0], double(n));
+        auto out = reduce(in, 1, index, 32);
+        EXPECT_EQ(double(n), out[0]);
     }
     // Perform reduction of an ascending sequence of {1,2,3,...,n}
     // The expected result is n*(n+1)/2
@@ -85,21 +86,84 @@ TEST(reduce_by_key, single_repeated_index)
 
 TEST(reduce_by_key, scatter)
 {
-    std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11};
+    std::vector<int> index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11};
     unsigned n = util::max_value(index)+1;
     std::vector<double> in(index.size(), 1);
-    std::vector<double> expected = {3., 1., 2., 1., 0., 0., 0., 3., 0., 0., 0., 1.};
+    std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.};
+
+    unsigned m = index.size();
 
     EXPECT_EQ(n, expected.size());
 
     auto out = reduce(in, n, index);
-    EXPECT_EQ(out, expected);
+    EXPECT_EQ(expected, out);
 
     // rerun with 7 threads per thread block, to test
     //  * using more than one thread block
     //  * thread blocks that are not a multiple of 32
     //  * thread blocks that are less than 32
+
     out = reduce(in, n, index, 7);
+    EXPECT_EQ(expected, out);
+}
+
+// Test kernels that perform more than one reduction in a single invokation.
+// Used to reproduce and test for synchronization issues on V100 GPUs.
+
+template <typename T, typename I>
+__global__
+void reduce_twice_kernel(const T* src, T* dst, const I* index, int n) {
+    unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+    unsigned mask = __ballot_sync(0xffffffff, tid<n);
+    if (tid<n) {
+        gpu::reduce_by_key(src[tid], dst, index[tid], mask);
+        gpu::reduce_by_key(src[tid], dst, index[tid], mask);
+    }
+}
+
+template <typename T>
+std::vector<T> reduce_twice(const std::vector<T>& in, size_t n_out, const std::vector<int>& index, unsigned block_dim=128) {
+    EXPECT_EQ(in.size(), index.size());
+    EXPECT_TRUE(std::is_sorted(index.begin(), index.end()));
+
+    using array = memory::device_vector<T>;
+    using iarray = memory::device_vector<int>;
+
+    int n = in.size();
+
+    array  src = memory::make_const_view(in);
+    iarray idx = memory::make_const_view(index);
+    array  dst(n_out, 0);
+
+    unsigned grid_dim = (n-1)/block_dim + 1;
+    reduce_twice_kernel<<<grid_dim, block_dim>>>(src.data(), dst.data(), idx.data(), n);
+
+    std::vector<T> out(n_out);
+    memory::copy(dst, memory::make_view(out));
+
+    return out;
+}
+
+TEST(reduce_by_key, scatter_twice)
+{
+    std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11};
+    unsigned n = util::max_value(index)+1;
+    std::vector<double> in(index.size(), 1);
+    std::vector<double> expected = {6., 2., 4., 2., 0., 0., 0., 6., 0., 0., 0., 2.};
+
+    unsigned m = index.size();
+
+    EXPECT_EQ(n, expected.size());
+
+    auto out = reduce_twice(in, n, index);
+    EXPECT_EQ(expected, out);
+
+    // rerun with 7 threads per thread block, to test
+    //  * using more than one thread block
+    //  * thread blocks that are not a multiple of 32
+    //  * thread blocks that are less than 32
 
-    EXPECT_EQ(out, expected);
+    out = reduce_twice(in, n, index, 7);
+    EXPECT_EQ(expected, out);
 }
-- 
GitLab