From 6ad689fae552e92845dc7163e5f22d33a4ad810e Mon Sep 17 00:00:00 2001 From: bcumming <bcumming@cscs.ch> Date: Mon, 15 Apr 2019 22:49:48 +0200 Subject: [PATCH] Fix reduce-by-key CUDA (#737) Fix bug in CUDA reduce_by_key implementation on V100 or later GPUs. The bug was not triggered for current use cases of the algorithm in Arbor, though it will be a problem when more than one reduction is to be performed in a single kernel invocation, which is required for ac cumulating both current and conductance values. * Use warp-synchronous aware operations to avoid problems on V100. * Simplify reduction kernel. * Rename `run_length` ancillary data structure to `key_set_pos`. * Add unit tests that trigger the incorrect behaviour observed in #736. Fixes #736. --- arbor/backends/gpu/reduce_by_key.hpp | 185 +++++++-------------------- modcc/printer/cudaprinter.cpp | 13 +- test/unit/test_reduce_by_key.cu | 78 ++++++++++- 3 files changed, 129 insertions(+), 147 deletions(-) diff --git a/arbor/backends/gpu/reduce_by_key.hpp b/arbor/backends/gpu/reduce_by_key.hpp index f7e8e4c8..5bf73bf7 100644 --- a/arbor/backends/gpu/reduce_by_key.hpp +++ b/arbor/backends/gpu/reduce_by_key.hpp @@ -7,163 +7,70 @@ namespace arb { namespace gpu { -namespace impl{ - -constexpr unsigned mask_all = 0xFFFFFFFF; - -// Wrappers around the CUDA warp intrinsics used in this file. -// CUDA 9 replaced the warp intrinsics with _sync variants, and -// depricated the old symbols. -// These wrappers can be removed CUDA 9 becomnse the minimum -// version used by Arbor. - -template <typename T> -static __device__ __inline__ -T arb_shfl(T x, unsigned lane) { -#if __CUDACC_VER_MAJOR__ < 9 - return __shfl(x, lane); -#else - return __shfl_sync(mask_all, x, lane); -#endif -} - -template <typename T> -static __device__ __inline__ -unsigned arb_shfl_up(T x, unsigned delta) { -#if __CUDACC_VER_MAJOR__ < 9 - return __shfl_up(x, delta); -#else - return __shfl_up_sync(mask_all, x, delta); -#endif -} - -static __device__ __inline__ -unsigned arb_ballot(int pred) { -#if __CUDACC_VER_MAJOR__ < 9 - return __ballot(pred); -#else - return __ballot_sync(mask_all, pred); -#endif -} - -// return the power of 2 that is less than or equal to i -__device__ __inline__ -unsigned rounddown_power_of_2(std::uint32_t i) { - // handle power of 2 and zero - if(__popc(i)<2) return i; - - return 1u<<(31u - __clz(i)); -} - -// The __shfl warp intrinsic is only implemented for 32 bit values. -// To shuffle a 64 bit value (usually a double), the value must be copied -// with two 32 bit shuffles. -// get_from_lane is a wrapper around __shfl() for both single and double -// precision. -// TODO: CUDA 9 provides a double precision version of __shfl that -// implements this hack. When we make CUDA 9 the minimum version, use -// the CUDA implementation instead. - -__device__ __inline__ -double get_from_lane(double x, unsigned lane) { - // split the double number into two 32b registers. - int lo, hi; - - asm volatile( "mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(x) ); - - // shuffle the two 32b registers. - lo = arb_shfl(lo, lane); - hi = arb_shfl(hi, lane); - - // return the recombined 64 bit value - return __hiloint2double(hi, lo); -} - -__device__ __inline__ -float get_from_lane(float value, unsigned lane) { - return arb_shfl(value, lane); -} - -// run_length_type Stores information about a run length. +// key_set_pos stores information required by a thread to calculate its +// contribution to a reduce by key operation. // -// A run length is a set of identical adjacent indexes in an index array. +// In reduce_by_key each thread performs a reduction with threads that have the same +// index in a sorted index array. +// The algorithm for reduce_by_key implemented here requires that each thread +// knows how far it is from the end of the key set, and whether it is the first +// thread in the warp with the key. // -// When doing a parallel reduction by index each thread must know about -// which of its neighbour threads are contributiing to the same global -// location (i.e. which neighbours have the same index). -// -// The constructor takes the thread id and index of each thread -// and the threads work cooperatively using warp shuffles to determine -// their run length information, so that each thread will have unique -// information that describes its run length and its position therein. -struct run_length_type { - unsigned left; - unsigned right; - unsigned width; - unsigned lane_id; - - __device__ __inline__ - bool is_root() const { - return left == lane_id; - } - - __device__ __inline__ - bool may_cross_warp() const { - return left==0 || right==threads_per_warp(); - } - - template <typename I1> +// As currently implemented, this could be performed inline inside the reduce_by_key +// function, however it is in a separate data type as a first step towards reusing +// the same key set information for multipe reduction kernel calls. +struct key_set_pos { + unsigned width; // distance to one past the end of this run + unsigned lane_id; // id of this warp lane + unsigned key_mask; // warp mask of threads participating in reduction + unsigned is_root; // if this lane is the first in the run + + // The constructor takes the index of each thread and a mask of which threads + // in the warp are participating in the reduction, and the threads work cooperatively + // using warp intrinsics and bit twiddling, so that each thread will have unique + // information that describes its position in the key set. __device__ - run_length_type(I1 idx) { - auto right_limit = [] (unsigned roots, unsigned shift) { - unsigned zeros_right = __ffs(roots>>shift); - return zeros_right ? shift -1 + zeros_right : threads_per_warp(); - }; + key_set_pos(int idx, unsigned mask) { + key_mask = mask; + lane_id = threadIdx.x%impl::threads_per_warp(); + unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask); - lane_id = threadIdx.x%threads_per_warp(); + // Determine if this thread is the root (i.e. first thread with this key). + int left_idx = __shfl_up_sync(key_mask, idx, lane_id? 1: 0); - // determine if this thread is the root - int left_idx = arb_shfl_up(idx, 1); - int is_root = 1; - if(lane_id>0) { - is_root = (left_idx != idx); - } + is_root = lane_id? left_idx!=idx: 1; - // determine the range this thread contributes to - unsigned roots = arb_ballot(is_root); + // Determine the range this thread contributes to. + unsigned roots = __ballot_sync(key_mask, is_root); - right = right_limit(roots, lane_id+1); - left = threads_per_warp()-1-right_limit(__brev(roots), threads_per_warp()-1-lane_id); - width = rounddown_power_of_2(right - left); + // Find the distance to the lane id one past the end of the run. + // Take care if this is the last run in the warp. + width = __ffs(roots>>(lane_id+1)); + if (!width) width = num_lanes-lane_id; } }; -} // namespace impl - template <typename T, typename I> __device__ __inline__ -void reduce_by_key(T contribution, T* target, I idx) { - impl::run_length_type run(idx); +void reduce_by_key(T contribution, T* target, I i, unsigned mask) { + key_set_pos run(i, mask); + unsigned shift = 1; + const unsigned width = run.width; - // get local copies of right and width, which are modified in the reduction loop - auto rhs = run.right; - auto width = run.width; + unsigned w = shift<width? shift: 0; - while (width) { - unsigned source_lane = run.lane_id + width; + while (__any_sync(run.key_mask, w)) { + T source_value = __shfl_down_sync(run.key_mask, contribution, w); - auto source_value = impl::get_from_lane(contribution, source_lane); - if (source_lane<rhs) { - contribution += source_value; - } + if (w) contribution += source_value; - rhs = run.left + width; - width >>= 1; + shift <<= 1; + w = shift<width? shift: 0; } - if(run.is_root()) { - // Update atomically in case the run spans multiple warps. - cuda_atomic_add(target+idx, contribution); + if(run.is_root) { + // The update must be atomic, because the run may span multiple warps. + cuda_atomic_add(target+i, contribution); } } diff --git a/modcc/printer/cudaprinter.cpp b/modcc/printer/cudaprinter.cpp index a5304524..e84200f2 100644 --- a/modcc/printer/cudaprinter.cpp +++ b/modcc/printer/cudaprinter.cpp @@ -394,6 +394,17 @@ void emit_api_body_cu(std::ostream& out, APIMethod* e, bool is_point_proc) { if (!body->statements().empty()) { out << "int tid_ = threadIdx.x + blockDim.x*blockIdx.x;\n"; + if (is_point_proc) { + // The run length information is only required if this method will + // update an indexed variable, like current or conductance. + // This is the case if one of the external variables "is_write". + auto it = std::find_if(indexed_vars.begin(), indexed_vars.end(), + [](auto& sym){return sym->external_variable()->is_write();}); + if (it!=indexed_vars.end()) { + out << "unsigned lane_mask_ = __ballot_sync(0xffffffff, tid_<n_);\n"; + } + } + out << "if (tid_<n_) {\n" << indent; for (auto& index: indices) { @@ -445,7 +456,7 @@ void emit_state_update_cu(std::ostream& out, Symbol* from, out << "arb::gpu::reduce_by_key("; is_minus && out << "-"; out << from->name() - << ", params_." << d.data_var << ", " << index_i_name(d.index_var) << ");\n"; + << ", params_." << d.data_var << ", " << index_i_name(d.index_var) << ", lane_mask_);\n"; } else { out << cuprint(external) << (is_minus? " -= ": " += ") << from->name() << ";\n"; diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu index ad662fe0..830a883e 100644 --- a/test/unit/test_reduce_by_key.cu +++ b/test/unit/test_reduce_by_key.cu @@ -14,8 +14,9 @@ __global__ void reduce_kernel(const T* src, T* dst, const I* index, int n) { unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; + unsigned mask = __ballot_sync(0xffffffff, tid<n); if (tid<n) { - gpu::reduce_by_key(src[tid], dst, index[tid]); + gpu::reduce_by_key(src[tid], dst, index[tid], mask); } } @@ -69,8 +70,8 @@ TEST(reduce_by_key, single_repeated_index) std::vector<double> in(n, 1); std::vector<int> index(n, 0); - auto out = reduce(in, 1, index); - EXPECT_EQ(out[0], double(n)); + auto out = reduce(in, 1, index, 32); + EXPECT_EQ(double(n), out[0]); } // Perform reduction of an ascending sequence of {1,2,3,...,n} // The expected result is n*(n+1)/2 @@ -85,21 +86,84 @@ TEST(reduce_by_key, single_repeated_index) TEST(reduce_by_key, scatter) { - std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11}; + std::vector<int> index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11}; unsigned n = util::max_value(index)+1; std::vector<double> in(index.size(), 1); - std::vector<double> expected = {3., 1., 2., 1., 0., 0., 0., 3., 0., 0., 0., 1.}; + std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.}; + + unsigned m = index.size(); EXPECT_EQ(n, expected.size()); auto out = reduce(in, n, index); - EXPECT_EQ(out, expected); + EXPECT_EQ(expected, out); // rerun with 7 threads per thread block, to test // * using more than one thread block // * thread blocks that are not a multiple of 32 // * thread blocks that are less than 32 + out = reduce(in, n, index, 7); + EXPECT_EQ(expected, out); +} + +// Test kernels that perform more than one reduction in a single invokation. +// Used to reproduce and test for synchronization issues on V100 GPUs. + +template <typename T, typename I> +__global__ +void reduce_twice_kernel(const T* src, T* dst, const I* index, int n) { + unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; + + unsigned mask = __ballot_sync(0xffffffff, tid<n); + if (tid<n) { + gpu::reduce_by_key(src[tid], dst, index[tid], mask); + gpu::reduce_by_key(src[tid], dst, index[tid], mask); + } +} + +template <typename T> +std::vector<T> reduce_twice(const std::vector<T>& in, size_t n_out, const std::vector<int>& index, unsigned block_dim=128) { + EXPECT_EQ(in.size(), index.size()); + EXPECT_TRUE(std::is_sorted(index.begin(), index.end())); + + using array = memory::device_vector<T>; + using iarray = memory::device_vector<int>; + + int n = in.size(); + + array src = memory::make_const_view(in); + iarray idx = memory::make_const_view(index); + array dst(n_out, 0); + + unsigned grid_dim = (n-1)/block_dim + 1; + reduce_twice_kernel<<<grid_dim, block_dim>>>(src.data(), dst.data(), idx.data(), n); + + std::vector<T> out(n_out); + memory::copy(dst, memory::make_view(out)); + + return out; +} + +TEST(reduce_by_key, scatter_twice) +{ + std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11}; + unsigned n = util::max_value(index)+1; + std::vector<double> in(index.size(), 1); + std::vector<double> expected = {6., 2., 4., 2., 0., 0., 0., 6., 0., 0., 0., 2.}; + + unsigned m = index.size(); + + EXPECT_EQ(n, expected.size()); + + auto out = reduce_twice(in, n, index); + EXPECT_EQ(expected, out); + + // rerun with 7 threads per thread block, to test + // * using more than one thread block + // * thread blocks that are not a multiple of 32 + // * thread blocks that are less than 32 - EXPECT_EQ(out, expected); + out = reduce_twice(in, n, index, 7); + EXPECT_EQ(expected, out); } -- GitLab