diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d176be0e3a85fd0777c8d4959283d953fee5b2c..0566fffa4bca13258376d3bec36b3a0763fd1755 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -134,7 +134,7 @@ if(ARB_WITH_CUDA)
     # and device compiler when targetting GPU.
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_CUDA)
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DARB_HAVE_GPU)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) # minimum target P100 GPUs
 
     add_definitions(-DARB_HAVE_GPU)
     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp
index 0d4627e5030f067a90bbcbe233a6c6f3739bb67a..a0ad5e39e9f5852e90d83d4b98f15c474c4b321e 100644
--- a/src/backends/gpu/kernels/reduce_by_key.hpp
+++ b/src/backends/gpu/kernels/reduce_by_key.hpp
@@ -8,6 +8,43 @@ namespace gpu {
 
 namespace impl{
 
+constexpr unsigned mask_all = 0xFFFFFFFF;
+
+// Wrappers around the CUDA warp intrinsics used in this file.
+// CUDA 9 replaced the warp intrinsics with _sync variants, and
+// depricated the old symbols.
+// These wrappers can be removed CUDA 9 becomnse the minimum
+// version used by Arbor.
+
+template <typename T>
+static __device__ __inline__
+T arb_shfl(T x, unsigned lane) {
+#if __CUDACC_VER_MAJOR__ < 9
+    return __shfl(x, lane);
+#else
+    return __shfl_sync(mask_all, x, lane);
+#endif
+}
+
+template <typename T>
+static __device__ __inline__
+unsigned arb_shfl_up(T x, unsigned delta) {
+#if __CUDACC_VER_MAJOR__ < 9
+    return __shfl_up(x, delta);
+#else
+    return __shfl_up_sync(mask_all, x, delta);
+#endif
+}
+
+static __device__ __inline__
+unsigned arb_ballot(int pred) {
+#if __CUDACC_VER_MAJOR__ < 9
+    return __ballot(pred);
+#else
+    return __ballot_sync(mask_all, pred);
+#endif
+}
+
 // return the power of 2 that is less than or equal to i
 __device__ __inline__
 unsigned rounddown_power_of_2(std::uint32_t i) {
@@ -22,6 +59,9 @@ unsigned rounddown_power_of_2(std::uint32_t i) {
 // with two 32 bit shuffles.
 // get_from_lane is a wrapper around __shfl() for both single and double
 // precision.
+// TODO: CUDA 9 provides a double precision version of __shfl that
+// implements this hack. When we make CUDA 9 the minimum version, use
+// the CUDA implementation instead.
 
 __device__ __inline__
 double get_from_lane(double x, unsigned lane) {
@@ -31,8 +71,8 @@ double get_from_lane(double x, unsigned lane) {
     asm volatile( "mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(x) );
 
     // shuffle the two 32b registers.
-    lo = __shfl(lo, lane);
-    hi = __shfl(hi, lane);
+    lo = arb_shfl(lo, lane);
+    hi = arb_shfl(hi, lane);
 
     // return the recombined 64 bit value
     return __hiloint2double(hi, lo);
@@ -40,7 +80,7 @@ double get_from_lane(double x, unsigned lane) {
 
 __device__ __inline__
 float get_from_lane(float value, unsigned lane) {
-    return __shfl(value, lane);
+    return arb_shfl(value, lane);
 }
 
 // run_length_type Stores information about a run length.
@@ -82,14 +122,14 @@ struct run_length_type {
         lane_id = threadIdx.x%threads_per_warp();
 
         // determine if this thread is the root
-        int left_idx  = __shfl_up(idx, 1);
+        int left_idx  = arb_shfl_up(idx, 1);
         int is_root = 1;
         if(lane_id>0) {
             is_root = (left_idx != idx);
         }
 
         // determine the range this thread contributes to
-        unsigned roots = __ballot(is_root);
+        unsigned roots = arb_ballot(is_root);
 
         right = right_limit(roots, lane_id+1);
         left  = threads_per_warp()-1-right_limit(__brev(roots), threads_per_warp()-1-lane_id);