diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 4a54100519fa9468ee13857c8648a29655060176..57a6641246310c6f8c7e3a05978f1836c61d9ddf 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -69,7 +69,7 @@ int main(int argc, char** argv) {
 
         banner();
 
-        meters.checkpoint("global setup");
+        meters.checkpoint("setup");
 
         // determine what to attach probes to
         probe_distribution pdist;
@@ -136,12 +136,12 @@ int main(int argc, char** argv) {
             }
         }
 
-        meters.checkpoint("model initialization");
+        meters.checkpoint("model-init");
 
         // run model
         m.run(options.tfinal, options.dt);
 
-        meters.checkpoint("time stepping");
+        meters.checkpoint("model-simulate");
 
         // output profile and diagnostic feedback
         auto const num_steps = options.tfinal / options.dt;
@@ -154,7 +154,14 @@ int main(int argc, char** argv) {
             write_trace(*trace.get(), options.trace_prefix);
         }
 
-        util::save_to_file(meters, "meters.json");
+        auto report = util::make_meter_report(meters);
+        std::cout << report;
+        if (global_policy::id()==0) {
+            std::ofstream fid;
+            fid.exceptions(std::ios_base::badbit | std::ios_base::failbit);
+            fid.open("meters.json");
+            fid << std::setw(1) << util::to_json(report) << "\n";
+        }
     }
     catch (io::usage_error& e) {
         // only print usage/startup errors on master
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index f4e3db811a20aa0a4e58c50148e4b3f8f147f565..5f54ff0efeaae0f8e67af43bea854228a5daf24b 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -43,6 +43,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("#include <algorithms.hpp>");
     text_.add_line("#include <backends/gpu/intrinsics.hpp>");
     text_.add_line("#include <backends/gpu/multi_event_stream.hpp>");
+    text_.add_line("#include <backends/gpu/kernels/reduce_by_key.hpp>");
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 
@@ -878,11 +879,13 @@ void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) {
             in->accept(this);
         }
         else {
-            text_ << (out->op()==tok::plus ? "cuda_atomic_add" : "cuda_atomic_sub") << "(&";
-            out->accept(this);
-            text_ << ", ";
+            text_ << "nest::mc::gpu::reduce_by_key(";
+            if (out->op()==tok::minus) text_ << "-";
             in->accept(this);
-            text_ << ")";
+            // reduce_by_key() takes a pointer to the start of the target
+            // array as a parameter. This requires writing the index_name of out, which
+            // we can safely assume is an indexed_variable by this point.
+            text_ << ", params_." << out->is_indexed_variable()->index_name() << ", gid_)";
         }
         text_.end_line(";");
     }
diff --git a/src/backends/gpu/kernels/reduce_by_key.hpp b/src/backends/gpu/kernels/reduce_by_key.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b34d7c19f107ee7c6e47620333d0ef77e363d41
--- /dev/null
+++ b/src/backends/gpu/kernels/reduce_by_key.hpp
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <cstdint>
+#include "detail.hpp"
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+namespace impl{
+
+// return the power of 2 that is less than or equal to i
+__device__ __inline__
+unsigned rounddown_power_of_2(std::uint32_t i) {
+    // handle power of 2 and zero
+    if(__popc(i)<2) return i;
+
+    return 1u<<(31u - __clz(i));
+}
+
+// The __shfl warp intrinsic is only implemented for 32 bit values.
+// To shuffle a 64 bit value (usually a double), the value must be copied
+// with two 32 bit shuffles.
+// get_from_lane is a wrapper around __shfl() for both single and double
+// precision.
+
+__device__ __inline__
+double get_from_lane(double x, unsigned lane) {
+    // split the double number into two 32b registers.
+    int lo, hi;
+
+    asm volatile( "mov.b64 { %0, %1 }, %2;" : "=r"(lo), "=r"(hi) : "d"(x) );
+
+    // shuffle the two 32b registers.
+    lo = __shfl(lo, lane);
+    hi = __shfl(hi, lane);
+
+    // return the recombined 64 bit value
+    return __hiloint2double(hi, lo);
+}
+
+__device__ __inline__
+float get_from_lane(float value, unsigned lane) {
+    return __shfl(value, lane);
+}
+
+// run_length_type Stores information about a run length.
+//
+// A run length is a set of identical adjacent indexes in an index array.
+//
+// When doing a parallel reduction by index each thread must know about
+// which of its neighbour threads are contributiing to the same global
+// location (i.e. which neighbours have the same index).
+//
+// The constructor takes the thread id and index of each thread
+// and the threads work cooperatively using warp shuffles to determine
+// their run length information, so that each thread will have unique
+// information that describes its run length and its position therein.
+struct run_length_type {
+    unsigned left;
+    unsigned right;
+    unsigned width;
+    unsigned lane_id;
+
+    __device__ __inline__
+    bool is_root() const {
+        return left == lane_id;
+    }
+
+    __device__ __inline__
+    bool may_cross_warp() const {
+        return left==0 || right==threads_per_warp();
+    }
+
+    template <typename I1>
+    __device__
+    run_length_type(I1 idx) {
+        auto right_limit = [] (unsigned roots, unsigned shift) {
+            unsigned zeros_right  = __ffs(roots>>shift);
+            return zeros_right ? shift -1 + zeros_right : threads_per_warp();
+        };
+
+        lane_id = threadIdx.x%threads_per_warp();
+
+        // determine if this thread is the root
+        int left_idx  = __shfl_up(idx, 1);
+        int is_root = 1;
+        if(lane_id>0) {
+            is_root = (left_idx != idx);
+        }
+
+        // determine the range this thread contributes to
+        unsigned roots = __ballot(is_root);
+
+        right = right_limit(roots, lane_id+1);
+        left  = threads_per_warp()-1-right_limit(__brev(roots), threads_per_warp()-1-lane_id);
+        width = rounddown_power_of_2(right - left);
+    }
+};
+
+} // namespace impl
+
+template <typename T, typename I>
+__device__ __inline__
+void reduce_by_key(T contribution, T* target, I idx) {
+    impl::run_length_type run(idx);
+
+    // get local copies of right and width, which are modified in the reduction loop
+    auto rhs = run.right;
+    auto width = run.width;
+
+    while (width) {
+        unsigned source_lane = run.lane_id + width;
+
+        auto source_value = impl::get_from_lane(contribution, source_lane);
+        if (source_lane<rhs) {
+            contribution += source_value;
+        }
+
+        rhs = run.left + width;
+        width >>= 1;
+    }
+
+    if(run.is_root()) {
+        // Update atomically in case the run spans multiple warps.
+        atomicAdd(target+idx, contribution);
+    }
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/profiling/memory_meter.cpp b/src/profiling/memory_meter.cpp
index c18c8f26e223a7574d7f8887c5dabf8378fbd2aa..5d894167b3014461596188b66f5c09ea9a7bfe71 100644
--- a/src/profiling/memory_meter.cpp
+++ b/src/profiling/memory_meter.cpp
@@ -19,7 +19,7 @@ protected:
 
 public:
     std::string name() override {
-        return "memory-allocated";
+        return "memory";
     }
 
     std::string units() override {
@@ -57,7 +57,7 @@ meter_ptr make_memory_meter() {
 class gpu_memory_meter: public memory_meter {
 public:
     std::string name() override {
-        return "gpu-memory-allocated";
+        return "memory-gpu";
     }
 
     void take_reading() override {
diff --git a/src/profiling/meter_manager.cpp b/src/profiling/meter_manager.cpp
index 54b5bd90a29f1701b609f0d400c2be81aad21c20..eae9d3ed47e2f63d7d9ff7b8319bc026e12851f4 100644
--- a/src/profiling/meter_manager.cpp
+++ b/src/profiling/meter_manager.cpp
@@ -1,5 +1,7 @@
 #include <communication/global_policy.hpp>
 #include <util/hostname.hpp>
+#include <util/strprintf.hpp>
+#include <util/rangeutil.hpp>
 #include <json/json.hpp>
 
 #include "meter_manager.hpp"
@@ -101,45 +103,74 @@ nlohmann::json to_json(const measurement& mnt) {
     };
 }
 
-nlohmann::json to_json(const meter_manager& manager) {
+// Build a report of meters, for use at the end of a simulation
+// for output to file or analysis.
+meter_report make_meter_report(const meter_manager& manager) {
+    meter_report report;
+
     using gcom = communication::global_policy;
 
+    // Add the times to the meter outputs
+    report.meters.push_back(measurement("time", "s", manager.times()));
+
     // Gather the meter outputs into a json Array
-    nlohmann::json meter_out;
     for (auto& m: manager.meters()) {
-        meter_out.push_back(
-            to_json(measurement(m->name(), m->units(), m->measurements()))
-        );
+        report.meters.push_back(
+            measurement(m->name(), m->units(), m->measurements()));
     }
-    // Add the times to the meter outputs
-    meter_out.push_back(to_json(measurement("time", "s", manager.times())));
 
     // Gather a vector with the names of the node that each rank is running on.
     auto host = hostname();
-    auto hosts = gcom::gather(host? *host: "unknown", 0);
-
-    // Only the "root" process returns meter information
-    if (gcom::id()==0) {
-        return {
-            {"checkpoints", manager.checkpoint_names()},
-            {"num_domains", gcom::size()},
-            {"global_model", std::to_string(gcom::kind())},
-            {"meters", meter_out},
-            {"hosts", hosts},
-        };
-    }
+    report.hosts = gcom::gather(host? *host: "unknown", 0);
+
+    report.checkpoints = manager.checkpoint_names();
+    report.num_domains = gcom::size();
+    report.communication_policy = gcom::kind();
 
-    return {};
+    return report;
 }
 
-void save_to_file(const meter_manager& manager, const std::string& name) {
-    auto measurements = to_json(manager);
-    if (!communication::global_policy::id()) {
-        std::ofstream fid;
-        fid.exceptions(std::ios_base::badbit | std::ios_base::failbit);
-        fid.open(name);
-        fid << std::setw(1) << measurements << "\n";
+nlohmann::json to_json(const meter_report& report) {
+    return {
+        {"checkpoints", report.checkpoints},
+        {"num_domains", report.num_domains},
+        {"global_model", std::to_string(report.communication_policy)},
+        {"meters", util::transform_view(report.meters, [](measurement const& m){return to_json(m);})},
+        {"hosts", report.hosts},
+    };
+}
+
+// Print easy to read report of meters to a stream.
+std::ostream& operator<<(std::ostream& o, const meter_report& report) {
+    o << "\n---- meters ------------------------------------------------------------\n";
+    o << strprintf("%21s", "");
+    for (auto const& m: report.meters) {
+        if (m.name=="time") {
+            o << strprintf("%16s", "time (s)");
+        }
+        else if (m.name.find("memory")!=std::string::npos) {
+            o << strprintf("%16s", m.name+" (MB)");
+        }
+    }
+    o << "\n------------------------------------------------------------------------\n";
+    int cp_index = 0;
+    for (auto name: report.checkpoints) {
+        name.resize(20);
+        o << strprintf("%-21s", name);
+        for (const auto& m: report.meters) {
+            if (m.name=="time") {
+                std::vector<double> times = m.measurements[cp_index];
+                o << strprintf("%16.4f", algorithms::mean(times));
+            }
+            else if (m.name.find("memory")!=std::string::npos) {
+                std::vector<double> mem = m.measurements[cp_index];
+                o << strprintf("%16.4f", algorithms::mean(mem)*1e-6);
+            }
+        }
+        o << "\n";
+        ++cp_index;
     }
+    return o;
 }
 
 } // namespace util
diff --git a/src/profiling/meter_manager.hpp b/src/profiling/meter_manager.hpp
index c3164b8106b5a24335f9b498d5023ca08012ddb7..d92789b8e9e938d81500ce112c58e79d04c338a3 100644
--- a/src/profiling/meter_manager.hpp
+++ b/src/profiling/meter_manager.hpp
@@ -3,6 +3,7 @@
 #include <memory>
 #include <vector>
 
+#include <communication/global_policy.hpp>
 #include <json/json.hpp>
 
 #include "meter.hpp"
@@ -29,10 +30,6 @@ struct measurement {
     measurement(std::string, std::string, const std::vector<double>&);
 };
 
-// Converts a measurement to a json type for serialization to file.
-// See src/profiling/meters.md for more information about the json formating.
-nlohmann::json to_json(const measurement& m);
-
 class meter_manager {
 private:
     bool started_ = false;
@@ -53,8 +50,18 @@ public:
     const std::vector<double>& times() const;
 };
 
-nlohmann::json to_json(const meter_manager&);
-void save_to_file(const meter_manager& manager, const std::string& name);
+// Simple type for gathering distributed meter information
+struct meter_report {
+    std::vector<std::string> checkpoints;
+    unsigned num_domains;
+    nest::mc::communication::global_policy_kind communication_policy;
+    std::vector<measurement> meters;
+    std::vector<std::string> hosts;
+};
+
+nlohmann::json to_json(const meter_report&);
+meter_report make_meter_report(const meter_manager& manager);
+std::ostream& operator<<(std::ostream& o, const meter_report& report);
 
 } // namespace util
 } // namespace mc
diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt
index 8f0fbc13c4b59e072ef9226c65c4da2eff6f732d..94bd3979c223dbd880e0d89625450354943e9fdc 100644
--- a/tests/ubench/CMakeLists.txt
+++ b/tests/ubench/CMakeLists.txt
@@ -3,10 +3,13 @@ include(ExternalProject)
 # List of micro benchmarks to build.
 
 set(bench_sources
-    accumulate_functor_values.cpp)
+    accumulate_functor_values.cpp
+)
 
 set(bench_sources_cuda
-    cuda_compare_and_reduce.cu)
+    cuda_compare_and_reduce.cu
+    cuda_reduce_by_key.cu
+)
 
 # Set up google benchmark as an external project.
 
@@ -67,12 +70,13 @@ endforeach()
 
 
 if(NMC_WITH_CUDA)
+    cuda_include_directories("${gbench_install_dir}/include")
     foreach(bench_src ${bench_sources_cuda})
         string(REGEX REPLACE "\\.[^.]*$" "" bench_exe "${bench_src}")
         cuda_add_executable("${bench_exe}" EXCLUDE_FROM_ALL "${bench_src}")
         add_dependencies("${bench_exe}" gbench)
-        target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include")
         target_link_libraries("${bench_exe}" "${gbench_install_dir}/lib/libbenchmark.a")
+        target_link_libraries("${bench_exe}" LINK_PUBLIC ${NESTMC_LIBRARIES})
 
         list(APPEND bench_exe_list ${bench_exe})
     endforeach()
diff --git a/tests/ubench/README.md b/tests/ubench/README.md
index 1d3fc53c146a4b501b4f71c756ead91ffe953b2c..80a7c538a18862ea65a5b30d86c84b01471a09d9 100644
--- a/tests/ubench/README.md
+++ b/tests/ubench/README.md
@@ -120,3 +120,61 @@ Platform:
 | 32768  | 102880 ns | 296836 ns | 265169 ns | 16758 ns |
 | 262144 | 661724 ns | 305209 ns | 269095 ns | 19792 ns |
 
+---
+
+### `cuda_reduce_by_index`
+
+#### Motivation
+
+The reduction by key pattern with repeated keys is used when "point process" mechanism contributions to currents are collected.
+More than one point process, typically synapses, can be attached to a compartment,
+and when their contributions are computed and added to the per-compartment current in parallel, care must be taken to avoid race conditions.
+Early versions of NestMC used cuda atomic operations to perform the accumulation, which works quite well up to a certain point.
+However, performance with atomics decreases as the number of synapses per compartment increases, i.e. as the number of threads performing simultatneous atomic updates on the same location increases.
+
+#### Implementations
+
+Two implementations are considered:
+
+1. Perform reductions inside each warp, which is a multi-step process:
+    1. threads inside each warp determine which other threads they must perform a reduction with
+    2. threads perform a binary reduction tree operation using warp shuffle intrinsics
+    3. one thread performs a CUDA atomic update for each key.
+    4. note that this approach takes advantage of the keys being sorted in ascending order
+
+2. The naive (however simple) use of CUDA atomics.
+
+#### Results
+
+Platform:
+* Xeon(R) CPU E5-2650 v4 (Haswell 12 cores @ 2.20 GHz)
+* Tesla P100-PCIE-16GB
+* Linux 3.10.0
+* gcc version 5.2.0
+* nvcc version 8.0.61
+
+Results are presented as speedup for warp intrinsics vs atomics, for both single and double precision.
+Note that the P100 GPU has hardware support for double precision atomics, and we expect much larger speedup for double precision on Keplar GPUs that emulate double precision atomics with CAS.
+The benchmark updates `n` locations, each with an average density of `d` keys per location.
+This is equivalent to `n` compartments with `d` synapses per compartment.
+Atomics are faster for the case where both `n` and `d` are small, however the gpu is backend is for throughput simulations, with large cell groups with at least 10k compartments in total.
+
+*float*
+
+| _n_    | d=1  | d=10 | d=100| d=1000|
+|--------|------|------|------|-------|
+| 100    | 0.75 | 0.80 | 1.66 | 10.7  |
+| 1k     | 0.76 | 0.87 | 3.15 | 12.5  |
+| 10k    | 0.87 | 1.14 | 3.52 | 14.0  |
+| 100k   | 0.92 | 1.34 | 3.58 | 15.5  |
+| 1000k  | 1.18 | 1.43 | 3.53 | 15.2  |
+
+*double*
+
+| _n_    | d=1  | d=10 | d=100| d=1000|
+|--------|------|------|------|-------|
+| 100    | 0.91 | 0.94 | 1.82 |  9.0  |
+| 1k     | 0.89 | 0.99 | 2.38 | 10.0  |
+| 10k    | 0.94 | 1.09 | 2.42 | 11.1  |
+| 100k   | 0.98 | 1.59 | 2.36 | 11.4  |
+| 1000k  | 1.13 | 1.63 | 2.36 | 11.4  |
diff --git a/tests/ubench/cuda_compare_and_reduce.cu b/tests/ubench/cuda_compare_and_reduce.cu
index 96ca8e856ec778a353f0276d601e4d840bc04ced..3cf5ce77831debfdc107536cac458fec12db319e 100644
--- a/tests/ubench/cuda_compare_and_reduce.cu
+++ b/tests/ubench/cuda_compare_and_reduce.cu
@@ -91,7 +91,7 @@ bool host_copy_compare(std::size_t n, int* aptr, int* bptr) {
 struct thrust_cmp_pred {
     __device__
     bool operator()(const thrust::tuple<int, int>& p) const {
-	return thrust::get<0>(p) < thrust::get<1>(p);
+        return thrust::get<0>(p) < thrust::get<1>(p);
     }
 };
 
@@ -129,9 +129,9 @@ bool custom_cuda_compare(std::size_t n, int* aptr, int* bptr) {
 template <typename T>
 struct device_store {
     device_store() {
-	void* p;
-	cudaMalloc(&p, sizeof(T));
-	ptr = (T*)p;
+    void* p;
+    cudaMalloc(&p, sizeof(T));
+    ptr = (T*)p;
     }
 
     ~device_store() { if (ptr) cudaFree(ptr); }
@@ -186,10 +186,8 @@ void bench_custom_cuda_compare_noalloc(benchmark::State& state) {
 void run_custom_arguments(benchmark::internal::Benchmark* b) {
     for (int n=1<<8; n<=1<<20; n*=2) {
         for (int oop: {0, 200, 4}) {
-
-// Uncomment to set fixed iteration count (for e.g. profiling):
-//	    b->Iterations(20);
-
+            // Uncomment to set fixed iteration count (for e.g. profiling):
+            //b->Iterations(20);
             b->Args({n, oop});
         }
     }
diff --git a/tests/ubench/cuda_reduce_by_key.cu b/tests/ubench/cuda_reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2863b42aa3413d90c849c605f4713c837e7e4c63
--- /dev/null
+++ b/tests/ubench/cuda_reduce_by_key.cu
@@ -0,0 +1,129 @@
+// Compare implementations of reduce by key
+//   u[key[i]] += v[i]
+// where key is sorted in ascending order and may contain repeated keys
+
+// Explicitly undef NDEBUG for assert below.
+#undef NDEBUG
+
+#include <iostream>
+#include <vector>
+#include <random>
+
+#include <benchmark/benchmark.h>
+
+#include <memory/memory.hpp>
+#include <backends/gpu/kernels/reduce_by_key.hpp>
+#include <backends/gpu/intrinsics.hpp>
+#include <util/rangeutil.hpp>
+
+using namespace nest::mc;
+
+// Run benchmarks
+//  * with between 100:1million entries to update
+//  * with between 1:1000 threads updating each entry
+// This corresponds to a range from a single 100 compartment cell with 1 synapse
+// per compartment to 100k cells with 10 compartments and 10k synapses each.
+
+void run_custom_arguments(benchmark::internal::Benchmark* b) {
+    for (auto n_comp: {100, 1000, 10000, 100000, 1000000}) {
+        for (auto syn_per_comp: {1, 10, 100, 1000}) {
+            b->Args({n_comp, syn_per_comp});
+        }
+    }
+}
+
+template <typename T, typename I>
+__global__
+void reduce_by_shuffle(const T* src, T* dst, const I* index, int n) {
+    unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+    if (tid<n) {
+        gpu::reduce_by_key(src[tid], dst, index[tid]);
+    }
+}
+
+template <typename T, typename I>
+__global__
+void reduce_by_atomic(const T* src, T* dst, const I* index, int n) {
+    unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+    if (tid<n) {
+        cuda_atomic_add(dst + index[tid], src[tid]);
+    }
+}
+
+template <typename T, typename Impl>
+void bench_runner(benchmark::State& state, const Impl& impl, T) {
+    int n_cmp = state.range(0);
+    int n = n_cmp*state.range(1);
+
+    // find a uniform random assignment of indices to compartments
+    std::uniform_int_distribution<int> U(0, n_cmp-1);
+    std::minstd_rand R;
+
+    std::vector<unsigned> hist(n_cmp);
+    for (int i=0; i<n; ++i) {
+        ++hist[U(R)];
+    }
+
+    std::vector<int> index;
+    index.reserve(n);
+    for (int i=0; i<n_cmp; ++i) {
+        index.insert(index.end(), hist[i], i);
+    }
+
+    using array = memory::device_vector<T>;
+    using iarray = memory::device_vector<int>;
+
+    // copy inputs to the device
+    array out(n_cmp, T(0));
+    array in(n, T(1));
+    iarray idx = memory::make_const_view(index);
+
+    // set up cuda events for custom timer of kernel executation times
+    cudaEvent_t start_e;
+    cudaEvent_t stop_e;
+    cudaEventCreate(&start_e);
+    cudaEventCreate(&stop_e);
+    while (state.KeepRunning()) {
+        int block_dim = 128;
+        int grid_dim = (n-1)/block_dim + 1;
+
+        cudaEventRecord(start_e, 0);
+        // call the kernel
+        impl<<<grid_dim, block_dim>>>(in.data(), out.data(), idx.data(), n);
+        cudaEventRecord(stop_e, 0);
+
+        // wait for kernel call to finish before querying the time taken
+        cudaEventSynchronize(stop_e);
+        float time_taken = 0.0f;
+        cudaEventElapsedTime(&time_taken, start_e, stop_e);
+
+        // pass custom time to benchmark framework
+        state.SetIterationTime(time_taken);
+    }
+    cudaEventDestroy(start_e);
+    cudaEventDestroy(stop_e);
+}
+
+// runners for single precision tests
+void shuffle_32(benchmark::State& state) {
+    bench_runner(state, reduce_by_shuffle<float, int>, float());
+}
+void atomic_32(benchmark::State& state) {
+    bench_runner(state, reduce_by_atomic<float, int>, float());
+}
+BENCHMARK(shuffle_32)->Apply(run_custom_arguments)->UseManualTime();
+BENCHMARK(atomic_32)->Apply(run_custom_arguments)->UseManualTime();
+
+// runners for double precision tests
+void shuffle_64(benchmark::State& state) {
+    bench_runner(state, reduce_by_shuffle<double, int>, double());
+}
+void atomic_64(benchmark::State& state) {
+    bench_runner(state, reduce_by_atomic<double, int>, double());
+}
+BENCHMARK(shuffle_64)->Apply(run_custom_arguments)->UseManualTime();
+BENCHMARK(atomic_64)->Apply(run_custom_arguments)->UseManualTime();
+
+BENCHMARK_MAIN();
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 789969bd47d4dbb968d5c0a771fcfd7587f23b6a..564f5964c57c1a948b754a03bd24f9a4124634cf 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -22,6 +22,7 @@ set(TEST_CUDA_SOURCES
     test_gpu_stack.cu
     test_matrix.cu
     test_multi_event_stream.cu
+    test_reduce_by_key.cu
     test_spikes.cu
     test_vector.cu
 
diff --git a/tests/unit/test_reduce_by_key.cu b/tests/unit/test_reduce_by_key.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29f359def8521fca9f3469df7a3b2156cb2c711c
--- /dev/null
+++ b/tests/unit/test_reduce_by_key.cu
@@ -0,0 +1,105 @@
+#include "../gtest.h"
+
+#include <vector>
+
+#include <backends/gpu/kernels/reduce_by_key.hpp>
+#include <memory/memory.hpp>
+#include <util/span.hpp>
+#include <util/rangeutil.hpp>
+
+using namespace nest::mc;
+
+template <typename T, typename I>
+__global__
+void reduce_kernel(const T* src, T* dst, const I* index, int n) {
+    unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+    if (tid<n) {
+        gpu::reduce_by_key(src[tid], dst, index[tid]);
+    }
+}
+
+template <typename T>
+std::vector<T> reduce(const std::vector<T>& in, size_t n_out, const std::vector<int>& index, unsigned block_dim=128) {
+    EXPECT_EQ(in.size(), index.size());
+    EXPECT_TRUE(std::is_sorted(index.begin(), index.end()));
+
+    using array = memory::device_vector<T>;
+    using iarray = memory::device_vector<int>;
+
+    int n = in.size();
+
+    array  src = memory::make_const_view(in);
+    iarray idx = memory::make_const_view(index);
+    array  dst(n_out, 0);
+
+    unsigned grid_dim = (n-1)/block_dim + 1;
+    reduce_kernel<<<grid_dim, block_dim>>>(src.data(), dst.data(), idx.data(), n);
+
+    std::vector<T> out(n_out);
+    memory::copy(dst, memory::make_view(out));
+
+    return out;
+}
+
+TEST(reduce_by_key, no_repetitions)
+{
+    int n = 64;
+    {
+        std::vector<float> in(n, 1);
+        std::vector<int> index = util::assign_from(util::make_span(0, n));
+
+        auto out = reduce(in, n, index);
+        for (auto o: out) EXPECT_EQ(o, 1.0f);
+    }
+    {
+        std::vector<double> in(n, 1);
+        std::vector<int> index = util::assign_from(util::make_span(0, n));
+
+        auto out = reduce(in, n, index);
+        for (auto o: out) EXPECT_EQ(o, 1.0);
+    }
+}
+
+TEST(reduce_by_key, single_repeated_index)
+{
+    // Perform reduction of a sequence of 1s of length n
+    // The expected result is n
+    for (auto n: {1, 2, 7, 31, 32, 33, 63, 64, 65, 128}) {
+        std::vector<double> in(n, 1);
+        std::vector<int> index(n, 0);
+
+        auto out = reduce(in, 1, index);
+        EXPECT_EQ(out[0], double(n));
+    }
+    // Perform reduction of an ascending sequence of {1,2,3,...,n}
+    // The expected result is n*(n+1)/2
+    for (auto n: {1, 2, 7, 31, 32, 33, 63, 64, 65, 128}) {
+        std::vector<double> in = util::assign_from(util::make_span(1, n+1));
+        std::vector<int> index(n, 0);
+
+        auto out = reduce(in, 1, index);
+        EXPECT_EQ(out[0], double((n+1)*n/2));
+    }
+}
+
+TEST(reduce_by_key, scatter)
+{
+    std::vector<int> index = {0,0,0,1,2,2,3,7,7,7,11};
+    unsigned n = util::max_value(index)+1;
+    std::vector<double> in(index.size(), 1);
+    std::vector<double> expected = {3., 1., 2., 1., 0., 0., 0., 3., 0., 0., 0., 1.};
+
+    EXPECT_EQ(n, expected.size());
+
+    auto out = reduce(in, n, index);
+    EXPECT_EQ(out, expected);
+
+    // rerun with 7 threads per thread block, to test
+    //  * using more than one thread block
+    //  * thread blocks that are not a multiple of 32
+    //  * thread blocks that are less than 32
+    out = reduce(in, n, index, 7);
+
+    EXPECT_EQ(out, expected);
+}