diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt
index fca1b3898f238d0a437c635dba2d8b014489dc8f..8f0fbc13c4b59e072ef9226c65c4da2eff6f732d 100644
--- a/tests/ubench/CMakeLists.txt
+++ b/tests/ubench/CMakeLists.txt
@@ -5,6 +5,9 @@ include(ExternalProject)
 set(bench_sources
     accumulate_functor_values.cpp)
 
+set(bench_sources_cuda
+    cuda_compare_and_reduce.cu)
+
 # Set up google benchmark as an external project.
 
 set(gbench_src_dir "${CMAKE_CURRENT_SOURCE_DIR}/google-benchmark")
@@ -62,5 +65,18 @@ foreach(bench_src ${bench_sources})
     list(APPEND bench_exe_list ${bench_exe})
 endforeach()
 
+
+if(NMC_WITH_CUDA)
+    foreach(bench_src ${bench_sources_cuda})
+        string(REGEX REPLACE "\\.[^.]*$" "" bench_exe "${bench_src}")
+        cuda_add_executable("${bench_exe}" EXCLUDE_FROM_ALL "${bench_src}")
+        add_dependencies("${bench_exe}" gbench)
+        target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include")
+        target_link_libraries("${bench_exe}" "${gbench_install_dir}/lib/libbenchmark.a")
+
+        list(APPEND bench_exe_list ${bench_exe})
+    endforeach()
+endif()
+
 add_custom_target(ubenches DEPENDS ${bench_exe_list})
 
diff --git a/tests/ubench/README.md b/tests/ubench/README.md
index 06e6d5afbb6a819414f70b4bc96f84898125a4c2..1d3fc53c146a4b501b4f71c756ead91ffe953b2c 100644
--- a/tests/ubench/README.md
+++ b/tests/ubench/README.md
@@ -63,3 +63,60 @@ Platform:
 | g++ -O3     |  907 ns | 2090 ns |  907 ns | 614 ns |
 | clang++ -O3 | 1063 ns |  533 ns | 1051 ns | 532 ns |
 
+---
+
+### `cuda_compare_and_reduce`
+
+#### Motivation
+
+One possible mechanism for determining if device-side event delivery had exhausted all
+events is to see if the start and end of each event-span of each cell were equal or not.
+This is equivalent to the test:
+
+> ∃i: a[i] < b[i]?
+
+for device vectors _a_ and _b_.
+
+How expensive is it simply to copy the vectors over to the host and compare there?
+Benchmarking indicated that for vectors up to approximately 10^5 elements, it was
+significiantly faster to copy to host, despite the limited host–device bandwidth.
+
+This non-intuitive result appears to be the result of the high overhead of `cudaMalloc`;
+pre-allocating space on the device for the reduction result restores expected
+performance behaviour.
+
+#### Implementations
+
+Four implementations are considered:
+
+1. Copy both vectors to host, run short-circuit compare.
+
+2. Use `thrust::zip_iterator` and `thrust::any_of` to run the reduction on the device.
+
+3. Use a short custom cuda kernel to run the reduction, using `__syncthreads_or` and
+   a non-atomic write to the result.
+
+4. Use the same cuda kernel as above, but pre-allocate the device memory to store the
+   result.
+
+Note: a fairer host-based comparison would interleave comparison with data transfer.
+
+#### Results
+
+Results here are presented for vector size _n_ equal to 256, 512, 32768 and 262144,
+with the two vectors equal.
+
+Platform:
+* Xeon(R) CPU E5-2650 v4 with base clock 2.20 GHz and max clock 2.9 GHz.
+* Tesla P100-PCIE-16GB
+* Linux 3.10.0
+* gcc version 5.3.0
+* nvcc version 8.0.61
+
+| _n_ | host copy | thrust | custom cuda | custom cuda noalloc |
+|:----|----------:|-------:|------------:|--------------------:|
+| 256    |  18265 ns |  41221 ns |  23255 ns | 16440 ns |
+| 512    |  18466 ns | 286331 ns | 265113 ns | 16335 ns |
+| 32768  | 102880 ns | 296836 ns | 265169 ns | 16758 ns |
+| 262144 | 661724 ns | 305209 ns | 269095 ns | 19792 ns |
+
diff --git a/tests/ubench/cuda_compare_and_reduce.cu b/tests/ubench/cuda_compare_and_reduce.cu
new file mode 100644
index 0000000000000000000000000000000000000000..96ca8e856ec778a353f0276d601e4d840bc04ced
--- /dev/null
+++ b/tests/ubench/cuda_compare_and_reduce.cu
@@ -0,0 +1,203 @@
+/* Compare implementations of the test:
+ *    ∃i: a[i]<b[i]?
+ * for device-side arrays a and b.
+ *
+ * Four implementations are compared:
+ * 1. Copy both vectors to host, compare there.
+ * 2. Use the thrust library.
+ * 3. Use a small custom cuda kernel.
+ * 4. Same custom cuda kernel with once-off allocation of return value.
+ */
+
+// Explicitly undef NDEBUG for assert below.
+#undef NDEBUG
+
+#include <algorithm>
+#include <cassert>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_profiler_api.h>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+
+template <typename Impl>
+void bench_generic(benchmark::State& state, const Impl& impl) {
+    std::size_t n = state.range(0);
+
+    int oop = state.range(1);
+    double p_inclusion = oop? 1.0/oop: 0;
+
+    // Make device vectors `a` and `b` of size n, and with 
+    // `p_inclusion` chance of `a[i]` < `b[i]` for any given `i`.
+
+    std::bernoulli_distribution B(p_inclusion);
+    std::uniform_int_distribution<int> U(0, 99);
+    std::minstd_rand R;
+
+    std::vector<int> xs(n);
+    std::generate(xs.begin(), xs.end(), [&]() { return U(R); });
+
+    thrust::device_vector<int> a(n);
+    thrust::copy(xs.begin(), xs.end(), a.begin());
+
+    bool differ = false;
+    thrust::device_vector<int> b(n);
+    for (std::size_t i = 0; i<n; ++i) {
+        if (B(R)) {
+            xs[i] += 3;
+            differ = true;
+        }
+    }
+    thrust::copy(xs.begin(), xs.end(), b.begin());
+    cudaDeviceSynchronize();
+
+    // Run benchmark.
+
+    bool result = false;
+    int* aptr = thrust::raw_pointer_cast(a.data());
+    int* bptr = thrust::raw_pointer_cast(b.data());
+
+    cudaProfilerStart();
+    while (state.KeepRunning()) {
+        result = impl(n, aptr, bptr);
+        benchmark::DoNotOptimize(result);
+    }
+    cudaProfilerStop();
+
+    // validate result
+    assert(result==differ);
+}
+
+bool host_copy_compare(std::size_t n, int* aptr, int* bptr) {
+    std::vector<int> a(n);
+    std::vector<int> b(n);
+
+    cudaMemcpy(a.data(), aptr, n*sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(b.data(), bptr, n*sizeof(int), cudaMemcpyDeviceToHost);
+
+    for (std::size_t i = 0; i<n; ++i) {
+        if (a[i]<b[i]) return true;
+    }
+    return false;
+}
+
+struct thrust_cmp_pred {
+    __device__
+    bool operator()(const thrust::tuple<int, int>& p) const {
+	return thrust::get<0>(p) < thrust::get<1>(p);
+    }
+};
+
+bool thrust_compare(std::size_t n, int* aptr, int* bptr) {
+    thrust::device_ptr<int> a(aptr), b(bptr);
+
+    auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(a, b));
+    auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(a+n, b+n));
+
+    return thrust::any_of(zip_begin, zip_end, thrust_cmp_pred{});
+}
+
+__global__ void cuda_cmp_kernel(std::size_t n, int* aptr, int* bptr, int* rptr) {
+    int i = threadIdx.x+blockIdx.x*blockDim.x;
+    int cmp = i<n? aptr[i]<bptr[i]: 0;
+    if (__syncthreads_or(cmp)) *rptr=1;
+}
+
+bool custom_cuda_compare(std::size_t n, int* aptr, int* bptr) {
+    int result;
+
+    constexpr int blockwidth = 128;
+    int nblock = n? 1+(n-1)/blockwidth: 0;
+
+    void* rptr;
+    cudaMalloc(&rptr, sizeof(int));
+    cudaMemset(rptr, 0, sizeof(int));
+    cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, (int *)rptr);
+    cudaMemcpy(&result, rptr, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaFree(rptr);
+
+    return (bool)result;
+}
+
+template <typename T>
+struct device_store {
+    device_store() {
+	void* p;
+	cudaMalloc(&p, sizeof(T));
+	ptr = (T*)p;
+    }
+
+    ~device_store() { if (ptr) cudaFree(ptr); }
+
+    device_store(const device_store&&) = delete;
+    device_store(device_store&&) = delete;
+
+    T* get() { return ptr; }
+
+private:
+    T* ptr = nullptr;
+};
+
+bool custom_cuda_compare_noalloc(std::size_t n, int* aptr, int* bptr) {
+    static thread_local device_store<int> state;
+    int result;
+
+    constexpr int blockwidth = 128;
+    int nblock = n? 1+(n-1)/blockwidth: 0;
+
+    cudaMemset(state.get(), 0, sizeof(int));
+    cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, state.get());
+    cudaMemcpy(&result, state.get(), sizeof(int), cudaMemcpyDeviceToHost);
+
+    return (bool)result;
+}
+
+void bench_host_copy_compare(benchmark::State& state) {
+    bench_generic(state, host_copy_compare);
+}
+
+void bench_thrust_compare(benchmark::State& state) {
+    bench_generic(state, thrust_compare);
+}
+
+void bench_custom_cuda_compare(benchmark::State& state) {
+    bench_generic(state, custom_cuda_compare);
+}
+
+void bench_custom_cuda_compare_noalloc(benchmark::State& state) {
+    bench_generic(state, custom_cuda_compare_noalloc);
+}
+
+// Run benches over 256 to circa 1e6 elements, with three cases:
+//
+// Arg1       Values
+// ---------------------------------------------------
+//    0       `a` and `b` equal
+//  200       `a` and `b` differ circa 1 in 200 places.
+//    4       `a` and `b` differ circa 1 in 4 places.
+
+void run_custom_arguments(benchmark::internal::Benchmark* b) {
+    for (int n=1<<8; n<=1<<20; n*=2) {
+        for (int oop: {0, 200, 4}) {
+
+// Uncomment to set fixed iteration count (for e.g. profiling):
+//	    b->Iterations(20);
+
+            b->Args({n, oop});
+        }
+    }
+}
+
+BENCHMARK(bench_host_copy_compare)->Apply(run_custom_arguments);
+BENCHMARK(bench_thrust_compare)->Apply(run_custom_arguments);
+BENCHMARK(bench_custom_cuda_compare)->Apply(run_custom_arguments);
+BENCHMARK(bench_custom_cuda_compare_noalloc)->Apply(run_custom_arguments);
+
+BENCHMARK_MAIN();
diff --git a/tests/ubench/google-benchmark b/tests/ubench/google-benchmark
index 9a5072d1bf9187b32ce9a88842dffa31ef416442..cb8a0cc10f8b634fd554251ae086da522b58f50e 160000
--- a/tests/ubench/google-benchmark
+++ b/tests/ubench/google-benchmark
@@ -1 +1 @@
-Subproject commit 9a5072d1bf9187b32ce9a88842dffa31ef416442
+Subproject commit cb8a0cc10f8b634fd554251ae086da522b58f50e