diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt index fca1b3898f238d0a437c635dba2d8b014489dc8f..8f0fbc13c4b59e072ef9226c65c4da2eff6f732d 100644 --- a/tests/ubench/CMakeLists.txt +++ b/tests/ubench/CMakeLists.txt @@ -5,6 +5,9 @@ include(ExternalProject) set(bench_sources accumulate_functor_values.cpp) +set(bench_sources_cuda + cuda_compare_and_reduce.cu) + # Set up google benchmark as an external project. set(gbench_src_dir "${CMAKE_CURRENT_SOURCE_DIR}/google-benchmark") @@ -62,5 +65,18 @@ foreach(bench_src ${bench_sources}) list(APPEND bench_exe_list ${bench_exe}) endforeach() + +if(NMC_WITH_CUDA) + foreach(bench_src ${bench_sources_cuda}) + string(REGEX REPLACE "\\.[^.]*$" "" bench_exe "${bench_src}") + cuda_add_executable("${bench_exe}" EXCLUDE_FROM_ALL "${bench_src}") + add_dependencies("${bench_exe}" gbench) + target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include") + target_link_libraries("${bench_exe}" "${gbench_install_dir}/lib/libbenchmark.a") + + list(APPEND bench_exe_list ${bench_exe}) + endforeach() +endif() + add_custom_target(ubenches DEPENDS ${bench_exe_list}) diff --git a/tests/ubench/README.md b/tests/ubench/README.md index 06e6d5afbb6a819414f70b4bc96f84898125a4c2..1d3fc53c146a4b501b4f71c756ead91ffe953b2c 100644 --- a/tests/ubench/README.md +++ b/tests/ubench/README.md @@ -63,3 +63,60 @@ Platform: | g++ -O3 | 907 ns | 2090 ns | 907 ns | 614 ns | | clang++ -O3 | 1063 ns | 533 ns | 1051 ns | 532 ns | +--- + +### `cuda_compare_and_reduce` + +#### Motivation + +One possible mechanism for determining if device-side event delivery had exhausted all +events is to see if the start and end of each event-span of each cell were equal or not. +This is equivalent to the test: + +> ∃i: a[i] < b[i]? + +for device vectors _a_ and _b_. + +How expensive is it simply to copy the vectors over to the host and compare there? +Benchmarking indicated that for vectors up to approximately 10^5 elements, it was +significiantly faster to copy to host, despite the limited host–device bandwidth. + +This non-intuitive result appears to be the result of the high overhead of `cudaMalloc`; +pre-allocating space on the device for the reduction result restores expected +performance behaviour. + +#### Implementations + +Four implementations are considered: + +1. Copy both vectors to host, run short-circuit compare. + +2. Use `thrust::zip_iterator` and `thrust::any_of` to run the reduction on the device. + +3. Use a short custom cuda kernel to run the reduction, using `__syncthreads_or` and + a non-atomic write to the result. + +4. Use the same cuda kernel as above, but pre-allocate the device memory to store the + result. + +Note: a fairer host-based comparison would interleave comparison with data transfer. + +#### Results + +Results here are presented for vector size _n_ equal to 256, 512, 32768 and 262144, +with the two vectors equal. + +Platform: +* Xeon(R) CPU E5-2650 v4 with base clock 2.20 GHz and max clock 2.9 GHz. +* Tesla P100-PCIE-16GB +* Linux 3.10.0 +* gcc version 5.3.0 +* nvcc version 8.0.61 + +| _n_ | host copy | thrust | custom cuda | custom cuda noalloc | +|:----|----------:|-------:|------------:|--------------------:| +| 256 | 18265 ns | 41221 ns | 23255 ns | 16440 ns | +| 512 | 18466 ns | 286331 ns | 265113 ns | 16335 ns | +| 32768 | 102880 ns | 296836 ns | 265169 ns | 16758 ns | +| 262144 | 661724 ns | 305209 ns | 269095 ns | 19792 ns | + diff --git a/tests/ubench/cuda_compare_and_reduce.cu b/tests/ubench/cuda_compare_and_reduce.cu new file mode 100644 index 0000000000000000000000000000000000000000..96ca8e856ec778a353f0276d601e4d840bc04ced --- /dev/null +++ b/tests/ubench/cuda_compare_and_reduce.cu @@ -0,0 +1,203 @@ +/* Compare implementations of the test: + * ∃i: a[i]<b[i]? + * for device-side arrays a and b. + * + * Four implementations are compared: + * 1. Copy both vectors to host, compare there. + * 2. Use the thrust library. + * 3. Use a small custom cuda kernel. + * 4. Same custom cuda kernel with once-off allocation of return value. + */ + +// Explicitly undef NDEBUG for assert below. +#undef NDEBUG + +#include <algorithm> +#include <cassert> +#include <random> +#include <vector> + +#include <benchmark/benchmark.h> + +#include <cuda_profiler_api.h> +#include <thrust/copy.h> +#include <thrust/device_ptr.h> +#include <thrust/device_vector.h> +#include <thrust/functional.h> +#include <thrust/iterator/zip_iterator.h> +#include <thrust/logical.h> + +template <typename Impl> +void bench_generic(benchmark::State& state, const Impl& impl) { + std::size_t n = state.range(0); + + int oop = state.range(1); + double p_inclusion = oop? 1.0/oop: 0; + + // Make device vectors `a` and `b` of size n, and with + // `p_inclusion` chance of `a[i]` < `b[i]` for any given `i`. + + std::bernoulli_distribution B(p_inclusion); + std::uniform_int_distribution<int> U(0, 99); + std::minstd_rand R; + + std::vector<int> xs(n); + std::generate(xs.begin(), xs.end(), [&]() { return U(R); }); + + thrust::device_vector<int> a(n); + thrust::copy(xs.begin(), xs.end(), a.begin()); + + bool differ = false; + thrust::device_vector<int> b(n); + for (std::size_t i = 0; i<n; ++i) { + if (B(R)) { + xs[i] += 3; + differ = true; + } + } + thrust::copy(xs.begin(), xs.end(), b.begin()); + cudaDeviceSynchronize(); + + // Run benchmark. + + bool result = false; + int* aptr = thrust::raw_pointer_cast(a.data()); + int* bptr = thrust::raw_pointer_cast(b.data()); + + cudaProfilerStart(); + while (state.KeepRunning()) { + result = impl(n, aptr, bptr); + benchmark::DoNotOptimize(result); + } + cudaProfilerStop(); + + // validate result + assert(result==differ); +} + +bool host_copy_compare(std::size_t n, int* aptr, int* bptr) { + std::vector<int> a(n); + std::vector<int> b(n); + + cudaMemcpy(a.data(), aptr, n*sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(b.data(), bptr, n*sizeof(int), cudaMemcpyDeviceToHost); + + for (std::size_t i = 0; i<n; ++i) { + if (a[i]<b[i]) return true; + } + return false; +} + +struct thrust_cmp_pred { + __device__ + bool operator()(const thrust::tuple<int, int>& p) const { + return thrust::get<0>(p) < thrust::get<1>(p); + } +}; + +bool thrust_compare(std::size_t n, int* aptr, int* bptr) { + thrust::device_ptr<int> a(aptr), b(bptr); + + auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(a, b)); + auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(a+n, b+n)); + + return thrust::any_of(zip_begin, zip_end, thrust_cmp_pred{}); +} + +__global__ void cuda_cmp_kernel(std::size_t n, int* aptr, int* bptr, int* rptr) { + int i = threadIdx.x+blockIdx.x*blockDim.x; + int cmp = i<n? aptr[i]<bptr[i]: 0; + if (__syncthreads_or(cmp)) *rptr=1; +} + +bool custom_cuda_compare(std::size_t n, int* aptr, int* bptr) { + int result; + + constexpr int blockwidth = 128; + int nblock = n? 1+(n-1)/blockwidth: 0; + + void* rptr; + cudaMalloc(&rptr, sizeof(int)); + cudaMemset(rptr, 0, sizeof(int)); + cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, (int *)rptr); + cudaMemcpy(&result, rptr, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(rptr); + + return (bool)result; +} + +template <typename T> +struct device_store { + device_store() { + void* p; + cudaMalloc(&p, sizeof(T)); + ptr = (T*)p; + } + + ~device_store() { if (ptr) cudaFree(ptr); } + + device_store(const device_store&&) = delete; + device_store(device_store&&) = delete; + + T* get() { return ptr; } + +private: + T* ptr = nullptr; +}; + +bool custom_cuda_compare_noalloc(std::size_t n, int* aptr, int* bptr) { + static thread_local device_store<int> state; + int result; + + constexpr int blockwidth = 128; + int nblock = n? 1+(n-1)/blockwidth: 0; + + cudaMemset(state.get(), 0, sizeof(int)); + cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, state.get()); + cudaMemcpy(&result, state.get(), sizeof(int), cudaMemcpyDeviceToHost); + + return (bool)result; +} + +void bench_host_copy_compare(benchmark::State& state) { + bench_generic(state, host_copy_compare); +} + +void bench_thrust_compare(benchmark::State& state) { + bench_generic(state, thrust_compare); +} + +void bench_custom_cuda_compare(benchmark::State& state) { + bench_generic(state, custom_cuda_compare); +} + +void bench_custom_cuda_compare_noalloc(benchmark::State& state) { + bench_generic(state, custom_cuda_compare_noalloc); +} + +// Run benches over 256 to circa 1e6 elements, with three cases: +// +// Arg1 Values +// --------------------------------------------------- +// 0 `a` and `b` equal +// 200 `a` and `b` differ circa 1 in 200 places. +// 4 `a` and `b` differ circa 1 in 4 places. + +void run_custom_arguments(benchmark::internal::Benchmark* b) { + for (int n=1<<8; n<=1<<20; n*=2) { + for (int oop: {0, 200, 4}) { + +// Uncomment to set fixed iteration count (for e.g. profiling): +// b->Iterations(20); + + b->Args({n, oop}); + } + } +} + +BENCHMARK(bench_host_copy_compare)->Apply(run_custom_arguments); +BENCHMARK(bench_thrust_compare)->Apply(run_custom_arguments); +BENCHMARK(bench_custom_cuda_compare)->Apply(run_custom_arguments); +BENCHMARK(bench_custom_cuda_compare_noalloc)->Apply(run_custom_arguments); + +BENCHMARK_MAIN(); diff --git a/tests/ubench/google-benchmark b/tests/ubench/google-benchmark index 9a5072d1bf9187b32ce9a88842dffa31ef416442..cb8a0cc10f8b634fd554251ae086da522b58f50e 160000 --- a/tests/ubench/google-benchmark +++ b/tests/ubench/google-benchmark @@ -1 +1 @@ -Subproject commit 9a5072d1bf9187b32ce9a88842dffa31ef416442 +Subproject commit cb8a0cc10f8b634fd554251ae086da522b58f50e