Skip to content
Snippets Groups Projects
Commit 0bd89928 authored by Sam Yates's avatar Sam Yates Committed by Ben Cumming
Browse files

Add micro-benchmark for device vector comparison. (#262)

* Add micro-benchmark `cuda_compare_and_reduce`.
* Add support for `.cu` cuda benchmarks in `tests/ubench/CMakeLists.txt`.
* Update `tests/ubench/README.md` with benchmark summary and results.
* Updates the version of google benchmark library, for fixed benchmark iteration support.
parent 18098783
No related branches found
No related tags found
No related merge requests found
...@@ -5,6 +5,9 @@ include(ExternalProject) ...@@ -5,6 +5,9 @@ include(ExternalProject)
set(bench_sources set(bench_sources
accumulate_functor_values.cpp) accumulate_functor_values.cpp)
set(bench_sources_cuda
cuda_compare_and_reduce.cu)
# Set up google benchmark as an external project. # Set up google benchmark as an external project.
set(gbench_src_dir "${CMAKE_CURRENT_SOURCE_DIR}/google-benchmark") set(gbench_src_dir "${CMAKE_CURRENT_SOURCE_DIR}/google-benchmark")
...@@ -62,5 +65,18 @@ foreach(bench_src ${bench_sources}) ...@@ -62,5 +65,18 @@ foreach(bench_src ${bench_sources})
list(APPEND bench_exe_list ${bench_exe}) list(APPEND bench_exe_list ${bench_exe})
endforeach() endforeach()
if(NMC_WITH_CUDA)
foreach(bench_src ${bench_sources_cuda})
string(REGEX REPLACE "\\.[^.]*$" "" bench_exe "${bench_src}")
cuda_add_executable("${bench_exe}" EXCLUDE_FROM_ALL "${bench_src}")
add_dependencies("${bench_exe}" gbench)
target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include")
target_link_libraries("${bench_exe}" "${gbench_install_dir}/lib/libbenchmark.a")
list(APPEND bench_exe_list ${bench_exe})
endforeach()
endif()
add_custom_target(ubenches DEPENDS ${bench_exe_list}) add_custom_target(ubenches DEPENDS ${bench_exe_list})
...@@ -63,3 +63,60 @@ Platform: ...@@ -63,3 +63,60 @@ Platform:
| g++ -O3 | 907 ns | 2090 ns | 907 ns | 614 ns | | g++ -O3 | 907 ns | 2090 ns | 907 ns | 614 ns |
| clang++ -O3 | 1063 ns | 533 ns | 1051 ns | 532 ns | | clang++ -O3 | 1063 ns | 533 ns | 1051 ns | 532 ns |
---
### `cuda_compare_and_reduce`
#### Motivation
One possible mechanism for determining if device-side event delivery had exhausted all
events is to see if the start and end of each event-span of each cell were equal or not.
This is equivalent to the test:
> ∃i: a[i] < b[i]?
for device vectors _a_ and _b_.
How expensive is it simply to copy the vectors over to the host and compare there?
Benchmarking indicated that for vectors up to approximately 10^5 elements, it was
significiantly faster to copy to host, despite the limited host–device bandwidth.
This non-intuitive result appears to be the result of the high overhead of `cudaMalloc`;
pre-allocating space on the device for the reduction result restores expected
performance behaviour.
#### Implementations
Four implementations are considered:
1. Copy both vectors to host, run short-circuit compare.
2. Use `thrust::zip_iterator` and `thrust::any_of` to run the reduction on the device.
3. Use a short custom cuda kernel to run the reduction, using `__syncthreads_or` and
a non-atomic write to the result.
4. Use the same cuda kernel as above, but pre-allocate the device memory to store the
result.
Note: a fairer host-based comparison would interleave comparison with data transfer.
#### Results
Results here are presented for vector size _n_ equal to 256, 512, 32768 and 262144,
with the two vectors equal.
Platform:
* Xeon(R) CPU E5-2650 v4 with base clock 2.20 GHz and max clock 2.9 GHz.
* Tesla P100-PCIE-16GB
* Linux 3.10.0
* gcc version 5.3.0
* nvcc version 8.0.61
| _n_ | host copy | thrust | custom cuda | custom cuda noalloc |
|:----|----------:|-------:|------------:|--------------------:|
| 256 | 18265 ns | 41221 ns | 23255 ns | 16440 ns |
| 512 | 18466 ns | 286331 ns | 265113 ns | 16335 ns |
| 32768 | 102880 ns | 296836 ns | 265169 ns | 16758 ns |
| 262144 | 661724 ns | 305209 ns | 269095 ns | 19792 ns |
/* Compare implementations of the test:
* ∃i: a[i]<b[i]?
* for device-side arrays a and b.
*
* Four implementations are compared:
* 1. Copy both vectors to host, compare there.
* 2. Use the thrust library.
* 3. Use a small custom cuda kernel.
* 4. Same custom cuda kernel with once-off allocation of return value.
*/
// Explicitly undef NDEBUG for assert below.
#undef NDEBUG
#include <algorithm>
#include <cassert>
#include <random>
#include <vector>
#include <benchmark/benchmark.h>
#include <cuda_profiler_api.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/logical.h>
template <typename Impl>
void bench_generic(benchmark::State& state, const Impl& impl) {
std::size_t n = state.range(0);
int oop = state.range(1);
double p_inclusion = oop? 1.0/oop: 0;
// Make device vectors `a` and `b` of size n, and with
// `p_inclusion` chance of `a[i]` < `b[i]` for any given `i`.
std::bernoulli_distribution B(p_inclusion);
std::uniform_int_distribution<int> U(0, 99);
std::minstd_rand R;
std::vector<int> xs(n);
std::generate(xs.begin(), xs.end(), [&]() { return U(R); });
thrust::device_vector<int> a(n);
thrust::copy(xs.begin(), xs.end(), a.begin());
bool differ = false;
thrust::device_vector<int> b(n);
for (std::size_t i = 0; i<n; ++i) {
if (B(R)) {
xs[i] += 3;
differ = true;
}
}
thrust::copy(xs.begin(), xs.end(), b.begin());
cudaDeviceSynchronize();
// Run benchmark.
bool result = false;
int* aptr = thrust::raw_pointer_cast(a.data());
int* bptr = thrust::raw_pointer_cast(b.data());
cudaProfilerStart();
while (state.KeepRunning()) {
result = impl(n, aptr, bptr);
benchmark::DoNotOptimize(result);
}
cudaProfilerStop();
// validate result
assert(result==differ);
}
bool host_copy_compare(std::size_t n, int* aptr, int* bptr) {
std::vector<int> a(n);
std::vector<int> b(n);
cudaMemcpy(a.data(), aptr, n*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b.data(), bptr, n*sizeof(int), cudaMemcpyDeviceToHost);
for (std::size_t i = 0; i<n; ++i) {
if (a[i]<b[i]) return true;
}
return false;
}
struct thrust_cmp_pred {
__device__
bool operator()(const thrust::tuple<int, int>& p) const {
return thrust::get<0>(p) < thrust::get<1>(p);
}
};
bool thrust_compare(std::size_t n, int* aptr, int* bptr) {
thrust::device_ptr<int> a(aptr), b(bptr);
auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(a, b));
auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(a+n, b+n));
return thrust::any_of(zip_begin, zip_end, thrust_cmp_pred{});
}
__global__ void cuda_cmp_kernel(std::size_t n, int* aptr, int* bptr, int* rptr) {
int i = threadIdx.x+blockIdx.x*blockDim.x;
int cmp = i<n? aptr[i]<bptr[i]: 0;
if (__syncthreads_or(cmp)) *rptr=1;
}
bool custom_cuda_compare(std::size_t n, int* aptr, int* bptr) {
int result;
constexpr int blockwidth = 128;
int nblock = n? 1+(n-1)/blockwidth: 0;
void* rptr;
cudaMalloc(&rptr, sizeof(int));
cudaMemset(rptr, 0, sizeof(int));
cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, (int *)rptr);
cudaMemcpy(&result, rptr, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(rptr);
return (bool)result;
}
template <typename T>
struct device_store {
device_store() {
void* p;
cudaMalloc(&p, sizeof(T));
ptr = (T*)p;
}
~device_store() { if (ptr) cudaFree(ptr); }
device_store(const device_store&&) = delete;
device_store(device_store&&) = delete;
T* get() { return ptr; }
private:
T* ptr = nullptr;
};
bool custom_cuda_compare_noalloc(std::size_t n, int* aptr, int* bptr) {
static thread_local device_store<int> state;
int result;
constexpr int blockwidth = 128;
int nblock = n? 1+(n-1)/blockwidth: 0;
cudaMemset(state.get(), 0, sizeof(int));
cuda_cmp_kernel<<<nblock, blockwidth>>>(n, aptr, bptr, state.get());
cudaMemcpy(&result, state.get(), sizeof(int), cudaMemcpyDeviceToHost);
return (bool)result;
}
void bench_host_copy_compare(benchmark::State& state) {
bench_generic(state, host_copy_compare);
}
void bench_thrust_compare(benchmark::State& state) {
bench_generic(state, thrust_compare);
}
void bench_custom_cuda_compare(benchmark::State& state) {
bench_generic(state, custom_cuda_compare);
}
void bench_custom_cuda_compare_noalloc(benchmark::State& state) {
bench_generic(state, custom_cuda_compare_noalloc);
}
// Run benches over 256 to circa 1e6 elements, with three cases:
//
// Arg1 Values
// ---------------------------------------------------
// 0 `a` and `b` equal
// 200 `a` and `b` differ circa 1 in 200 places.
// 4 `a` and `b` differ circa 1 in 4 places.
void run_custom_arguments(benchmark::internal::Benchmark* b) {
for (int n=1<<8; n<=1<<20; n*=2) {
for (int oop: {0, 200, 4}) {
// Uncomment to set fixed iteration count (for e.g. profiling):
// b->Iterations(20);
b->Args({n, oop});
}
}
}
BENCHMARK(bench_host_copy_compare)->Apply(run_custom_arguments);
BENCHMARK(bench_thrust_compare)->Apply(run_custom_arguments);
BENCHMARK(bench_custom_cuda_compare)->Apply(run_custom_arguments);
BENCHMARK(bench_custom_cuda_compare_noalloc)->Apply(run_custom_arguments);
BENCHMARK_MAIN();
Subproject commit 9a5072d1bf9187b32ce9a88842dffa31ef416442 Subproject commit cb8a0cc10f8b634fd554251ae086da522b58f50e
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment