diff --git a/src/util/padded_alloc.hpp b/src/util/padded_alloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..679e918e152cea168e40b8b99f89f723166c114f --- /dev/null +++ b/src/util/padded_alloc.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include <memory> +#include <system_error> +#include <utility> + +#include <iostream> + +// Allocator with run-time alignment and padding guarantees. +// +// With an alignment value of `n`, any allocations will be +// aligned to have a starting address of a multiple of `n`, +// and the size of the allocation will be padded so that the +// one-past-the-end address is also a multiple of `n`. +// +// Any alignment `n` specified must be a power of two. +// +// Assignment does not change the alignment property of the +// allocator on the left hand side of the assignment, so that +// e.g. +// ``` +// std::vector<int, padded_allocator<int>> a(100, 32), b(50, 64); +// a = b; +// assert(a.get_allocator().alignment()==32); +// ``` +// will pass, and the vector `a` will not require reallocation. +// +// For move assignment, this means we cannot allow a simple ownership +// transfer if the left hand side has a stronger alignment guarantee +// that the right hand side. Correspondingly, we have to return `false` +// for the allocator equality test if the alignments differ. + +namespace arb { +namespace util { + +template <typename T> +struct padded_allocator { + using value_type = T; + using pointer = T*; + using propagate_on_container_copy_assignment = std::false_type; + using propagate_on_container_move_assignment = std::false_type; + using propagate_on_container_swap = std::false_type; + using is_always_equal = std::false_type; + + padded_allocator() noexcept {} + + template <typename U> + padded_allocator(const padded_allocator<U>& b) noexcept: alignment_(b.alignment()) {} + + explicit padded_allocator(std::size_t alignment): alignment_(alignment) { + if (!alignment_ || (alignment_&(alignment_-1))) { + throw std::range_error("alignment must be positive power of two"); + } + } + + padded_allocator select_on_container_copy_construction() const noexcept { + return *this; + } + + pointer allocate(std::size_t n) { + if (n>std::size_t(-1)/sizeof(T)) { + throw std::bad_alloc(); + } + + void* mem = nullptr; + std::size_t size = round_up(n*sizeof(T), alignment_); + std::size_t pm_align = std::max(alignment_, sizeof(void*)); + + if (auto err = posix_memalign(&mem, pm_align, size)) { + throw std::system_error(err, std::generic_category(), "posix_memalign"); + } + return static_cast<pointer>(mem); + } + + void deallocate(pointer p, std::size_t n) { + std::free(p); + } + + bool operator==(const padded_allocator& a) const { return alignment_==a.alignment_; } + bool operator!=(const padded_allocator& a) const { return !(*this==a); } + + std::size_t alignment() const { return alignment_; } + +private: + // Start address and one-past-the-end address a multiple of alignment: + std::size_t alignment_ = 1; + + static std::size_t round_up(std::size_t v, std::size_t b) { + std::size_t m = v%b; + return v-m+(m? b: 0); + } +}; + +} // namespace util +} // namespace arb diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt index c0486931d4f0b0d6a75699b6fd4cef0299fc85d4..297fb4919ddf2b1065309b71363d0b1b39ed70f3 100644 --- a/tests/ubench/CMakeLists.txt +++ b/tests/ubench/CMakeLists.txt @@ -4,6 +4,7 @@ include(ExternalProject) set(bench_sources accumulate_functor_values.cpp + default_construct.cpp event_setup.cpp event_binning.cpp ) diff --git a/tests/ubench/README.md b/tests/ubench/README.md index 2d86750d597724a2d70149ea3572927b13cdd6ba..01bec9ed0bf39738afc3ef15f788d5f14e44821e 100644 --- a/tests/ubench/README.md +++ b/tests/ubench/README.md @@ -263,3 +263,88 @@ Overall, maintaining seperate queues for each cell is much faster for more than |1Q | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | |nQ | 1.1 | 1.8 | 2.8 | 3.7 | 5.4 | |nV | 2.4 | 2.6 | 3.9 | 5.8 | 7.8 | + +--- + +### `default_construct` + +#### Motivation + +The `padded_allocator` code allows us to use, for example, a `std::vector` for CPU-side aligned storage and padded storage (for SIMD) +instead of the `memory::array` class. The latter though does not construct its elements, while a `std::vector` will use the allocator's +`construct` method. + +For scalar values that have trivial default constructors, a `std::allocator` construction with no arguments will value-initialize, +which will zero initialize any non-class values. By supplying an alternate `construct` method, we can make an allocator that will +default-initialize instead, skipping any initialization for non-class values, and providing semantics similar to that of +`memory::array`. + +Is it worth doing so? + +#### Implementation + +The microbenchmark uses an adaptor class that replaces the allocator `construct` methods to default initialize if there are no +arguments given. The benchmark creates a vector using the standard or adapted allocator, fills with the numbers from 1 to n +and takes the sum. + +For comparison, the benchmark also compares the two vectors when they are initialized by a pair of iterators that provide the +same enumeration from 1 to n. + +#### Results + +With this low computation-to-size ratio task, using the default constructing adaptor gives a significant performance benefit. +With the iterator-pair construction however, where we would expect no performance difference, GCC (but not Clang) produces +very much slower code. + +Note that Clang produces overall considerably faster code. + +Platform: +* Xeon E3-1220 v2 with base clock 3.1 GHz and max clock 3.5 GHz. +* Linux 4.9.75 +* gcc version 7.3.1 +* clang version 6.0.0 +* optimization options: -O3 -march=ivybridge + +##### Create then fill and sum + +*GCC* + +| size | value-initialized | default-initialized | +|---------:|------------------:|--------------------:| +| 1 kiB | 403 ns | 331 ns | +| 4 kiB | 1 430 ns | 1 142 ns | +| 32 kiB | 12 377 ns | 8 982 ns | +| 256 kiB | 114 598 ns | 81 599 ns | +| 1024 kiB | 455 502 ns | 323 366 ns | + +*Clang* + +| size | value-initialized | default-initialized | +|---------:|------------------:|--------------------:| +| 1 kib | 228 ns | 147 ns | +| 4 kib | 826 ns | 527 ns | +| 32 kib | 10 425 ns | 6 823 ns | +| 256 kib | 106 497 ns | 72 375 ns | +| 1024 kib | 430 561 ns | 293 999 ns | + +##### Create directly from counting iterators and sum + +*GCC* + +| size | value-initialized | default-initialized | +|---------:|------------------:|--------------------:| +| 1 kiB | 335 ns | 775 ns | +| 4 kiB | 1 146 ns | 2 920 ns | +| 32 kiB | 8 954 ns | 23 197 ns | +| 256 kiB | 81 609 ns | 193 230 ns | +| 1024 kiB | 322 947 ns | 763 243 ns | + +*Clang* + +| size | value-initialized | default-initialized | +|---------:|------------------:|--------------------:| +| 1 kiB | 151 ns | 160 ns | +| 4 kiB | 531 ns | 528 ns | +| 32 kiB | 6 790 ns | 6 816 ns | +| 256 kiB | 72 460 ns | 72 687 ns | +| 1024 kiB | 293 991 ns | 293 746 ns | diff --git a/tests/ubench/default_construct.cpp b/tests/ubench/default_construct.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dc0614e23d5aa1ff937919108b53ebd54f927730 --- /dev/null +++ b/tests/ubench/default_construct.cpp @@ -0,0 +1,109 @@ +// Compare value- vs default- initialized vector performance. + +// Explicitly undef NDEBUG for assert below. +#undef NDEBUG + +#include <cassert> +#include <vector> + +#include <benchmark/benchmark.h> + +#include <util/span.hpp> + +using arb::util::make_span; + +template <typename Allocator> +struct default_construct_adaptor: Allocator { +private: + using traits = typename std::allocator_traits<Allocator>; + +public: + using pointer = typename traits::pointer; + using value_type = typename traits::value_type; + + default_construct_adaptor() noexcept {} + + template <typename... Args> + default_construct_adaptor(Args&&... args): + Allocator(std::forward<Args...>(args...)) + {} + + + template <typename U> + default_construct_adaptor(const default_construct_adaptor<U>& b) noexcept: Allocator(b) {} + + void construct(pointer p) { + ::new (static_cast<void*>(p)) value_type; + } + + template <typename... Args> + void construct(pointer p, Args&&... args) { + ::new (static_cast<void*>(p)) value_type(std::forward<Args...>(args)...); + } + + template <typename U> + struct rebind { + using other = default_construct_adaptor<typename traits::template rebind_alloc<U>>; + }; + +}; + + +template <typename Container> +unsigned run_accumulate(std::size_t n) { + Container c(n); + + unsigned s = 0; + for (unsigned& x: c) { + x = ++s; + } + s = 0; + for (unsigned x: c) { + s += x; + } + return s; +} + +template <typename Container> +unsigned run_accumulate_range_init(std::size_t n) { + auto values = make_span(1, n+1); + Container c(values.begin(), values.end()); + + unsigned s = 0; + for (unsigned x: c) { + s += x; + } + return s; +} + +template <unsigned (*Fn)(std::size_t)> +void bench_container(benchmark::State& state) { + std::size_t n = state.range(0); + + while (state.KeepRunning()) { + benchmark::DoNotOptimize(Fn(n)); + } + + // check! + unsigned s = (n*(n+1))/2; + assert(s==Fn(n)); +} + +template <typename T> +using dc_vector = std::vector<T, default_construct_adaptor<std::allocator<T>>>; + +auto bench_vector = bench_container<run_accumulate<std::vector<unsigned>>>; +auto bench_dc_vector = bench_container<run_accumulate<dc_vector<unsigned>>>; + +auto bench_vector_range = bench_container<run_accumulate_range_init<std::vector<unsigned>>>; +auto bench_dc_vector_range = bench_container<run_accumulate_range_init<dc_vector<unsigned>>>; + + +BENCHMARK(bench_vector)->Range(1<<10, 1<<20); +BENCHMARK(bench_dc_vector)->Range(1<<10, 1<<20); + +BENCHMARK(bench_vector_range)->Range(1<<10, 1<<20); +BENCHMARK(bench_dc_vector_range)->Range(1<<10, 1<<20); + +BENCHMARK_MAIN(); + diff --git a/tests/ubench/event_binning.cpp b/tests/ubench/event_binning.cpp index a807d08bb64acd17551ad8b41d68a16b2048c133..94b58eba1cb16a9e910b784885eaab2e0e96d336 100644 --- a/tests/ubench/event_binning.cpp +++ b/tests/ubench/event_binning.cpp @@ -4,6 +4,7 @@ // Keep this test as a prototype for testing, esp. when looking into binning. #include <random> +#include <unordered_map> #include <vector> #include <event_queue.hpp> diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index fe018f935b9692d5bd941765ae657dc5ab34e77c..5133c8963b418d0f083abd0c17fb2ae51fea0692 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -61,6 +61,7 @@ set(TEST_SOURCES test_nop.cpp test_optional.cpp test_mechinfo.cpp + test_padded.cpp test_partition.cpp test_path.cpp test_point.cpp diff --git a/tests/unit/instrument_malloc.hpp b/tests/unit/instrument_malloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b36b8f83ef3e28e120ddbfd40562d3cdcf040691 --- /dev/null +++ b/tests/unit/instrument_malloc.hpp @@ -0,0 +1,138 @@ +#pragma once + +// Base class for scoped-instrumentation of glibc malloc. +// +// For the lifetime of a `with_instrumented_malloc` object, +// global memory allocation hooks will be set so that +// the virtual `on_malloc`, `on_realloc`, `on_memalign` +// and `on_free` calls will be invoked before the corresponding +// `malloc`, `realloc` etc. is executed. +// +// Scopes of `with_instrumented_malloc` may be nested, but: +// * Don't interleave lifetimes of these objects and expect things +// to work! +// * Don't try and create new `with_instrumented_malloc` instances +// from within an `on_malloc` callback (or others). +// * Definitely don't try and use this in a multithreaded context. +// +// Calling code should check CAN_INSTRUMENT_MALLOC preprocessor +// symbol to see if this functionality is available. + +#include <cstddef> + +#if (__GLIBC__==2) +#include <malloc.h> +#define CAN_INSTRUMENT_MALLOC +#endif + +namespace testing { + +#ifdef CAN_INSTRUMENT_MALLOC + +// For run-time, temporary intervention in the malloc-family calls, +// there is still no better alternative than to use the +// deprecated __malloc_hook pointers and friends. + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +// Totally not thread safe! +struct with_instrumented_malloc { + with_instrumented_malloc() { + push(); + } + + ~with_instrumented_malloc() { + pop(); + } + + virtual void on_malloc(std::size_t, const void*) {} + virtual void on_realloc(void*, std::size_t, const void*) {} + virtual void on_free(void*, const void*) {} + virtual void on_memalign(std::size_t, std::size_t, const void*) {} + +private: + static with_instrumented_malloc*& instance() { + static with_instrumented_malloc* ptr = nullptr; + return ptr; + } + + with_instrumented_malloc* prev_; + decltype(__malloc_hook) saved_malloc_hook_; + decltype(__realloc_hook) saved_realloc_hook_; + decltype(__free_hook) saved_free_hook_; + decltype(__memalign_hook) saved_memalign_hook_; + + void push() { + saved_malloc_hook_ = __malloc_hook; + saved_realloc_hook_ = __realloc_hook; + saved_free_hook_ = __free_hook; + saved_memalign_hook_ = __memalign_hook; + + prev_ = instance(); + instance() = this; + + __malloc_hook = malloc_hook; + __realloc_hook = realloc_hook; + __free_hook = free_hook; + __memalign_hook = memalign_hook; + } + + void pop() { + instance() = prev_; + __malloc_hook = saved_malloc_hook_; + __realloc_hook = saved_realloc_hook_; + __free_hook = saved_free_hook_; + __memalign_hook = saved_memalign_hook_; + } + + struct windback_guard { + with_instrumented_malloc* p; + + windback_guard(): p(instance()) { p->pop(); } + ~windback_guard() { p->push(); } + }; + + static void* malloc_hook(std::size_t size, const void* caller) { + windback_guard g; + g.p->on_malloc(size, caller); + return malloc(size); + } + + static void* realloc_hook(void* ptr, std::size_t size, const void* caller) { + windback_guard g; + g.p->on_realloc(ptr, size, caller); + return realloc(ptr, size); + } + + static void free_hook(void* ptr, const void* caller) { + windback_guard g; + g.p->on_free(ptr, caller); + free(ptr); + } + + static void* memalign_hook(std::size_t alignment, std::size_t size, const void* caller) { + windback_guard g; + g.p->on_memalign(alignment, size, caller); + return memalign(alignment, size); + } +}; + +#pragma GCC diagnostic pop + +#else + +struct with_instrumented_malloc { + with_instrumented_malloc() { + throw std::runtime_error("malloc instrumentation not supported\n"); + } + + virtual void on_malloc(std::size_t, const void*) {} + virtual void on_realloc(void*, std::size_t, const void*) {} + virtual void on_free(void*, const void*) {} + virtual void on_memalign(std::size_t, std::size_t, const void*) {} +}; + +#endif // ifdef CAN_INSTRUMENT_MALLOC + +} // namespace testing diff --git a/tests/unit/test_padded.cpp b/tests/unit/test_padded.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4bdb453d86f9f2128ae6cd72d8c982ecf859afb4 --- /dev/null +++ b/tests/unit/test_padded.cpp @@ -0,0 +1,159 @@ +#include <cstdint> + +#if (__GLIBC__==2) +#include <malloc.h> +#define INSTRUMENT_MALLOC +#endif + +#include <util/padded_alloc.hpp> + +#include "../gtest.h" +#include "common.hpp" +#include "instrument_malloc.hpp" + +using arb::util::padded_allocator; + +template <typename T> +using pvector = std::vector<T, padded_allocator<T>>; + +// (For k a power of 2 only) +static bool is_aligned(void* p, std::size_t k) { + auto addr = reinterpret_cast<std::uintptr_t>(p); + return !(addr&(k-1)); +} + +TEST(padded_vector, alignment) { + padded_allocator<double> pa(1024); + pvector<double> a(101, pa); + + EXPECT_EQ(1024u, a.get_allocator().alignment()); + EXPECT_TRUE(is_aligned(a.data(), 1024)); +} + +TEST(padded_vector, allocator_constraints) { + EXPECT_THROW(padded_allocator<char>(7), std::range_error); + + padded_allocator<char> pa(2); // less than sizeof(void*) + std::vector<char, padded_allocator<char>> v(7, pa); + + EXPECT_TRUE(is_aligned(v.data(), sizeof(void*))); +} + +TEST(padded_vector, allocator_propagation) { + padded_allocator<double> pa(1024); + pvector<double> a(101, pa); + + EXPECT_EQ(pa, a.get_allocator()); + + pvector<double> b(101); + auto pb = b.get_allocator(); + + // Differing alignment => allocators compare not-equal. + EXPECT_EQ(1u, pb.alignment()); + EXPECT_NE(pa, pb); + + // Don't propagate on copy- or move-assignment: + b = a; + EXPECT_EQ(pb.alignment(), b.get_allocator().alignment()); + EXPECT_NE(pb.alignment(), pa.alignment()); + + pvector<double> c; + c = std::move(a); + EXPECT_NE(c.get_allocator().alignment(), pa.alignment()); +} + + +#ifdef INSTRUMENT_MALLOC + +struct alloc_data { + unsigned n_malloc = 0; + unsigned n_realloc = 0; + unsigned n_memalign = 0; + + std::size_t last_malloc = -1; + std::size_t last_realloc = -1; + std::size_t last_memalign = -1; +}; + +struct count_allocs: testing::with_instrumented_malloc { + alloc_data data; + + void on_malloc(std::size_t size, const void*) override { + ++data.n_malloc; + data.last_malloc = size; + } + + void on_realloc(void*, std::size_t size, const void*) override { + ++data.n_realloc; + data.last_realloc = size; + } + + void on_memalign(std::size_t, std::size_t size, const void*) override { + ++data.n_memalign; + data.last_memalign = size; + } + + void reset() { + data = alloc_data(); + } +}; + +TEST(padded_vector, instrumented) { + count_allocs A; + + padded_allocator<double> pad256(256), pad32(32); + pvector<double> v1p256(303, pad256); + alloc_data mdata = A.data; + + unsigned expected_v1_alloc = 303*sizeof(double); + expected_v1_alloc = expected_v1_alloc%256? 256*(1+expected_v1_alloc/256): expected_v1_alloc; + + EXPECT_EQ(1u, mdata.n_memalign); + EXPECT_EQ(0u, mdata.n_malloc); + EXPECT_EQ(0u, mdata.n_realloc); + EXPECT_EQ(expected_v1_alloc, mdata.last_memalign); + + // Move assignment: v2 has differing alignment guarantee, so cannot + // take ownership of v1's data. We expect that v2 will need to allocate. + + pvector<double> v2p32(10, pad32); + A.reset(); + v2p32 = std::move(v1p256); + mdata = A.data; + + EXPECT_EQ(1u, mdata.n_memalign); + EXPECT_EQ(0u, mdata.n_malloc); + EXPECT_EQ(0u, mdata.n_realloc); + + pvector<double> v3p256(101, pad256), v4p256(700, pad256); + + A.reset(); + v4p256 = v3p256; // same alignment, larger size => shouldn't need to allocate + mdata = A.data; + + EXPECT_EQ(0u, mdata.n_memalign); + EXPECT_EQ(0u, mdata.n_malloc); + EXPECT_EQ(0u, mdata.n_realloc); + + A.reset(); + pvector<double> v5p32(701, pad32); + mdata = A.data; + + unsigned expected_v5_alloc = 701*sizeof(double); + expected_v5_alloc = expected_v5_alloc%32? 32*(1+expected_v5_alloc/32): expected_v5_alloc; + + EXPECT_EQ(1u, mdata.n_memalign); + EXPECT_EQ(0u, mdata.n_malloc); + EXPECT_EQ(0u, mdata.n_realloc); + EXPECT_EQ(expected_v5_alloc, mdata.last_memalign); + + A.reset(); + v5p32 = v3p256; // different alignment, but enough space, so shouldn't reallocate. + mdata = A.data; + + EXPECT_EQ(0u, mdata.n_memalign); + EXPECT_EQ(0u, mdata.n_malloc); + EXPECT_EQ(0u, mdata.n_realloc); +} + +#endif // ifdef INSTRUMENT_MALLOC