Add padded allocator for aligned and padded vectors. (#460)

Padded vectors with run-time padding/alignment guarantees will form the basis of the storage class for the new CPU and SIMD generated mechanisms. * Add `padded_allocator` that aligns and pads allocations. * Make microbenchmark for `default_construct_adaptor` that overrides the allocator construct() to default- instead of value-initialization on values. * Add `with_instrumented_malloc` class for tracking malloc, realloc, etc. calls. * Add unit tests for `padded_allocator`.

Add padded allocator for aligned and padded vectors. (#460)
Padded vectors with run-time padding/alignment guarantees will form the basis of the storage class for the new CPU and SIMD generated mechanisms. * Add `padded_allocator` that aligns and pads allocations. * Make microbenchmark for `default_construct_adaptor` that overrides the allocator construct() to default- instead of value-initialization on values. * Add `with_instrumented_malloc` class for tracking malloc, realloc, etc. calls. * Add unit tests for `padded_allocator`.
581c4ef3 · Sam Yates · Ben Cumming · 3019ae1e · 581c4ef3 · 581c4ef3
Commit 581c4ef3 authored 7 years ago by Sam Yates Committed by Ben Cumming 7 years ago
--- a/src/util/padded_alloc.hpp
+++ b/src/util/padded_alloc.hpp
+#pragma once
+
+#include <memory>
+#include <system_error>
+#include <utility>
+
+#include <iostream>
+
+// Allocator with run-time alignment and padding guarantees.
+//
+// With an alignment value of `n`, any allocations will be
+// aligned to have a starting address of a multiple of `n`,
+// and the size of the allocation will be padded so that the
+// one-past-the-end address is also a multiple of `n`.
+//
+// Any alignment `n` specified must be a power of two.
+//
+// Assignment does not change the alignment property of the
+// allocator on the left hand side of the assignment, so that
+// e.g.
+// ```
+//     std::vector<int, padded_allocator<int>> a(100, 32), b(50, 64);
+//     a = b;
+//     assert(a.get_allocator().alignment()==32);
+// ```
+// will pass, and the vector `a` will not require reallocation.
+//
+// For move assignment, this means we cannot allow a simple ownership
+// transfer if the left hand side has a stronger alignment guarantee
+// that the right hand side. Correspondingly, we have to return `false`
+// for the allocator equality test if the alignments differ.
+
+namespace arb {
+namespace util {
+
+template <typename T>
+struct padded_allocator {
+    using value_type = T;
+    using pointer = T*;
+    using propagate_on_container_copy_assignment = std::false_type;
+    using propagate_on_container_move_assignment = std::false_type;
+    using propagate_on_container_swap = std::false_type;
+    using is_always_equal = std::false_type;
+
+    padded_allocator() noexcept {}
+
+    template <typename U>
+    padded_allocator(const padded_allocator<U>& b) noexcept: alignment_(b.alignment()) {}
+
+    explicit padded_allocator(std::size_t alignment): alignment_(alignment) {
+        if (!alignment_ || (alignment_&(alignment_-1))) {
+            throw std::range_error("alignment must be positive power of two");
+        }
+    }
+
+    padded_allocator select_on_container_copy_construction() const noexcept {
+        return *this;
+    }
+
+    pointer allocate(std::size_t n) {
+        if (n>std::size_t(-1)/sizeof(T)) {
+            throw std::bad_alloc();
+        }
+
+        void* mem = nullptr;
+        std::size_t size = round_up(n*sizeof(T), alignment_);
+        std::size_t pm_align = std::max(alignment_, sizeof(void*));
+
+        if (auto err = posix_memalign(&mem, pm_align, size)) {
+            throw std::system_error(err, std::generic_category(), "posix_memalign");
+        }
+        return static_cast<pointer>(mem);
+    }
+
+    void deallocate(pointer p, std::size_t n) {
+        std::free(p);
+    }
+
+    bool operator==(const padded_allocator& a) const { return alignment_==a.alignment_; }
+    bool operator!=(const padded_allocator& a) const { return !(*this==a); }
+
+    std::size_t alignment() const { return alignment_; }
+
+private:
+    // Start address and one-past-the-end address a multiple of alignment:
+    std::size_t alignment_ = 1;
+
+    static std::size_t round_up(std::size_t v, std::size_t b) {
+        std::size_t m = v%b;
+        return v-m+(m? b: 0);
+    }
+};
+
+} // namespace util
+} // namespace arb
--- a/tests/ubench/CMakeLists.txt
+++ b/tests/ubench/CMakeLists.txt
@@ -4,6 +4,7 @@ include(ExternalProject)

 set(bench_sources
    accumulate_functor_values.cpp
+    default_construct.cpp
    event_setup.cpp
    event_binning.cpp
 )

--- a/tests/ubench/README.md
+++ b/tests/ubench/README.md
@@ -263,3 +263,88 @@ Overall, maintaining seperate queues for each cell is much faster for more than
 |1Q    |  1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
 |nQ    |  1.1 | 1.8 | 2.8 | 3.7 | 5.4 |
 |nV    |  2.4 | 2.6 | 3.9 | 5.8 | 7.8 |
+
+---
+
+### `default_construct`
+
+#### Motivation
+
+The `padded_allocator` code allows us to use, for example, a `std::vector` for CPU-side aligned storage and padded storage (for SIMD)
+instead of the `memory::array` class. The latter though does not construct its elements, while a `std::vector` will use the allocator's
+`construct` method.
+
+For scalar values that have trivial default constructors, a `std::allocator` construction with no arguments will value-initialize,
+which will zero initialize any non-class values. By supplying an alternate `construct` method, we can make an allocator that will
+default-initialize instead, skipping any initialization for non-class values, and providing semantics similar to that of
+`memory::array`.
+
+Is it worth doing so?
+
+#### Implementation
+
+The microbenchmark uses an adaptor class that replaces the allocator `construct` methods to default initialize if there are no
+arguments given. The benchmark creates a vector using the standard or adapted allocator, fills with the numbers from 1 to n
+and takes the sum.
+
+For comparison, the benchmark also compares the two vectors when they are initialized by a pair of iterators that provide the
+same enumeration from 1 to n.
+
+#### Results
+
+With this low computation-to-size ratio task, using the default constructing adaptor gives a significant performance benefit.
+With the iterator-pair construction however, where we would expect no performance difference, GCC (but not Clang) produces
+very much slower code.
+
+Note that Clang produces overall considerably faster code.
+
+Platform:
+* Xeon E3-1220 v2 with base clock 3.1 GHz and max clock 3.5 GHz. 
+* Linux 4.9.75
+* gcc version 7.3.1
+* clang version 6.0.0
+* optimization options: -O3 -march=ivybridge
+
+##### Create then fill and sum
+
+*GCC*
+
+|    size  | value-initialized | default-initialized |
+|---------:|------------------:|--------------------:|
+|    1 kiB |            403 ns |              331 ns |
+|    4 kiB |          1 430 ns |            1 142 ns |
+|   32 kiB |         12 377 ns |            8 982 ns |
+|  256 kiB |        114 598 ns |           81 599 ns |
+| 1024 kiB |        455 502 ns |          323 366 ns |
+
+*Clang*
+
+|    size  | value-initialized | default-initialized |
+|---------:|------------------:|--------------------:|
+|    1 kib |            228 ns |              147 ns |
+|    4 kib |            826 ns |              527 ns |
+|   32 kib |         10 425 ns |            6 823 ns |
+|  256 kib |        106 497 ns |           72 375 ns |
+| 1024 kib |        430 561 ns |          293 999 ns |
+
+##### Create directly from counting iterators and sum
+
+*GCC*
+
+|    size  | value-initialized | default-initialized |
+|---------:|------------------:|--------------------:|
+|    1 kiB |            335 ns |              775 ns |
+|    4 kiB |          1 146 ns |            2 920 ns |
+|   32 kiB |          8 954 ns |           23 197 ns |
+|  256 kiB |         81 609 ns |          193 230 ns |
+| 1024 kiB |        322 947 ns |          763 243 ns |
+
+*Clang*
+
+|    size  | value-initialized | default-initialized |
+|---------:|------------------:|--------------------:|
+|    1 kiB |            151 ns |              160 ns |
+|    4 kiB |            531 ns |              528 ns |
+|   32 kiB |          6 790 ns |            6 816 ns |
+|  256 kiB |         72 460 ns |           72 687 ns |
+| 1024 kiB |        293 991 ns |          293 746 ns |
--- a/tests/ubench/default_construct.cpp
+++ b/tests/ubench/default_construct.cpp
+// Compare value- vs default- initialized vector performance.
+
+// Explicitly undef NDEBUG for assert below.
+#undef NDEBUG
+
+#include <cassert>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+#include <util/span.hpp>
+
+using arb::util::make_span;
+
+template <typename Allocator>
+struct default_construct_adaptor: Allocator {
+private:
+    using traits = typename std::allocator_traits<Allocator>;
+
+public:
+    using pointer = typename traits::pointer;
+    using value_type = typename traits::value_type;
+
+    default_construct_adaptor() noexcept {}
+
+    template <typename... Args>
+    default_construct_adaptor(Args&&... args):
+        Allocator(std::forward<Args...>(args...))
+    {}
+
+
+    template <typename U>
+    default_construct_adaptor(const default_construct_adaptor<U>& b) noexcept: Allocator(b) {}
+
+    void construct(pointer p) {
+        ::new (static_cast<void*>(p)) value_type;
+    }
+
+    template <typename... Args>
+    void construct(pointer p, Args&&... args) {
+        ::new (static_cast<void*>(p)) value_type(std::forward<Args...>(args)...);
+    }
+
+    template <typename U>
+    struct rebind {
+        using other = default_construct_adaptor<typename traits::template rebind_alloc<U>>;
+    };
+
+};
+
+
+template <typename Container>
+unsigned run_accumulate(std::size_t n) {
+    Container c(n);
+
+    unsigned s = 0;
+    for (unsigned& x: c) {
+        x = ++s;
+    }
+    s = 0;
+    for (unsigned x: c) {
+        s += x;
+    }
+    return s;
+}
+
+template <typename Container>
+unsigned run_accumulate_range_init(std::size_t n) {
+    auto values = make_span(1, n+1);
+    Container c(values.begin(), values.end());
+
+    unsigned s = 0;
+    for (unsigned x: c) {
+        s += x;
+    }
+    return s;
+}
+
+template <unsigned (*Fn)(std::size_t)>
+void bench_container(benchmark::State& state) {
+    std::size_t n = state.range(0);
+
+    while (state.KeepRunning()) {
+        benchmark::DoNotOptimize(Fn(n));
+    }
+
+    // check!
+    unsigned s = (n*(n+1))/2;
+    assert(s==Fn(n));
+}
+
+template <typename T>
+using dc_vector = std::vector<T, default_construct_adaptor<std::allocator<T>>>;
+
+auto bench_vector = bench_container<run_accumulate<std::vector<unsigned>>>;
+auto bench_dc_vector = bench_container<run_accumulate<dc_vector<unsigned>>>;
+
+auto bench_vector_range = bench_container<run_accumulate_range_init<std::vector<unsigned>>>;
+auto bench_dc_vector_range = bench_container<run_accumulate_range_init<dc_vector<unsigned>>>;
+
+
+BENCHMARK(bench_vector)->Range(1<<10, 1<<20);
+BENCHMARK(bench_dc_vector)->Range(1<<10, 1<<20);
+
+BENCHMARK(bench_vector_range)->Range(1<<10, 1<<20);
+BENCHMARK(bench_dc_vector_range)->Range(1<<10, 1<<20);
+
+BENCHMARK_MAIN();
+
--- a/tests/ubench/event_binning.cpp
+++ b/tests/ubench/event_binning.cpp
@@ -4,6 +4,7 @@
 // Keep this test as a prototype for testing, esp. when looking into binning.

 #include <random>
+#include <unordered_map>
 #include <vector>

 #include <event_queue.hpp>

--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -61,6 +61,7 @@ set(TEST_SOURCES
    test_nop.cpp
    test_optional.cpp
    test_mechinfo.cpp
+    test_padded.cpp
    test_partition.cpp
    test_path.cpp
    test_point.cpp

--- a/tests/unit/instrument_malloc.hpp
+++ b/tests/unit/instrument_malloc.hpp
+#pragma once
+
+// Base class for scoped-instrumentation of glibc malloc.
+//
+// For the lifetime of a `with_instrumented_malloc` object,
+// global memory allocation hooks will be set so that
+// the virtual `on_malloc`, `on_realloc`, `on_memalign`
+// and `on_free` calls will be invoked before the corresponding
+// `malloc`, `realloc` etc. is executed.
+//
+// Scopes of `with_instrumented_malloc` may be nested, but:
+//   * Don't interleave lifetimes of these objects and expect things
+//     to work!
+//   * Don't try and create new `with_instrumented_malloc` instances
+//     from within an `on_malloc` callback (or others).
+//   * Definitely don't try and use this in a multithreaded context.
+//
+// Calling code should check CAN_INSTRUMENT_MALLOC preprocessor
+// symbol to see if this functionality is available.
+
+#include <cstddef>
+
+#if (__GLIBC__==2)
+#include <malloc.h>
+#define CAN_INSTRUMENT_MALLOC
+#endif
+
+namespace testing {
+
+#ifdef CAN_INSTRUMENT_MALLOC
+
+// For run-time, temporary intervention in the malloc-family calls,
+// there is still no better alternative than to use the
+// deprecated __malloc_hook pointers and friends. 
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+// Totally not thread safe!
+struct with_instrumented_malloc {
+    with_instrumented_malloc() {
+        push();
+    }
+
+    ~with_instrumented_malloc() {
+        pop();
+    }
+
+    virtual void on_malloc(std::size_t, const void*) {}
+    virtual void on_realloc(void*, std::size_t, const void*) {}
+    virtual void on_free(void*, const void*) {}
+    virtual void on_memalign(std::size_t, std::size_t, const void*) {}
+
+private:
+    static with_instrumented_malloc*& instance() {
+        static with_instrumented_malloc* ptr = nullptr;
+        return ptr;
+    }
+
+    with_instrumented_malloc* prev_;
+    decltype(__malloc_hook) saved_malloc_hook_;
+    decltype(__realloc_hook) saved_realloc_hook_;
+    decltype(__free_hook) saved_free_hook_;
+    decltype(__memalign_hook) saved_memalign_hook_;
+
+    void push() {
+        saved_malloc_hook_ = __malloc_hook;
+        saved_realloc_hook_ = __realloc_hook;
+        saved_free_hook_ = __free_hook;
+        saved_memalign_hook_ = __memalign_hook;
+
+        prev_ = instance();
+        instance() = this;
+
+        __malloc_hook = malloc_hook;
+        __realloc_hook = realloc_hook;
+        __free_hook = free_hook;
+        __memalign_hook = memalign_hook;
+    }
+
+    void pop() {
+        instance() = prev_;
+        __malloc_hook = saved_malloc_hook_;
+        __realloc_hook = saved_realloc_hook_;
+        __free_hook = saved_free_hook_;
+        __memalign_hook = saved_memalign_hook_;
+    }
+
+    struct windback_guard {
+        with_instrumented_malloc* p;
+
+        windback_guard(): p(instance()) { p->pop(); }
+        ~windback_guard() { p->push(); }
+    };
+
+    static void* malloc_hook(std::size_t size, const void* caller) {
+        windback_guard g;
+        g.p->on_malloc(size, caller);
+        return malloc(size);
+    }
+
+    static void* realloc_hook(void* ptr, std::size_t size, const void* caller) {
+        windback_guard g;
+        g.p->on_realloc(ptr, size, caller);
+        return realloc(ptr, size);
+    }
+
+    static void free_hook(void* ptr, const void* caller) {
+        windback_guard g;
+        g.p->on_free(ptr, caller);
+        free(ptr);
+    }
+
+    static void* memalign_hook(std::size_t alignment, std::size_t size, const void* caller) {
+        windback_guard g;
+        g.p->on_memalign(alignment, size, caller);
+        return memalign(alignment, size);
+    }
+};
+
+#pragma GCC diagnostic pop
+
+#else
+
+struct with_instrumented_malloc {
+    with_instrumented_malloc() {
+        throw std::runtime_error("malloc instrumentation not supported\n");
+    }
+
+    virtual void on_malloc(std::size_t, const void*) {}
+    virtual void on_realloc(void*, std::size_t, const void*) {}
+    virtual void on_free(void*, const void*) {}
+    virtual void on_memalign(std::size_t, std::size_t, const void*) {}
+};
+
+#endif // ifdef CAN_INSTRUMENT_MALLOC
+
+} // namespace testing
--- a/tests/unit/test_padded.cpp
+++ b/tests/unit/test_padded.cpp
+#include <cstdint>
+
+#if (__GLIBC__==2)
+#include <malloc.h>
+#define INSTRUMENT_MALLOC
+#endif
+
+#include <util/padded_alloc.hpp>
+
+#include "../gtest.h"
+#include "common.hpp"
+#include "instrument_malloc.hpp"
+
+using arb::util::padded_allocator;
+
+template <typename T>
+using pvector = std::vector<T, padded_allocator<T>>;
+
+// (For k a power of 2 only)
+static bool is_aligned(void* p, std::size_t k) {
+    auto addr = reinterpret_cast<std::uintptr_t>(p);
+    return !(addr&(k-1));
+}
+
+TEST(padded_vector, alignment) {
+    padded_allocator<double> pa(1024);
+    pvector<double> a(101, pa);
+
+    EXPECT_EQ(1024u, a.get_allocator().alignment());
+    EXPECT_TRUE(is_aligned(a.data(), 1024));
+}
+
+TEST(padded_vector, allocator_constraints) {
+    EXPECT_THROW(padded_allocator<char>(7), std::range_error);
+
+    padded_allocator<char> pa(2); // less than sizeof(void*)
+    std::vector<char, padded_allocator<char>> v(7, pa);
+
+    EXPECT_TRUE(is_aligned(v.data(), sizeof(void*)));
+}
+
+TEST(padded_vector, allocator_propagation) {
+    padded_allocator<double> pa(1024);
+    pvector<double> a(101, pa);
+
+    EXPECT_EQ(pa, a.get_allocator());
+
+    pvector<double> b(101);
+    auto pb = b.get_allocator();
+
+    // Differing alignment => allocators compare not-equal.
+    EXPECT_EQ(1u, pb.alignment());
+    EXPECT_NE(pa, pb);
+
+    // Don't propagate on copy- or move-assignment:
+    b = a;
+    EXPECT_EQ(pb.alignment(), b.get_allocator().alignment());
+    EXPECT_NE(pb.alignment(), pa.alignment());
+
+    pvector<double> c;
+    c = std::move(a);
+    EXPECT_NE(c.get_allocator().alignment(), pa.alignment());
+}
+
+
+#ifdef INSTRUMENT_MALLOC
+
+struct alloc_data {
+    unsigned n_malloc = 0;
+    unsigned n_realloc = 0;
+    unsigned n_memalign = 0;
+
+    std::size_t last_malloc = -1;
+    std::size_t last_realloc = -1;
+    std::size_t last_memalign = -1;
+};
+
+struct count_allocs: testing::with_instrumented_malloc {
+    alloc_data data;
+
+    void on_malloc(std::size_t size, const void*) override {
+        ++data.n_malloc;
+        data.last_malloc = size;
+    }
+
+    void on_realloc(void*, std::size_t size, const void*) override {
+        ++data.n_realloc;
+        data.last_realloc = size;
+    }
+
+    void on_memalign(std::size_t, std::size_t size, const void*) override {
+        ++data.n_memalign;
+        data.last_memalign = size;
+    }
+
+    void reset() {
+        data = alloc_data();
+    }
+};
+
+TEST(padded_vector, instrumented) {
+    count_allocs A;
+
+    padded_allocator<double> pad256(256), pad32(32);
+    pvector<double> v1p256(303, pad256);
+    alloc_data mdata = A.data;
+
+    unsigned expected_v1_alloc = 303*sizeof(double);
+    expected_v1_alloc = expected_v1_alloc%256? 256*(1+expected_v1_alloc/256): expected_v1_alloc;
+
+    EXPECT_EQ(1u, mdata.n_memalign);
+    EXPECT_EQ(0u, mdata.n_malloc);
+    EXPECT_EQ(0u, mdata.n_realloc);
+    EXPECT_EQ(expected_v1_alloc, mdata.last_memalign);
+
+    // Move assignment: v2 has differing alignment guarantee, so cannot
+    // take ownership of v1's data. We expect that v2 will need to allocate.
+
+    pvector<double> v2p32(10, pad32);
+    A.reset();
+    v2p32 = std::move(v1p256);
+    mdata = A.data;
+
+    EXPECT_EQ(1u, mdata.n_memalign);
+    EXPECT_EQ(0u, mdata.n_malloc);
+    EXPECT_EQ(0u, mdata.n_realloc);
+
+    pvector<double> v3p256(101, pad256), v4p256(700, pad256);
+
+    A.reset();
+    v4p256 = v3p256; // same alignment, larger size => shouldn't need to allocate
+    mdata = A.data;
+
+    EXPECT_EQ(0u, mdata.n_memalign);
+    EXPECT_EQ(0u, mdata.n_malloc);
+    EXPECT_EQ(0u, mdata.n_realloc);
+
+    A.reset();
+    pvector<double> v5p32(701, pad32);
+    mdata = A.data;
+
+    unsigned expected_v5_alloc = 701*sizeof(double);
+    expected_v5_alloc = expected_v5_alloc%32? 32*(1+expected_v5_alloc/32): expected_v5_alloc;
+
+    EXPECT_EQ(1u, mdata.n_memalign);
+    EXPECT_EQ(0u, mdata.n_malloc);
+    EXPECT_EQ(0u, mdata.n_realloc);
+    EXPECT_EQ(expected_v5_alloc, mdata.last_memalign);
+
+    A.reset();
+    v5p32 = v3p256; // different alignment, but enough space, so shouldn't reallocate.
+    mdata = A.data;
+
+    EXPECT_EQ(0u, mdata.n_memalign);
+    EXPECT_EQ(0u, mdata.n_malloc);
+    EXPECT_EQ(0u, mdata.n_realloc);
+}
+
+#endif // ifdef INSTRUMENT_MALLOC