From 2dff9c411d9df09ea4aa59ba3ac30de2f00d6d5f Mon Sep 17 00:00:00 2001
From: Sam Yates <yates@cscs.ch>
Date: Fri, 16 Mar 2018 16:21:05 +0100
Subject: [PATCH] SIMD wrappers for Arbor generated mechanisms. (#450)

This provides a bunch of SIMD intrinsic wrappers as a precursor to the SIMD printers.

The aim is that the SIMD printer can be agnostic regarding the particular vector architecture.

The design is based rather loosely on the proposal P0214R6 for C++ Parallelism TS 2. The transcendental function implementations are adapted from the existing SIMD architecture-specific code, which in turn are based on the Cephes library algorithms.

The custom CSS for the html documentation have been tweaked.
---
 CMakeLists.txt              |    3 +
 cmake/CompilerOptions.cmake |    6 +-
 doc/index.rst               |    2 +
 doc/simd_api.rst            |  950 +++++++++++++++++++++++++++++++++
 doc/simd_maths.rst          |  185 +++++++
 doc/static/custom.css       |   21 +-
 src/simd/approx.hpp         |   84 +++
 src/simd/avx.hpp            |  921 ++++++++++++++++++++++++++++++++
 src/simd/avx512.hpp         |  712 +++++++++++++++++++++++++
 src/simd/generic.hpp        |   68 +++
 src/simd/implbase.hpp       |  478 +++++++++++++++++
 src/simd/native.hpp         |   93 ++++
 src/simd/simd.hpp           |  521 ++++++++++++++++++
 src/util/debug.hpp          |   74 +++
 tests/unit/CMakeLists.txt   |    1 +
 tests/unit/common.hpp       |   64 ++-
 tests/unit/test_simd.cpp    | 1001 +++++++++++++++++++++++++++++++++++
 17 files changed, 5175 insertions(+), 9 deletions(-)
 create mode 100644 doc/simd_api.rst
 create mode 100644 doc/simd_maths.rst
 create mode 100644 src/simd/approx.hpp
 create mode 100644 src/simd/avx.hpp
 create mode 100644 src/simd/avx512.hpp
 create mode 100644 src/simd/generic.hpp
 create mode 100644 src/simd/implbase.hpp
 create mode 100644 src/simd/native.hpp
 create mode 100644 src/simd/simd.hpp
 create mode 100644 tests/unit/test_simd.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8f4ac97..2f92d0a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,9 @@ endif()
 set(ARB_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX2,AVX512}")
 set_property(CACHE ARB_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX2 AVX512)
 
+# Note: this option conflates modcc code generation options and
+# the target architecture for compilation of Arbor. TODO: fix!
+
 if(ARB_VECTORIZE_TARGET STREQUAL "KNL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL} -DSIMD_KNL")
 elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX2")
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index 40a019bc..8e51dddb 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -16,6 +16,10 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL")
 endif()
 
 if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    set(CXXOPT_KNL "-march=knl")
+    set(CXXOPT_AVX2 "-mavx2 -mfma")
+    set(CXXOPT_AVX512 "-mavx512f -mavx512cd")
+
     # Disable 'missing-braces' warning: this will inappropriately
     # flag initializations such as
     #     std::array<int,3> a={1,2,3};
@@ -36,7 +40,7 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
     # Compiler flags for generating KNL-specific AVX512 instructions
     # supported in gcc 4.9.x and later.
     set(CXXOPT_KNL "-march=knl")
-    set(CXXOPT_AVX2 "-mavx2")
+    set(CXXOPT_AVX2 "-mavx2 -mfma")
     set(CXXOPT_AVX512 "-mavx512f -mavx512cd")
 
     # Disable 'maybe-uninitialized' warning: this will be raised
diff --git a/doc/index.rst b/doc/index.rst
index 23852d71..661a6794 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,6 +25,8 @@ Arbor is a high-performance library for computationa neurscience simulations.
    :maxdepth: 1
 
    library
+   simd_api
+   simd_maths
    profiler
    sampling_api
 
diff --git a/doc/simd_api.rst b/doc/simd_api.rst
new file mode 100644
index 00000000..be33e7a0
--- /dev/null
+++ b/doc/simd_api.rst
@@ -0,0 +1,950 @@
+SIMD classes for Arbor
+======================
+
+The purpose of the SIMD classes is to abstract and consolidate the use of
+compiler intrinsics for the manipulation of architecture-specific vector
+(SIMD) values.
+
+The implementation is rather loosely based on the data-parallel vector types
+proposal `P0214R6 for the C++ Parallelism TS 2 <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0214r6.pdf>`_.
+
+Unless otherwise specified, all classes, namespaces and top-level functions
+described below are all within the top-level `arb::simd` namespace.
+
+.. rubric:: Example usage
+
+The following code performs an element-wise vector product, storing
+only non-zero values in the resultant array.
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        #include <simd/simd.hpp>
+        using namespace arb::simd;
+
+        void product_nonzero(int n, const double* a, const double* b, double* result) {
+            constexpr int N = simd_abi::native_width<double>::value;
+            using simd = simd<double, N>;
+            using mask = simd::simd_mask;
+
+            int i = 0;
+            for (; i+N<=n; i+=N) {
+                auto vp = simd(a+i)*simd(b+i);
+                where(vp!=0, vp).copy_to(result+i);
+            }
+
+            int tail = n-i;
+            auto m = mask::unpack((1<<tail)-1);
+
+            auto vp = simd(a+i, m)*simd(b+i, m);
+            where(m && vp!=0, vp).copy_to(c+i);
+        }
+
+
+Classes
+-------
+
+Three user-facing template classes are provided:
+
+1. ``simd<V, N, I = simd_abi::default_abi>``
+
+   *N*-wide vector type of values of type *V*, using architecture-specific
+   implementation *I*. The implementation parameter is itself a template,
+   acting as a type-map, with ``I<V, N>::type`` being the concrete implementation
+   class (see below) for *N*-wide vectors of type *V* for this architecture.
+
+   The implementation ``simd_abi::generic`` provides a ``std::array``-backed
+   implementation for arbitrary *V* and *N*, while ``simd_abi::native``
+   maps to the native architecture implementation for *V* and *N*, if
+   one is available for the target architecture.
+
+   ``simd_abi::default_abi`` will use ``simd_abi::native`` if available, or
+   else fall back to the generic implementation.
+
+2. ``simd_mask<V, N, I = simd_api::default_abi>``
+
+   The result of performing a lane-wise comparison/test operation on
+   a ``simd<V, N, I>`` vector value. ``simd_mask`` objects support logical
+   operations and are used as arguments to ``where`` expressions.
+
+   ``simd_mask<V, N, I>`` is a type alias for ``simd<V, N, I>::simd_mask``.
+
+3. ``where_expression<simd<V, N, I>>``
+
+   The result of a ``where`` expression, used for masked assignment.
+
+Implementation typemaps live in the ``simd_abi`` namespace, while concrete
+implementation classes live in ``simd_detail``. A particular specialization
+for an architecture, for example 4-wide double on AVX, then requires:
+
+*  A concrete implementation class, e.g. ``simd_detail::avx_double4``.
+
+*  A specialization of its ABI map, so that ``simd_abi::avx<double, 4>::type``
+   is an alias for ``simd_detail::avx_double4``.
+
+*  A specialization of the native ABI map, so that
+   ``simd_abi::native<double, 4>::type`` is an alias for ``simd_abi::avx<double, 4>::type``.
+
+The maximum natively supported width for a scalar type *V* is recorded in
+``simd_abi::native_width<V>::value``.
+
+Class ``simd``
+^^^^^^^^^^^^^^
+
+The class ``simd<V, N, I>`` is an alias for ``simd_detail::simd_impl<I<V, N>::type>``;
+the class ``simd_detail::simd_impl<C>`` provides the public interface and
+arithmetic operators for a concrete implementation class `C`.
+
+In the following:
+
+* *S* stands for the class ``simd<V, N, I>``.
+* *s* is an SIMD value of type *S*.
+* *m* is a mask value of type ``S::simd_mask``.
+* *t*, *u* and *v* are const objects of type *S*.
+* *i* is an index of type ``int``.
+* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type.
+* *x* is a value of type *V*.
+* *p* is a pointer to *V*.
+* *c* is a const pointer to *V* or a length *N* array of *V*.
+
+Here and below, the value in lane *i* of a SIMD vector or mask *v* is denoted by
+*v*\ `i`:sub:
+
+
+.. rubric:: Type aliases and constexpr members
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Name
+      - Type
+      - Description
+
+    * - ``S::scalar_type``
+      - *V*
+      - The type of one lane of the SIMD type.
+
+    * - ``S::simd_mask``
+      - ``simd_mask<V, N, I>``
+      - The ``simd_mask`` specialization resulting from comparisons of *S* SIMD values.
+
+    * - ``S::width``
+      - ``unsigned``
+      - The SIMD width *N*.
+
+.. rubric:: Constructors
+
+.. list-table::
+    :widths: 20 80
+    :header-rows: 1
+
+    * - Expression
+      - Description
+
+    * - ``S(x)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to *x* for *i* = 0…*N*-1.
+
+    * - ``S(t)``
+      - A copy of the SIMD value *t*.
+
+    * - ``S(c)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* = 0…*N*-1.
+
+    * - ``S(c, m)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* where *m*\ `i`:sub: is true.
+
+.. rubric:: Member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``t.copy_to(p)``
+      - ``void``
+      - Set ``p[i]`` to *t*\ `i`:sub: for *i* = 0…*N*-1.
+
+    * - ``t.scatter(p, j)``
+      - ``void``
+      - Set ``p[j[i]]`` to *t*\ `i`:sub: for *i* = 0…*N*-1.
+
+    * - ``s.copy_from(c)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[i]`` for *i* = 0…*N*-1.
+
+    * - ``s.gather(c, j)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* = 0…*N*-1.
+
+.. rubric:: Expressions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``t+u``
+      - ``S``
+      - Lane-wise sum.
+
+    * - ``t-u``
+      - ``S``
+      - Lane-wise difference.
+
+    * - ``t*u``
+      - ``S``
+      - Lane-wise product.
+
+    * - ``t/u``
+      - ``S``
+      - Lane-wise quotient.
+
+    * - ``fma(t, u, v)``
+      - ``S``
+      - Lane-wise FMA *t* * *u* + *v*.
+
+    * - ``s<t``
+      - ``S::simd_mask``
+      - Lane-wise less-than comparison.
+
+    * - ``s<=t``
+      - ``S::simd_mask``
+      - Lane-wise less-than-or-equals comparison.
+
+    * - ``s>t``
+      - ``S::simd_mask``
+      - Lane-wise greater-than comparison.
+
+    * - ``s>=t``
+      - ``S::simd_mask``
+      - Lane-wise greater-than-or-equals comparison.
+
+    * - ``s==t``
+      - ``S::simd_mask``
+      - Lane-wise equality test.
+
+    * - ``s!=t``
+      - ``S::simd_mask``
+      - Lane-wise inequality test.
+
+    * - ``s=t``
+      - ``S&``
+      - Lane-wise assignment.
+
+    * - ``s+=t``
+      - ``S&``
+      - Equivalent to ``s=s+t``.
+
+    * - ``s-=t``
+      - ``S&``
+      - Equivalent to ``s=s-t``.
+
+    * - ``s*=t``
+      - ``S&``
+      - Equivalent to ``s=s*t``.
+
+    * - ``s/=t``
+      - ``S&``
+      - Equivalent to ``s=s/t``.
+
+    * - ``s=x``
+      - ``S&``
+      - Equivalent to ``s=S(x)``.
+
+    * - ``t[i]``
+      - ``V``
+      - Value *t*\ `i`:sub:
+
+    * - ``s[i]=x``
+      - ``S::reference``
+      - Set value *s*\ `i`:sub: to *x*.
+
+The (non-const) index operator ``operator[]`` returns a proxy object of type ``S::reference``,
+which writes the corresponding lane in the SIMD value on assignment, and has an
+implicit conversion to ``scalar_type``.
+
+
+Class ``simd_mask``
+^^^^^^^^^^^^^^^^^^^
+
+``simd_mask<V, N, I>`` is an alias for ``simd<V, N, I>::simd_mask``, which in turn
+will be an alias for a class ``simd_detail::simd_mask_impl<D>``, where *D* is
+a concrete implementation class for the SIMD mask representation. ``simd_mask_impl<D>``
+inherits from, and is implemented in terms of, ``simd_detail::simd_impl<D>``,
+but note that the concrete implementation class *D* may or may not be the same
+as the concrete implementation class ``I<V, N>::type`` used by ``simd<V, N, I>``.
+
+Mask values are read and written as ``bool`` values of 0 or 1, which may
+differ from the internal representation in each lane of the SIMD implementation.
+
+In the following:
+
+* *M* stands for the class ``simd_mask<V, N, I>``.
+* *m* and *q* are const objects of type ``simd_mask<V, N, I>``.
+* *u* is an object of type ``simd_mask<V, N, I>``.
+* *b* is a boolean value.
+* *w* is a pointer to ``bool``.
+* *y* is a const pointer to ``bool`` or a length *N* array of ``bool``.
+* *i* is of type ``int``.
+* *k* is of type ``unsigned long long``.
+
+.. rubric:: Constructors
+
+.. list-table::
+    :widths: 20 80
+    :header-rows: 1
+
+    * - Expression
+      - Description
+
+    * - ``M(b)``
+      - A SIMD mask *u* with *u*\ `i`:sub: equal to *b* for *i* = 0…*N*-1.
+
+    * - ``M(m)``
+      - A copy of the SIMD mask *m*.
+
+    * - ``M(y)``
+      - A SIMD value *u* with *u*\ `i`:sub: equal to ``y[i]`` for *i* = 0…*N*-1.
+
+Note that ``simd_mask`` does not (currently) offer a masked pointer/array constructor.
+
+.. rubric:: Member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``m.copy_to(w)``
+      - ``void``
+      - Write the boolean value *m*\ `i`:sub: to ``w[i]`` for *i* = 0…*N*-1.
+
+    * - ``u.copy_from(y)``
+      - ``void``
+      - Set *u*\ `i`:sub: to the boolean value ``y[i]`` for *i* = 0…*N*-1.
+
+.. rubric:: Expressions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``!m``
+      - ``M``
+      - Lane-wise negation.
+
+    * - ``m&&q``
+      - ``M``
+      - Lane-wise logical and.
+
+    * - ``m||q``
+      - ``M``
+      - Lane-wise logical or.
+
+    * - ``m==q``
+      - ``M``
+      - Lane-wise equality (equivalent to ``m!=!q``).
+
+    * - ``m!=q``
+      - ``M``
+      - Lane-wise logical xor.
+
+    * - ``m=q``
+      - ``M&``
+      - Lane-wise assignment.
+
+    * - ``m[i]``
+      - ``bool``
+      - Boolean value *m*\ `i`:sub:.
+
+    * - ``m[i]=b``
+      - ``M::reference``
+      - Set *m*\ `i`:sub: to boolean value *b*.
+
+.. rubric:: Static member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``M::unpack(k)``
+      - ``M``
+      - Mask with value *m*\ `i`:sub: equal to the *i*\ th bit of *k*.
+
+
+Class ``where_expression``
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``where_expression<S>`` represents a masked subset of the lanes
+of a SIMD value of type ``S``, used for conditional assignment,
+masked scatter, and masked gather. It is a type alias for
+``S::where_expression``, and is the result of an expression of the
+form ``where(mask, simdvalue)``.
+
+In the following:
+
+* *W* stands for the class ``where_expression<simd<V, N, I>>``.
+* *s* is a reference to a SIMD value of type ``simd<V, N, I>&``.
+* *t* is a SIMD value of type ``simd<V, N, I>``.
+* *m* is a mask of type ``simd<V, N, I>::simd_mask``.
+* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type.
+* *x* is a scalar of type *V*.
+* *p* is a pointer to *V*.
+* *c* is a const pointer to *V* or a length *N* array of *V*.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``where(m, s)``
+      - ``W``
+      - A proxy for masked-assignment operations.
+
+    * - ``where(m, s)=t``
+      - ``void``
+      - Set *s*\ `i`:sub: to *t*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s)=x``
+      - ``void``
+      - Set *s*\ `i`:sub: to *x* for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).copy_to(p)``
+      - ``void``
+      - Set ``p[i]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).scatter(p, j)``
+      - ``void``
+      - Set ``p[j[i]]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).copy_from(c)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[i]`` for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).gather(c, j)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* where *m*\ `i`:sub: is true.
+
+
+Top-level functions
+-------------------
+
+Lane-wise mathematical operations *abs(x)*, *min(x, y)* and *max(x, y)* are offered for
+all SIMD value types, while the transcendental functions are only usable for
+SIMD floating point types.
+
+Vectorized implementations of some of the transcendental functions are provided:
+refer to :doc:`simd_maths` for details.
+
+
+In the following:
+
+* *A* is a SIMD class ``simd<K, N, I>`` for some scalar type *K*.
+* *S* is a SIMD class ``simd<V, N, I>`` for a floating point type *V*.
+* *a* and *b* are values of type *A*.
+* *s* and *t* are values of type *S*.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``abs(a)``
+      - *A*
+      - Lane-wise absolute value of *a*.
+
+    * - ``min(a, b)``
+      - *A*
+      - Lane-wise minimum of *a* and *b*.
+
+    * - ``max(a, b)``
+      - *A*
+      - Lane-wise maximum of *a* and *b*.
+
+    * - ``sin(s)``
+      - *S*
+      - Lane-wise sine of *s*.
+
+    * - ``cos(s)``
+      - *S*
+      - Lane-wise cosine of *s*.
+
+    * - ``log(s)``
+      - *S*
+      - Lane-wise natural logarithm of *s*.
+
+    * - ``exp(s)``
+      - *S*
+      - Lane-wise exponential of *s*.
+
+    * - ``expm1(s)``
+      - *S*
+      - Lane-wise :math:`x \mapsto e^x - 1`.
+
+    * - ``exprelr(s)``
+      - *S*
+      - Lane-wise :math:`x \mapsto x / (e^x - 1)`.
+
+    * - ``pow(s, t)``
+      - *S*
+      - Lane-wise raise *s* to the power of *t*.
+
+
+Implementation requirements
+---------------------------
+
+Each specific architecture is represented by a templated class *I*, with
+``I<V, N>::type`` being the concrete implementation for an *N*-wide
+SIMD value with ``scalar_type`` *V*.
+
+A concrete implementation class *C* inherits from ``simd_detail::implbase<C>``,
+which provides (via CRTP) generic implementations of most of the SIMD
+functionality. The base class ``implbase<C>`` in turn relies upon
+``simd_detail::simd_traits<C>`` to look up the SIMD width, and associated types.
+
+All the required SIMD operations are given by static member functions of *C*.
+
+Minimal implementation
+^^^^^^^^^^^^^^^^^^^^^^
+
+In the following, let *C* be the concrete implementation class for a
+*N*-wide vector of scalar_type *V*, with low-level representation
+``archvec``.
+
+The specialization of ``simd_detail::simd_traits<C>`` then exposes these
+types and values, and also provides the concrete implementation class *M*
+for masks associated with *C*:
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        template <>
+        struct simd_traits<C> {
+            static constexpr unsigned width = N;
+            using scalar_type = V;
+            using vector_type = archvec;
+            using mask_impl = M;
+        };
+
+
+The mask implementation class *M* may or may not be the same as *C*.
+For example, ``simd_detail::avx_double4`` provides both the arithmetic operations and mask
+operations for an AVX 4 × double SIMD vector, while the mask
+implementation for ``simd_detail::avx512_double8`` is ``simd_detail::avx512_mask8``.
+
+The concrete implementation class must provide at minimum implementations
+of ``copy_to`` and ``copy_from`` (see the section below for semantics):
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        struct C: implbase<C> {
+            static void copy_to(const arch_vector&, V*);
+            static arch_vector copy_from(const V*);
+        };
+
+If the implementation is also acting as a mask implementation, it must also
+provide ``mask_copy_to``, ``mask_copy_from``, ``mask_element`` and
+``mask_set_element``:
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        struct C: implbase<C> {
+            static void copy_to(const arch_vector&, V*);
+            static arch_vector copy_from(const V*);
+
+            static void mask_copy_to(const arch_vector& v, bool* w);
+            static arch_vector mask_copy_from(const bool* y);
+            static bool mask_element(const arch_vector& v, int i);
+            static void mask_set_element(arch_vector& v, int i, bool x);
+        };
+
+The ``simd_detial::generic<T, N>`` provides an example of a minimal
+implementation based on an ``arch_vector`` type of ``std::array<T, N>``.
+
+
+Concrete implementation API
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the following, *C* represents the concrete implementation class for
+a SIMD class of width *N* and value type *V*.
+
+* *u*, *v*, and *w* are values of type ``C::vector_type``.
+* *r* is a reference of type ``C::vector_type&``.
+* *x* is a value of type ``C::scalar_type``.
+* *c* is a const pointer of type ``const C::scalar_type*``.
+* *p* is a pointer of type ``C::scalar_type*``.
+* *j* is a SIMD index representation of type ``J::vector_type`` for
+  an integral concrete implementation class *J*.
+* *b* is a ``bool`` value.
+* *w* is a pointer to ``bool``.
+* *y* is a const pointer to ``bool``.
+* *i* is an unsigned (index) value.
+* *k* is an unsigned long long value.
+* *m* is a mask representation of type ``C::mask_type``.
+
+.. rubric:: Types and constants
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Name
+      - Type
+      - Description
+
+    * - ``C::vector_type``
+      - ``simd_traits<C>::vector_type``
+      - Underlying SIMD representation type.
+
+    * - ``C::scalar_type``
+      - ``simd_traits<C>::scalar_type``
+      - Should be convertible to/from *V*.
+
+    * - ``C::mask_impl``
+      - ``simd_traits<C>::mask_impl``
+      - Concrete implementation class for mask SIMD type.
+
+    * - ``C::mask_type``
+      - ``C::mask_impl::vector_type``
+      - Underlying SIMD representation for masks.
+
+    * - ``C::width``
+      - ``unsigned``
+      - The SIMD width *N*.
+
+.. rubric:: Initialization, load, store
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::broadcast(x)``
+      - ``C::vector_type``
+      - Fill representation with scalar *x*.
+
+    * - ``C::copy_to(v, p)``
+      - ``void``
+      - Store values *v*\ `i`:sub: to *p+i*. *p* may be unaligned.
+
+    * - ``C::copy_to_masked(v, p, m)``
+      - ``void``
+      - Store values *v*\ `i`:sub: to *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned.
+
+    * - ``C::copy_from(c)``
+      - ``C::vector_type``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i*. *p* may be unaligned.
+
+    * - ``C::copy_from_masked(c, m)``
+      - ``C::vector_type``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned.
+
+    * - ``C::copy_from_masked(w, c, m)``
+      - ``void``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true, or equal to *w*\ `i`:sub
+        otherwise. *p* may be unaligned.
+
+.. rubric:: Lane access
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::element(v, i)``
+      - ``C::scalar_type``
+      - Value in ith lane of *u*.
+
+    * - ``C::set_element(r, i, x)``
+      - ``void``
+      - Set value in lane *i* of *r* to *x*.
+
+.. rubric:: Gather and scatter
+
+The offsets for gather and scatter operations are given
+by a vector type ``J::vector_type`` for some possibly
+different concrete implementation class *J*, and the
+static methods implementing gather and scatter are templated
+on this class.
+
+Implementations can provide optimized versions for specific
+index classes *J*; this process would be simplified with
+more support for casts between SIMD types and their concrete
+implementations, functionality which is not yet provided.
+
+The first argument to these functions is a dummy argument
+of type *J*, used only to disambiguate overloads.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::gather(J{}, p, j)``
+      - ``C::vector_type``
+      - Vector *v* with values *v*\ `i`:sub: = ``p[j[i]]``.
+
+    * - ``C::gather(J{}, u, p, j, m)``
+      - ``C::vector_type``
+      - Vector *v* with values *v*\ `i`:sub: = *m*\ `i`:sub: ? ``p[j[i]]`` : *u*\ `i`:sub:.
+
+    * - ``C::scatter(J{}, u, p, j)``
+      - ``void``
+      - Write values *u*\ `i`:sub: to ``p[j[i]]``.
+
+    * - ``C::scatter(J{}, u, p, j, m)``
+      - ``void``
+      - Write values *u*\ `i`:sub: to ``p[j[i]]`` for lanes *i* where *m*\ `i`:sub: is true.
+
+.. rubric:: Arithmetic operations
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::negate(v)``
+      - ``C::vector_type``
+      - Lane-wise unary minus.
+
+    * - ``C::mul(u, v)``
+      - ``C::vector_type``
+      - Lane-wise multiplication.
+
+    * - ``C::add(u, v)``
+      - ``C::vector_type``
+      - Lane-wise addition.
+
+    * - ``C::sub(u, v)``
+      - ``C::vector_type``
+      - Lane-wise subtraction.
+
+    * - ``C::div(u, v)``
+      - ``C::vector_type``
+      - Lane-wise division.
+
+    * - ``C::fma(u, v, w)``
+      - ``C::vector_type``
+      - Lane-wise fused multiply-add (u*v+w).
+
+.. rubric:: Comparison and blends
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::cmp_eq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* = *v*.
+
+    * - ``C::cmp_neq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* ≠ *v*.
+
+    * - ``C::cmp_gt(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* > *v*.
+
+    * - ``C::cmp_geq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* ≥ *v*.
+
+    * - ``C::cmp_lt(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* < *v*.
+
+    * - ``C::cmp_leq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* ≤ *v*.
+
+    * - ``C::ifelse(m, u, v)``
+      - ``C::vector_type``
+      - Vector *w* with values *w*\ `i`:sub: = *m*\ `i`:sub: ? *u*\ `i`:sub: : *v*\ `i`:sub:.
+
+.. rubric:: Mathematical function support.
+
+With the exception of ``abs``, ``min`` and ``max``, these are only
+required for floating point vector implementations.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::abs(v)``
+      - ``C::vector_type``
+      - Lane-wise absolute value.
+
+    * - ``C::min(u, v)``
+      - ``C::vector_type``
+      - Lane-wise minimum.
+
+    * - ``C::max(u, v)``
+      - ``C::vector_type``
+      - Lane-wise maximum.
+
+    * - ``C::sin(v)``
+      - ``C::vector_type``
+      - Lane-wise sine.
+
+    * - ``C::cos(v)``
+      - ``C::vector_type``
+      - Lane-wise cosine.
+
+    * - ``C::log(v)``
+      - ``C::vector_type``
+      - Lane-wise natural logarithm.
+
+    * - ``C::exp(v)``
+      - ``C::vector_type``
+      - Lane-wise exponential.
+
+    * - ``C::expm1(v)``
+      - ``C::vector_type``
+      - Lane-wise :math:`x \mapsto e^x -1`.
+
+    * - ``C::exprelr(v)``
+      - ``C::vector_type``
+      - Lane-wise :math:`x \mapsto x/(e^x -1)`.
+
+    * - ``C::pow(u, v)``
+      - ``C::vector_type``
+      - Lane-wise *u* raised to the power of *v*.
+
+.. rubric:: Mask value support
+
+Mask operations are only required if *C* constitutes the implementation of a
+SIMD mask class.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::mask_broadcast(b)``
+      - ``C::vector_type``
+      - Fill mask representation with bool *b*.
+
+    * - ``C::mask_element(v, i)``
+      - ``bool``
+      - Mask value *v*\ `i`:sub:.
+
+    * - ``C::mask_set_element(u, i, b)``
+      - ``void``
+      - Set mask value *u*\ `i`:sub: to *b*.
+
+    * - ``C::mask_copy_to(v, w)``
+      - ``void``
+      - Write bool values to memory (unaligned).
+
+    * - ``C::mask_copy_from(y)``
+      - ``C::vector_type``
+      - Load bool values from memory (unaligned).
+
+    * - ``C::mask_unpack(k)``
+      - ``C::vector_type``
+      - Return vector *v* with boolean value *v*\ `i`:sub: equal
+        to the *i*\ th bit of *k*.
+
+.. rubric:: Logical operations
+
+Logical operations are only required if *C* constitutes the implementation of a
+SIMD mask class.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::logical_not(u)``
+      - ``C::vector_type``
+      - Lane-wise negation.
+
+    * - ``C::logical_and(u, v)``
+      - ``C::vector_type``
+      - Lane-wise logical and.
+
+    * - ``C::logical_or(u, v)``
+      - ``C::vector_type``
+      - Lane-wise logical or.
+
+    * - ``C::select(m, v, w)``
+      - ``C::vector_type``
+      - Lane-wise *m*? *v*: *u*.
+
+
+Missing functionality
+---------------------
+
+There is no support yet for the following features, although some of these
+will need to be provided in order to improve the efficiency of SIMD versions
+of our generated mechanisms.
+
+* A SIMD cast function, e.g. ``simd_cast<S>(const T&)`` that converts between
+  different SIMD wrappers of the same width. The infrastructure that supports
+  this in concrete implementation classes would also simplify the implementation
+  of more generic ``gather`` and ``scatter`` methods.
+
+* Horizontal reductions across the lanes of a SIMD value or where-expression.
+
+* Vectorizable implementations of trigonometric functions.
+
+* Compound assignment operations for where-expressions. Extending the concrete
+  implementation API to support this would allow, for example, efficient use
+  of AVX512 masked arithmetic instructions.
+
diff --git a/doc/simd_maths.rst b/doc/simd_maths.rst
new file mode 100644
index 00000000..b93b07e6
--- /dev/null
+++ b/doc/simd_maths.rst
@@ -0,0 +1,185 @@
+Implementation of vector transcendental functions
+=================================================
+
+When building with the Intel C++ compiler, transcendental
+functions on SIMD values in ``simd<double, 8, simd_detail::avx512>``
+wrap calls to the Intel scalar vector mathematics library (SVML).
+
+Outside of this case, the functions *exp*, *log*, *expm1* and
+*exprelr* use explicit approximations as detailed below. The
+algortihms follow those used in the
+`Cephes library <http://www.netlib.org/cephes/>`_, with
+some accommodations.
+
+.. default-role:: math
+
+Exponentials
+------------
+
+`\operatorname{exp}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The exponential is computed as
+
+.. math::
+
+    e^x = 2^n · e^g,
+
+with `|g| ≤ 0.5` and `n` an integer. The power of two
+is computed via direct manipulation of the exponent bits of the floating
+point representation, while `e^g` is approximated by a rational polynomial.
+
+`n` and `g` are computed by:
+
+.. math::
+
+    n &= \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor
+
+    g &= x - n·\log 2
+
+where the subtraction in the calculation of `g` is performed in two stages,
+to limit cancellation error:
+
+.. math::
+
+    g &\leftarrow \operatorname{fl}(x - n · c_1)
+
+    g &\leftarrow \operatorname{fl}(g - n · c_2)
+
+where `c_1+c_2 = \log 2`,`c_1` comprising the first 32 bits of the mantissa.
+(In principle `c_1` might contain more bits of the logarithm, but this
+particular decomposition matches that used in the Cephes library.) This
+decomposition gives `|g|\leq \frac{1}{2}\log 2\approx 0.347`.
+
+The rational approximation for `e^g` is of the form
+
+.. math::
+
+    e^g \approx \frac{R(g)}{R(-g)}
+
+where `R(g)` is a polynomial of order 6. The coefficients are again those
+used by Cephes, and probably are derived via a Remez algorithm.
+`R(g)` is decomposed into even and odd terms
+
+.. math::
+
+    R(g) = Q(x^2) + xP(x^2)
+
+so that the ratio can be calculated by:
+
+.. math::
+
+    e^g \approx 1 + \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}.
+
+Randomized testing indicates the approximation is accurate to 2 ulp.
+
+
+`\operatorname{expm1}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A similar decomposition of `x = g + n·\log 2` is performed so that
+`g≤0.5`, with the exception that `n` is always taken to
+be zero for `|x|≤0.5`, i.e.
+
+.. math::
+
+    n = \begin{cases}
+          0&\text{if $|x|≤0.5$,}\\
+          \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor
+          &\text{otherwise.}
+        \end{cases}
+
+
+`\operatorname{expm1}(x)` is then computed as
+
+.. math::
+
+    e^x - 1 = 2^n·(e^g - 1)+(2^n-1).
+
+and the same rational polynomial is used to approximate `e^g-1`,
+
+.. math::
+
+    e^g - 1 \approx \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}.
+
+The scaling by step for `n≠0` is in practice calculated as
+
+.. math::
+
+    e^x - 1 = 2·(2^{n-1}·(e^g - 1)+(2^{n-1}-0.5)).
+
+in order to avoid overflow at the upper end of the range.
+
+The order 6 rational polynomial approximation for small `x`
+is insufficiently accurate to maintain 1 ulp accuracy; randomized
+testing indicates a maximum error of up to 3 ulp.
+
+
+`\operatorname{exprelr}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The function is defined as
+
+.. math::
+
+    \operatorname{exprelr}(x) = x/(e^x-1),
+
+and is the reciprocal of the relative exponential function,
+
+.. math::
+
+    \operatorname{exprel}(x) &= {}_1F_1(1; 2; x)\\
+                             &= \frac{e^x-1}{x}.
+
+This is computed in terms of expm1 by:
+
+.. math::
+
+    \operatorname{exprelr}(x) :=
+      \begin{cases}
+          1&\text{if $\operatorname{fl}(1+x) = 1$,}\\
+          x/\operatorname{expm1}(x)&\text{otherwise.}
+      \end{cases}
+
+With the approximation for `\operatorname{expm1}` used above,
+randomized testing demonstrates a maximum error on the order
+of 4 ulp.
+
+
+Logarithms
+----------
+
+The natural logarithm is computed as
+
+.. math::
+
+    \log x = \log u + n·log 2
+
+where `n` is an integer and `u` is in the interval
+`[ \frac{1}{2}\sqrt 2, \sqrt 2]`. The logarithm of
+`u` is then approximated by the rational polynomial
+used in the Cephes implementation,
+
+.. math::
+
+    \log u &\approx R(u-1)
+
+    R(z) &= z - \frac{z^2}{2} + z^3·\frac{P(z)}{Q(z)},
+
+where `P` and `Q` are polynomials of degree 5, with
+`Q` monic.
+
+Cancellation error is minimized by computing the sum for
+`\log x` as:
+
+.. math::
+
+    s &\leftarrow \operatorname{fl}(z^3·P(z)/Q(z))\\
+    s &\leftarrow \operatorname{fl}(s + n·c_4)\\
+    s &\leftarrow \operatorname{fl}(s - 0.5·z^2)\\
+    s &\leftarrow \operatorname{fl}(s + z)\\
+    s &\leftarrow \operatorname{fl}(s + n·c_3)
+
+where `z=u-1` and `c_3+c_4=\log 2`, `c_3` comprising
+the first 9 bits of the mantissa.
+
diff --git a/doc/static/custom.css b/doc/static/custom.css
index 005eac31..54b96418 100644
--- a/doc/static/custom.css
+++ b/doc/static/custom.css
@@ -5,7 +5,7 @@
     padding-left: 3pt;
     content: "Example";
     background: #6ab0de;
-    color: #ffffff
+    color: #ffffff;
 }
 
 .api-code>div[class^='highlight']:before {
@@ -14,9 +14,24 @@
     padding-bottom: 2pt;
     padding-left: 3pt;
     content: "API";
-    background: #e0e0e0
+    background: #e0e0e0;
 }
 
 .api-code>div[class^='highlight'] {
-    background: #f4f4f4
+    background: #f4f4f4;
+}
+
+li>p:last-child {
+    padding-bottom: 2ex;
+}
+
+table.docutils td {
+    vertical-align: top !important;
+}
+
+@media screen and (min-width: 40em) {
+    .wy-table-responsive table td,
+    .wy-table-responsive table th {
+	white-space: normal;
+    }
 }
diff --git a/src/simd/approx.hpp b/src/simd/approx.hpp
new file mode 100644
index 00000000..cd5f8395
--- /dev/null
+++ b/src/simd/approx.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <cfloat>
+
+// Constants/magic numbers for the approximation of
+// exponential, trigonometric and special functions.
+//
+// Polynomial coefficients and evaluation orders
+// match those of the Cephes library,
+// <URL: http://www.netlib.org/cephes>.
+//
+// Refer to the developer documentation for more
+// detail concerning the approximations.
+
+namespace arb {
+namespace simd_detail {
+
+// Exponential:
+//
+// Approximation of exponential by a Padé-like rational
+// polynomial R(x)/R(-x) of order 6.
+//
+// P comprises the odd coefficients, and Q the even.
+
+constexpr double P0exp = 9.99999999999999999910E-1;
+constexpr double P1exp = 3.02994407707441961300E-2;
+constexpr double P2exp = 1.26177193074810590878E-4;
+
+constexpr double Q0exp = 2.00000000000000000009E0;
+constexpr double Q1exp = 2.27265548208155028766E-1;
+constexpr double Q2exp = 2.52448340349684104192E-3;
+constexpr double Q3exp = 3.00198505138664455042E-6;
+
+// Cancellation-corrected ln(2) = ln2C1 + ln2C2:
+
+constexpr double ln2C1 = 6.93145751953125E-1;
+constexpr double ln2C2 = 1.42860682030941723212E-6;
+
+// 1/ln(2):
+
+constexpr double ln2inv = 1.4426950408889634073599;
+
+// Min and max argument values for finite and normal
+// double-precision exponential.
+
+constexpr double exp_minarg = -708.3964185322641;
+constexpr double exp_maxarg = 709.782712893384;
+
+// For expm1, minimum argument that gives a result
+// over -1.
+
+constexpr double expm1_minarg = -37.42994775023705;
+
+// Logarithm:
+//
+// Positive denormal numbers are treated as zero
+// for the purposes of the log function. 
+
+constexpr double log_minarg = DBL_MIN;
+constexpr double sqrt2 = 1.41421356237309504880;
+
+// Cancellation-corrected ln(2) = ln2C3 + ln2C4:
+
+constexpr double ln2C3 = 0.693359375;
+constexpr double ln2C4 = -2.121944400546905827679e-4;
+
+// Polynomial coefficients (Q is also order 5,
+// but monic).
+
+constexpr double P0log = 7.70838733755885391666E0;
+constexpr double P1log = 1.79368678507819816313e1;
+constexpr double P2log = 1.44989225341610930846e1;
+constexpr double P3log = 4.70579119878881725854e0;
+constexpr double P4log = 4.97494994976747001425e-1;
+constexpr double P5log = 1.01875663804580931796e-4;
+
+constexpr double Q0log = 2.31251620126765340583E1;
+constexpr double Q1log = 7.11544750618563894466e1;
+constexpr double Q2log = 8.29875266912776603211e1;
+constexpr double Q3log = 4.52279145837532221105e1;
+constexpr double Q4log = 1.12873587189167450590e1;
+
+} // namespace simd_detail
+} // namespace arb
diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp
new file mode 100644
index 00000000..1527f1b0
--- /dev/null
+++ b/src/simd/avx.hpp
@@ -0,0 +1,921 @@
+#pragma once
+
+// AVX/AVX2 SIMD intrinsics implementation.
+
+#ifdef __AVX__
+
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+
+#include <simd/approx.hpp>
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simsimd {
+namespace simd_detail {
+
+struct avx_int4;
+struct avx_double4;
+
+template <>
+struct simd_traits<avx_int4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = std::int32_t;
+    using vector_type = __m128i;
+    using mask_impl = avx_int4;
+};
+
+template <>
+struct simd_traits<avx_double4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = double;
+    using vector_type = __m256d;
+    using mask_impl = avx_double4;
+};
+
+struct avx_int4: implbase<avx_int4> {
+    // Use default implementations for: element, set_element, div.
+
+    using int32 = std::int32_t;
+
+    static __m128i broadcast(int32 v) {
+        return _mm_set1_epi32(v);
+    }
+
+    static void copy_to(const __m128i& v, int32* p) {
+        _mm_storeu_si128((__m128i*)p, v);
+    }
+
+    static void copy_to_masked(const __m128i& v, int32* p, const __m128i& mask) {
+        _mm_maskstore_ps(reinterpret_cast<float*>(p), mask, _mm_castsi128_ps(v));
+    }
+
+    static __m128i copy_from(const int32* p) {
+        return _mm_loadu_si128((const __m128i*)p);
+    }
+
+    static __m128i copy_from_masked(const int32* p, const __m128i& mask) {
+        return _mm_castps_si128(_mm_maskload_ps(reinterpret_cast<const float*>(p), mask));
+    }
+
+    static __m128i copy_from_masked(const __m128i& v, const int32* p, const __m128i& mask) {
+        __m128 d = _mm_maskload_ps(reinterpret_cast<const float*>(p), mask);
+        return ifelse(mask, _mm_castps_si128(d), v);
+    }
+
+    static __m128i negate(const __m128i& a) {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_sub_epi32(zero, a);
+    }
+
+    static __m128i add(const __m128i& a, const __m128i& b) {
+        return _mm_add_epi32(a, b);
+    }
+
+    static __m128i sub(const __m128i& a, const __m128i& b) {
+        return _mm_sub_epi32(a, b);
+    }
+
+    static __m128i mul(const __m128i& a, const __m128i& b) {
+        return _mm_mullo_epi32(a, b);
+    }
+
+    static __m128i fma(const __m128i& a, const __m128i& b, const __m128i& c) {
+        return _mm_add_epi32(_mm_mullo_epi32(a, b), c);
+    }
+
+    static __m128i logical_not(const __m128i& a) {
+        __m128i ones = {};
+        return _mm_xor_si128(a, _mm_cmpeq_epi32(ones, ones));
+    }
+
+    static __m128i logical_and(const __m128i& a, const __m128i& b) {
+        return _mm_and_si128(a, b);
+    }
+
+    static __m128i logical_or(const __m128i& a, const __m128i& b) {
+        return _mm_or_si128(a, b);
+    }
+
+    static __m128i cmp_eq(const __m128i& a, const __m128i& b) {
+        return _mm_cmpeq_epi32(a, b);
+    }
+
+    static __m128i cmp_neq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_eq(a, b));
+    }
+
+    static __m128i cmp_gt(const __m128i& a, const __m128i& b) {
+        return _mm_cmpgt_epi32(a, b);
+    }
+
+    static __m128i cmp_geq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_gt(b, a));
+    }
+
+    static __m128i cmp_lt(const __m128i& a, const __m128i& b) {
+        return cmp_gt(b, a);
+    }
+
+    static __m128i cmp_leq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_gt(a, b));
+    }
+
+    static __m128i ifelse(const __m128i& m, const __m128i& u, const __m128i& v) {
+        return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(u), _mm_castsi128_ps(m)));
+    }
+
+    static __m128i mask_broadcast(bool b) {
+        return _mm_set1_epi32(-(int32)b);
+    }
+
+    static __m128i mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m128i b = _mm_set1_epi8((char)k);
+        b = logical_or(b, _mm_setr_epi32(0xfefefefe,0xfdfdfdfd,0xfbfbfbfb,0xf7f7f7f7));
+
+        __m128i ones = {};
+        ones = _mm_cmpeq_epi32(ones, ones);
+        return _mm_cmpeq_epi32(b, ones);
+    }
+
+    static bool mask_element(const __m128i& u, int i) {
+        return static_cast<bool>(element(u, i));
+    }
+
+    static void mask_set_element(__m128i& u, int i, bool b) {
+        set_element(u, i, -(int32)b);
+    }
+
+    static void mask_copy_to(const __m128i& m, bool* y) {
+        // Negate (convert 0xffffffff to 0x00000001) and move low bytes to
+        // bottom 4 bytes.
+
+        __m128i s = _mm_setr_epi32(0x0c080400ul,0,0,0);
+        __m128i p = _mm_shuffle_epi8(negate(m), s);
+        std::memcpy(y, &p, 4);
+    }
+
+    static __m128i mask_copy_from(const bool* w) {
+        __m128i r;
+        std::memcpy(&r, w, 4);
+
+        __m128i s = _mm_setr_epi32(0x80808000ul, 0x80808001ul, 0x80808002ul, 0x80808003ul);
+        return negate(_mm_shuffle_epi8(r, s));
+    }
+
+    static __m128i max(const __m128i& a, const __m128i& b) {
+        return _mm_max_epi32(a, b);
+    }
+
+    static __m128i min(const __m128i& a, const __m128i& b) {
+        return _mm_min_epi32(a, b);
+    }
+};
+
+struct avx_double4: implbase<avx_double4> {
+    // Use default implementations for:
+    //     element, set_element, fma.
+
+    using int64 = std::int64_t;
+
+    // CMPPD predicates:
+    static constexpr int cmp_eq_oq =    0;
+    static constexpr int cmp_unord_q =  3;
+    static constexpr int cmp_neq_uq =   4;
+    static constexpr int cmp_true_uq = 15;
+    static constexpr int cmp_lt_oq =   17;
+    static constexpr int cmp_le_oq =   18;
+    static constexpr int cmp_nge_uq =  25;
+    static constexpr int cmp_ge_oq =   29;
+    static constexpr int cmp_gt_oq =   30;
+
+    static __m256d broadcast(double v) {
+        return _mm256_set1_pd(v);
+    }
+
+    static void copy_to(const __m256d& v, double* p) {
+        _mm256_storeu_pd(p, v);
+    }
+
+    static void copy_to_masked(const __m256d& v, double* p, const __m256d& mask) {
+        _mm256_maskstore_pd(p, _mm256_castpd_si256(mask), v);
+    }
+
+    static __m256d copy_from(const double* p) {
+        return _mm256_loadu_pd(p);
+    }
+
+    static __m256d copy_from_masked(const double* p, const __m256d& mask) {
+        return _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
+    }
+
+    static __m256d copy_from_masked(const __m256d& v, const double* p, const __m256d& mask) {
+        __m256d d = _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
+        return ifelse(mask, d, v);
+    }
+
+    static __m256d negate(const __m256d& a) {
+        return _mm256_sub_pd(zero(), a);
+    }
+
+    static __m256d add(const __m256d& a, const __m256d& b) {
+        return _mm256_add_pd(a, b);
+    }
+
+    static __m256d sub(const __m256d& a, const __m256d& b) {
+        return _mm256_sub_pd(a, b);
+    }
+
+    static __m256d mul(const __m256d& a, const __m256d& b) {
+        return _mm256_mul_pd(a, b);
+    }
+
+    static __m256d div(const __m256d& a, const __m256d& b) {
+        return _mm256_div_pd(a, b);
+    }
+
+    static __m256d logical_not(const __m256d& a) {
+        __m256d ones = {};
+        return _mm256_xor_pd(a, _mm256_cmp_pd(ones, ones, 15));
+    }
+
+    static __m256d logical_and(const __m256d& a, const __m256d& b) {
+        return _mm256_and_pd(a, b);
+    }
+
+    static __m256d logical_or(const __m256d& a, const __m256d& b) {
+        return _mm256_or_pd(a, b);
+    }
+
+    static __m256d cmp_eq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_eq_oq);
+    }
+
+    static __m256d cmp_neq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_neq_uq);
+    }
+
+    static __m256d cmp_gt(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_gt_oq);
+    }
+
+    static __m256d cmp_geq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_ge_oq);
+    }
+
+    static __m256d cmp_lt(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_lt_oq);
+    }
+
+    static __m256d cmp_leq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_le_oq);
+    }
+
+    static __m256d ifelse(const __m256d& m, const __m256d& u, const __m256d& v) {
+        return _mm256_blendv_pd(v, u, m);
+    }
+
+    static __m256d mask_broadcast(bool b) {
+        return _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64)b));
+    }
+
+    static bool mask_element(const __m256d& u, int i) {
+        return static_cast<bool>(element(u, i));
+    }
+
+    static __m256d mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m128i b = _mm_set1_epi8((char)k);
+        // (Note: there is no _mm_setr_epi64x (!))
+        __m128i bl = _mm_or_si128(b, _mm_set_epi64x(0xfdfdfdfdfdfdfdfd, 0xfefefefefefefefe));
+        __m128i bu = _mm_or_si128(b, _mm_set_epi64x(0xf7f7f7f7f7f7f7f7, 0xfbfbfbfbfbfbfbfb));
+
+        __m128i ones = {};
+        ones = _mm_cmpeq_epi32(ones, ones);
+        bl = _mm_cmpeq_epi64(bl, ones);
+        bu = _mm_cmpeq_epi64(bu, ones);
+        return _mm256_castsi256_pd(combine_m128i(bu, bl));
+    }
+
+    static void mask_set_element(__m256d& u, int i, bool b) {
+        char data[256];
+        _mm256_storeu_pd((double*)data, u);
+        ((int64*)data)[i] = -(int64)b;
+        u = _mm256_loadu_pd((double*)data);
+    }
+
+    static void mask_copy_to(const __m256d& m, bool* y) {
+        // Convert to 32-bit wide mask values, and delegate to
+        // avx2_int4.
+
+        avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y);
+    }
+
+    static __m256d mask_copy_from(const bool* w) {
+        __m128i zero = _mm_setzero_si128();
+
+        __m128i r;
+        std::memcpy(&r, w, 4);
+
+        // Move bytes:
+        //   rl: byte 0 to byte 0, byte 1 to byte 8, zero elsewhere.
+        //   ru: byte 2 to byte 0, byte 3 to byte 8, zero elsewhere.
+        //
+        // Subtract from zero to translate
+        // 0x0000000000000001 to 0xffffffffffffffff.
+
+        __m128i sl = _mm_setr_epi32(0x80808000ul, 0x80808080ul, 0x80808001ul, 0x80808080ul);
+        __m128i rl = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, sl));
+
+        __m128i su = _mm_setr_epi32(0x80808002ul, 0x80808080ul, 0x80808003ul, 0x80808080ul);
+        __m128i ru = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, su));
+
+        return _mm256_castsi256_pd(combine_m128i(ru, rl));
+    }
+
+    static __m256d max(const __m256d& a, const __m256d& b) {
+        return _mm256_max_pd(a, b);
+    }
+
+    static __m256d min(const __m256d& a, const __m256d& b) {
+        return _mm256_min_pd(a, b);
+    }
+
+    static __m256d abs(const __m256d& x) {
+        __m256i m = _mm256_set1_epi64x(0x7fffffffffffffffll);
+        return _mm256_and_pd(x, _mm256_castsi256_pd(m));
+    }
+
+    // Exponential is calculated as follows:
+    //
+    //     e^x = e^g · 2^n,
+    //
+    // where g in [-0.5, 0.5) and n is an integer. 2^n can be
+    // calculated via bit manipulation or specialized scaling intrinsics,
+    // whereas e^g is approximated using the order-6 rational
+    // approximation:
+    //
+    //     e^g = R(g)/R(-g)
+    //
+    // with R(x) split into even and odd terms:
+    //
+    //     R(x) = Q(x^2) + x·P(x^2)
+    //
+    // so that the ratio can be computed as:
+    //
+    //     e^g = 1 + 2·g·P(g^2) / (Q(g^2)-g·P(g^2)).
+    //
+    // Note that the coefficients for R are close to but not the same as those
+    // from the 6,6 Padé approximant to the exponential. 
+    //
+    // The exponents n and g are calculated by:
+    //
+    //     n = floor(x/ln(2) + 0.5)
+    //     g = x - n·ln(2)
+    //
+    // so that x = g + n·ln(2). We have:
+    //
+    //     |g| = |x - n·ln(2)|
+    //         = |x - x + α·ln(2)|
+    //  
+    // for some fraction |α| ≤ 0.5, and thus |g| ≤ 0.5ln(2) ≈ 0.347.
+    //
+    // Tne subtraction x - n·ln(2) is performed in two parts, with
+    // ln(2) = C1 + C2, in order to compensate for the possible loss of precision
+    // attributable to catastrophic rounding. C1 comprises the first
+    // 32-bits of mantissa, C2 the remainder.
+
+    static  __m256d exp(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        // Compute n and g.
+
+        auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5)));
+
+        auto g = sub(x, mul(n, broadcast(ln2C1)));
+        g = sub(g, mul(n, broadcast(ln2C2)));
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = add(broadcast(1), mul(broadcast(2),
+            div(odd, sub(even, odd))));
+
+        // Finally, compute product with 2^n.
+        // Note: can only achieve full range using the ldexp implementation,
+        // rather than multiplying by 2^n directly.
+
+        auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n));
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+            ifelse(is_nan, broadcast(NAN),
+                   result)));
+    }
+
+    // Use same rational polynomial expansion as for exp(x), without
+    // the unit term.
+    //
+    // For |x|<=0.5, take n to be zero. Otherwise, set n as above,
+    // and scale the answer by:
+    //     expm1(x) = 2^n * expm1(g) + (2^n - 1).
+
+    static  __m256d expm1(const __m256d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+        auto two = add(one, one);
+
+        auto nzero = cmp_leq(abs(x), half);
+        auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), half));
+        n = ifelse(nzero, zero(), n);
+
+        auto g = sub(x, mul(n, broadcast(ln2C1)));
+        g = sub(g, mul(n, broadcast(ln2C2)));
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+
+        // For small x (n zero), bypass scaling step to avoid underflow.
+        // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
+        //     result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) )
+        // to avoid overflow when n=1024.
+
+        auto nm1 = _mm256_cvtpd_epi32(sub(n, one));
+        auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(is_nan, broadcast(NAN),
+            ifelse(nzero, expgm1,
+                   scaled))));
+    }
+
+    // Natural logarithm:
+    //
+    // Decompose x = 2^g * u such that g is an integer and
+    // u is in the interval [sqrt(2)/2, sqrt(2)].
+    //
+    // Then ln(x) is computed as R(u-1) + g*ln(2) where
+    // R(z) is a rational polynomial approximating ln(z+1)
+    // for small z:
+    //
+    //     R(z) = z - z^2/2 + z^3 * P(z)/Q(z)
+    //
+    // where P and Q are degree 5 polynomials, Q monic.
+    //
+    // In order to avoid cancellation error, ln(2) is represented
+    // as C3 + C4, with the C4 correction term approx. -2.1e-4.
+    // The summation order for R(z)+2^g is:
+    //
+    //     z^3*P(z)/Q(z) + g*C4 - z^2/2 + z + g*C3
+
+    static __m256d log(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq);
+
+        __m256d g = _mm256_cvtepi32_pd(logb_normal(x));
+        __m256d u = fraction_normal(x);
+
+        __m256d one = broadcast(1.);
+        __m256d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = add(r, mul(g,  broadcast(ln2C4)));
+        r = sub(r, mul(z2, half));
+        r = add(r, z);
+        r = add(r, mul(g,  broadcast(ln2C3)));
+
+        // Return NaN if x is NaN or negarive, +inf if x is +inf,
+        // or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_domainerr, broadcast(NAN),
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r)));
+    }
+
+protected:
+    static __m256d zero() {
+        return _mm256_setzero_pd();
+    }
+
+    static __m128i hi_epi32(__m256i x) {
+        __m128i xl = _mm256_castsi256_si128(x);
+        __m128i xh = _mm256_extractf128_si256(x, 1);
+        return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0xddu));
+    }
+
+    static __m128i lo_epi32(__m256i x) {
+        __m128i xl = _mm256_castsi256_si128(x);
+        __m128i xh = _mm256_extractf128_si256(x, 1);
+        return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0x88u));
+    }
+
+    static __m256i combine_m128i(__m128i hi, __m128i lo) {
+        return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+    }
+
+    // horner(x, a0, ..., an) computes the degree n polynomial A(x) with coefficients
+    // a0, ..., an by a0+x·(a1+x·(a2+...+x·an)...).
+
+    static inline __m256d horner(__m256d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m256d horner(__m256d x, double a0, T... tail) {
+        return add(mul(x, horner(x, tail...)), broadcast(a0));
+    }
+
+    // horner1(x, a0, ..., an) computes the degree n+1 monic polynomial A(x) with coefficients
+    // a0, ..., an, 1 by by a0+x·(a1+x·(a2+...+x·(an+x)...).
+
+    static inline __m256d horner1(__m256d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m256d horner1(__m256d x, double a0, T... tail) {
+        return add(mul(x, horner1(x, tail...)), broadcast(a0));
+    }
+
+    // Compute 2.0^n.
+    static __m256d exp2int(__m128i n) {
+        n = _mm_slli_epi32(n, 20);
+        n = _mm_add_epi32(n, _mm_set1_epi32(1023<<20));
+
+        auto nl = _mm_shuffle_epi32(n, 0x50);
+        auto nh = _mm_shuffle_epi32(n, 0xfa);
+        __m256i nhnl = combine_m128i(nh, nl);
+
+        return _mm256_castps_pd(
+            _mm256_blend_ps(_mm256_set1_ps(0),
+            _mm256_castsi256_ps(nhnl),0xaa));
+    }
+
+    // Compute n and f such that x = 2^n·f, with |f| ∈ [1,2), given x is finite and normal.
+    static __m128i logb_normal(const __m256d& x) {
+        __m128i xw = hi_epi32(_mm256_castpd_si256(x));
+        __m128i emask = _mm_set1_epi32(0x7ff00000);
+        __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20);
+
+        return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023));
+    }
+
+    static __m256d fraction_normal(const __m256d& x) {
+        // 0x800fffffffffffff (intrinsic takes signed parameter)
+        __m256d emask = _mm256_castsi256_pd(_mm256_set1_epi64x(-0x7ff0000000000001));
+        __m256d bias = _mm256_castsi256_pd(_mm256_set1_epi64x(0x3ff0000000000000));
+        return _mm256_or_pd(bias, _mm256_and_pd(emask, x));
+    }
+
+    // Compute 2^n·x when both x and 2^n·x are normal, finite and strictly positive doubles.
+    static __m256d ldexp_positive(__m256d x, __m128i n) {
+        n = _mm_slli_epi32(n, 20);
+        auto zero = _mm_set1_epi32(0);
+        auto nl = _mm_unpacklo_epi32(zero, n);
+        auto nh = _mm_unpackhi_epi32(zero, n);
+
+        __m128d xl = _mm256_castpd256_pd128(x);
+        __m128d xh = _mm256_extractf128_pd(x, 1);
+
+        __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl));
+        __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh));
+        __m256i sumhl = combine_m128i(sumh, suml);
+
+        return _mm256_castsi256_pd(sumhl);
+    }
+
+    // Compute 2^n·x when both x and 2^n·x are normal and finite.
+    static __m256d ldexp_normal(__m256d x, __m128i n) {
+        __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll));
+        __m256d sbits = _mm256_andnot_pd(smask, x);
+
+        n = _mm_slli_epi32(n, 20);
+        auto zi = _mm_set1_epi32(0);
+        auto nl = _mm_unpacklo_epi32(zi, n);
+        auto nh = _mm_unpackhi_epi32(zi, n);
+
+        __m128d xl = _mm256_castpd256_pd128(x);
+        __m128d xh = _mm256_extractf128_pd(x, 1);
+
+        __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl));
+        __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh));
+        __m256i sumhl = combine_m128i(sumh, suml);
+
+        auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sumhl), smask), sbits);
+        return ifelse(cmp_eq(x, zero()), zero(), nzans);
+    }
+};
+
+
+#if defined(__AVX2__) && defined(__FMA__)
+
+// Same implementation as for AVX.
+using avx2_int4 = avx_int4;
+
+struct avx2_double4;
+
+template <>
+struct simd_traits<avx2_double4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = double;
+    using vector_type = __m256d;
+    using mask_impl = avx2_double4;
+};
+
+// Note: we derive from avx_double4 only as an implementation shortcut.
+// Because `avx2_double4` does not derive from `implbase<avx2_double4>`,
+// any fallback methods in `implbase` will use the `avx_double4`
+// functions rather than the `avx2_double4` functions.
+
+struct avx2_double4: avx_double4 {
+    static __m256d fma(const __m256d& a, const __m256d& b, const __m256d& c) {
+        return _mm256_fmadd_pd(a, b, c);
+    }
+
+    static vector_type logical_not(const vector_type& a) {
+        __m256i ones = {};
+        return _mm256_xor_pd(a, _mm256_castsi256_pd(_mm256_cmpeq_epi32(ones, ones)));
+    }
+
+    static __m256d mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m256i b = _mm256_set1_epi8((char)k);
+        b = _mm256_or_si256(b, _mm256_setr_epi64x(
+                0xfefefefefefefefe, 0xfdfdfdfdfdfdfdfd,
+                0xfbfbfbfbfbfbfbfb, 0xf7f7f7f7f7f7f7f7));
+
+        __m256i ones = {};
+        ones = _mm256_cmpeq_epi64(ones, ones);
+        return _mm256_castsi256_pd(_mm256_cmpeq_epi64(ones, b));
+    }
+
+    static void mask_copy_to(const __m256d& m, bool* y) {
+        // Convert to 32-bit wide mask values, and delegate to
+        // avx2_int4.
+
+        avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y);
+    }
+
+    static __m256d mask_copy_from(const bool* w) {
+        __m256i zero = _mm256_setzero_si256();
+
+        __m128i r;
+        std::memcpy(&r, w, 4);
+        return _mm256_castsi256_pd(_mm256_sub_epi64(zero, _mm256_cvtepi8_epi64(r)));
+    }
+
+    static __m256d gather(avx2_int4, const double* p, const __m128i& index) {
+        return _mm256_i32gather_pd(p, index, 8);
+    }
+
+    static __m256d gather(avx2_int4, __m256d a, const double* p, const __m128i& index, const __m256d& mask) {
+        return  _mm256_mask_i32gather_pd(a, p, index, mask, 8);
+    };
+
+    // avx4_double4 versions of log, exp, and expm1 use the same algorithms as for avx_double4,
+    // but use AVX2-specialized bit manipulation and FMA.
+
+    static  __m256d exp(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        // Compute n and g.
+
+        auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, broadcast(0.5)));
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1));
+
+        // Finally, compute product with 2^n.
+        // Note: can only achieve full range using the ldexp implementation,
+        // rather than multiplying by 2^n directly.
+
+        auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n));
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+            ifelse(is_nan, broadcast(NAN),
+                   result)));
+    }
+
+    static  __m256d expm1(const __m256d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+        auto two = add(one, one);
+
+        auto smallx = cmp_leq(abs(x), half);
+        auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, half));
+        n = ifelse(smallx, zero(), n);
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+
+        auto nm1 = _mm256_cvtpd_epi32(sub(n, one));
+        auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(is_nan, broadcast(NAN),
+            ifelse(smallx, expgm1,
+                   scaled))));
+    }
+
+    static __m256d log(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq);
+
+        __m256d g = _mm256_cvtepi32_pd(logb_normal(x));
+        __m256d u = fraction_normal(x);
+
+        __m256d one = broadcast(1.);
+        __m256d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = fma(g,  broadcast(ln2C4), r);
+        r = fms(z2, half, r);
+        r = sub(z, r);
+        r = fma(g,  broadcast(ln2C3), r);
+
+        // Return NaN if x is NaN or negative, +inf if x is +inf,
+        // or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_domainerr, broadcast(NAN),
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r)));
+    }
+
+protected:
+    static __m128i lo_epi32(__m256i a) {
+        a = _mm256_shuffle_epi32(a, 0x08);
+        a = _mm256_permute4x64_epi64(a, 0x08);
+        return _mm256_castsi256_si128(a);
+    }
+
+    static  __m128i hi_epi32(__m256i a) {
+        a = _mm256_shuffle_epi32(a, 0x0d);
+        a = _mm256_permute4x64_epi64(a, 0x08);
+        return _mm256_castsi256_si128(a);
+    }
+
+    static inline __m256d horner(__m256d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m256d horner(__m256d x, double a0, T... tail) {
+        return fma(x, horner(x, tail...), broadcast(a0));
+    }
+
+    static inline __m256d horner1(__m256d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m256d horner1(__m256d x, double a0, T... tail) {
+        return fma(x, horner1(x, tail...), broadcast(a0));
+    }
+
+    static __m256d fms(const __m256d& a, const __m256d& b, const __m256d& c) {
+        return _mm256_fmsub_pd(a, b, c);
+    }
+
+    // Compute 2.0^n.
+    // Overrides avx_double4::exp2int.
+    static __m256d exp2int(__m128i n) {
+        __m256d x = broadcast(1);
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+        return _mm256_castsi256_pd(sum);
+    }
+
+    // Compute 2^n*x when both x and 2^n*x are normal, finite and strictly positive doubles.
+    // Overrides avx_double4::ldexp_positive.
+    static __m256d ldexp_positive(__m256d x, __m128i n) {
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+        return _mm256_castsi256_pd(sum);
+    }
+
+    // Compute 2^n*x when both x and 2^n*x are normal and finite.
+    // Overrides avx_double4::ldexp_normal.
+    static __m256d ldexp_normal(__m256d x, __m128i n) {
+        __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll));
+        __m256d sbits = _mm256_andnot_pd(smask, x);
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+
+        auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sum), smask), sbits);
+        return ifelse(cmp_eq(x, zero()), zero(), nzans);
+    }
+
+    // Override avx_double4::logb_normal so as to use avx2 version of hi_epi32.
+    static __m128i logb_normal(const __m256d& x) {
+        __m128i xw = hi_epi32(_mm256_castpd_si256(x));
+        __m128i emask = _mm_set1_epi32(0x7ff00000);
+        __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20);
+
+        return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023));
+    }
+};
+#endif // defined(__AVX2__) && defined(__FMA__)
+
+} // namespace simd_detail
+
+namespace simd_abi {
+    template <typename T, unsigned N> struct avx;
+
+    template <> struct avx<int, 4> { using type = simd_detail::avx_int4; };
+    template <> struct avx<double, 4> { using type = simd_detail::avx_double4; };
+
+#if defined(__AVX2__) && defined(__FMA__)
+    template <typename T, unsigned N> struct avx2;
+
+    template <> struct avx2<int, 4> { using type = simd_detail::avx2_int4; };
+    template <> struct avx2<double, 4> { using type = simd_detail::avx2_double4; };
+#endif
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
+
+#endif // def __AVX__
diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp
new file mode 100644
index 00000000..f956bb00
--- /dev/null
+++ b/src/simd/avx512.hpp
@@ -0,0 +1,712 @@
+#pragma once
+
+// AVX512F SIMD intrinsics implementation.
+
+#ifdef __AVX512F__
+
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+
+#include <simd/approx.hpp>
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+struct avx512_double8;
+struct avx512_int8;
+struct avx512_mask8;
+
+template <>
+struct simd_traits<avx512_mask8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = bool;
+    using vector_type = __mmask8;
+    using mask_impl = avx512_mask8;
+};
+
+template <>
+struct simd_traits<avx512_double8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = double;
+    using vector_type = __m512d;
+    using mask_impl = avx512_mask8;
+};
+
+template <>
+struct simd_traits<avx512_int8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = std::int32_t;
+    using vector_type = __m512i;
+    using mask_impl = avx512_mask8;
+};
+
+struct avx512_mask8: implbase<avx512_mask8> {
+    static __mmask8 broadcast(bool b) {
+        return _mm512_int2mask(-b);
+    }
+
+    static void copy_to(const __mmask8& k, bool* b) {
+        __m256i a = _mm256_setzero_si256();
+        a = _mm512_castsi512_si256(_mm512_mask_set1_epi32(_mm512_castsi256_si512(a), k, 1));
+
+        __m256i s = _mm256_set1_epi32(0x0c080400);
+        a = _mm256_shuffle_epi8(a, s);
+
+        s = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0);
+        a = _mm256_permutevar8x32_epi32(a, s);
+
+        std::memcpy(b, &a, 8);
+    }
+
+    static __mmask8 copy_from(const bool* p) {
+        __m256i a = _mm256_setzero_si256();
+        std::memcpy(&a, p, 8);
+        a = _mm256_sub_epi8(_mm256_setzero_si256(), a);
+        return _mm512_int2mask(_mm256_movemask_epi8(a));
+    }
+
+    // Note: fall back to implbase implementations of copy_to_masked and copy_from_masked;
+    // could be improved with the use of AVX512BW instructions on supported platforms.
+
+    static __mmask8 logical_not(const __mmask8& k) {
+        return _mm512_knot(k);
+    }
+
+    static __mmask8 logical_and(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    static __mmask8 logical_or(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kor(a, b);
+    }
+
+    // Arithmetic operations not necessarily appropriate for
+    // packed bit mask, but implemented for completeness/testing,
+    // with Z modulo 2 semantics:
+    //     a + b   is equivalent to   a ^ b
+    //     a * b                      a & b
+    //     a / b                      a
+    //     a - b                      a ^ b
+    //     -a                         a
+    //     max(a, b)                  a | b
+    //     min(a, b)                  a & b
+
+    static __mmask8 negate(const __mmask8& a) {
+        return a;
+    }
+
+    static __mmask8 add(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 sub(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 mul(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    static __mmask8 div(const __mmask8& a, const __mmask8& b) {
+        return a;
+    }
+
+    static __mmask8 fma(const __mmask8& a, const __mmask8& b, const __mmask8& c) {
+        return add(mul(a, b), c);
+    }
+
+    static __mmask8 max(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kor(a, b);
+    }
+
+    static __mmask8 min(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    // Comparison operators are also taken as operating on Z modulo 2,
+    // with 1 > 0:
+    //
+    //     a > b    is equivalent to  a & ~b
+    //     a >= b                     a | ~b,  ~(~a & b)
+    //     a < b                      ~a & b
+    //     a <= b                     ~a | b,  ~(a & ~b)
+    //     a == b                     ~(a ^ b)
+    //     a != b                     a ^ b
+
+    static __mmask8 cmp_eq(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxnor(a, b);
+    }
+
+    static __mmask8 cmp_neq(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 cmp_lt(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kandn(a, b);
+    }
+
+    static __mmask8 cmp_gt(const __mmask8& a, const __mmask8& b) {
+        return cmp_lt(b, a);
+    }
+
+    static __mmask8 cmp_geq(const __mmask8& a, const __mmask8& b) {
+        return logical_not(cmp_lt(a, b));
+    }
+
+    static __mmask8 cmp_leq(const __mmask8& a, const __mmask8& b) {
+        return logical_not(cmp_gt(a, b));
+    }
+
+    static __mmask8 ifelse(const __mmask8& m, const __mmask8& u, const __mmask8& v) {
+        return _mm512_kor(_mm512_kandn(m, u), _mm512_kand(m, v));
+    }
+
+    static bool element(const __mmask8& k, int i) {
+        return _mm512_mask2int(k)&(1<<i);
+    }
+
+    static void set_element(__mmask8& k, int i, bool b) {
+        int n = _mm512_mask2int(k);
+        k = _mm512_int2mask((n&~(1<<i))|(b<<i));
+    }
+
+    static __mmask8 mask_broadcast(bool b) {
+        return broadcast(b);
+    }
+
+    static __mmask8 mask_unpack(unsigned long long p) {
+        return _mm512_int2mask(p);
+    }
+
+    static bool mask_element(const __mmask8& u, int i) {
+        return element(u, i);
+    }
+
+    static void mask_set_element(__mmask8& u, int i, bool b) {
+        set_element(u, i, b);
+    }
+
+    static void mask_copy_to(const __mmask8& m, bool* y) {
+        copy_to(m, y);
+    }
+
+    static __mmask8 mask_copy_from(const bool* y) {
+        return copy_from(y);
+    }
+};
+
+struct avx512_int8: implbase<avx512_int8> {
+    // Use default implementations for:
+    //     element, set_element.
+    //
+    // Consider changing mask representation to __mmask16
+    // and be explicit about comparison masks: restrictions
+    // to __mmask8 seem to produce a lot of ultimately unnecessary
+    // operations. 
+
+    using int32 = std::int32_t;
+
+    static __mmask8 lo() {
+        return _mm512_int2mask(0xff);
+    }
+
+    static __m512i broadcast(int32 v) {
+        return _mm512_set1_epi32(v);
+    }
+
+    static void copy_to(const __m512i& v, int32* p) {
+        _mm512_mask_storeu_epi32(p, lo(), v);
+    }
+
+    static void copy_to_masked(const __m512i& v, int32* p, const __mmask8& mask) {
+        _mm512_mask_storeu_epi32(p, mask, v);
+    }
+
+    static __m512i copy_from(const int32* p) {
+        return _mm512_maskz_loadu_epi32(lo(), p);
+    }
+
+    static __m512i copy_from_masked(const int32* p, const __mmask8& mask) {
+        return _mm512_maskz_loadu_epi32(mask, p);
+    }
+
+    static __m512i copy_from_masked(const __m512i& v, const int32* p, const __mmask8& mask) {
+        return _mm512_mask_loadu_epi32(v, mask, p);
+    }
+
+    static __m512i negate(const __m512i& a) {
+        return sub(_mm512_setzero_epi32(), a);
+    }
+
+    static __m512i add(const __m512i& a, const __m512i& b) {
+        return _mm512_add_epi32(a, b);
+    }
+
+    static __m512i sub(const __m512i& a, const __m512i& b) {
+        return _mm512_sub_epi32(a, b);
+    }
+
+    static __m512i mul(const __m512i& a, const __m512i& b) {
+        // Can represent 32-bit exactly in double, and technically overflow is
+        // undefined behaviour, so we can do this in doubles.
+        constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+        auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
+        auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b));
+        auto fpmul = _mm512_mul_round_pd(da, db, rtz);
+        return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpmul, rtz));
+    }
+
+    static __m512i div(const __m512i& a, const __m512i& b) {
+        // Can represent 32-bit exactly in double, so do a fp division with fixed rounding.
+        constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+        auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
+        auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b));
+        auto fpdiv = _mm512_div_round_pd(da, db, rtz);
+        return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpdiv, rtz));
+    }
+
+    static __m512i fma(const __m512i& a, const __m512i& b, const __m512i& c) {
+        return add(mul(a, b), c);
+    }
+
+    static __mmask8 cmp_eq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpeq_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_neq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpneq_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_gt(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpgt_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_geq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpge_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_lt(const __m512i& a, const __m512i& b) {
+        return _mm512_cmplt_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_leq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmple_epi32_mask(a, b);
+    }
+
+    static __m512i ifelse(const __mmask8& m, const __m512i& u, const __m512i& v) {
+        return _mm512_mask_blend_epi32(m, v, u);
+    }
+
+    static __m512i max(const __m512i& a, const __m512i& b) {
+        return _mm512_max_epi32(a, b);
+    }
+
+    static __m512i min(const __m512i& a, const __m512i& b) {
+        return _mm512_min_epi32(a, b);
+    }
+
+    static __m512i abs(const __m512i& a) {
+        return _mm512_abs_epi32(a);
+    }
+
+    // Generic 8-wide int solutions for gather and scatter.
+
+    template <typename Impl>
+    using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>;
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512i gather(ImplIndex, const int32* p, const typename ImplIndex::vector_type& index) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), _mm512_loadu_si512(op), p, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512i gather(ImplIndex, __m512i a, const int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        return _mm512_mask_i32gather_epi32(a, mask, _mm512_loadu_si512(op), p, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        _mm512_mask_i32scatter_epi32(p, lo(), _mm512_loadu_si512(op), s, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        _mm512_mask_i32scatter_epi32(p, mask, _mm512_loadu_si512(op), s, 4);
+    }
+
+    // Specialized 8-wide gather and scatter for avx512_int8 implementation.
+
+    static __m512i gather(avx512_int8, const int32* p, const __m512i& index) {
+        return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), index, p, 4);
+    }
+
+    static __m512i gather(avx512_int8, __m512i a, const int32* p, const __m512i& index, const __mmask8& mask) {
+        return _mm512_mask_i32gather_epi32(a, mask, index, p, 4);
+    }
+
+    static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index) {
+        _mm512_mask_i32scatter_epi32(p, lo(), index, s, 4);
+    }
+
+    static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index, const __mmask8& mask) {
+        _mm512_mask_i32scatter_epi32(p, mask, index, s, 4);
+    }
+};
+
+struct avx512_double8: implbase<avx512_double8> {
+    // Use default implementations for:
+    //     element, set_element.
+
+    // CMPPD predicates:
+    static constexpr int cmp_eq_oq =    0;
+    static constexpr int cmp_unord_q =  3;
+    static constexpr int cmp_neq_uq =   4;
+    static constexpr int cmp_true_uq = 15;
+    static constexpr int cmp_lt_oq =   17;
+    static constexpr int cmp_le_oq =   18;
+    static constexpr int cmp_nge_uq =  25;
+    static constexpr int cmp_ge_oq =   29;
+    static constexpr int cmp_gt_oq =   30;
+
+    static __m512d broadcast(double v) {
+        return _mm512_set1_pd(v);
+    }
+
+    static void copy_to(const __m512d& v, double* p) {
+        _mm512_storeu_pd(p, v);
+    }
+
+    static void copy_to_masked(const __m512d& v, double* p, const __mmask8& mask) {
+        _mm512_mask_storeu_pd(p, mask, v);
+    }
+
+    static __m512d copy_from(const double* p) {
+        return _mm512_loadu_pd(p);
+    }
+
+    static __m512d copy_from_masked(const double* p, const __mmask8& mask) {
+        return _mm512_maskz_loadu_pd(mask, p);
+    }
+
+    static __m512d copy_from_masked(const __m512d& v, const double* p, const __mmask8& mask) {
+        return _mm512_mask_loadu_pd(v, mask, p);
+    }
+
+    static __m512d negate(const __m512d& a) {
+        return _mm512_sub_pd(_mm512_setzero_pd(), a);
+    }
+
+    static __m512d add(const __m512d& a, const __m512d& b) {
+        return _mm512_add_pd(a, b);
+    }
+
+    static __m512d sub(const __m512d& a, const __m512d& b) {
+        return _mm512_sub_pd(a, b);
+    }
+
+    static __m512d mul(const __m512d& a, const __m512d& b) {
+        return _mm512_mul_pd(a, b);
+    }
+
+    static __m512d div(const __m512d& a, const __m512d& b) {
+        return _mm512_div_pd(a, b);
+    }
+
+    static __m512d fma(const __m512d& a, const __m512d& b, const __m512d& c) {
+        return _mm512_fmadd_pd(a, b, c);
+    }
+
+    static __mmask8 cmp_eq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_eq_oq);
+    }
+
+    static __mmask8 cmp_neq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_neq_uq);
+    }
+
+    static __mmask8 cmp_gt(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_gt_oq);
+    }
+
+    static __mmask8 cmp_geq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_ge_oq);
+    }
+
+    static __mmask8 cmp_lt(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_lt_oq);
+    }
+
+    static __mmask8 cmp_leq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_le_oq);
+    }
+
+    static __m512d ifelse(const __mmask8& m, const __m512d& u, const __m512d& v) {
+        return _mm512_mask_blend_pd(m, v, u);
+    }
+
+    static __m512d max(const __m512d& a, const __m512d& b) {
+        return _mm512_max_pd(a, b);
+    }
+
+    static __m512d min(const __m512d& a, const __m512d& b) {
+        return _mm512_min_pd(a, b);
+    }
+
+    static __m512d abs(const __m512d& x) {
+        // (NB: _mm512_abs_pd() wrapper buggy on gcc, so use AND directly.)
+        __m512i m = _mm512_set1_epi64(0x7fffffffffffffff);
+        return _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(x), m));
+    }
+
+    // Generic 8-wide int solutions for gather and scatter.
+
+    template <typename Impl>
+    using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>;
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512d gather(ImplIndex, const double* p, const typename ImplIndex::vector_type& index) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        return _mm512_i32gather_pd(_mm256_loadu_si256(op), p, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512d gather(ImplIndex, __m512d a, const double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        return _mm512_mask_i32gather_pd(a, mask, _mm256_loadu_si256(op), p, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        _mm512_i32scatter_pd(p, _mm256_loadu_si256(op), s, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        _mm512_mask_i32scatter_pd(p, mask, _mm256_loadu_si256(op), s, 8);
+    }
+
+    // Specialized 8-wide gather and scatter for avx512_int8 implementation.
+
+    static __m512d gather(avx512_int8, const double* p, const __m512i& index) {
+        return _mm512_i32gather_pd(_mm512_castsi512_si256(index), p, 8);
+    }
+
+    static __m512d gather(avx512_int8, __m512d a, const double* p, const __m512i& index, const __mmask8& mask) {
+        return _mm512_mask_i32gather_pd(a, mask, _mm512_castsi512_si256(index), p, 8);
+    }
+
+    static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index) {
+        _mm512_i32scatter_pd(p, _mm512_castsi512_si256(index), s, 8);
+    }
+
+    static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index, const __mmask8& mask) {
+        _mm512_mask_i32scatter_pd(p, mask, _mm512_castsi512_si256(index), s, 8);
+    }
+
+    // Use SVML for exp and log if compiling with icpc, else use ratpoly
+    // approximations.
+
+#if defined(__INTEL_COMPILER)
+    static  __m512d exp(const __m512d& x) {
+        return _mm512_exp_pd(x);
+    }
+
+    static  __m512d expm1(const __m512d& x) {
+        return _mm512_expm1_pd(x);
+    }
+
+    static __m512d log(const __m512d& x) {
+        return _mm512_log_pd(x);
+    }
+
+    static __m512d pow(const __m512d& x, const __m512d& y) {
+        return _mm512_pow_pd(x, y);
+    }
+#else
+
+    // Refer to avx/avx2 code for details of the exponential and log
+    // implementations.
+
+    static  __m512d exp(const __m512d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+
+        // Compute n and g.
+
+        auto n = _mm512_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5)));
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1));
+
+        // Scale by 2^n, propogating NANs.
+
+        auto result = _mm512_scalef_pd(expg, n);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+                   result));
+    }
+
+    static  __m512d expm1(const __m512d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+
+        auto nnz = cmp_gt(abs(x), half);
+        auto n = _mm512_maskz_roundscale_round_pd(
+                    nnz,
+                    mul(broadcast(ln2inv), x),
+                    0,
+                    _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd)));
+
+        // For small x (n zero), bypass scaling step to avoid underflow.
+        // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
+        //     result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) )
+        // to avoid overflow when n=1024.
+
+        auto nm1 = sub(n, one);
+
+        auto result =
+            _mm512_scalef_pd(
+                add(sub(_mm512_scalef_pd(one, nm1), half),
+                    _mm512_scalef_pd(expgm1, nm1)),
+                one);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(nnz, result, expgm1)));
+    }
+
+    static __m512d log(const __m512d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        is_small = avx512_mask8::logical_and(is_small, cmp_geq(x, broadcast(0)));
+
+        __m512d g = _mm512_getexp_pd(x);
+        __m512d u = _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_nan);
+
+        __m512d one = broadcast(1.);
+        __m512d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = fma(g,  broadcast(ln2C4), r);
+        r = fms(z2, half, r);
+        r = sub(z, r);
+        r = fma(g,  broadcast(ln2C3), r);
+
+        // r is alrady NaN if x is NaN or negative, otherwise
+        // return  +inf if x is +inf, or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r));
+    }
+#endif
+
+protected:
+    static inline __m512d horner1(__m512d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    static inline __m512d horner(__m512d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m512d horner(__m512d x, double a0, T... tail) {
+        return fma(x, horner(x, tail...), broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m512d horner1(__m512d x, double a0, T... tail) {
+        return fma(x, horner1(x, tail...), broadcast(a0));
+    }
+
+    static __m512d fms(const __m512d& a, const __m512d& b, const __m512d& c) {
+        return _mm512_fmsub_pd(a, b, c);
+    }
+};
+
+} // namespace simd_detail
+
+namespace simd_abi {
+    template <typename T, unsigned N> struct avx512;
+    template <> struct avx512<double, 8> { using type = simd_detail::avx512_double8; };
+    template <> struct avx512<int, 8> { using type = simd_detail::avx512_int8; };
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
+
+#endif // def __AVX512F__
diff --git a/src/simd/generic.hpp b/src/simd/generic.hpp
new file mode 100644
index 00000000..e8ca5524
--- /dev/null
+++ b/src/simd/generic.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <cmath>
+
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+template <typename T, unsigned N>
+struct generic;
+
+template <typename T, unsigned N>
+struct simd_traits<generic<T, N>> {
+    static constexpr unsigned width = N;
+    using scalar_type = T;
+    using vector_type = std::array<T, N>;
+    using mask_impl = generic<bool, N>;
+};
+
+template <typename T, unsigned N>
+struct generic: implbase<generic<T, N>> {
+    using array = std::array<T, N>;
+
+    static array copy_from(const T* p) {
+        array result;
+        std::memcpy(&result, p, sizeof(result));
+        return result;
+    }
+
+    static void copy_to(const array& v, T* p) {
+        std::memcpy(p, &v, sizeof(v));
+    }
+
+    static void mask_copy_to(const array& v, bool* w) {
+        std::copy(v.begin(), v.end(), w);
+    }
+
+    static array mask_copy_from(const bool* y) {
+        array v;
+        std::copy(y, y+N, v.data());
+        return v;
+    }
+
+    static bool mask_element(const array& v, int i) {
+        return static_cast<bool>(v[i]);
+    }
+
+    static void mask_set_element(array& v, int i, bool x) {
+        v[i] = x;
+    }
+};
+
+} // namespace simd_detail
+
+namespace simd_abi {
+
+    template <typename T, unsigned N> struct generic {
+        using type = simd_detail::generic<T, N>;
+    };
+
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/implbase.hpp b/src/simd/implbase.hpp
new file mode 100644
index 00000000..f5b968f9
--- /dev/null
+++ b/src/simd/implbase.hpp
@@ -0,0 +1,478 @@
+#pragma once
+
+// Base class (via CRTP) for concrete implementation
+// classes, with default implementations based on
+// copy_to/copy_from.
+//
+// Also provides simd_detail::simd_traits type map.
+//
+// Maths functions are implemented in terms of
+// arithmetic primitives or lane-wise invocation of
+// std::math functions; specialized implementations
+// should be provided by the concrete implementation
+// where it is more efficient:
+//
+// Function | Default implemention by
+// ----------------------------------
+// min      | negate, cmp_gt, ifelse
+// max      | negate, cmp_gt, ifelse
+// abs      | negate, max
+// sin      | lane-wise std::sin
+// cos      | lane-wise std::cos
+// exp      | lane-wise std::exp
+// log      | lane-wise std::log
+// pow      | lane-wise std::pow
+// expm1    | lane-wise std::expm1
+// exprelr  | expm1, div, add, cmp_eq, ifelse
+//
+// 'exprelr' is the function x ↦ x/(exp(x)-1).
+
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <iterator>
+
+// Derived class I must at minimum provide:
+//
+// * specialization of simd_traits.
+//
+// * implementations (static) for copy_to, copy_from:
+//
+//     void I::copy_to(const vector_type&, scalar_type*)
+//     vector_type I::copy_from(const scalar_type*)
+//
+//     void I::mask_copy_to(const vector_type&, scalar_type*)
+//     vector_type I::mask_copy_from(const bool*)
+//
+// * implementations (static) for mask element get/set:
+//
+//     bool I::mask_element(const vector_type& v, int i);
+//     void I::mask_set_element(vector_type& v, int i, bool x);
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+// The simd_traits class provides the mapping between a concrete SIMD
+// implementation I and its associated classes. This must be specialized
+// for each concrete implementation.
+
+template <typename I>
+struct simd_traits {
+    static constexpr unsigned width = 0;
+    using scalar_type = void;
+    using vector_type = void;
+    using mask_impl = void;
+};
+
+template <typename I>
+struct implbase {
+    constexpr static unsigned width = simd_traits<I>::width;
+    using scalar_type = typename simd_traits<I>::scalar_type;
+    using vector_type = typename simd_traits<I>::vector_type;
+
+    using mask_impl = typename simd_traits<I>::mask_impl;
+    using mask_type = typename simd_traits<mask_impl>::vector_type;
+
+    using store = scalar_type[width];
+    using mask_store = bool[width];
+
+    static vector_type broadcast(scalar_type x) {
+        store a;
+        std::fill(std::begin(a), std::end(a), x);
+        return I::copy_from(a);
+    }
+
+    static scalar_type element(const vector_type& v, int i) {
+        store a;
+        I::copy_to(v, a);
+        return a[i];
+    }
+
+    static void set_element(vector_type& v, int i, scalar_type x) {
+        store a;
+        I::copy_to(v, a);
+        a[i] = x;
+        v = I::copy_from(a);
+    }
+
+    static void copy_to_masked(const vector_type& v, scalar_type* p, const mask_type& mask) {
+        store a;
+        I::copy_to(v, a);
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) p[i] = a[i];
+        }
+    }
+
+    static vector_type copy_from_masked(const scalar_type* p, const mask_type& mask) {
+        store a;
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) a[i] = p[i];
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type copy_from_masked(const vector_type& v, const scalar_type* p, const mask_type& mask) {
+        store a;
+        I::copy_to(v, a);
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) a[i] = p[i];
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type negate(const vector_type& u) {
+        store a, r;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = -a[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type add(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]+b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type mul(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]*b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type sub(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]-b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type div(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]/b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type fma(const vector_type& u, const vector_type& v, const vector_type& w) {
+        store a, b, c, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+        I::copy_to(w, c);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::fma(a[i], b[i], c[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static mask_type cmp_eq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]==b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_neq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]!=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_gt(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]>b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_geq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]>=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_lt(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]<b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_leq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]<=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static vector_type logical_not(const vector_type& u) {
+        store a, r;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = !a[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type logical_and(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i] && b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type logical_or(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i] || b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type ifelse(const mask_type& mask, const vector_type& u, const vector_type& v) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = m[i]? a[i]: b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type mask_broadcast(bool v) {
+        mask_store m;
+        std::fill(std::begin(m), std::end(m), v);
+        return I::mask_copy_from(m);
+    }
+
+    static vector_type mask_unpack(unsigned long long k) {
+        mask_store m;
+        for (unsigned i = 0; i<width; ++i) {
+            m[i] = k&(1ull<<i);
+        }
+        return I::mask_copy_from(m);
+    }
+
+    template <typename ImplIndex>
+    static vector_type gather(ImplIndex, const scalar_type* p, const typename ImplIndex::vector_type& index) {
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        for (unsigned i = 0; i<width; ++i) {
+            a[i] = p[o[i]];
+        }
+        return I::copy_from(a);
+    }
+
+    template <typename ImplIndex>
+    static vector_type gather(ImplIndex, const vector_type& s, const scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) { a[i] = p[o[i]]; }
+        }
+        return I::copy_from(a);
+    };
+
+    template <typename ImplIndex>
+    static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index) {
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            p[o[i]] = a[i];
+        }
+    }
+
+    template <typename ImplIndex>
+    static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) { p[o[i]] = a[i]; }
+        }
+    }
+
+    // Maths
+
+    static vector_type abs(const vector_type& u) {
+        store a;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            a[i] = std::abs(a[i]);
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type min(const vector_type& s, const vector_type& t) {
+        return ifelse(cmp_gt(t, s), s, t);
+    }
+
+    static vector_type max(const vector_type& s, const vector_type& t) {
+        return ifelse(cmp_gt(t, s), t, s);
+    }
+
+    static vector_type sin(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::sin(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type cos(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::cos(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type exp(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::exp(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type expm1(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::expm1(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type log(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::log(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type exprelr(const vector_type& s) {
+        vector_type ones = I::broadcast(1);
+        return ifelse(cmp_eq(ones, add(ones, s)), ones, div(s, expm1(s)));
+    }
+
+    static vector_type pow(const vector_type& s, const vector_type &t) {
+        store a, b, r;
+        I::copy_to(s, a);
+        I::copy_to(t, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::pow(a[i], b[i]);
+        }
+        return I::copy_from(r);
+    }
+};
+
+} // namespace simd_detail
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/native.hpp b/src/simd/native.hpp
new file mode 100644
index 00000000..d71bccec
--- /dev/null
+++ b/src/simd/native.hpp
@@ -0,0 +1,93 @@
+#pragma once
+
+// Native SIMD representations based on architecture.
+
+// Predefined macros from the compiler are used to guard
+// architecture-specific code here and in the source code
+// for the concrete implementation classes.
+//
+// For x86 vector extensions, the same preprocessor defines
+// are used across gcc, clang and icpc: __AVX__, __AVX2__,
+// __FMA__, __AVX512F__.
+//
+// Note that the FMA extensions for x86 are strictly speaking
+// independent of AVX2, and instructing gcc (for example)
+// to generate AVX2 code with '-mavx2' will not enable FMA
+// instructions unless '-mfma' is also given. It is generally
+// safer to explicitly request the target using
+// '-march', '-mcpu' or '-x', depending on the compiler and
+// architecure.
+
+namespace arb {
+namespace simd {
+namespace simd_abi {
+
+template <typename Value, unsigned N>
+struct native {
+    using type = void;
+};
+
+} // namespace simd_abi
+} // namespace simd
+} // namespace arb
+
+#define ARB_DEF_NATIVE_SIMD_(T, N, A)\
+namespace arb {\
+namespace simd_abi {\
+template <> struct native<T, N> {\
+    using type = typename A<T, N>::type;\
+};\
+}\
+}
+
+
+#if defined(__AVX2__) && defined(__FMA__)
+
+#include <simd/avx.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 4, avx2)
+ARB_DEF_NATIVE_SIMD_(double, 4, avx2)
+
+#elif defined(__AVX__)
+
+#include <simd/avx.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 4, avx)
+ARB_DEF_NATIVE_SIMD_(double, 4, avx)
+
+#endif
+
+#if defined(__AVX512F__)
+
+#include <simd/avx512.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 8, avx512)
+ARB_DEF_NATIVE_SIMD_(double, 8, avx512)
+
+#endif
+
+
+namespace arb {
+namespace simd {
+namespace simd_abi {
+
+// Define native widths based on largest native vector implementation
+// of corresponding type. Presume power of 2, no larger than largest
+// possible over implemented architectures.
+
+template <typename Value, int k = 64>
+struct native_width;
+
+template <typename Value, int k>
+struct native_width {
+   static constexpr int value =
+        std::is_same<void, typename native<Value, k>::type>::value?
+        native_width<Value, k/2>::value:
+        k;
+};
+
+template <typename Value>
+struct native_width<Value, 1> {
+    static constexpr int value = 1;
+};
+
+} // namespace simd_abi
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/simd.hpp b/src/simd/simd.hpp
new file mode 100644
index 00000000..a7a1c655
--- /dev/null
+++ b/src/simd/simd.hpp
@@ -0,0 +1,521 @@
+#pragma once
+
+#include <type_traits>
+
+#include <simd/implbase.hpp>
+#include <simd/generic.hpp>
+#include <simd/native.hpp>
+
+namespace arb {
+namespace simd {
+
+namespace simd_detail {
+    template <typename Impl>
+    struct simd_impl;
+
+    template <typename Impl>
+    struct simd_mask_impl;
+}
+
+// Forward declarations for top-level maths functions.
+// (these require access to private simd_impl<Impl>::wrap method).
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+namespace simd_detail {
+    template <typename Impl>
+    struct simd_impl {
+        // Type aliases:
+        //
+        //     scalar_type           internal value type in one simd lane,
+        //     simd_mask             simd_mask_impl specialization represeting comparison results,
+        //
+        //     vector_type           underlying representation,
+        //     mask_type             underlying representation for mask.
+
+        using scalar_type = typename simd_traits<Impl>::scalar_type;
+        using simd_mask   = simd_mask_impl<typename simd_traits<Impl>::mask_impl>;
+
+    protected:
+        using vector_type = typename simd_traits<Impl>::vector_type;
+        using mask_type   = typename simd_traits<typename simd_traits<Impl>::mask_impl>::vector_type;
+
+    public:
+        static constexpr unsigned width = simd_traits<Impl>::width;
+
+        template <typename Other>
+        friend class simd_impl;
+
+        simd_impl() = default;
+
+        // Construct by filling with scalar value.
+        simd_impl(const scalar_type& x) {
+            value_ = Impl::broadcast(x);
+        }
+
+        // Construct from scalar values in memory.
+        explicit simd_impl(const scalar_type* p) {
+            value_ = Impl::copy_from(p);
+        }
+
+        // Construct from const array ref.
+        explicit simd_impl(const scalar_type (&a)[width]) {
+            value_ = Impl::copy_from(a);
+        }
+
+        // Construct from scalar values in memory with mask.
+        explicit simd_impl(const scalar_type* p, const simd_mask& m) {
+            value_ = Impl::copy_from_masked(p, m.value_);
+        }
+
+        // Construct from const array ref with mask.
+        explicit simd_impl(const scalar_type (&a)[width], const simd_mask& m) {
+            value_ = Impl::copy_from_masked(a, m.value_);
+        }
+
+        // Copy constructor.
+        simd_impl(const simd_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+        }
+
+        // Scalar asssignment fills vector.
+        simd_impl& operator=(scalar_type x) {
+            value_ = Impl::broadcast(x);
+            return *this;
+        }
+
+        // Copy assignment.
+        simd_impl& operator=(const simd_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+            return *this;
+        }
+
+        // Read/write operations: copy_to, copy_from.
+
+        void copy_to(scalar_type* p) const {
+            Impl::copy_to(value_, p);
+        }
+
+        void copy_from(const scalar_type* p) {
+            value_ = Impl::copy_from(p);
+        }
+
+        // Arithmetic operations: +, -, *, /, fma.
+
+        simd_impl operator-() const {
+            return wrap(Impl::negate(value_));
+        }
+
+        friend simd_impl operator+(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::add(a.value_, b.value_));
+        }
+
+        friend simd_impl operator-(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::sub(a.value_, b.value_));
+        }
+
+        friend simd_impl operator*(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::mul(a.value_, b.value_));
+        }
+
+        friend simd_impl operator/(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::div(a.value_, b.value_));
+        }
+
+        friend simd_impl fma(const simd_impl& a, simd_impl b, simd_impl c) {
+            return simd_impl::wrap(Impl::fma(a.value_, b.value_, c.value_));
+        }
+
+        // Lane-wise relational operations.
+
+        friend simd_mask operator==(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_eq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator!=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_neq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator<=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_leq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator<(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_lt(a.value_, b.value_));
+        }
+
+        friend simd_mask operator>=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_geq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator>(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_gt(a.value_, b.value_));
+        }
+
+        // Compound assignment operations: +=, -=, *=, /=.
+
+        simd_impl& operator+=(const simd_impl& x) {
+            value_ = Impl::add(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator-=(const simd_impl& x) {
+            value_ = Impl::sub(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator*=(const simd_impl& x) {
+            value_ = Impl::mul(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator/=(const simd_impl& x) {
+            value_ = Impl::div(value_, x.value_);
+            return *this;
+        }
+
+        // Gather and scatter.
+
+        template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+        void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) {
+            value_ = Impl::gather(IndexImpl{}, p, index.value_);
+        }
+
+        template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+        void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) {
+            Impl::scatter(IndexImpl{}, value_, p, index.value_);
+        }
+
+        // Array subscript operations.
+
+        struct reference {
+            reference() = delete;
+            reference(const reference&) = default;
+            reference& operator=(const reference&) = delete;
+
+            reference(vector_type& value, int i):
+                ptr_(&value), i(i) {}
+
+            reference& operator=(scalar_type v) && {
+                Impl::set_element(*ptr_, i, v);
+                return *this;
+            }
+
+            operator scalar_type() const {
+                return Impl::element(*ptr_, i);
+            }
+
+            vector_type* ptr_;
+            int i;
+        };
+
+        reference operator[](int i) {
+            return reference(value_, i);
+        }
+
+        scalar_type operator[](int i) const {
+            return Impl::element(value_, i);
+        }
+
+        // Masked assignment (via where expressions).
+
+        struct where_expression {
+            where_expression(const where_expression&) = default;
+            where_expression& operator=(const where_expression&) = delete;
+
+            where_expression(const simd_mask& m, simd_impl& s):
+                mask_(m), data_(s) {}
+
+            where_expression& operator=(scalar_type v) {
+                data_.value_ = Impl::ifelse(mask_.value_, simd_impl(v).value_, data_.value_);
+                return *this;
+            }
+
+            where_expression& operator=(const simd_impl& v) {
+                data_.value_ = Impl::ifelse(mask_.value_, v.value_, data_.value_);
+                return *this;
+            }
+
+            void copy_to(scalar_type* p) const {
+                Impl::copy_to_masked(data_.value_, p, mask_.value_);
+            }
+
+            void copy_from(const scalar_type* p) {
+                data_.value_ = Impl::copy_from_masked(data_.value_, p, mask_.value_);
+            }
+
+            template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+            void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) {
+                data_.value_ = Impl::gather(IndexImpl{}, data_.value_, p, index.value_, mask_.value_);
+            }
+
+            template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+            void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) {
+                Impl::scatter(IndexImpl{}, data_.value_, p, index.value_, mask_.value_);
+            }
+
+        private:
+            const simd_mask& mask_;
+            simd_impl& data_;
+        };
+
+        // Maths functions are implemented as top-level functions, but require
+        // access to `wrap`.
+
+        friend simd_impl abs<Impl>(const simd_impl&);
+        friend simd_impl sin<Impl>(const simd_impl&);
+        friend simd_impl cos<Impl>(const simd_impl&);
+        friend simd_impl exp<Impl>(const simd_impl&);
+        friend simd_impl log<Impl>(const simd_impl&);
+        friend simd_impl expm1<Impl>(const simd_impl&);
+        friend simd_impl exprelr<Impl>(const simd_impl&);
+        friend simd_impl min<Impl>(const simd_impl&, const simd_impl&);
+        friend simd_impl max<Impl>(const simd_impl&, const simd_impl&);
+        friend simd_impl pow<Impl>(const simd_impl&, const simd_impl&);
+
+    protected:
+        vector_type value_;
+        simd_impl(const vector_type& x) {
+            value_ = x;
+        }
+
+    private:
+        static simd_impl wrap(const vector_type& v) {
+            simd_impl s;
+            s.value_ = v;
+            return s;
+        }
+
+        static simd_mask mask(const mask_type& v) {
+            simd_mask m;
+            m.value_ = v;
+            return m;
+        }
+    };
+
+    template <typename Impl>
+    struct simd_mask_impl: simd_impl<Impl> {
+        using base = simd_impl<Impl>;
+        using typename base::vector_type;
+        using typename base::scalar_type;
+        using base::width;
+        using base::value_;
+
+        simd_mask_impl() = default;
+
+        // Construct by filling with scalar value.
+        simd_mask_impl(bool b) {
+            value_ = Impl::mask_broadcast(b);
+        }
+
+        // Scalar asssignment fills vector.
+        simd_mask_impl& operator=(bool b) {
+            value_ = Impl::mask_broadcast(b);
+            return *this;
+        }
+
+        // Construct from bool values in memory.
+        explicit simd_mask_impl(const bool* y) {
+            value_ = Impl::mask_copy_from(y);
+        }
+
+        // Construct from const array ref.
+        explicit simd_mask_impl(const bool (&a)[width]) {
+            value_ = Impl::mask_copy_from(&a[0]);
+        }
+
+        // Copy assignment.
+        simd_mask_impl& operator=(const simd_mask_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+            return *this;
+        }
+
+        // Read/write bool operations: copy_to, copy_from.
+
+        void copy_to(bool* w) const {
+            Impl::mask_copy_to(value_, w);
+        }
+
+        void copy_from(const bool* y) {
+            value_ = Impl::mask_copy_from(y);
+        }
+
+        // Array subscript operations.
+
+        struct reference {
+            reference() = delete;
+            reference(const reference&) = default;
+            reference& operator=(const reference&) = delete;
+
+            reference(vector_type& value, int i):
+                ptr_(&value), i(i) {}
+
+            reference& operator=(bool v) && {
+                Impl::mask_set_element(*ptr_, i, v);
+                return *this;
+            }
+
+            operator bool() const {
+                return Impl::mask_element(*ptr_, i);
+            }
+
+            vector_type* ptr_;
+            int i;
+        };
+
+        reference operator[](int i) {
+            return reference(value_, i);
+        }
+
+        bool operator[](int i) const {
+            return Impl::element(value_, i);
+        }
+
+        // Logical operations.
+
+        simd_mask_impl operator!() const {
+            return simd_mask_impl::wrap(Impl::logical_not(value_));
+        }
+
+        friend simd_mask_impl operator&&(const simd_mask_impl& a, const simd_mask_impl& b) {
+            return simd_mask_impl::wrap(Impl::logical_and(a.value_, b.value_));
+        }
+
+        friend simd_mask_impl operator||(const simd_mask_impl& a, const simd_mask_impl& b) {
+            return simd_mask_impl::wrap(Impl::logical_or(a.value_, b.value_));
+        }
+
+        // Make mask from corresponding bits of integer.
+
+        static simd_mask_impl unpack(unsigned long long bits) {
+            return simd_mask_impl::wrap(Impl::mask_unpack(bits));
+        }
+
+    private:
+        simd_mask_impl(const vector_type& v): base(v) {}
+
+        template <class> friend class simd_impl;
+
+        static simd_mask_impl wrap(const vector_type& v) {
+            simd_mask_impl m;
+            m.value_ = v;
+            return m;
+        }
+    };
+} // namespace simd_detail
+
+namespace simd_abi {
+    // Note: `simd_abi::native` template class defined in `simd/native.hpp`,
+    // `simd_abi::generic` in `simd/generic.hpp`.
+
+    template <typename Value, unsigned N>
+    struct default_abi {
+        using type = typename std::conditional<
+            std::is_same<void, typename native<Value, N>::type>::value,
+            typename generic<Value, N>::type,
+            typename native<Value, N>::type>::type;
+    };
+}
+
+template <typename Value, unsigned N, template <class, unsigned> class Abi = simd_abi::default_abi>
+using simd = simd_detail::simd_impl<typename Abi<Value, N>::type>;
+
+template <typename Value, unsigned N>
+using simd_mask = typename simd<Value, N>::simd_mask;
+
+template <typename Simd>
+using where_expression = typename Simd::where_expression;
+
+template <typename Simd>
+where_expression<Simd> where(const typename Simd::simd_mask& m, Simd& v) {
+    return where_expression<Simd>(m, v);
+}
+
+template <typename>
+struct is_simd: std::false_type {};
+
+template <typename Impl>
+struct is_simd<simd_detail::simd_impl<Impl>>: std::true_type {};
+
+// Top-level maths functions: forward to underlying Impl.
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::abs(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::sin(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::cos(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::exp(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::log(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::expm1(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::exprelr(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::pow(s.value_, t.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::min(s.value_, t.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::max(s.value_, t.value_));
+}
+
+} // namespace simd
+} // namespace arb
diff --git a/src/util/debug.hpp b/src/util/debug.hpp
index 214e2269..dfc55392 100644
--- a/src/util/debug.hpp
+++ b/src/util/debug.hpp
@@ -2,9 +2,14 @@
 
 #include <iostream>
 #include <sstream>
+#include <string>
 #include <mutex>
 #include <utility>
 
+extern "C" {
+#include <endian.h>
+}
+
 #include <threading/threading.hpp>
 #include "unwind.hpp"
 
@@ -84,16 +89,85 @@ namespace impl {
             return out;
         }
     };
+
+    enum class endian {
+        little = __ORDER_LITTLE_ENDIAN__,
+        big = __ORDER_BIG_ENDIAN__,
+        native = __BYTE_ORDER__
+    };
+
+    // Wrapper for emitting values on an ostream as a sequence of hex digits.
+    struct hexdump_inline_wrap {
+        const unsigned char* from;
+        std::size_t size;
+        unsigned width;
+
+        friend std::ostream& operator<<(std::ostream& out, const hexdump_inline_wrap& h) {
+            using std::ptrdiff_t;
+
+            constexpr bool little = endian::native==endian::little;
+            ptrdiff_t width = h.width;
+            const unsigned char* from = h.from;
+            const unsigned char* end = h.from+h.size;
+            std::string buf;
+
+            auto emit = [&buf](unsigned char c) {
+                const char* digit = "0123456789abcdef";
+                buf += digit[(c>>4)&0xf];
+                buf += digit[c&0xf];
+            };
+
+            constexpr unsigned bufsz = 512;
+            unsigned bufmargin = 4*width+1;
+
+            buf.reserve(bufsz);
+            while (end-from>width) {
+                if (buf.size()+bufmargin>=bufsz) {
+                    out << buf;
+                    buf.clear();
+                }
+                for (ptrdiff_t i = 0; i<width; ++i) {
+                    emit(little? from[width-i-1]: from[i]);
+                }
+                from += width;
+                buf += ' ';
+            }
+            for (ptrdiff_t i = 0; i<end-from; ++i) {
+                emit(little? from[width-i-1]: from[i]);
+            }
+
+            out << buf;
+            return out;
+        }
+    };
 }
 
 // Wrap a sequence or container of values so that they can be printed
 // to an `std::ostream` with the elements separated by the supplied 
 // separator.
+
 template <typename Seq, typename Separator>
 impl::sepval<Seq, Separator> sepval(const Seq& seq, Separator sep) {
     return impl::sepval<Seq, Separator>(seq, std::move(sep));
 }
 
+template <typename Seq>
+impl::sepval<Seq, const char*> csv(const Seq& seq) {
+    return sepval(seq, ", ");
+}
+
+// Dump something in hex (inline representation).
+
+template <typename T>
+impl::hexdump_inline_wrap hexdump(const T& obj, unsigned width = 4) {
+    return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(&obj), sizeof obj, width};
+}
+
+template <typename T>
+impl::hexdump_inline_wrap hexdump_n(const T* ptr, std::size_t n, unsigned width = 4) {
+    return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(ptr), n, width};
+}
+
 } // namespace util
 } // namespace arb
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index f2efc570..fe018f93 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -70,6 +70,7 @@ set(TEST_SOURCES
     test_segment.cpp
     test_schedule.cpp
     test_rss_cell.cpp
+    test_simd.cpp
     test_span.cpp
     test_spikes.cpp
     test_spike_store.cpp
diff --git a/tests/unit/common.hpp b/tests/unit/common.hpp
index 70eace9a..ad6c6873 100644
--- a/tests/unit/common.hpp
+++ b/tests/unit/common.hpp
@@ -40,7 +40,7 @@ struct null_terminated_t {
 
 constexpr null_terminated_t null_terminated;
 
-// wrap a value type, with copy operations disabled
+// Wrap a value type, with copy operations disabled.
 
 template <typename V>
 struct nocopy {
@@ -81,7 +81,7 @@ int nocopy<V>::move_ctor_count;
 template <typename V>
 int nocopy<V>::move_assign_count;
 
-// wrap a value type, with move operations disabled
+// Wrap a value type, with move operations disabled.
 
 template <typename V>
 struct nomove {
@@ -138,12 +138,13 @@ template <typename FPType, typename Seq1, typename Seq2>
     for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) {
         using FP = testing::internal::FloatingPoint<FPType>;
 
-        // cast to FPType to avoid warnings about lowering conversion
-        // if FPType has lower precision than Seq{12}::value_type
         auto v1 = *i1;
         auto v2 = *i2;
 
-        if (!FP{v1}.AlmostEquals(FP{v2})) {
+        // Cast to FPType to avoid warnings about lowering conversion
+        // if FPType has lower precision than Seq{12}::value_type.
+
+        if (!(std::isnan(v1) && std::isnan(v2)) && !FP{v1}.AlmostEquals(FP{v2})) {
             return ::testing::AssertionFailure() << "floating point numbers " << v1 << " and " << v2 << " differ at index " << j;
         }
 
@@ -155,6 +156,59 @@ template <typename FPType, typename Seq1, typename Seq2>
     return ::testing::AssertionSuccess();
 }
 
+template <typename V>
+inline bool generic_isnan(const V& x) { return false; }
+inline bool generic_isnan(float x) { return std::isnan(x); }
+inline bool generic_isnan(double x) { return std::isnan(x); }
+inline bool generic_isnan(long double x) { return std::isnan(x); }
+
+template <typename U, typename V>
+static bool equiv(const U& u, const V& v) {
+    return u==v || (generic_isnan(u) && generic_isnan(v));
+}
+
+template <typename Seq1, typename Seq2>
+::testing::AssertionResult seq_eq(Seq1&& seq1, Seq2&& seq2) {
+    using std::begin;
+    using std::end;
+
+    auto i1 = begin(seq1);
+    auto i2 = begin(seq2);
+
+    auto e1 = end(seq1);
+    auto e2 = end(seq2);
+
+    for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) {
+        auto v1 = *i1;
+        auto v2 = *i2;
+
+        if (!equiv(v1, v2)) {
+            return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << j;
+        }
+    }
+
+    if (i1!=e1 || i2!=e2) {
+        return ::testing::AssertionFailure() << "sequences differ in length";
+    }
+    return ::testing::AssertionSuccess();
+}
+
+// Assert elements 0..n-1 inclusive of two indexed collections are exactly equal.
+template <typename Arr1, typename Arr2>
+::testing::AssertionResult indexed_eq_n(int n, Arr1&& a1, Arr2&& a2) {
+    for (int i = 0; i<n; ++i) {
+        auto v1 = a1[i];
+        auto v2 = a2[i];
+
+        if (!equiv(v1,v2)) {
+            return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << i;
+        }
+    }
+
+    return ::testing::AssertionSuccess();
+}
+
+
 // Assert two floating point values are within a relative tolerance.
 inline ::testing::AssertionResult near_relative(double a, double b, double relerr) {
     double tol = relerr*std::max(std::abs(a), std::abs(b));
diff --git a/tests/unit/test_simd.cpp b/tests/unit/test_simd.cpp
new file mode 100644
index 00000000..f22de961
--- /dev/null
+++ b/tests/unit/test_simd.cpp
@@ -0,0 +1,1001 @@
+#include <cmath>
+#include <random>
+
+#include <simd/simd.hpp>
+#include <simd/avx.hpp>
+
+#include "common.hpp"
+
+using namespace arb::simd;
+
+namespace {
+    // Use different distributions in `fill_random`, based on the value type in question:
+    //
+    //     * floating point type => uniform_real_distribution, default interval [-1, 1).
+    //     * bool                => uniform_int_distribution, default interval [0, 1).
+    //     * other integral type => uniform_int_distribution, default interval [L, U]
+    //                              such that L^2+L and U^2+U fit within the integer range.
+
+    template <typename V, typename = typename std::enable_if<std::is_floating_point<V>::value>::type>
+    std::uniform_real_distribution<V> make_udist(V lb = -1., V ub = 1.) {
+        return std::uniform_real_distribution<V>(lb, ub);
+    }
+
+    template <typename V, typename = typename std::enable_if<std::is_integral<V>::value && !std::is_same<V, bool>::value>::type>
+    std::uniform_int_distribution<V> make_udist(
+            V lb = std::numeric_limits<V>::lowest() / (2 << std::numeric_limits<V>::digits/2),
+            V ub = std::numeric_limits<V>::max() >> (1+std::numeric_limits<V>::digits/2))
+    {
+        return std::uniform_int_distribution<V>(lb, ub);
+    }
+
+    template <typename V, typename = typename std::enable_if<std::is_same<V, bool>::value>::type>
+    std::uniform_int_distribution<> make_udist(V lb = 0, V ub = 1) {
+        return std::uniform_int_distribution<>(0, 1);
+    }
+
+    template <typename Seq, typename Rng>
+    void fill_random(Seq&& seq, Rng& rng) {
+        using V = typename std::decay<decltype(*std::begin(seq))>::type;
+
+        auto u = make_udist<V>();
+        for (auto& x: seq) { x = u(rng); }
+    }
+
+    template <typename Seq, typename Rng, typename B1, typename B2>
+    void fill_random(Seq&& seq, Rng& rng, B1 lb, B2 ub) {
+        using V = typename std::decay<decltype(*std::begin(seq))>::type;
+
+        auto u = make_udist<V>(lb, ub);
+        for (auto& x: seq) { x = u(rng); }
+    }
+
+    template <typename Simd, typename Rng, typename B1, typename B2, typename = typename std::enable_if<is_simd<Simd>::value>::type>
+    void fill_random(Simd& s, Rng& rng, B1 lb, B2 ub) {
+        using V = typename Simd::scalar_type;
+        constexpr unsigned N = Simd::width;
+
+        V v[N];
+        fill_random(v, rng, lb, ub);
+        s.copy_from(v);
+    }
+
+    template <typename Simd, typename Rng, typename = typename std::enable_if<is_simd<Simd>::value>::type>
+    void fill_random(Simd& s, Rng& rng) {
+        using V = typename Simd::scalar_type;
+        constexpr unsigned N = Simd::width;
+
+        V v[N];
+        fill_random(v, rng);
+        s.copy_from(v);
+    }
+
+    template <typename Simd>
+    ::testing::AssertionResult simd_eq(Simd a, Simd b) {
+        constexpr unsigned N = Simd::width;
+        using V = typename Simd::scalar_type;
+
+        V as[N], bs[N];
+        a.copy_to(as);
+        b.copy_to(bs);
+
+        return ::testing::seq_eq(as, bs);
+    }
+
+    constexpr unsigned nrounds = 20u;
+}
+
+template <typename S>
+struct simd_value: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_value);
+
+// Initialization and element access.
+TYPED_TEST_P(simd_value, elements) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1001);
+
+    // broadcast:
+    simd a(2);
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(2., a[i]);
+    }
+
+    // scalar assignment:
+    a = 3;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(3, a[i]);
+    }
+
+    scalar bv[N], cv[N], dv[N];
+
+    fill_random(bv, rng);
+    fill_random(cv, rng);
+    fill_random(dv, rng);
+
+    // array initialization:
+    simd b(bv);
+    EXPECT_TRUE(testing::indexed_eq_n(N, bv, b));
+
+    // array rvalue initialization:
+    simd c(std::move(cv));
+    EXPECT_TRUE(testing::indexed_eq_n(N, cv, c));
+
+    // pointer initialization:
+    simd d(&dv[0]);
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, d));
+
+    // copy construction:
+    simd e(d);
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, e));
+
+    // copy assignment:
+    b = d;
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, b));
+}
+
+TYPED_TEST_P(simd_value, element_lvalue) {
+    using simd = TypeParam;
+    constexpr unsigned N = simd::width;
+
+    simd a(3);
+    ASSERT_GT(N, 1u);
+    a[N-2] = 5;
+
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(i==N-2? 5: 3, a[i]);
+    }
+}
+
+TYPED_TEST_P(simd_value, copy_to_from) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1010);
+
+    scalar buf1[N], buf2[N];
+    fill_random(buf1, rng);
+    fill_random(buf2, rng);
+
+    simd s;
+    s.copy_from(buf1);
+    s.copy_to(buf2);
+
+    EXPECT_TRUE(testing::indexed_eq_n(N, buf1, s));
+    EXPECT_TRUE(testing::seq_eq(buf1, buf2));
+}
+
+TYPED_TEST_P(simd_value, copy_to_from_masked) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1031);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar buf1[N], buf2[N], buf3[N];
+        fill_random(buf1, rng);
+        fill_random(buf2, rng);
+        fill_random(buf3, rng);
+
+        bool mbuf1[N], mbuf2[N];
+        fill_random(mbuf1, rng);
+        fill_random(mbuf2, rng);
+        mask m1(mbuf1);
+        mask m2(mbuf2);
+
+        scalar expected[N];
+        for (unsigned i = 0; i<N; ++i) {
+            expected[i] = mbuf1[i]? buf2[i]: buf1[i];
+        }
+
+        simd s(buf1);
+        where(m1, s).copy_from(buf2);
+        EXPECT_TRUE(testing::indexed_eq_n(N, expected, s));
+
+        for (unsigned i = 0; i<N; ++i) {
+            if (!mbuf2[i]) expected[i] = buf3[i];
+        }
+
+        where(m2, s).copy_to(buf3);
+        EXPECT_TRUE(testing::indexed_eq_n(N, expected, buf3));
+    }
+}
+
+TYPED_TEST_P(simd_value, construct_masked) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1031);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar buf[N];
+        fill_random(buf, rng);
+
+        bool mbuf[N];
+        fill_random(mbuf, rng);
+
+        mask m(mbuf);
+        simd s(buf, m);
+
+        for (unsigned i = 0; i<N; ++i) {
+            if (!mbuf[i]) continue;
+            EXPECT_EQ(buf[i], s[i]);
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, arithmetic) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1002);
+    scalar u[N], v[N], w[N], r[N];
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        fill_random(u, rng);
+        fill_random(v, rng);
+        fill_random(w, rng);
+
+        scalar neg_u[N];
+        for (unsigned i = 0; i<N; ++i) neg_u[i] = -u[i];
+
+        scalar u_plus_v[N];
+        for (unsigned i = 0; i<N; ++i) u_plus_v[i] = u[i]+v[i];
+
+        scalar u_minus_v[N];
+        for (unsigned i = 0; i<N; ++i) u_minus_v[i] = u[i]-v[i];
+
+        scalar u_times_v[N];
+        for (unsigned i = 0; i<N; ++i) u_times_v[i] = u[i]*v[i];
+
+        scalar u_divide_v[N];
+        for (unsigned i = 0; i<N; ++i) u_divide_v[i] = u[i]/v[i];
+
+        scalar fma_u_v_w[N];
+        for (unsigned i = 0; i<N; ++i) fma_u_v_w[i] = std::fma(u[i],v[i],w[i]);
+
+        simd us(u), vs(v), ws(w);
+
+        (-us).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(neg_u, r));
+
+        (us+vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_plus_v, r));
+
+        (us-vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_minus_v, r));
+
+        (us*vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_times_v, r));
+
+        (us/vs).copy_to(r);
+#if defined(__INTEL_COMPILER)
+        // icpc will by default use an approximation for scalar division,
+        // and a different one for vectorized scalar division; the latter,
+        // in particular, is often out by 1 ulp for normal quotients.
+        //
+        // Unfortunately, we can't check at compile time the floating
+        // point dodginess quotient.
+
+        if (std::is_floating_point<scalar>::value) {
+            EXPECT_TRUE(testing::seq_almost_eq<scalar>(u_divide_v, r));
+        }
+        else {
+            EXPECT_TRUE(testing::seq_eq(u_divide_v, r));
+        }
+#else
+        EXPECT_TRUE(testing::seq_eq(u_divide_v, r));
+#endif
+
+        (fma(us, vs, ws)).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(fma_u_v_w, r));
+    }
+}
+
+TYPED_TEST_P(simd_value, compound_assignment) {
+    using simd = TypeParam;
+
+    simd a, b, r;
+
+    std::minstd_rand rng(1003);
+    fill_random(a, rng);
+    fill_random(b, rng);
+
+    EXPECT_TRUE(simd_eq(a+b, (r = a)+=b));
+    EXPECT_TRUE(simd_eq(a-b, (r = a)-=b));
+    EXPECT_TRUE(simd_eq(a*b, (r = a)*=b));
+    EXPECT_TRUE(simd_eq(a/b, (r = a)/=b));
+}
+
+TYPED_TEST_P(simd_value, comparison) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1004);
+    std::uniform_int_distribution<> sgn(-1, 1); // -1, 0 or 1.
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        int cmp[N];
+        bool test[N];
+        simd a, b;
+
+        fill_random(b, rng);
+
+        for (unsigned j = 0; j<N; ++j) {
+            cmp[j] = sgn(rng);
+            a[j] = b[j]+17*cmp[j];
+        }
+
+        mask gt = a>b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, gt));
+
+        mask geq = a>=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, geq));
+
+        mask lt = a<b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, lt));
+
+        mask leq = a<=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, leq));
+
+        mask eq = a==b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]==0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, eq));
+
+        mask ne = a!=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]!=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, ne));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_elements) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1005);
+
+    // bool broadcast:
+    mask a(true);
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(true, a[i]);
+    }
+
+    // scalar assignment:
+    mask d;
+    d = false;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(false, d[i]);
+    }
+    d = true;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(true, d[i]);
+    }
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool bv[N], cv[N], dv[N];
+
+        fill_random(bv, rng);
+        fill_random(cv, rng);
+        fill_random(dv, rng);
+
+        // array initialization:
+        mask b(bv);
+        EXPECT_TRUE(testing::indexed_eq_n(N, bv, b));
+
+        // array rvalue initialization:
+        mask c(std::move(cv));
+        EXPECT_TRUE(testing::indexed_eq_n(N, cv, c));
+
+        // pointer initialization:
+        mask d(&dv[0]);
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, d));
+
+        // copy construction:
+        mask e(d);
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, e));
+
+        // copy assignment:
+        b = d;
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, b));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_element_lvalue) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1006);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool v[N];
+        fill_random(v, rng);
+
+        mask m(v);
+        for (unsigned j = 0; j<N; ++j) {
+            bool b = v[j];
+            m[j] = !b;
+            v[j] = !b;
+
+            EXPECT_EQ(m[j], !b);
+            EXPECT_TRUE(testing::indexed_eq_n(N, v, m));
+
+            m[j] = b;
+            v[j] = b;
+            EXPECT_EQ(m[j], b);
+            EXPECT_TRUE(testing::indexed_eq_n(N, v, m));
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_copy_to_from) {
+    using simd = TypeParam;
+    using simd_mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1012);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool buf1[N], buf2[N];
+        fill_random(buf1, rng);
+        fill_random(buf2, rng);
+
+        simd_mask m;
+        m.copy_from(buf1);
+        m.copy_to(buf2);
+
+        EXPECT_TRUE(testing::indexed_eq_n(N, buf1, m));
+        EXPECT_TRUE(testing::seq_eq(buf1, buf2));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_unpack) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1035);
+    std::uniform_int_distribution<unsigned long long> U(0, (1ull<<N)-1);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        unsigned long long packed = U(rng);
+        bool b[N];
+        mask::unpack(packed).copy_to(b);
+
+        for (unsigned j = 0; j<N; ++j) {
+            EXPECT_EQ((bool)(packed&(1ull<<j)), b[j]);
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, maths) {
+    // min, max, abs tests valid for both fp and int types.
+
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1013);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar a[N], b[N], test[N];
+        fill_random(a, rng);
+        fill_random(b, rng);
+
+        simd as(a), bs(b);
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::abs(a[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, abs(as)));
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::min(a[j], b[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, min(as, bs)));
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::max(a[j], b[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, max(as, bs)));
+    }
+}
+
+REGISTER_TYPED_TEST_CASE_P(simd_value, elements, element_lvalue, copy_to_from, copy_to_from_masked, construct_masked, arithmetic, compound_assignment, comparison, mask_elements, mask_element_lvalue, mask_copy_to_from, mask_unpack, maths);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd<int, 4, simd_abi::avx>,
+    simd<double, 4, simd_abi::avx>,
+#endif
+#ifdef __AVX2__
+    simd<int, 4, simd_abi::avx2>,
+    simd<double, 4, simd_abi::avx2>,
+#endif
+#ifdef __AVX512F__
+    simd<int, 8, simd_abi::avx512>,
+    simd<double, 8, simd_abi::avx512>,
+#endif
+
+    simd<int, 4, simd_abi::generic>,
+    simd<double, 4, simd_abi::generic>,
+    simd<float, 16, simd_abi::generic>,
+
+    simd<int, 4, simd_abi::default_abi>,
+    simd<double, 4, simd_abi::default_abi>,
+    simd<int, 8, simd_abi::default_abi>,
+    simd<double, 8, simd_abi::default_abi>
+> simd_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_value, simd_test_types);
+
+// FP-only SIMD value tests (maths).
+
+template <typename S>
+struct simd_fp_value: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_fp_value);
+
+TYPED_TEST_P(simd_fp_value, fp_maths) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1014);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        fp epsilon = std::numeric_limits<fp>::epsilon();
+        int min_exponent = std::numeric_limits<fp>::min_exponent;
+        int max_exponent = std::numeric_limits<fp>::max_exponent;
+
+        fp u[N], v[N], r[N];
+
+        // Trigonometric functions (sin, cos):
+        fill_random(u, rng);
+
+        fp sin_u[N];
+        for (unsigned i = 0; i<N; ++i) sin_u[i] = std::sin(u[i]);
+        sin(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(sin_u, r));
+
+        fp cos_u[N];
+        for (unsigned i = 0; i<N; ++i) cos_u[i] = std::cos(u[i]);
+        cos(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(cos_u, r));
+
+        // Logarithms (natural log):
+        fill_random(u, rng, -max_exponent*std::log(2.), max_exponent*std::log(2.));
+        for (auto& x: u) {
+            x = std::exp(x);
+            // SIMD log implementation may treat subnormal as zero
+            if (std::fpclassify(x)==FP_SUBNORMAL) x = 0;
+        }
+
+        fp log_u[N];
+        for (unsigned i = 0; i<N; ++i) log_u[i] = std::log(u[i]);
+        log(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(log_u, r));
+
+        // Exponential functions (exp, expm1, exprelr):
+
+        // Use max_exponent to get coverage over finite domain.
+        fp exp_min_arg = min_exponent*std::log(2.);
+        fp exp_max_arg = max_exponent*std::log(2.);
+        fill_random(u, rng, exp_min_arg, exp_max_arg);
+
+        fp exp_u[N];
+        for (unsigned i = 0; i<N; ++i) exp_u[i] = std::exp(u[i]);
+        exp(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u, r));
+
+        fp expm1_u[N];
+        for (unsigned i = 0; i<N; ++i) expm1_u[i] = std::expm1(u[i]);
+        expm1(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u, r));
+
+        fp exprelr_u[N];
+        for (unsigned i = 0; i<N; ++i) {
+            exprelr_u[i] = u[i]+fp(1)==fp(1)? fp(1): u[i]/(std::exp(u[i])-fp(1));
+        }
+        exprelr(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u, r));
+
+        // Test expm1 and exprelr with small (magnitude < fp epsilon) values.
+        fill_random(u, rng, -epsilon, epsilon);
+        fp expm1_u_small[N];
+        for (unsigned i = 0; i<N; ++i) {
+            expm1_u_small[i] = std::expm1(u[i]);
+            EXPECT_NEAR(u[i], expm1_u_small[i], std::abs(4*u[i]*epsilon)); // just to confirm!
+        }
+        expm1(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u_small, r));
+
+        fp exprelr_u_small[N];
+        for (unsigned i = 0; i<N; ++i) exprelr_u_small[i] = 1;
+        exprelr(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u_small, r));
+
+        // Test zero result for highly negative exponents.
+        fill_random(u, rng, 4*exp_min_arg, 2*exp_min_arg);
+        fp exp_u_very_negative[N];
+        for (unsigned i = 0; i<N; ++i) exp_u_very_negative[i] = std::exp(u[i]);
+        exp(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u_very_negative, r));
+
+        // Power function:
+
+        // Non-negative base, arbitrary exponent.
+        fill_random(u, rng, 0., std::exp(1));
+        fill_random(v, rng, exp_min_arg, exp_max_arg);
+        fp pow_u_pos_v[N];
+        for (unsigned i = 0; i<N; ++i) pow_u_pos_v[i] = std::pow(u[i], v[i]);
+        pow(simd(u), simd(v)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_pos_v, r));
+
+        // Arbitrary base, small magnitude integer exponent.
+        fill_random(u, rng);
+        int int_exponent[N];
+        fill_random(int_exponent, rng, -2, 2);
+        for (unsigned i = 0; i<N; ++i) v[i] = int_exponent[i];
+        fp pow_u_v_int[N];
+        for (unsigned i = 0; i<N; ++i) pow_u_v_int[i] = std::pow(u[i], v[i]);
+        pow(simd(u), simd(v)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_v_int, r));
+    }
+}
+
+// Check special function behaviour for specific values including
+// qNAN, infinity etc.
+
+TYPED_TEST_P(simd_fp_value, exp_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    const fp exp_minarg = std::log(normal_least);
+    const fp exp_maxarg = std::log(largest);
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    normal_least, denorm_least, -normal_least, -denorm_least,
+                    exp_minarg, exp_maxarg, qnan, -qnan };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::exp(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = exp(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+TYPED_TEST_P(simd_fp_value, expm1_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    const fp expm1_minarg = std::nextafter(std::log(eps/4), fp(0));
+    const fp expm1_maxarg = std::log(largest);
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    normal_least, denorm_least, -normal_least, -denorm_least,
+                    expm1_minarg, expm1_maxarg, qnan, -qnan };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::expm1(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = expm1(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+TYPED_TEST_P(simd_fp_value, log_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    // NOTE: simd log implementations may treat subnormal
+    // numbers as zero, so omit the denorm_least tests...
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    //constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    //normal_least, denorm_least, -normal_least, -denorm_least,
+                    normal_least, -normal_least,
+                    qnan, -qnan, largest };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::log(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = log(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+REGISTER_TYPED_TEST_CASE_P(simd_fp_value, fp_maths, exp_special_values, expm1_special_values, log_special_values);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd<double, 4, simd_abi::avx>,
+#endif
+#ifdef __AVX2__
+    simd<double, 4, simd_abi::avx2>,
+#endif
+#ifdef __AVX512F__
+    simd<double, 8, simd_abi::avx512>,
+#endif
+
+    simd<float, 2, simd_abi::generic>,
+    simd<double, 4, simd_abi::generic>,
+    simd<float, 8, simd_abi::generic>,
+
+    simd<double, 4, simd_abi::default_abi>,
+    simd<double, 8, simd_abi::default_abi>
+> simd_fp_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_fp_value, simd_fp_test_types);
+
+// Gather/scatter tests.
+
+template <typename A, typename B>
+struct simd_and_index {
+    using simd = A;
+    using simd_index = B;
+};
+
+template <typename SI>
+struct simd_indirect: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_indirect);
+
+TYPED_TEST_P(simd_indirect, gather) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen];
+        index indirect[N];
+
+        fill_random(array, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+
+        simd s;
+        s.gather(array, simd_index(indirect));
+
+        scalar test[N];
+        for (unsigned j = 0; j<N; ++j) {
+            test[j] = array[indirect[j]];
+        }
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, s));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, masked_gather) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+    using simd_mask = typename simd::simd_mask;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], original[N], test[N];
+        index indirect[N];
+        bool mask[N];
+
+        fill_random(array, rng);
+        fill_random(original, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+        fill_random(mask, rng);
+
+        for (unsigned j = 0; j<N; ++j) {
+            test[j] = mask[j]? array[indirect[j]]: original[j];
+        }
+
+        simd s(original);
+        simd_mask m(mask);
+        where(m, s).gather(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, s));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, scatter) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], test[buflen], values[N];
+        index indirect[N];
+
+        fill_random(array, rng);
+        fill_random(values, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+
+        for (unsigned j = 0; j<buflen; ++j) {
+            test[j] = array[j];
+        }
+        for (unsigned j = 0; j<N; ++j) {
+            test[indirect[j]] = values[j];
+        }
+
+        simd s(values);
+        s.scatter(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, array));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, masked_scatter) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+    using simd_mask = typename simd::simd_mask;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], test[buflen], values[N];
+        index indirect[N];
+        bool mask[N];
+
+        fill_random(array, rng);
+        fill_random(values, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+        fill_random(mask, rng);
+
+        for (unsigned j = 0; j<buflen; ++j) {
+            test[j] = array[j];
+        }
+        for (unsigned j = 0; j<N; ++j) {
+            if (mask[j]) { test[indirect[j]] = values[j]; }
+        }
+
+        simd s(values);
+        simd_mask m(mask);
+        where(m, s).scatter(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, array));
+    }
+}
+
+
+REGISTER_TYPED_TEST_CASE_P(simd_indirect, gather, masked_gather, scatter, masked_scatter);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd_and_index<simd<double, 4, simd_abi::avx>,
+                   simd<int, 4, simd_abi::avx>>,
+#endif
+
+#ifdef __AVX2__
+    simd_and_index<simd<double, 4, simd_abi::avx2>,
+                   simd<int, 4, simd_abi::avx2>>,
+#endif
+
+#ifdef __AVX512F__
+    simd_and_index<simd<double, 8, simd_abi::avx512>,
+                   simd<int, 8, simd_abi::avx512>>,
+#endif
+
+    simd_and_index<simd<float, 4, simd_abi::generic>,
+                   simd<std::int64_t, 4, simd_abi::generic>>,
+
+    simd_and_index<simd<double, 8, simd_abi::generic>,
+                   simd<unsigned, 8, simd_abi::generic>>,
+
+    simd_and_index<simd<double, 4, simd_abi::default_abi>,
+                   simd<int, 4, simd_abi::default_abi>>,
+
+    simd_and_index<simd<double, 8, simd_abi::default_abi>,
+                   simd<int, 8, simd_abi::default_abi>>
+> simd_indirect_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_indirect, simd_indirect_test_types);
-- 
GitLab