diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8f4ac9729ab8ae398073402f323143e9432b379..2f92d0a7fd39bb11622429c361a74cfa391d5ec7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,9 @@ endif()
 set(ARB_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX2,AVX512}")
 set_property(CACHE ARB_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX2 AVX512)
 
+# Note: this option conflates modcc code generation options and
+# the target architecture for compilation of Arbor. TODO: fix!
+
 if(ARB_VECTORIZE_TARGET STREQUAL "KNL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL} -DSIMD_KNL")
 elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX2")
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index 40a019bcbcdec70f9ac72d1fd076ea18e558ba72..8e51dddbdb597933a7808f7ae77685e334be2a7a 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -16,6 +16,10 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL")
 endif()
 
 if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    set(CXXOPT_KNL "-march=knl")
+    set(CXXOPT_AVX2 "-mavx2 -mfma")
+    set(CXXOPT_AVX512 "-mavx512f -mavx512cd")
+
     # Disable 'missing-braces' warning: this will inappropriately
     # flag initializations such as
     #     std::array<int,3> a={1,2,3};
@@ -36,7 +40,7 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
     # Compiler flags for generating KNL-specific AVX512 instructions
     # supported in gcc 4.9.x and later.
     set(CXXOPT_KNL "-march=knl")
-    set(CXXOPT_AVX2 "-mavx2")
+    set(CXXOPT_AVX2 "-mavx2 -mfma")
     set(CXXOPT_AVX512 "-mavx512f -mavx512cd")
 
     # Disable 'maybe-uninitialized' warning: this will be raised
diff --git a/doc/index.rst b/doc/index.rst
index 23852d71b732e1f22e1aa846afc3845995ae94f2..661a67946e69f057bb4f0f40ebd7f540e1e4d59c 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,6 +25,8 @@ Arbor is a high-performance library for computationa neurscience simulations.
    :maxdepth: 1
 
    library
+   simd_api
+   simd_maths
    profiler
    sampling_api
 
diff --git a/doc/simd_api.rst b/doc/simd_api.rst
new file mode 100644
index 0000000000000000000000000000000000000000..be33e7a040ebaa0693adaad8af0ca926b4207750
--- /dev/null
+++ b/doc/simd_api.rst
@@ -0,0 +1,950 @@
+SIMD classes for Arbor
+======================
+
+The purpose of the SIMD classes is to abstract and consolidate the use of
+compiler intrinsics for the manipulation of architecture-specific vector
+(SIMD) values.
+
+The implementation is rather loosely based on the data-parallel vector types
+proposal `P0214R6 for the C++ Parallelism TS 2 <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0214r6.pdf>`_.
+
+Unless otherwise specified, all classes, namespaces and top-level functions
+described below are all within the top-level `arb::simd` namespace.
+
+.. rubric:: Example usage
+
+The following code performs an element-wise vector product, storing
+only non-zero values in the resultant array.
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        #include <simd/simd.hpp>
+        using namespace arb::simd;
+
+        void product_nonzero(int n, const double* a, const double* b, double* result) {
+            constexpr int N = simd_abi::native_width<double>::value;
+            using simd = simd<double, N>;
+            using mask = simd::simd_mask;
+
+            int i = 0;
+            for (; i+N<=n; i+=N) {
+                auto vp = simd(a+i)*simd(b+i);
+                where(vp!=0, vp).copy_to(result+i);
+            }
+
+            int tail = n-i;
+            auto m = mask::unpack((1<<tail)-1);
+
+            auto vp = simd(a+i, m)*simd(b+i, m);
+            where(m && vp!=0, vp).copy_to(c+i);
+        }
+
+
+Classes
+-------
+
+Three user-facing template classes are provided:
+
+1. ``simd<V, N, I = simd_abi::default_abi>``
+
+   *N*-wide vector type of values of type *V*, using architecture-specific
+   implementation *I*. The implementation parameter is itself a template,
+   acting as a type-map, with ``I<V, N>::type`` being the concrete implementation
+   class (see below) for *N*-wide vectors of type *V* for this architecture.
+
+   The implementation ``simd_abi::generic`` provides a ``std::array``-backed
+   implementation for arbitrary *V* and *N*, while ``simd_abi::native``
+   maps to the native architecture implementation for *V* and *N*, if
+   one is available for the target architecture.
+
+   ``simd_abi::default_abi`` will use ``simd_abi::native`` if available, or
+   else fall back to the generic implementation.
+
+2. ``simd_mask<V, N, I = simd_api::default_abi>``
+
+   The result of performing a lane-wise comparison/test operation on
+   a ``simd<V, N, I>`` vector value. ``simd_mask`` objects support logical
+   operations and are used as arguments to ``where`` expressions.
+
+   ``simd_mask<V, N, I>`` is a type alias for ``simd<V, N, I>::simd_mask``.
+
+3. ``where_expression<simd<V, N, I>>``
+
+   The result of a ``where`` expression, used for masked assignment.
+
+Implementation typemaps live in the ``simd_abi`` namespace, while concrete
+implementation classes live in ``simd_detail``. A particular specialization
+for an architecture, for example 4-wide double on AVX, then requires:
+
+*  A concrete implementation class, e.g. ``simd_detail::avx_double4``.
+
+*  A specialization of its ABI map, so that ``simd_abi::avx<double, 4>::type``
+   is an alias for ``simd_detail::avx_double4``.
+
+*  A specialization of the native ABI map, so that
+   ``simd_abi::native<double, 4>::type`` is an alias for ``simd_abi::avx<double, 4>::type``.
+
+The maximum natively supported width for a scalar type *V* is recorded in
+``simd_abi::native_width<V>::value``.
+
+Class ``simd``
+^^^^^^^^^^^^^^
+
+The class ``simd<V, N, I>`` is an alias for ``simd_detail::simd_impl<I<V, N>::type>``;
+the class ``simd_detail::simd_impl<C>`` provides the public interface and
+arithmetic operators for a concrete implementation class `C`.
+
+In the following:
+
+* *S* stands for the class ``simd<V, N, I>``.
+* *s* is an SIMD value of type *S*.
+* *m* is a mask value of type ``S::simd_mask``.
+* *t*, *u* and *v* are const objects of type *S*.
+* *i* is an index of type ``int``.
+* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type.
+* *x* is a value of type *V*.
+* *p* is a pointer to *V*.
+* *c* is a const pointer to *V* or a length *N* array of *V*.
+
+Here and below, the value in lane *i* of a SIMD vector or mask *v* is denoted by
+*v*\ `i`:sub:
+
+
+.. rubric:: Type aliases and constexpr members
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Name
+      - Type
+      - Description
+
+    * - ``S::scalar_type``
+      - *V*
+      - The type of one lane of the SIMD type.
+
+    * - ``S::simd_mask``
+      - ``simd_mask<V, N, I>``
+      - The ``simd_mask`` specialization resulting from comparisons of *S* SIMD values.
+
+    * - ``S::width``
+      - ``unsigned``
+      - The SIMD width *N*.
+
+.. rubric:: Constructors
+
+.. list-table::
+    :widths: 20 80
+    :header-rows: 1
+
+    * - Expression
+      - Description
+
+    * - ``S(x)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to *x* for *i* = 0â€¦*N*-1.
+
+    * - ``S(t)``
+      - A copy of the SIMD value *t*.
+
+    * - ``S(c)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* = 0â€¦*N*-1.
+
+    * - ``S(c, m)``
+      - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* where *m*\ `i`:sub: is true.
+
+.. rubric:: Member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``t.copy_to(p)``
+      - ``void``
+      - Set ``p[i]`` to *t*\ `i`:sub: for *i* = 0â€¦*N*-1.
+
+    * - ``t.scatter(p, j)``
+      - ``void``
+      - Set ``p[j[i]]`` to *t*\ `i`:sub: for *i* = 0â€¦*N*-1.
+
+    * - ``s.copy_from(c)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[i]`` for *i* = 0â€¦*N*-1.
+
+    * - ``s.gather(c, j)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* = 0â€¦*N*-1.
+
+.. rubric:: Expressions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``t+u``
+      - ``S``
+      - Lane-wise sum.
+
+    * - ``t-u``
+      - ``S``
+      - Lane-wise difference.
+
+    * - ``t*u``
+      - ``S``
+      - Lane-wise product.
+
+    * - ``t/u``
+      - ``S``
+      - Lane-wise quotient.
+
+    * - ``fma(t, u, v)``
+      - ``S``
+      - Lane-wise FMA *t* * *u* + *v*.
+
+    * - ``s<t``
+      - ``S::simd_mask``
+      - Lane-wise less-than comparison.
+
+    * - ``s<=t``
+      - ``S::simd_mask``
+      - Lane-wise less-than-or-equals comparison.
+
+    * - ``s>t``
+      - ``S::simd_mask``
+      - Lane-wise greater-than comparison.
+
+    * - ``s>=t``
+      - ``S::simd_mask``
+      - Lane-wise greater-than-or-equals comparison.
+
+    * - ``s==t``
+      - ``S::simd_mask``
+      - Lane-wise equality test.
+
+    * - ``s!=t``
+      - ``S::simd_mask``
+      - Lane-wise inequality test.
+
+    * - ``s=t``
+      - ``S&``
+      - Lane-wise assignment.
+
+    * - ``s+=t``
+      - ``S&``
+      - Equivalent to ``s=s+t``.
+
+    * - ``s-=t``
+      - ``S&``
+      - Equivalent to ``s=s-t``.
+
+    * - ``s*=t``
+      - ``S&``
+      - Equivalent to ``s=s*t``.
+
+    * - ``s/=t``
+      - ``S&``
+      - Equivalent to ``s=s/t``.
+
+    * - ``s=x``
+      - ``S&``
+      - Equivalent to ``s=S(x)``.
+
+    * - ``t[i]``
+      - ``V``
+      - Value *t*\ `i`:sub:
+
+    * - ``s[i]=x``
+      - ``S::reference``
+      - Set value *s*\ `i`:sub: to *x*.
+
+The (non-const) index operator ``operator[]`` returns a proxy object of type ``S::reference``,
+which writes the corresponding lane in the SIMD value on assignment, and has an
+implicit conversion to ``scalar_type``.
+
+
+Class ``simd_mask``
+^^^^^^^^^^^^^^^^^^^
+
+``simd_mask<V, N, I>`` is an alias for ``simd<V, N, I>::simd_mask``, which in turn
+will be an alias for a class ``simd_detail::simd_mask_impl<D>``, where *D* is
+a concrete implementation class for the SIMD mask representation. ``simd_mask_impl<D>``
+inherits from, and is implemented in terms of, ``simd_detail::simd_impl<D>``,
+but note that the concrete implementation class *D* may or may not be the same
+as the concrete implementation class ``I<V, N>::type`` used by ``simd<V, N, I>``.
+
+Mask values are read and written as ``bool`` values of 0 or 1, which may
+differ from the internal representation in each lane of the SIMD implementation.
+
+In the following:
+
+* *M* stands for the class ``simd_mask<V, N, I>``.
+* *m* and *q* are const objects of type ``simd_mask<V, N, I>``.
+* *u* is an object of type ``simd_mask<V, N, I>``.
+* *b* is a boolean value.
+* *w* is a pointer to ``bool``.
+* *y* is a const pointer to ``bool`` or a length *N* array of ``bool``.
+* *i* is of type ``int``.
+* *k* is of type ``unsigned long long``.
+
+.. rubric:: Constructors
+
+.. list-table::
+    :widths: 20 80
+    :header-rows: 1
+
+    * - Expression
+      - Description
+
+    * - ``M(b)``
+      - A SIMD mask *u* with *u*\ `i`:sub: equal to *b* for *i* = 0â€¦*N*-1.
+
+    * - ``M(m)``
+      - A copy of the SIMD mask *m*.
+
+    * - ``M(y)``
+      - A SIMD value *u* with *u*\ `i`:sub: equal to ``y[i]`` for *i* = 0â€¦*N*-1.
+
+Note that ``simd_mask`` does not (currently) offer a masked pointer/array constructor.
+
+.. rubric:: Member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``m.copy_to(w)``
+      - ``void``
+      - Write the boolean value *m*\ `i`:sub: to ``w[i]`` for *i* = 0â€¦*N*-1.
+
+    * - ``u.copy_from(y)``
+      - ``void``
+      - Set *u*\ `i`:sub: to the boolean value ``y[i]`` for *i* = 0â€¦*N*-1.
+
+.. rubric:: Expressions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``!m``
+      - ``M``
+      - Lane-wise negation.
+
+    * - ``m&&q``
+      - ``M``
+      - Lane-wise logical and.
+
+    * - ``m||q``
+      - ``M``
+      - Lane-wise logical or.
+
+    * - ``m==q``
+      - ``M``
+      - Lane-wise equality (equivalent to ``m!=!q``).
+
+    * - ``m!=q``
+      - ``M``
+      - Lane-wise logical xor.
+
+    * - ``m=q``
+      - ``M&``
+      - Lane-wise assignment.
+
+    * - ``m[i]``
+      - ``bool``
+      - Boolean value *m*\ `i`:sub:.
+
+    * - ``m[i]=b``
+      - ``M::reference``
+      - Set *m*\ `i`:sub: to boolean value *b*.
+
+.. rubric:: Static member functions
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``M::unpack(k)``
+      - ``M``
+      - Mask with value *m*\ `i`:sub: equal to the *i*\ th bit of *k*.
+
+
+Class ``where_expression``
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``where_expression<S>`` represents a masked subset of the lanes
+of a SIMD value of type ``S``, used for conditional assignment,
+masked scatter, and masked gather. It is a type alias for
+``S::where_expression``, and is the result of an expression of the
+form ``where(mask, simdvalue)``.
+
+In the following:
+
+* *W* stands for the class ``where_expression<simd<V, N, I>>``.
+* *s* is a reference to a SIMD value of type ``simd<V, N, I>&``.
+* *t* is a SIMD value of type ``simd<V, N, I>``.
+* *m* is a mask of type ``simd<V, N, I>::simd_mask``.
+* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type.
+* *x* is a scalar of type *V*.
+* *p* is a pointer to *V*.
+* *c* is a const pointer to *V* or a length *N* array of *V*.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``where(m, s)``
+      - ``W``
+      - A proxy for masked-assignment operations.
+
+    * - ``where(m, s)=t``
+      - ``void``
+      - Set *s*\ `i`:sub: to *t*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s)=x``
+      - ``void``
+      - Set *s*\ `i`:sub: to *x* for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).copy_to(p)``
+      - ``void``
+      - Set ``p[i]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).scatter(p, j)``
+      - ``void``
+      - Set ``p[j[i]]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).copy_from(c)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[i]`` for *i* where *m*\ `i`:sub: is true.
+
+    * - ``where(m, s).gather(c, j)``
+      - ``void``
+      - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* where *m*\ `i`:sub: is true.
+
+
+Top-level functions
+-------------------
+
+Lane-wise mathematical operations *abs(x)*, *min(x, y)* and *max(x, y)* are offered for
+all SIMD value types, while the transcendental functions are only usable for
+SIMD floating point types.
+
+Vectorized implementations of some of the transcendental functions are provided:
+refer to :doc:`simd_maths` for details.
+
+
+In the following:
+
+* *A* is a SIMD class ``simd<K, N, I>`` for some scalar type *K*.
+* *S* is a SIMD class ``simd<V, N, I>`` for a floating point type *V*.
+* *a* and *b* are values of type *A*.
+* *s* and *t* are values of type *S*.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``abs(a)``
+      - *A*
+      - Lane-wise absolute value of *a*.
+
+    * - ``min(a, b)``
+      - *A*
+      - Lane-wise minimum of *a* and *b*.
+
+    * - ``max(a, b)``
+      - *A*
+      - Lane-wise maximum of *a* and *b*.
+
+    * - ``sin(s)``
+      - *S*
+      - Lane-wise sine of *s*.
+
+    * - ``cos(s)``
+      - *S*
+      - Lane-wise cosine of *s*.
+
+    * - ``log(s)``
+      - *S*
+      - Lane-wise natural logarithm of *s*.
+
+    * - ``exp(s)``
+      - *S*
+      - Lane-wise exponential of *s*.
+
+    * - ``expm1(s)``
+      - *S*
+      - Lane-wise :math:`x \mapsto e^x - 1`.
+
+    * - ``exprelr(s)``
+      - *S*
+      - Lane-wise :math:`x \mapsto x / (e^x - 1)`.
+
+    * - ``pow(s, t)``
+      - *S*
+      - Lane-wise raise *s* to the power of *t*.
+
+
+Implementation requirements
+---------------------------
+
+Each specific architecture is represented by a templated class *I*, with
+``I<V, N>::type`` being the concrete implementation for an *N*-wide
+SIMD value with ``scalar_type`` *V*.
+
+A concrete implementation class *C* inherits from ``simd_detail::implbase<C>``,
+which provides (via CRTP) generic implementations of most of the SIMD
+functionality. The base class ``implbase<C>`` in turn relies upon
+``simd_detail::simd_traits<C>`` to look up the SIMD width, and associated types.
+
+All the required SIMD operations are given by static member functions of *C*.
+
+Minimal implementation
+^^^^^^^^^^^^^^^^^^^^^^
+
+In the following, let *C* be the concrete implementation class for a
+*N*-wide vector of scalar_type *V*, with low-level representation
+``archvec``.
+
+The specialization of ``simd_detail::simd_traits<C>`` then exposes these
+types and values, and also provides the concrete implementation class *M*
+for masks associated with *C*:
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        template <>
+        struct simd_traits<C> {
+            static constexpr unsigned width = N;
+            using scalar_type = V;
+            using vector_type = archvec;
+            using mask_impl = M;
+        };
+
+
+The mask implementation class *M* may or may not be the same as *C*.
+For example, ``simd_detail::avx_double4`` provides both the arithmetic operations and mask
+operations for an AVX 4 Ã— double SIMD vector, while the mask
+implementation for ``simd_detail::avx512_double8`` is ``simd_detail::avx512_mask8``.
+
+The concrete implementation class must provide at minimum implementations
+of ``copy_to`` and ``copy_from`` (see the section below for semantics):
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        struct C: implbase<C> {
+            static void copy_to(const arch_vector&, V*);
+            static arch_vector copy_from(const V*);
+        };
+
+If the implementation is also acting as a mask implementation, it must also
+provide ``mask_copy_to``, ``mask_copy_from``, ``mask_element`` and
+``mask_set_element``:
+
+.. container:: api-code
+
+    .. code-block:: cpp
+
+        struct C: implbase<C> {
+            static void copy_to(const arch_vector&, V*);
+            static arch_vector copy_from(const V*);
+
+            static void mask_copy_to(const arch_vector& v, bool* w);
+            static arch_vector mask_copy_from(const bool* y);
+            static bool mask_element(const arch_vector& v, int i);
+            static void mask_set_element(arch_vector& v, int i, bool x);
+        };
+
+The ``simd_detial::generic<T, N>`` provides an example of a minimal
+implementation based on an ``arch_vector`` type of ``std::array<T, N>``.
+
+
+Concrete implementation API
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the following, *C* represents the concrete implementation class for
+a SIMD class of width *N* and value type *V*.
+
+* *u*, *v*, and *w* are values of type ``C::vector_type``.
+* *r* is a reference of type ``C::vector_type&``.
+* *x* is a value of type ``C::scalar_type``.
+* *c* is a const pointer of type ``const C::scalar_type*``.
+* *p* is a pointer of type ``C::scalar_type*``.
+* *j* is a SIMD index representation of type ``J::vector_type`` for
+  an integral concrete implementation class *J*.
+* *b* is a ``bool`` value.
+* *w* is a pointer to ``bool``.
+* *y* is a const pointer to ``bool``.
+* *i* is an unsigned (index) value.
+* *k* is an unsigned long long value.
+* *m* is a mask representation of type ``C::mask_type``.
+
+.. rubric:: Types and constants
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Name
+      - Type
+      - Description
+
+    * - ``C::vector_type``
+      - ``simd_traits<C>::vector_type``
+      - Underlying SIMD representation type.
+
+    * - ``C::scalar_type``
+      - ``simd_traits<C>::scalar_type``
+      - Should be convertible to/from *V*.
+
+    * - ``C::mask_impl``
+      - ``simd_traits<C>::mask_impl``
+      - Concrete implementation class for mask SIMD type.
+
+    * - ``C::mask_type``
+      - ``C::mask_impl::vector_type``
+      - Underlying SIMD representation for masks.
+
+    * - ``C::width``
+      - ``unsigned``
+      - The SIMD width *N*.
+
+.. rubric:: Initialization, load, store
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::broadcast(x)``
+      - ``C::vector_type``
+      - Fill representation with scalar *x*.
+
+    * - ``C::copy_to(v, p)``
+      - ``void``
+      - Store values *v*\ `i`:sub: to *p+i*. *p* may be unaligned.
+
+    * - ``C::copy_to_masked(v, p, m)``
+      - ``void``
+      - Store values *v*\ `i`:sub: to *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned.
+
+    * - ``C::copy_from(c)``
+      - ``C::vector_type``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i*. *p* may be unaligned.
+
+    * - ``C::copy_from_masked(c, m)``
+      - ``C::vector_type``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned.
+
+    * - ``C::copy_from_masked(w, c, m)``
+      - ``void``
+      - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true, or equal to *w*\ `i`:sub
+        otherwise. *p* may be unaligned.
+
+.. rubric:: Lane access
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::element(v, i)``
+      - ``C::scalar_type``
+      - Value in ith lane of *u*.
+
+    * - ``C::set_element(r, i, x)``
+      - ``void``
+      - Set value in lane *i* of *r* to *x*.
+
+.. rubric:: Gather and scatter
+
+The offsets for gather and scatter operations are given
+by a vector type ``J::vector_type`` for some possibly
+different concrete implementation class *J*, and the
+static methods implementing gather and scatter are templated
+on this class.
+
+Implementations can provide optimized versions for specific
+index classes *J*; this process would be simplified with
+more support for casts between SIMD types and their concrete
+implementations, functionality which is not yet provided.
+
+The first argument to these functions is a dummy argument
+of type *J*, used only to disambiguate overloads.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::gather(J{}, p, j)``
+      - ``C::vector_type``
+      - Vector *v* with values *v*\ `i`:sub: = ``p[j[i]]``.
+
+    * - ``C::gather(J{}, u, p, j, m)``
+      - ``C::vector_type``
+      - Vector *v* with values *v*\ `i`:sub: = *m*\ `i`:sub: ? ``p[j[i]]`` : *u*\ `i`:sub:.
+
+    * - ``C::scatter(J{}, u, p, j)``
+      - ``void``
+      - Write values *u*\ `i`:sub: to ``p[j[i]]``.
+
+    * - ``C::scatter(J{}, u, p, j, m)``
+      - ``void``
+      - Write values *u*\ `i`:sub: to ``p[j[i]]`` for lanes *i* where *m*\ `i`:sub: is true.
+
+.. rubric:: Arithmetic operations
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::negate(v)``
+      - ``C::vector_type``
+      - Lane-wise unary minus.
+
+    * - ``C::mul(u, v)``
+      - ``C::vector_type``
+      - Lane-wise multiplication.
+
+    * - ``C::add(u, v)``
+      - ``C::vector_type``
+      - Lane-wise addition.
+
+    * - ``C::sub(u, v)``
+      - ``C::vector_type``
+      - Lane-wise subtraction.
+
+    * - ``C::div(u, v)``
+      - ``C::vector_type``
+      - Lane-wise division.
+
+    * - ``C::fma(u, v, w)``
+      - ``C::vector_type``
+      - Lane-wise fused multiply-add (u*v+w).
+
+.. rubric:: Comparison and blends
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::cmp_eq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* = *v*.
+
+    * - ``C::cmp_neq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* â‰  *v*.
+
+    * - ``C::cmp_gt(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* > *v*.
+
+    * - ``C::cmp_geq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* â‰¥ *v*.
+
+    * - ``C::cmp_lt(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* < *v*.
+
+    * - ``C::cmp_leq(u, v)``
+      - ``C::mask_type``
+      - Lane-wise *u* â‰¤ *v*.
+
+    * - ``C::ifelse(m, u, v)``
+      - ``C::vector_type``
+      - Vector *w* with values *w*\ `i`:sub: = *m*\ `i`:sub: ? *u*\ `i`:sub: : *v*\ `i`:sub:.
+
+.. rubric:: Mathematical function support.
+
+With the exception of ``abs``, ``min`` and ``max``, these are only
+required for floating point vector implementations.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::abs(v)``
+      - ``C::vector_type``
+      - Lane-wise absolute value.
+
+    * - ``C::min(u, v)``
+      - ``C::vector_type``
+      - Lane-wise minimum.
+
+    * - ``C::max(u, v)``
+      - ``C::vector_type``
+      - Lane-wise maximum.
+
+    * - ``C::sin(v)``
+      - ``C::vector_type``
+      - Lane-wise sine.
+
+    * - ``C::cos(v)``
+      - ``C::vector_type``
+      - Lane-wise cosine.
+
+    * - ``C::log(v)``
+      - ``C::vector_type``
+      - Lane-wise natural logarithm.
+
+    * - ``C::exp(v)``
+      - ``C::vector_type``
+      - Lane-wise exponential.
+
+    * - ``C::expm1(v)``
+      - ``C::vector_type``
+      - Lane-wise :math:`x \mapsto e^x -1`.
+
+    * - ``C::exprelr(v)``
+      - ``C::vector_type``
+      - Lane-wise :math:`x \mapsto x/(e^x -1)`.
+
+    * - ``C::pow(u, v)``
+      - ``C::vector_type``
+      - Lane-wise *u* raised to the power of *v*.
+
+.. rubric:: Mask value support
+
+Mask operations are only required if *C* constitutes the implementation of a
+SIMD mask class.
+
+.. list-table::
+    :widths: 20 20 60
+    :header-rows: 1
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::mask_broadcast(b)``
+      - ``C::vector_type``
+      - Fill mask representation with bool *b*.
+
+    * - ``C::mask_element(v, i)``
+      - ``bool``
+      - Mask value *v*\ `i`:sub:.
+
+    * - ``C::mask_set_element(u, i, b)``
+      - ``void``
+      - Set mask value *u*\ `i`:sub: to *b*.
+
+    * - ``C::mask_copy_to(v, w)``
+      - ``void``
+      - Write bool values to memory (unaligned).
+
+    * - ``C::mask_copy_from(y)``
+      - ``C::vector_type``
+      - Load bool values from memory (unaligned).
+
+    * - ``C::mask_unpack(k)``
+      - ``C::vector_type``
+      - Return vector *v* with boolean value *v*\ `i`:sub: equal
+        to the *i*\ th bit of *k*.
+
+.. rubric:: Logical operations
+
+Logical operations are only required if *C* constitutes the implementation of a
+SIMD mask class.
+
+.. list-table::
+    :header-rows: 1
+    :widths: 20 20 60
+
+    * - Expression
+      - Type
+      - Description
+
+    * - ``C::logical_not(u)``
+      - ``C::vector_type``
+      - Lane-wise negation.
+
+    * - ``C::logical_and(u, v)``
+      - ``C::vector_type``
+      - Lane-wise logical and.
+
+    * - ``C::logical_or(u, v)``
+      - ``C::vector_type``
+      - Lane-wise logical or.
+
+    * - ``C::select(m, v, w)``
+      - ``C::vector_type``
+      - Lane-wise *m*? *v*: *u*.
+
+
+Missing functionality
+---------------------
+
+There is no support yet for the following features, although some of these
+will need to be provided in order to improve the efficiency of SIMD versions
+of our generated mechanisms.
+
+* A SIMD cast function, e.g. ``simd_cast<S>(const T&)`` that converts between
+  different SIMD wrappers of the same width. The infrastructure that supports
+  this in concrete implementation classes would also simplify the implementation
+  of more generic ``gather`` and ``scatter`` methods.
+
+* Horizontal reductions across the lanes of a SIMD value or where-expression.
+
+* Vectorizable implementations of trigonometric functions.
+
+* Compound assignment operations for where-expressions. Extending the concrete
+  implementation API to support this would allow, for example, efficient use
+  of AVX512 masked arithmetic instructions.
+
diff --git a/doc/simd_maths.rst b/doc/simd_maths.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b93b07e663973100949c6f885bea4dff6863bd78
--- /dev/null
+++ b/doc/simd_maths.rst
@@ -0,0 +1,185 @@
+Implementation of vector transcendental functions
+=================================================
+
+When building with the Intel C++ compiler, transcendental
+functions on SIMD values in ``simd<double, 8, simd_detail::avx512>``
+wrap calls to the Intel scalar vector mathematics library (SVML).
+
+Outside of this case, the functions *exp*, *log*, *expm1* and
+*exprelr* use explicit approximations as detailed below. The
+algortihms follow those used in the
+`Cephes library <http://www.netlib.org/cephes/>`_, with
+some accommodations.
+
+.. default-role:: math
+
+Exponentials
+------------
+
+`\operatorname{exp}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The exponential is computed as
+
+.. math::
+
+    e^x = 2^n Â· e^g,
+
+with `|g| â‰¤ 0.5` and `n` an integer. The power of two
+is computed via direct manipulation of the exponent bits of the floating
+point representation, while `e^g` is approximated by a rational polynomial.
+
+`n` and `g` are computed by:
+
+.. math::
+
+    n &= \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor
+
+    g &= x - nÂ·\log 2
+
+where the subtraction in the calculation of `g` is performed in two stages,
+to limit cancellation error:
+
+.. math::
+
+    g &\leftarrow \operatorname{fl}(x - n Â· c_1)
+
+    g &\leftarrow \operatorname{fl}(g - n Â· c_2)
+
+where `c_1+c_2 = \log 2`,`c_1` comprising the first 32 bits of the mantissa.
+(In principle `c_1` might contain more bits of the logarithm, but this
+particular decomposition matches that used in the Cephes library.) This
+decomposition gives `|g|\leq \frac{1}{2}\log 2\approx 0.347`.
+
+The rational approximation for `e^g` is of the form
+
+.. math::
+
+    e^g \approx \frac{R(g)}{R(-g)}
+
+where `R(g)` is a polynomial of order 6. The coefficients are again those
+used by Cephes, and probably are derived via a Remez algorithm.
+`R(g)` is decomposed into even and odd terms
+
+.. math::
+
+    R(g) = Q(x^2) + xP(x^2)
+
+so that the ratio can be calculated by:
+
+.. math::
+
+    e^g \approx 1 + \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}.
+
+Randomized testing indicates the approximation is accurate to 2 ulp.
+
+
+`\operatorname{expm1}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A similar decomposition of `x = g + nÂ·\log 2` is performed so that
+`gâ‰¤0.5`, with the exception that `n` is always taken to
+be zero for `|x|â‰¤0.5`, i.e.
+
+.. math::
+
+    n = \begin{cases}
+          0&\text{if $|x|â‰¤0.5$,}\\
+          \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor
+          &\text{otherwise.}
+        \end{cases}
+
+
+`\operatorname{expm1}(x)` is then computed as
+
+.. math::
+
+    e^x - 1 = 2^nÂ·(e^g - 1)+(2^n-1).
+
+and the same rational polynomial is used to approximate `e^g-1`,
+
+.. math::
+
+    e^g - 1 \approx \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}.
+
+The scaling by step for `nâ‰ 0` is in practice calculated as
+
+.. math::
+
+    e^x - 1 = 2Â·(2^{n-1}Â·(e^g - 1)+(2^{n-1}-0.5)).
+
+in order to avoid overflow at the upper end of the range.
+
+The order 6 rational polynomial approximation for small `x`
+is insufficiently accurate to maintain 1 ulp accuracy; randomized
+testing indicates a maximum error of up to 3 ulp.
+
+
+`\operatorname{exprelr}(x)`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The function is defined as
+
+.. math::
+
+    \operatorname{exprelr}(x) = x/(e^x-1),
+
+and is the reciprocal of the relative exponential function,
+
+.. math::
+
+    \operatorname{exprel}(x) &= {}_1F_1(1; 2; x)\\
+                             &= \frac{e^x-1}{x}.
+
+This is computed in terms of expm1 by:
+
+.. math::
+
+    \operatorname{exprelr}(x) :=
+      \begin{cases}
+          1&\text{if $\operatorname{fl}(1+x) = 1$,}\\
+          x/\operatorname{expm1}(x)&\text{otherwise.}
+      \end{cases}
+
+With the approximation for `\operatorname{expm1}` used above,
+randomized testing demonstrates a maximum error on the order
+of 4 ulp.
+
+
+Logarithms
+----------
+
+The natural logarithm is computed as
+
+.. math::
+
+    \log x = \log u + nÂ·log 2
+
+where `n` is an integer and `u` is in the interval
+`[ \frac{1}{2}\sqrt 2, \sqrt 2]`. The logarithm of
+`u` is then approximated by the rational polynomial
+used in the Cephes implementation,
+
+.. math::
+
+    \log u &\approx R(u-1)
+
+    R(z) &= z - \frac{z^2}{2} + z^3Â·\frac{P(z)}{Q(z)},
+
+where `P` and `Q` are polynomials of degree 5, with
+`Q` monic.
+
+Cancellation error is minimized by computing the sum for
+`\log x` as:
+
+.. math::
+
+    s &\leftarrow \operatorname{fl}(z^3Â·P(z)/Q(z))\\
+    s &\leftarrow \operatorname{fl}(s + nÂ·c_4)\\
+    s &\leftarrow \operatorname{fl}(s - 0.5Â·z^2)\\
+    s &\leftarrow \operatorname{fl}(s + z)\\
+    s &\leftarrow \operatorname{fl}(s + nÂ·c_3)
+
+where `z=u-1` and `c_3+c_4=\log 2`, `c_3` comprising
+the first 9 bits of the mantissa.
+
diff --git a/doc/static/custom.css b/doc/static/custom.css
index 005eac316b8d6274be5c4749129a7b8780a692fe..54b964182c59ae21cf065a5a005579faea4d56ce 100644
--- a/doc/static/custom.css
+++ b/doc/static/custom.css
@@ -5,7 +5,7 @@
     padding-left: 3pt;
     content: "Example";
     background: #6ab0de;
-    color: #ffffff
+    color: #ffffff;
 }
 
 .api-code>div[class^='highlight']:before {
@@ -14,9 +14,24 @@
     padding-bottom: 2pt;
     padding-left: 3pt;
     content: "API";
-    background: #e0e0e0
+    background: #e0e0e0;
 }
 
 .api-code>div[class^='highlight'] {
-    background: #f4f4f4
+    background: #f4f4f4;
+}
+
+li>p:last-child {
+    padding-bottom: 2ex;
+}
+
+table.docutils td {
+    vertical-align: top !important;
+}
+
+@media screen and (min-width: 40em) {
+    .wy-table-responsive table td,
+    .wy-table-responsive table th {
+	white-space: normal;
+    }
 }
diff --git a/src/simd/approx.hpp b/src/simd/approx.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd5f8395414b69289db93fd94985a8d662569aa3
--- /dev/null
+++ b/src/simd/approx.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <cfloat>
+
+// Constants/magic numbers for the approximation of
+// exponential, trigonometric and special functions.
+//
+// Polynomial coefficients and evaluation orders
+// match those of the Cephes library,
+// <URL: http://www.netlib.org/cephes>.
+//
+// Refer to the developer documentation for more
+// detail concerning the approximations.
+
+namespace arb {
+namespace simd_detail {
+
+// Exponential:
+//
+// Approximation of exponential by a PadÃ©-like rational
+// polynomial R(x)/R(-x) of order 6.
+//
+// P comprises the odd coefficients, and Q the even.
+
+constexpr double P0exp = 9.99999999999999999910E-1;
+constexpr double P1exp = 3.02994407707441961300E-2;
+constexpr double P2exp = 1.26177193074810590878E-4;
+
+constexpr double Q0exp = 2.00000000000000000009E0;
+constexpr double Q1exp = 2.27265548208155028766E-1;
+constexpr double Q2exp = 2.52448340349684104192E-3;
+constexpr double Q3exp = 3.00198505138664455042E-6;
+
+// Cancellation-corrected ln(2) = ln2C1 + ln2C2:
+
+constexpr double ln2C1 = 6.93145751953125E-1;
+constexpr double ln2C2 = 1.42860682030941723212E-6;
+
+// 1/ln(2):
+
+constexpr double ln2inv = 1.4426950408889634073599;
+
+// Min and max argument values for finite and normal
+// double-precision exponential.
+
+constexpr double exp_minarg = -708.3964185322641;
+constexpr double exp_maxarg = 709.782712893384;
+
+// For expm1, minimum argument that gives a result
+// over -1.
+
+constexpr double expm1_minarg = -37.42994775023705;
+
+// Logarithm:
+//
+// Positive denormal numbers are treated as zero
+// for the purposes of the log function. 
+
+constexpr double log_minarg = DBL_MIN;
+constexpr double sqrt2 = 1.41421356237309504880;
+
+// Cancellation-corrected ln(2) = ln2C3 + ln2C4:
+
+constexpr double ln2C3 = 0.693359375;
+constexpr double ln2C4 = -2.121944400546905827679e-4;
+
+// Polynomial coefficients (Q is also order 5,
+// but monic).
+
+constexpr double P0log = 7.70838733755885391666E0;
+constexpr double P1log = 1.79368678507819816313e1;
+constexpr double P2log = 1.44989225341610930846e1;
+constexpr double P3log = 4.70579119878881725854e0;
+constexpr double P4log = 4.97494994976747001425e-1;
+constexpr double P5log = 1.01875663804580931796e-4;
+
+constexpr double Q0log = 2.31251620126765340583E1;
+constexpr double Q1log = 7.11544750618563894466e1;
+constexpr double Q2log = 8.29875266912776603211e1;
+constexpr double Q3log = 4.52279145837532221105e1;
+constexpr double Q4log = 1.12873587189167450590e1;
+
+} // namespace simd_detail
+} // namespace arb
diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1527f1b073ee3a4f7b1d147b3b5d64787fa29cc5
--- /dev/null
+++ b/src/simd/avx.hpp
@@ -0,0 +1,921 @@
+#pragma once
+
+// AVX/AVX2 SIMD intrinsics implementation.
+
+#ifdef __AVX__
+
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+
+#include <simd/approx.hpp>
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simsimd {
+namespace simd_detail {
+
+struct avx_int4;
+struct avx_double4;
+
+template <>
+struct simd_traits<avx_int4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = std::int32_t;
+    using vector_type = __m128i;
+    using mask_impl = avx_int4;
+};
+
+template <>
+struct simd_traits<avx_double4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = double;
+    using vector_type = __m256d;
+    using mask_impl = avx_double4;
+};
+
+struct avx_int4: implbase<avx_int4> {
+    // Use default implementations for: element, set_element, div.
+
+    using int32 = std::int32_t;
+
+    static __m128i broadcast(int32 v) {
+        return _mm_set1_epi32(v);
+    }
+
+    static void copy_to(const __m128i& v, int32* p) {
+        _mm_storeu_si128((__m128i*)p, v);
+    }
+
+    static void copy_to_masked(const __m128i& v, int32* p, const __m128i& mask) {
+        _mm_maskstore_ps(reinterpret_cast<float*>(p), mask, _mm_castsi128_ps(v));
+    }
+
+    static __m128i copy_from(const int32* p) {
+        return _mm_loadu_si128((const __m128i*)p);
+    }
+
+    static __m128i copy_from_masked(const int32* p, const __m128i& mask) {
+        return _mm_castps_si128(_mm_maskload_ps(reinterpret_cast<const float*>(p), mask));
+    }
+
+    static __m128i copy_from_masked(const __m128i& v, const int32* p, const __m128i& mask) {
+        __m128 d = _mm_maskload_ps(reinterpret_cast<const float*>(p), mask);
+        return ifelse(mask, _mm_castps_si128(d), v);
+    }
+
+    static __m128i negate(const __m128i& a) {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_sub_epi32(zero, a);
+    }
+
+    static __m128i add(const __m128i& a, const __m128i& b) {
+        return _mm_add_epi32(a, b);
+    }
+
+    static __m128i sub(const __m128i& a, const __m128i& b) {
+        return _mm_sub_epi32(a, b);
+    }
+
+    static __m128i mul(const __m128i& a, const __m128i& b) {
+        return _mm_mullo_epi32(a, b);
+    }
+
+    static __m128i fma(const __m128i& a, const __m128i& b, const __m128i& c) {
+        return _mm_add_epi32(_mm_mullo_epi32(a, b), c);
+    }
+
+    static __m128i logical_not(const __m128i& a) {
+        __m128i ones = {};
+        return _mm_xor_si128(a, _mm_cmpeq_epi32(ones, ones));
+    }
+
+    static __m128i logical_and(const __m128i& a, const __m128i& b) {
+        return _mm_and_si128(a, b);
+    }
+
+    static __m128i logical_or(const __m128i& a, const __m128i& b) {
+        return _mm_or_si128(a, b);
+    }
+
+    static __m128i cmp_eq(const __m128i& a, const __m128i& b) {
+        return _mm_cmpeq_epi32(a, b);
+    }
+
+    static __m128i cmp_neq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_eq(a, b));
+    }
+
+    static __m128i cmp_gt(const __m128i& a, const __m128i& b) {
+        return _mm_cmpgt_epi32(a, b);
+    }
+
+    static __m128i cmp_geq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_gt(b, a));
+    }
+
+    static __m128i cmp_lt(const __m128i& a, const __m128i& b) {
+        return cmp_gt(b, a);
+    }
+
+    static __m128i cmp_leq(const __m128i& a, const __m128i& b) {
+        return logical_not(cmp_gt(a, b));
+    }
+
+    static __m128i ifelse(const __m128i& m, const __m128i& u, const __m128i& v) {
+        return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(u), _mm_castsi128_ps(m)));
+    }
+
+    static __m128i mask_broadcast(bool b) {
+        return _mm_set1_epi32(-(int32)b);
+    }
+
+    static __m128i mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m128i b = _mm_set1_epi8((char)k);
+        b = logical_or(b, _mm_setr_epi32(0xfefefefe,0xfdfdfdfd,0xfbfbfbfb,0xf7f7f7f7));
+
+        __m128i ones = {};
+        ones = _mm_cmpeq_epi32(ones, ones);
+        return _mm_cmpeq_epi32(b, ones);
+    }
+
+    static bool mask_element(const __m128i& u, int i) {
+        return static_cast<bool>(element(u, i));
+    }
+
+    static void mask_set_element(__m128i& u, int i, bool b) {
+        set_element(u, i, -(int32)b);
+    }
+
+    static void mask_copy_to(const __m128i& m, bool* y) {
+        // Negate (convert 0xffffffff to 0x00000001) and move low bytes to
+        // bottom 4 bytes.
+
+        __m128i s = _mm_setr_epi32(0x0c080400ul,0,0,0);
+        __m128i p = _mm_shuffle_epi8(negate(m), s);
+        std::memcpy(y, &p, 4);
+    }
+
+    static __m128i mask_copy_from(const bool* w) {
+        __m128i r;
+        std::memcpy(&r, w, 4);
+
+        __m128i s = _mm_setr_epi32(0x80808000ul, 0x80808001ul, 0x80808002ul, 0x80808003ul);
+        return negate(_mm_shuffle_epi8(r, s));
+    }
+
+    static __m128i max(const __m128i& a, const __m128i& b) {
+        return _mm_max_epi32(a, b);
+    }
+
+    static __m128i min(const __m128i& a, const __m128i& b) {
+        return _mm_min_epi32(a, b);
+    }
+};
+
+struct avx_double4: implbase<avx_double4> {
+    // Use default implementations for:
+    //     element, set_element, fma.
+
+    using int64 = std::int64_t;
+
+    // CMPPD predicates:
+    static constexpr int cmp_eq_oq =    0;
+    static constexpr int cmp_unord_q =  3;
+    static constexpr int cmp_neq_uq =   4;
+    static constexpr int cmp_true_uq = 15;
+    static constexpr int cmp_lt_oq =   17;
+    static constexpr int cmp_le_oq =   18;
+    static constexpr int cmp_nge_uq =  25;
+    static constexpr int cmp_ge_oq =   29;
+    static constexpr int cmp_gt_oq =   30;
+
+    static __m256d broadcast(double v) {
+        return _mm256_set1_pd(v);
+    }
+
+    static void copy_to(const __m256d& v, double* p) {
+        _mm256_storeu_pd(p, v);
+    }
+
+    static void copy_to_masked(const __m256d& v, double* p, const __m256d& mask) {
+        _mm256_maskstore_pd(p, _mm256_castpd_si256(mask), v);
+    }
+
+    static __m256d copy_from(const double* p) {
+        return _mm256_loadu_pd(p);
+    }
+
+    static __m256d copy_from_masked(const double* p, const __m256d& mask) {
+        return _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
+    }
+
+    static __m256d copy_from_masked(const __m256d& v, const double* p, const __m256d& mask) {
+        __m256d d = _mm256_maskload_pd(p, _mm256_castpd_si256(mask));
+        return ifelse(mask, d, v);
+    }
+
+    static __m256d negate(const __m256d& a) {
+        return _mm256_sub_pd(zero(), a);
+    }
+
+    static __m256d add(const __m256d& a, const __m256d& b) {
+        return _mm256_add_pd(a, b);
+    }
+
+    static __m256d sub(const __m256d& a, const __m256d& b) {
+        return _mm256_sub_pd(a, b);
+    }
+
+    static __m256d mul(const __m256d& a, const __m256d& b) {
+        return _mm256_mul_pd(a, b);
+    }
+
+    static __m256d div(const __m256d& a, const __m256d& b) {
+        return _mm256_div_pd(a, b);
+    }
+
+    static __m256d logical_not(const __m256d& a) {
+        __m256d ones = {};
+        return _mm256_xor_pd(a, _mm256_cmp_pd(ones, ones, 15));
+    }
+
+    static __m256d logical_and(const __m256d& a, const __m256d& b) {
+        return _mm256_and_pd(a, b);
+    }
+
+    static __m256d logical_or(const __m256d& a, const __m256d& b) {
+        return _mm256_or_pd(a, b);
+    }
+
+    static __m256d cmp_eq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_eq_oq);
+    }
+
+    static __m256d cmp_neq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_neq_uq);
+    }
+
+    static __m256d cmp_gt(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_gt_oq);
+    }
+
+    static __m256d cmp_geq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_ge_oq);
+    }
+
+    static __m256d cmp_lt(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_lt_oq);
+    }
+
+    static __m256d cmp_leq(const __m256d& a, const __m256d& b) {
+        return _mm256_cmp_pd(a, b, cmp_le_oq);
+    }
+
+    static __m256d ifelse(const __m256d& m, const __m256d& u, const __m256d& v) {
+        return _mm256_blendv_pd(v, u, m);
+    }
+
+    static __m256d mask_broadcast(bool b) {
+        return _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64)b));
+    }
+
+    static bool mask_element(const __m256d& u, int i) {
+        return static_cast<bool>(element(u, i));
+    }
+
+    static __m256d mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m128i b = _mm_set1_epi8((char)k);
+        // (Note: there is no _mm_setr_epi64x (!))
+        __m128i bl = _mm_or_si128(b, _mm_set_epi64x(0xfdfdfdfdfdfdfdfd, 0xfefefefefefefefe));
+        __m128i bu = _mm_or_si128(b, _mm_set_epi64x(0xf7f7f7f7f7f7f7f7, 0xfbfbfbfbfbfbfbfb));
+
+        __m128i ones = {};
+        ones = _mm_cmpeq_epi32(ones, ones);
+        bl = _mm_cmpeq_epi64(bl, ones);
+        bu = _mm_cmpeq_epi64(bu, ones);
+        return _mm256_castsi256_pd(combine_m128i(bu, bl));
+    }
+
+    static void mask_set_element(__m256d& u, int i, bool b) {
+        char data[256];
+        _mm256_storeu_pd((double*)data, u);
+        ((int64*)data)[i] = -(int64)b;
+        u = _mm256_loadu_pd((double*)data);
+    }
+
+    static void mask_copy_to(const __m256d& m, bool* y) {
+        // Convert to 32-bit wide mask values, and delegate to
+        // avx2_int4.
+
+        avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y);
+    }
+
+    static __m256d mask_copy_from(const bool* w) {
+        __m128i zero = _mm_setzero_si128();
+
+        __m128i r;
+        std::memcpy(&r, w, 4);
+
+        // Move bytes:
+        //   rl: byte 0 to byte 0, byte 1 to byte 8, zero elsewhere.
+        //   ru: byte 2 to byte 0, byte 3 to byte 8, zero elsewhere.
+        //
+        // Subtract from zero to translate
+        // 0x0000000000000001 to 0xffffffffffffffff.
+
+        __m128i sl = _mm_setr_epi32(0x80808000ul, 0x80808080ul, 0x80808001ul, 0x80808080ul);
+        __m128i rl = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, sl));
+
+        __m128i su = _mm_setr_epi32(0x80808002ul, 0x80808080ul, 0x80808003ul, 0x80808080ul);
+        __m128i ru = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, su));
+
+        return _mm256_castsi256_pd(combine_m128i(ru, rl));
+    }
+
+    static __m256d max(const __m256d& a, const __m256d& b) {
+        return _mm256_max_pd(a, b);
+    }
+
+    static __m256d min(const __m256d& a, const __m256d& b) {
+        return _mm256_min_pd(a, b);
+    }
+
+    static __m256d abs(const __m256d& x) {
+        __m256i m = _mm256_set1_epi64x(0x7fffffffffffffffll);
+        return _mm256_and_pd(x, _mm256_castsi256_pd(m));
+    }
+
+    // Exponential is calculated as follows:
+    //
+    //     e^x = e^g Â· 2^n,
+    //
+    // where g in [-0.5, 0.5) and n is an integer. 2^n can be
+    // calculated via bit manipulation or specialized scaling intrinsics,
+    // whereas e^g is approximated using the order-6 rational
+    // approximation:
+    //
+    //     e^g = R(g)/R(-g)
+    //
+    // with R(x) split into even and odd terms:
+    //
+    //     R(x) = Q(x^2) + xÂ·P(x^2)
+    //
+    // so that the ratio can be computed as:
+    //
+    //     e^g = 1 + 2Â·gÂ·P(g^2) / (Q(g^2)-gÂ·P(g^2)).
+    //
+    // Note that the coefficients for R are close to but not the same as those
+    // from the 6,6 PadÃ© approximant to the exponential. 
+    //
+    // The exponents n and g are calculated by:
+    //
+    //     n = floor(x/ln(2) + 0.5)
+    //     g = x - nÂ·ln(2)
+    //
+    // so that x = g + nÂ·ln(2). We have:
+    //
+    //     |g| = |x - nÂ·ln(2)|
+    //         = |x - x + Î±Â·ln(2)|
+    //  
+    // for some fraction |Î±| â‰¤ 0.5, and thus |g| â‰¤ 0.5ln(2) â‰ˆ 0.347.
+    //
+    // Tne subtraction x - nÂ·ln(2) is performed in two parts, with
+    // ln(2) = C1 + C2, in order to compensate for the possible loss of precision
+    // attributable to catastrophic rounding. C1 comprises the first
+    // 32-bits of mantissa, C2 the remainder.
+
+    static  __m256d exp(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        // Compute n and g.
+
+        auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5)));
+
+        auto g = sub(x, mul(n, broadcast(ln2C1)));
+        g = sub(g, mul(n, broadcast(ln2C2)));
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = add(broadcast(1), mul(broadcast(2),
+            div(odd, sub(even, odd))));
+
+        // Finally, compute product with 2^n.
+        // Note: can only achieve full range using the ldexp implementation,
+        // rather than multiplying by 2^n directly.
+
+        auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n));
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+            ifelse(is_nan, broadcast(NAN),
+                   result)));
+    }
+
+    // Use same rational polynomial expansion as for exp(x), without
+    // the unit term.
+    //
+    // For |x|<=0.5, take n to be zero. Otherwise, set n as above,
+    // and scale the answer by:
+    //     expm1(x) = 2^n * expm1(g) + (2^n - 1).
+
+    static  __m256d expm1(const __m256d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+        auto two = add(one, one);
+
+        auto nzero = cmp_leq(abs(x), half);
+        auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), half));
+        n = ifelse(nzero, zero(), n);
+
+        auto g = sub(x, mul(n, broadcast(ln2C1)));
+        g = sub(g, mul(n, broadcast(ln2C2)));
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+
+        // For small x (n zero), bypass scaling step to avoid underflow.
+        // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
+        //     result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) )
+        // to avoid overflow when n=1024.
+
+        auto nm1 = _mm256_cvtpd_epi32(sub(n, one));
+        auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(is_nan, broadcast(NAN),
+            ifelse(nzero, expgm1,
+                   scaled))));
+    }
+
+    // Natural logarithm:
+    //
+    // Decompose x = 2^g * u such that g is an integer and
+    // u is in the interval [sqrt(2)/2, sqrt(2)].
+    //
+    // Then ln(x) is computed as R(u-1) + g*ln(2) where
+    // R(z) is a rational polynomial approximating ln(z+1)
+    // for small z:
+    //
+    //     R(z) = z - z^2/2 + z^3 * P(z)/Q(z)
+    //
+    // where P and Q are degree 5 polynomials, Q monic.
+    //
+    // In order to avoid cancellation error, ln(2) is represented
+    // as C3 + C4, with the C4 correction term approx. -2.1e-4.
+    // The summation order for R(z)+2^g is:
+    //
+    //     z^3*P(z)/Q(z) + g*C4 - z^2/2 + z + g*C3
+
+    static __m256d log(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq);
+
+        __m256d g = _mm256_cvtepi32_pd(logb_normal(x));
+        __m256d u = fraction_normal(x);
+
+        __m256d one = broadcast(1.);
+        __m256d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = add(r, mul(g,  broadcast(ln2C4)));
+        r = sub(r, mul(z2, half));
+        r = add(r, z);
+        r = add(r, mul(g,  broadcast(ln2C3)));
+
+        // Return NaN if x is NaN or negarive, +inf if x is +inf,
+        // or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_domainerr, broadcast(NAN),
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r)));
+    }
+
+protected:
+    static __m256d zero() {
+        return _mm256_setzero_pd();
+    }
+
+    static __m128i hi_epi32(__m256i x) {
+        __m128i xl = _mm256_castsi256_si128(x);
+        __m128i xh = _mm256_extractf128_si256(x, 1);
+        return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0xddu));
+    }
+
+    static __m128i lo_epi32(__m256i x) {
+        __m128i xl = _mm256_castsi256_si128(x);
+        __m128i xh = _mm256_extractf128_si256(x, 1);
+        return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0x88u));
+    }
+
+    static __m256i combine_m128i(__m128i hi, __m128i lo) {
+        return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+    }
+
+    // horner(x, a0, ..., an) computes the degree n polynomial A(x) with coefficients
+    // a0, ..., an by a0+xÂ·(a1+xÂ·(a2+...+xÂ·an)...).
+
+    static inline __m256d horner(__m256d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m256d horner(__m256d x, double a0, T... tail) {
+        return add(mul(x, horner(x, tail...)), broadcast(a0));
+    }
+
+    // horner1(x, a0, ..., an) computes the degree n+1 monic polynomial A(x) with coefficients
+    // a0, ..., an, 1 by by a0+xÂ·(a1+xÂ·(a2+...+xÂ·(an+x)...).
+
+    static inline __m256d horner1(__m256d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m256d horner1(__m256d x, double a0, T... tail) {
+        return add(mul(x, horner1(x, tail...)), broadcast(a0));
+    }
+
+    // Compute 2.0^n.
+    static __m256d exp2int(__m128i n) {
+        n = _mm_slli_epi32(n, 20);
+        n = _mm_add_epi32(n, _mm_set1_epi32(1023<<20));
+
+        auto nl = _mm_shuffle_epi32(n, 0x50);
+        auto nh = _mm_shuffle_epi32(n, 0xfa);
+        __m256i nhnl = combine_m128i(nh, nl);
+
+        return _mm256_castps_pd(
+            _mm256_blend_ps(_mm256_set1_ps(0),
+            _mm256_castsi256_ps(nhnl),0xaa));
+    }
+
+    // Compute n and f such that x = 2^nÂ·f, with |f| âˆˆ [1,2), given x is finite and normal.
+    static __m128i logb_normal(const __m256d& x) {
+        __m128i xw = hi_epi32(_mm256_castpd_si256(x));
+        __m128i emask = _mm_set1_epi32(0x7ff00000);
+        __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20);
+
+        return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023));
+    }
+
+    static __m256d fraction_normal(const __m256d& x) {
+        // 0x800fffffffffffff (intrinsic takes signed parameter)
+        __m256d emask = _mm256_castsi256_pd(_mm256_set1_epi64x(-0x7ff0000000000001));
+        __m256d bias = _mm256_castsi256_pd(_mm256_set1_epi64x(0x3ff0000000000000));
+        return _mm256_or_pd(bias, _mm256_and_pd(emask, x));
+    }
+
+    // Compute 2^nÂ·x when both x and 2^nÂ·x are normal, finite and strictly positive doubles.
+    static __m256d ldexp_positive(__m256d x, __m128i n) {
+        n = _mm_slli_epi32(n, 20);
+        auto zero = _mm_set1_epi32(0);
+        auto nl = _mm_unpacklo_epi32(zero, n);
+        auto nh = _mm_unpackhi_epi32(zero, n);
+
+        __m128d xl = _mm256_castpd256_pd128(x);
+        __m128d xh = _mm256_extractf128_pd(x, 1);
+
+        __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl));
+        __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh));
+        __m256i sumhl = combine_m128i(sumh, suml);
+
+        return _mm256_castsi256_pd(sumhl);
+    }
+
+    // Compute 2^nÂ·x when both x and 2^nÂ·x are normal and finite.
+    static __m256d ldexp_normal(__m256d x, __m128i n) {
+        __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll));
+        __m256d sbits = _mm256_andnot_pd(smask, x);
+
+        n = _mm_slli_epi32(n, 20);
+        auto zi = _mm_set1_epi32(0);
+        auto nl = _mm_unpacklo_epi32(zi, n);
+        auto nh = _mm_unpackhi_epi32(zi, n);
+
+        __m128d xl = _mm256_castpd256_pd128(x);
+        __m128d xh = _mm256_extractf128_pd(x, 1);
+
+        __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl));
+        __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh));
+        __m256i sumhl = combine_m128i(sumh, suml);
+
+        auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sumhl), smask), sbits);
+        return ifelse(cmp_eq(x, zero()), zero(), nzans);
+    }
+};
+
+
+#if defined(__AVX2__) && defined(__FMA__)
+
+// Same implementation as for AVX.
+using avx2_int4 = avx_int4;
+
+struct avx2_double4;
+
+template <>
+struct simd_traits<avx2_double4> {
+    static constexpr unsigned width = 4;
+    using scalar_type = double;
+    using vector_type = __m256d;
+    using mask_impl = avx2_double4;
+};
+
+// Note: we derive from avx_double4 only as an implementation shortcut.
+// Because `avx2_double4` does not derive from `implbase<avx2_double4>`,
+// any fallback methods in `implbase` will use the `avx_double4`
+// functions rather than the `avx2_double4` functions.
+
+struct avx2_double4: avx_double4 {
+    static __m256d fma(const __m256d& a, const __m256d& b, const __m256d& c) {
+        return _mm256_fmadd_pd(a, b, c);
+    }
+
+    static vector_type logical_not(const vector_type& a) {
+        __m256i ones = {};
+        return _mm256_xor_pd(a, _mm256_castsi256_pd(_mm256_cmpeq_epi32(ones, ones)));
+    }
+
+    static __m256d mask_unpack(unsigned long long k) {
+        // Only care about bottom four bits of k.
+        __m256i b = _mm256_set1_epi8((char)k);
+        b = _mm256_or_si256(b, _mm256_setr_epi64x(
+                0xfefefefefefefefe, 0xfdfdfdfdfdfdfdfd,
+                0xfbfbfbfbfbfbfbfb, 0xf7f7f7f7f7f7f7f7));
+
+        __m256i ones = {};
+        ones = _mm256_cmpeq_epi64(ones, ones);
+        return _mm256_castsi256_pd(_mm256_cmpeq_epi64(ones, b));
+    }
+
+    static void mask_copy_to(const __m256d& m, bool* y) {
+        // Convert to 32-bit wide mask values, and delegate to
+        // avx2_int4.
+
+        avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y);
+    }
+
+    static __m256d mask_copy_from(const bool* w) {
+        __m256i zero = _mm256_setzero_si256();
+
+        __m128i r;
+        std::memcpy(&r, w, 4);
+        return _mm256_castsi256_pd(_mm256_sub_epi64(zero, _mm256_cvtepi8_epi64(r)));
+    }
+
+    static __m256d gather(avx2_int4, const double* p, const __m128i& index) {
+        return _mm256_i32gather_pd(p, index, 8);
+    }
+
+    static __m256d gather(avx2_int4, __m256d a, const double* p, const __m128i& index, const __m256d& mask) {
+        return  _mm256_mask_i32gather_pd(a, p, index, mask, 8);
+    };
+
+    // avx4_double4 versions of log, exp, and expm1 use the same algorithms as for avx_double4,
+    // but use AVX2-specialized bit manipulation and FMA.
+
+    static  __m256d exp(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        // Compute n and g.
+
+        auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, broadcast(0.5)));
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1));
+
+        // Finally, compute product with 2^n.
+        // Note: can only achieve full range using the ldexp implementation,
+        // rather than multiplying by 2^n directly.
+
+        auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n));
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+            ifelse(is_nan, broadcast(NAN),
+                   result)));
+    }
+
+    static  __m256d expm1(const __m256d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+        auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q);
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+        auto two = add(one, one);
+
+        auto smallx = cmp_leq(abs(x), half);
+        auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, half));
+        n = ifelse(smallx, zero(), n);
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+
+        auto nm1 = _mm256_cvtpd_epi32(sub(n, one));
+        auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(is_nan, broadcast(NAN),
+            ifelse(smallx, expgm1,
+                   scaled))));
+    }
+
+    static __m256d log(const __m256d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq);
+
+        __m256d g = _mm256_cvtepi32_pd(logb_normal(x));
+        __m256d u = fraction_normal(x);
+
+        __m256d one = broadcast(1.);
+        __m256d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = fma(g,  broadcast(ln2C4), r);
+        r = fms(z2, half, r);
+        r = sub(z, r);
+        r = fma(g,  broadcast(ln2C3), r);
+
+        // Return NaN if x is NaN or negative, +inf if x is +inf,
+        // or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_domainerr, broadcast(NAN),
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r)));
+    }
+
+protected:
+    static __m128i lo_epi32(__m256i a) {
+        a = _mm256_shuffle_epi32(a, 0x08);
+        a = _mm256_permute4x64_epi64(a, 0x08);
+        return _mm256_castsi256_si128(a);
+    }
+
+    static  __m128i hi_epi32(__m256i a) {
+        a = _mm256_shuffle_epi32(a, 0x0d);
+        a = _mm256_permute4x64_epi64(a, 0x08);
+        return _mm256_castsi256_si128(a);
+    }
+
+    static inline __m256d horner(__m256d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m256d horner(__m256d x, double a0, T... tail) {
+        return fma(x, horner(x, tail...), broadcast(a0));
+    }
+
+    static inline __m256d horner1(__m256d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m256d horner1(__m256d x, double a0, T... tail) {
+        return fma(x, horner1(x, tail...), broadcast(a0));
+    }
+
+    static __m256d fms(const __m256d& a, const __m256d& b, const __m256d& c) {
+        return _mm256_fmsub_pd(a, b, c);
+    }
+
+    // Compute 2.0^n.
+    // Overrides avx_double4::exp2int.
+    static __m256d exp2int(__m128i n) {
+        __m256d x = broadcast(1);
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+        return _mm256_castsi256_pd(sum);
+    }
+
+    // Compute 2^n*x when both x and 2^n*x are normal, finite and strictly positive doubles.
+    // Overrides avx_double4::ldexp_positive.
+    static __m256d ldexp_positive(__m256d x, __m128i n) {
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+        return _mm256_castsi256_pd(sum);
+    }
+
+    // Compute 2^n*x when both x and 2^n*x are normal and finite.
+    // Overrides avx_double4::ldexp_normal.
+    static __m256d ldexp_normal(__m256d x, __m128i n) {
+        __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll));
+        __m256d sbits = _mm256_andnot_pd(smask, x);
+        __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52);
+        __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x));
+
+        auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sum), smask), sbits);
+        return ifelse(cmp_eq(x, zero()), zero(), nzans);
+    }
+
+    // Override avx_double4::logb_normal so as to use avx2 version of hi_epi32.
+    static __m128i logb_normal(const __m256d& x) {
+        __m128i xw = hi_epi32(_mm256_castpd_si256(x));
+        __m128i emask = _mm_set1_epi32(0x7ff00000);
+        __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20);
+
+        return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023));
+    }
+};
+#endif // defined(__AVX2__) && defined(__FMA__)
+
+} // namespace simd_detail
+
+namespace simd_abi {
+    template <typename T, unsigned N> struct avx;
+
+    template <> struct avx<int, 4> { using type = simd_detail::avx_int4; };
+    template <> struct avx<double, 4> { using type = simd_detail::avx_double4; };
+
+#if defined(__AVX2__) && defined(__FMA__)
+    template <typename T, unsigned N> struct avx2;
+
+    template <> struct avx2<int, 4> { using type = simd_detail::avx2_int4; };
+    template <> struct avx2<double, 4> { using type = simd_detail::avx2_double4; };
+#endif
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
+
+#endif // def __AVX__
diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f956bb00e9e96dfae90a259eb78cf0d487cf3664
--- /dev/null
+++ b/src/simd/avx512.hpp
@@ -0,0 +1,712 @@
+#pragma once
+
+// AVX512F SIMD intrinsics implementation.
+
+#ifdef __AVX512F__
+
+#include <cmath>
+#include <cstdint>
+#include <immintrin.h>
+
+#include <simd/approx.hpp>
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+struct avx512_double8;
+struct avx512_int8;
+struct avx512_mask8;
+
+template <>
+struct simd_traits<avx512_mask8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = bool;
+    using vector_type = __mmask8;
+    using mask_impl = avx512_mask8;
+};
+
+template <>
+struct simd_traits<avx512_double8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = double;
+    using vector_type = __m512d;
+    using mask_impl = avx512_mask8;
+};
+
+template <>
+struct simd_traits<avx512_int8> {
+    static constexpr unsigned width = 8;
+    using scalar_type = std::int32_t;
+    using vector_type = __m512i;
+    using mask_impl = avx512_mask8;
+};
+
+struct avx512_mask8: implbase<avx512_mask8> {
+    static __mmask8 broadcast(bool b) {
+        return _mm512_int2mask(-b);
+    }
+
+    static void copy_to(const __mmask8& k, bool* b) {
+        __m256i a = _mm256_setzero_si256();
+        a = _mm512_castsi512_si256(_mm512_mask_set1_epi32(_mm512_castsi256_si512(a), k, 1));
+
+        __m256i s = _mm256_set1_epi32(0x0c080400);
+        a = _mm256_shuffle_epi8(a, s);
+
+        s = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0);
+        a = _mm256_permutevar8x32_epi32(a, s);
+
+        std::memcpy(b, &a, 8);
+    }
+
+    static __mmask8 copy_from(const bool* p) {
+        __m256i a = _mm256_setzero_si256();
+        std::memcpy(&a, p, 8);
+        a = _mm256_sub_epi8(_mm256_setzero_si256(), a);
+        return _mm512_int2mask(_mm256_movemask_epi8(a));
+    }
+
+    // Note: fall back to implbase implementations of copy_to_masked and copy_from_masked;
+    // could be improved with the use of AVX512BW instructions on supported platforms.
+
+    static __mmask8 logical_not(const __mmask8& k) {
+        return _mm512_knot(k);
+    }
+
+    static __mmask8 logical_and(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    static __mmask8 logical_or(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kor(a, b);
+    }
+
+    // Arithmetic operations not necessarily appropriate for
+    // packed bit mask, but implemented for completeness/testing,
+    // with Z modulo 2 semantics:
+    //     a + b   is equivalent to   a ^ b
+    //     a * b                      a & b
+    //     a / b                      a
+    //     a - b                      a ^ b
+    //     -a                         a
+    //     max(a, b)                  a | b
+    //     min(a, b)                  a & b
+
+    static __mmask8 negate(const __mmask8& a) {
+        return a;
+    }
+
+    static __mmask8 add(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 sub(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 mul(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    static __mmask8 div(const __mmask8& a, const __mmask8& b) {
+        return a;
+    }
+
+    static __mmask8 fma(const __mmask8& a, const __mmask8& b, const __mmask8& c) {
+        return add(mul(a, b), c);
+    }
+
+    static __mmask8 max(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kor(a, b);
+    }
+
+    static __mmask8 min(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kand(a, b);
+    }
+
+    // Comparison operators are also taken as operating on Z modulo 2,
+    // with 1 > 0:
+    //
+    //     a > b    is equivalent to  a & ~b
+    //     a >= b                     a | ~b,  ~(~a & b)
+    //     a < b                      ~a & b
+    //     a <= b                     ~a | b,  ~(a & ~b)
+    //     a == b                     ~(a ^ b)
+    //     a != b                     a ^ b
+
+    static __mmask8 cmp_eq(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxnor(a, b);
+    }
+
+    static __mmask8 cmp_neq(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kxor(a, b);
+    }
+
+    static __mmask8 cmp_lt(const __mmask8& a, const __mmask8& b) {
+        return _mm512_kandn(a, b);
+    }
+
+    static __mmask8 cmp_gt(const __mmask8& a, const __mmask8& b) {
+        return cmp_lt(b, a);
+    }
+
+    static __mmask8 cmp_geq(const __mmask8& a, const __mmask8& b) {
+        return logical_not(cmp_lt(a, b));
+    }
+
+    static __mmask8 cmp_leq(const __mmask8& a, const __mmask8& b) {
+        return logical_not(cmp_gt(a, b));
+    }
+
+    static __mmask8 ifelse(const __mmask8& m, const __mmask8& u, const __mmask8& v) {
+        return _mm512_kor(_mm512_kandn(m, u), _mm512_kand(m, v));
+    }
+
+    static bool element(const __mmask8& k, int i) {
+        return _mm512_mask2int(k)&(1<<i);
+    }
+
+    static void set_element(__mmask8& k, int i, bool b) {
+        int n = _mm512_mask2int(k);
+        k = _mm512_int2mask((n&~(1<<i))|(b<<i));
+    }
+
+    static __mmask8 mask_broadcast(bool b) {
+        return broadcast(b);
+    }
+
+    static __mmask8 mask_unpack(unsigned long long p) {
+        return _mm512_int2mask(p);
+    }
+
+    static bool mask_element(const __mmask8& u, int i) {
+        return element(u, i);
+    }
+
+    static void mask_set_element(__mmask8& u, int i, bool b) {
+        set_element(u, i, b);
+    }
+
+    static void mask_copy_to(const __mmask8& m, bool* y) {
+        copy_to(m, y);
+    }
+
+    static __mmask8 mask_copy_from(const bool* y) {
+        return copy_from(y);
+    }
+};
+
+struct avx512_int8: implbase<avx512_int8> {
+    // Use default implementations for:
+    //     element, set_element.
+    //
+    // Consider changing mask representation to __mmask16
+    // and be explicit about comparison masks: restrictions
+    // to __mmask8 seem to produce a lot of ultimately unnecessary
+    // operations. 
+
+    using int32 = std::int32_t;
+
+    static __mmask8 lo() {
+        return _mm512_int2mask(0xff);
+    }
+
+    static __m512i broadcast(int32 v) {
+        return _mm512_set1_epi32(v);
+    }
+
+    static void copy_to(const __m512i& v, int32* p) {
+        _mm512_mask_storeu_epi32(p, lo(), v);
+    }
+
+    static void copy_to_masked(const __m512i& v, int32* p, const __mmask8& mask) {
+        _mm512_mask_storeu_epi32(p, mask, v);
+    }
+
+    static __m512i copy_from(const int32* p) {
+        return _mm512_maskz_loadu_epi32(lo(), p);
+    }
+
+    static __m512i copy_from_masked(const int32* p, const __mmask8& mask) {
+        return _mm512_maskz_loadu_epi32(mask, p);
+    }
+
+    static __m512i copy_from_masked(const __m512i& v, const int32* p, const __mmask8& mask) {
+        return _mm512_mask_loadu_epi32(v, mask, p);
+    }
+
+    static __m512i negate(const __m512i& a) {
+        return sub(_mm512_setzero_epi32(), a);
+    }
+
+    static __m512i add(const __m512i& a, const __m512i& b) {
+        return _mm512_add_epi32(a, b);
+    }
+
+    static __m512i sub(const __m512i& a, const __m512i& b) {
+        return _mm512_sub_epi32(a, b);
+    }
+
+    static __m512i mul(const __m512i& a, const __m512i& b) {
+        // Can represent 32-bit exactly in double, and technically overflow is
+        // undefined behaviour, so we can do this in doubles.
+        constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+        auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
+        auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b));
+        auto fpmul = _mm512_mul_round_pd(da, db, rtz);
+        return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpmul, rtz));
+    }
+
+    static __m512i div(const __m512i& a, const __m512i& b) {
+        // Can represent 32-bit exactly in double, so do a fp division with fixed rounding.
+        constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
+        auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
+        auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b));
+        auto fpdiv = _mm512_div_round_pd(da, db, rtz);
+        return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpdiv, rtz));
+    }
+
+    static __m512i fma(const __m512i& a, const __m512i& b, const __m512i& c) {
+        return add(mul(a, b), c);
+    }
+
+    static __mmask8 cmp_eq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpeq_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_neq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpneq_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_gt(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpgt_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_geq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmpge_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_lt(const __m512i& a, const __m512i& b) {
+        return _mm512_cmplt_epi32_mask(a, b);
+    }
+
+    static __mmask8 cmp_leq(const __m512i& a, const __m512i& b) {
+        return _mm512_cmple_epi32_mask(a, b);
+    }
+
+    static __m512i ifelse(const __mmask8& m, const __m512i& u, const __m512i& v) {
+        return _mm512_mask_blend_epi32(m, v, u);
+    }
+
+    static __m512i max(const __m512i& a, const __m512i& b) {
+        return _mm512_max_epi32(a, b);
+    }
+
+    static __m512i min(const __m512i& a, const __m512i& b) {
+        return _mm512_min_epi32(a, b);
+    }
+
+    static __m512i abs(const __m512i& a) {
+        return _mm512_abs_epi32(a);
+    }
+
+    // Generic 8-wide int solutions for gather and scatter.
+
+    template <typename Impl>
+    using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>;
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512i gather(ImplIndex, const int32* p, const typename ImplIndex::vector_type& index) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), _mm512_loadu_si512(op), p, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512i gather(ImplIndex, __m512i a, const int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        return _mm512_mask_i32gather_epi32(a, mask, _mm512_loadu_si512(op), p, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        _mm512_mask_i32scatter_epi32(p, lo(), _mm512_loadu_si512(op), s, 4);
+    }
+
+    template <typename ImplIndex,
+              typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int32 o[16];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m512i*>(o);
+        _mm512_mask_i32scatter_epi32(p, mask, _mm512_loadu_si512(op), s, 4);
+    }
+
+    // Specialized 8-wide gather and scatter for avx512_int8 implementation.
+
+    static __m512i gather(avx512_int8, const int32* p, const __m512i& index) {
+        return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), index, p, 4);
+    }
+
+    static __m512i gather(avx512_int8, __m512i a, const int32* p, const __m512i& index, const __mmask8& mask) {
+        return _mm512_mask_i32gather_epi32(a, mask, index, p, 4);
+    }
+
+    static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index) {
+        _mm512_mask_i32scatter_epi32(p, lo(), index, s, 4);
+    }
+
+    static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index, const __mmask8& mask) {
+        _mm512_mask_i32scatter_epi32(p, mask, index, s, 4);
+    }
+};
+
+struct avx512_double8: implbase<avx512_double8> {
+    // Use default implementations for:
+    //     element, set_element.
+
+    // CMPPD predicates:
+    static constexpr int cmp_eq_oq =    0;
+    static constexpr int cmp_unord_q =  3;
+    static constexpr int cmp_neq_uq =   4;
+    static constexpr int cmp_true_uq = 15;
+    static constexpr int cmp_lt_oq =   17;
+    static constexpr int cmp_le_oq =   18;
+    static constexpr int cmp_nge_uq =  25;
+    static constexpr int cmp_ge_oq =   29;
+    static constexpr int cmp_gt_oq =   30;
+
+    static __m512d broadcast(double v) {
+        return _mm512_set1_pd(v);
+    }
+
+    static void copy_to(const __m512d& v, double* p) {
+        _mm512_storeu_pd(p, v);
+    }
+
+    static void copy_to_masked(const __m512d& v, double* p, const __mmask8& mask) {
+        _mm512_mask_storeu_pd(p, mask, v);
+    }
+
+    static __m512d copy_from(const double* p) {
+        return _mm512_loadu_pd(p);
+    }
+
+    static __m512d copy_from_masked(const double* p, const __mmask8& mask) {
+        return _mm512_maskz_loadu_pd(mask, p);
+    }
+
+    static __m512d copy_from_masked(const __m512d& v, const double* p, const __mmask8& mask) {
+        return _mm512_mask_loadu_pd(v, mask, p);
+    }
+
+    static __m512d negate(const __m512d& a) {
+        return _mm512_sub_pd(_mm512_setzero_pd(), a);
+    }
+
+    static __m512d add(const __m512d& a, const __m512d& b) {
+        return _mm512_add_pd(a, b);
+    }
+
+    static __m512d sub(const __m512d& a, const __m512d& b) {
+        return _mm512_sub_pd(a, b);
+    }
+
+    static __m512d mul(const __m512d& a, const __m512d& b) {
+        return _mm512_mul_pd(a, b);
+    }
+
+    static __m512d div(const __m512d& a, const __m512d& b) {
+        return _mm512_div_pd(a, b);
+    }
+
+    static __m512d fma(const __m512d& a, const __m512d& b, const __m512d& c) {
+        return _mm512_fmadd_pd(a, b, c);
+    }
+
+    static __mmask8 cmp_eq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_eq_oq);
+    }
+
+    static __mmask8 cmp_neq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_neq_uq);
+    }
+
+    static __mmask8 cmp_gt(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_gt_oq);
+    }
+
+    static __mmask8 cmp_geq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_ge_oq);
+    }
+
+    static __mmask8 cmp_lt(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_lt_oq);
+    }
+
+    static __mmask8 cmp_leq(const __m512d& a, const __m512d& b) {
+        return _mm512_cmp_pd_mask(a, b, cmp_le_oq);
+    }
+
+    static __m512d ifelse(const __mmask8& m, const __m512d& u, const __m512d& v) {
+        return _mm512_mask_blend_pd(m, v, u);
+    }
+
+    static __m512d max(const __m512d& a, const __m512d& b) {
+        return _mm512_max_pd(a, b);
+    }
+
+    static __m512d min(const __m512d& a, const __m512d& b) {
+        return _mm512_min_pd(a, b);
+    }
+
+    static __m512d abs(const __m512d& x) {
+        // (NB: _mm512_abs_pd() wrapper buggy on gcc, so use AND directly.)
+        __m512i m = _mm512_set1_epi64(0x7fffffffffffffff);
+        return _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(x), m));
+    }
+
+    // Generic 8-wide int solutions for gather and scatter.
+
+    template <typename Impl>
+    using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>;
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512d gather(ImplIndex, const double* p, const typename ImplIndex::vector_type& index) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        return _mm512_i32gather_pd(_mm256_loadu_si256(op), p, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static __m512d gather(ImplIndex, __m512d a, const double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        return _mm512_mask_i32gather_pd(a, mask, _mm256_loadu_si256(op), p, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        _mm512_i32scatter_pd(p, _mm256_loadu_si256(op), s, 8);
+    }
+
+    template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type>
+    static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) {
+        int o[8];
+        ImplIndex::copy_to(index, o);
+        auto op = reinterpret_cast<const __m256i*>(o);
+        _mm512_mask_i32scatter_pd(p, mask, _mm256_loadu_si256(op), s, 8);
+    }
+
+    // Specialized 8-wide gather and scatter for avx512_int8 implementation.
+
+    static __m512d gather(avx512_int8, const double* p, const __m512i& index) {
+        return _mm512_i32gather_pd(_mm512_castsi512_si256(index), p, 8);
+    }
+
+    static __m512d gather(avx512_int8, __m512d a, const double* p, const __m512i& index, const __mmask8& mask) {
+        return _mm512_mask_i32gather_pd(a, mask, _mm512_castsi512_si256(index), p, 8);
+    }
+
+    static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index) {
+        _mm512_i32scatter_pd(p, _mm512_castsi512_si256(index), s, 8);
+    }
+
+    static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index, const __mmask8& mask) {
+        _mm512_mask_i32scatter_pd(p, mask, _mm512_castsi512_si256(index), s, 8);
+    }
+
+    // Use SVML for exp and log if compiling with icpc, else use ratpoly
+    // approximations.
+
+#if defined(__INTEL_COMPILER)
+    static  __m512d exp(const __m512d& x) {
+        return _mm512_exp_pd(x);
+    }
+
+    static  __m512d expm1(const __m512d& x) {
+        return _mm512_expm1_pd(x);
+    }
+
+    static __m512d log(const __m512d& x) {
+        return _mm512_log_pd(x);
+    }
+
+    static __m512d pow(const __m512d& x, const __m512d& y) {
+        return _mm512_pow_pd(x, y);
+    }
+#else
+
+    // Refer to avx/avx2 code for details of the exponential and log
+    // implementations.
+
+    static  __m512d exp(const __m512d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(exp_minarg));
+
+        // Compute n and g.
+
+        auto n = _mm512_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5)));
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        // Compute the g*P(g^2) and Q(g^2).
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1));
+
+        // Scale by 2^n, propogating NANs.
+
+        auto result = _mm512_scalef_pd(expg, n);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(0),
+                   result));
+    }
+
+    static  __m512d expm1(const __m512d& x) {
+        auto is_large = cmp_gt(x, broadcast(exp_maxarg));
+        auto is_small = cmp_lt(x, broadcast(expm1_minarg));
+
+        auto half = broadcast(0.5);
+        auto one = broadcast(1.);
+
+        auto nnz = cmp_gt(abs(x), half);
+        auto n = _mm512_maskz_roundscale_round_pd(
+                    nnz,
+                    mul(broadcast(ln2inv), x),
+                    0,
+                    _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+
+        auto g = fma(n, broadcast(-ln2C1), x);
+        g = fma(n, broadcast(-ln2C2), g);
+
+        auto gg = mul(g, g);
+
+        auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
+        auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
+
+        // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
+
+        auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd)));
+
+        // For small x (n zero), bypass scaling step to avoid underflow.
+        // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
+        //     result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) )
+        // to avoid overflow when n=1024.
+
+        auto nm1 = sub(n, one);
+
+        auto result =
+            _mm512_scalef_pd(
+                add(sub(_mm512_scalef_pd(one, nm1), half),
+                    _mm512_scalef_pd(expgm1, nm1)),
+                one);
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-1),
+            ifelse(nnz, result, expgm1)));
+    }
+
+    static __m512d log(const __m512d& x) {
+        // Masks for exceptional cases.
+
+        auto is_large = cmp_geq(x, broadcast(HUGE_VAL));
+        auto is_small = cmp_lt(x, broadcast(log_minarg));
+        is_small = avx512_mask8::logical_and(is_small, cmp_geq(x, broadcast(0)));
+
+        __m512d g = _mm512_getexp_pd(x);
+        __m512d u = _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_nan);
+
+        __m512d one = broadcast(1.);
+        __m512d half = broadcast(0.5);
+        auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2));
+        g = ifelse(gtsqrt2, add(g, one), g);
+        u = ifelse(gtsqrt2, mul(u, half), u);
+
+        auto z = sub(u, one);
+        auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log);
+        auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log);
+
+        auto z2 = mul(z, z);
+        auto z3 = mul(z2, z);
+
+        auto r = div(mul(z3, pz), qz);
+        r = fma(g,  broadcast(ln2C4), r);
+        r = fms(z2, half, r);
+        r = sub(z, r);
+        r = fma(g,  broadcast(ln2C3), r);
+
+        // r is alrady NaN if x is NaN or negative, otherwise
+        // return  +inf if x is +inf, or -inf if zero or (positive) denormal.
+
+        return
+            ifelse(is_large, broadcast(HUGE_VAL),
+            ifelse(is_small, broadcast(-HUGE_VAL),
+                r));
+    }
+#endif
+
+protected:
+    static inline __m512d horner1(__m512d x, double a0) {
+        return add(x, broadcast(a0));
+    }
+
+    static inline __m512d horner(__m512d x, double a0) {
+        return broadcast(a0);
+    }
+
+    template <typename... T>
+    static __m512d horner(__m512d x, double a0, T... tail) {
+        return fma(x, horner(x, tail...), broadcast(a0));
+    }
+
+    template <typename... T>
+    static __m512d horner1(__m512d x, double a0, T... tail) {
+        return fma(x, horner1(x, tail...), broadcast(a0));
+    }
+
+    static __m512d fms(const __m512d& a, const __m512d& b, const __m512d& c) {
+        return _mm512_fmsub_pd(a, b, c);
+    }
+};
+
+} // namespace simd_detail
+
+namespace simd_abi {
+    template <typename T, unsigned N> struct avx512;
+    template <> struct avx512<double, 8> { using type = simd_detail::avx512_double8; };
+    template <> struct avx512<int, 8> { using type = simd_detail::avx512_int8; };
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
+
+#endif // def __AVX512F__
diff --git a/src/simd/generic.hpp b/src/simd/generic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8ca5524b8ce5511d815a4483facbcf4c4d53cda
--- /dev/null
+++ b/src/simd/generic.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <cmath>
+
+#include <simd/implbase.hpp>
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+template <typename T, unsigned N>
+struct generic;
+
+template <typename T, unsigned N>
+struct simd_traits<generic<T, N>> {
+    static constexpr unsigned width = N;
+    using scalar_type = T;
+    using vector_type = std::array<T, N>;
+    using mask_impl = generic<bool, N>;
+};
+
+template <typename T, unsigned N>
+struct generic: implbase<generic<T, N>> {
+    using array = std::array<T, N>;
+
+    static array copy_from(const T* p) {
+        array result;
+        std::memcpy(&result, p, sizeof(result));
+        return result;
+    }
+
+    static void copy_to(const array& v, T* p) {
+        std::memcpy(p, &v, sizeof(v));
+    }
+
+    static void mask_copy_to(const array& v, bool* w) {
+        std::copy(v.begin(), v.end(), w);
+    }
+
+    static array mask_copy_from(const bool* y) {
+        array v;
+        std::copy(y, y+N, v.data());
+        return v;
+    }
+
+    static bool mask_element(const array& v, int i) {
+        return static_cast<bool>(v[i]);
+    }
+
+    static void mask_set_element(array& v, int i, bool x) {
+        v[i] = x;
+    }
+};
+
+} // namespace simd_detail
+
+namespace simd_abi {
+
+    template <typename T, unsigned N> struct generic {
+        using type = simd_detail::generic<T, N>;
+    };
+
+} // namespace simd_abi
+
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/implbase.hpp b/src/simd/implbase.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5b968f95fe834c67a2a8b87e1f47fb9cf041bc4
--- /dev/null
+++ b/src/simd/implbase.hpp
@@ -0,0 +1,478 @@
+#pragma once
+
+// Base class (via CRTP) for concrete implementation
+// classes, with default implementations based on
+// copy_to/copy_from.
+//
+// Also provides simd_detail::simd_traits type map.
+//
+// Maths functions are implemented in terms of
+// arithmetic primitives or lane-wise invocation of
+// std::math functions; specialized implementations
+// should be provided by the concrete implementation
+// where it is more efficient:
+//
+// Function | Default implemention by
+// ----------------------------------
+// min      | negate, cmp_gt, ifelse
+// max      | negate, cmp_gt, ifelse
+// abs      | negate, max
+// sin      | lane-wise std::sin
+// cos      | lane-wise std::cos
+// exp      | lane-wise std::exp
+// log      | lane-wise std::log
+// pow      | lane-wise std::pow
+// expm1    | lane-wise std::expm1
+// exprelr  | expm1, div, add, cmp_eq, ifelse
+//
+// 'exprelr' is the function x â†¦ x/(exp(x)-1).
+
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <iterator>
+
+// Derived class I must at minimum provide:
+//
+// * specialization of simd_traits.
+//
+// * implementations (static) for copy_to, copy_from:
+//
+//     void I::copy_to(const vector_type&, scalar_type*)
+//     vector_type I::copy_from(const scalar_type*)
+//
+//     void I::mask_copy_to(const vector_type&, scalar_type*)
+//     vector_type I::mask_copy_from(const bool*)
+//
+// * implementations (static) for mask element get/set:
+//
+//     bool I::mask_element(const vector_type& v, int i);
+//     void I::mask_set_element(vector_type& v, int i, bool x);
+
+namespace arb {
+namespace simd {
+namespace simd_detail {
+
+// The simd_traits class provides the mapping between a concrete SIMD
+// implementation I and its associated classes. This must be specialized
+// for each concrete implementation.
+
+template <typename I>
+struct simd_traits {
+    static constexpr unsigned width = 0;
+    using scalar_type = void;
+    using vector_type = void;
+    using mask_impl = void;
+};
+
+template <typename I>
+struct implbase {
+    constexpr static unsigned width = simd_traits<I>::width;
+    using scalar_type = typename simd_traits<I>::scalar_type;
+    using vector_type = typename simd_traits<I>::vector_type;
+
+    using mask_impl = typename simd_traits<I>::mask_impl;
+    using mask_type = typename simd_traits<mask_impl>::vector_type;
+
+    using store = scalar_type[width];
+    using mask_store = bool[width];
+
+    static vector_type broadcast(scalar_type x) {
+        store a;
+        std::fill(std::begin(a), std::end(a), x);
+        return I::copy_from(a);
+    }
+
+    static scalar_type element(const vector_type& v, int i) {
+        store a;
+        I::copy_to(v, a);
+        return a[i];
+    }
+
+    static void set_element(vector_type& v, int i, scalar_type x) {
+        store a;
+        I::copy_to(v, a);
+        a[i] = x;
+        v = I::copy_from(a);
+    }
+
+    static void copy_to_masked(const vector_type& v, scalar_type* p, const mask_type& mask) {
+        store a;
+        I::copy_to(v, a);
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) p[i] = a[i];
+        }
+    }
+
+    static vector_type copy_from_masked(const scalar_type* p, const mask_type& mask) {
+        store a;
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) a[i] = p[i];
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type copy_from_masked(const vector_type& v, const scalar_type* p, const mask_type& mask) {
+        store a;
+        I::copy_to(v, a);
+
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) a[i] = p[i];
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type negate(const vector_type& u) {
+        store a, r;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = -a[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type add(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]+b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type mul(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]*b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type sub(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]-b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type div(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]/b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type fma(const vector_type& u, const vector_type& v, const vector_type& w) {
+        store a, b, c, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+        I::copy_to(w, c);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::fma(a[i], b[i], c[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static mask_type cmp_eq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]==b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_neq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]!=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_gt(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]>b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_geq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]>=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_lt(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]<b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static mask_type cmp_leq(const vector_type& u, const vector_type& v) {
+        store a, b;
+        mask_store r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i]<=b[i];
+        }
+        return mask_impl::mask_copy_from(r);
+    }
+
+    static vector_type logical_not(const vector_type& u) {
+        store a, r;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = !a[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type logical_and(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i] && b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type logical_or(const vector_type& u, const vector_type& v) {
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = a[i] || b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type ifelse(const mask_type& mask, const vector_type& u, const vector_type& v) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        store a, b, r;
+        I::copy_to(u, a);
+        I::copy_to(v, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = m[i]? a[i]: b[i];
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type mask_broadcast(bool v) {
+        mask_store m;
+        std::fill(std::begin(m), std::end(m), v);
+        return I::mask_copy_from(m);
+    }
+
+    static vector_type mask_unpack(unsigned long long k) {
+        mask_store m;
+        for (unsigned i = 0; i<width; ++i) {
+            m[i] = k&(1ull<<i);
+        }
+        return I::mask_copy_from(m);
+    }
+
+    template <typename ImplIndex>
+    static vector_type gather(ImplIndex, const scalar_type* p, const typename ImplIndex::vector_type& index) {
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        for (unsigned i = 0; i<width; ++i) {
+            a[i] = p[o[i]];
+        }
+        return I::copy_from(a);
+    }
+
+    template <typename ImplIndex>
+    static vector_type gather(ImplIndex, const vector_type& s, const scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) { a[i] = p[o[i]]; }
+        }
+        return I::copy_from(a);
+    };
+
+    template <typename ImplIndex>
+    static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index) {
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            p[o[i]] = a[i];
+        }
+    }
+
+    template <typename ImplIndex>
+    static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) {
+        mask_store m;
+        mask_impl::mask_copy_to(mask, m);
+
+        typename ImplIndex::scalar_type o[width];
+        ImplIndex::copy_to(index, o);
+
+        store a;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            if (m[i]) { p[o[i]] = a[i]; }
+        }
+    }
+
+    // Maths
+
+    static vector_type abs(const vector_type& u) {
+        store a;
+        I::copy_to(u, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            a[i] = std::abs(a[i]);
+        }
+        return I::copy_from(a);
+    }
+
+    static vector_type min(const vector_type& s, const vector_type& t) {
+        return ifelse(cmp_gt(t, s), s, t);
+    }
+
+    static vector_type max(const vector_type& s, const vector_type& t) {
+        return ifelse(cmp_gt(t, s), t, s);
+    }
+
+    static vector_type sin(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::sin(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type cos(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::cos(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type exp(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::exp(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type expm1(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::expm1(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type log(const vector_type& s) {
+        store a, r;
+        I::copy_to(s, a);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::log(a[i]);
+        }
+        return I::copy_from(r);
+    }
+
+    static vector_type exprelr(const vector_type& s) {
+        vector_type ones = I::broadcast(1);
+        return ifelse(cmp_eq(ones, add(ones, s)), ones, div(s, expm1(s)));
+    }
+
+    static vector_type pow(const vector_type& s, const vector_type &t) {
+        store a, b, r;
+        I::copy_to(s, a);
+        I::copy_to(t, b);
+
+        for (unsigned i = 0; i<width; ++i) {
+            r[i] = std::pow(a[i], b[i]);
+        }
+        return I::copy_from(r);
+    }
+};
+
+} // namespace simd_detail
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/native.hpp b/src/simd/native.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d71bccec0085940ae279490eccbd4d14c20a32c2
--- /dev/null
+++ b/src/simd/native.hpp
@@ -0,0 +1,93 @@
+#pragma once
+
+// Native SIMD representations based on architecture.
+
+// Predefined macros from the compiler are used to guard
+// architecture-specific code here and in the source code
+// for the concrete implementation classes.
+//
+// For x86 vector extensions, the same preprocessor defines
+// are used across gcc, clang and icpc: __AVX__, __AVX2__,
+// __FMA__, __AVX512F__.
+//
+// Note that the FMA extensions for x86 are strictly speaking
+// independent of AVX2, and instructing gcc (for example)
+// to generate AVX2 code with '-mavx2' will not enable FMA
+// instructions unless '-mfma' is also given. It is generally
+// safer to explicitly request the target using
+// '-march', '-mcpu' or '-x', depending on the compiler and
+// architecure.
+
+namespace arb {
+namespace simd {
+namespace simd_abi {
+
+template <typename Value, unsigned N>
+struct native {
+    using type = void;
+};
+
+} // namespace simd_abi
+} // namespace simd
+} // namespace arb
+
+#define ARB_DEF_NATIVE_SIMD_(T, N, A)\
+namespace arb {\
+namespace simd_abi {\
+template <> struct native<T, N> {\
+    using type = typename A<T, N>::type;\
+};\
+}\
+}
+
+
+#if defined(__AVX2__) && defined(__FMA__)
+
+#include <simd/avx.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 4, avx2)
+ARB_DEF_NATIVE_SIMD_(double, 4, avx2)
+
+#elif defined(__AVX__)
+
+#include <simd/avx.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 4, avx)
+ARB_DEF_NATIVE_SIMD_(double, 4, avx)
+
+#endif
+
+#if defined(__AVX512F__)
+
+#include <simd/avx512.hpp>
+ARB_DEF_NATIVE_SIMD_(int, 8, avx512)
+ARB_DEF_NATIVE_SIMD_(double, 8, avx512)
+
+#endif
+
+
+namespace arb {
+namespace simd {
+namespace simd_abi {
+
+// Define native widths based on largest native vector implementation
+// of corresponding type. Presume power of 2, no larger than largest
+// possible over implemented architectures.
+
+template <typename Value, int k = 64>
+struct native_width;
+
+template <typename Value, int k>
+struct native_width {
+   static constexpr int value =
+        std::is_same<void, typename native<Value, k>::type>::value?
+        native_width<Value, k/2>::value:
+        k;
+};
+
+template <typename Value>
+struct native_width<Value, 1> {
+    static constexpr int value = 1;
+};
+
+} // namespace simd_abi
+} // namespace simd
+} // namespace arb
diff --git a/src/simd/simd.hpp b/src/simd/simd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7a1c65505c3c181e74d84fc0ea850b42a965070
--- /dev/null
+++ b/src/simd/simd.hpp
@@ -0,0 +1,521 @@
+#pragma once
+
+#include <type_traits>
+
+#include <simd/implbase.hpp>
+#include <simd/generic.hpp>
+#include <simd/native.hpp>
+
+namespace arb {
+namespace simd {
+
+namespace simd_detail {
+    template <typename Impl>
+    struct simd_impl;
+
+    template <typename Impl>
+    struct simd_mask_impl;
+}
+
+// Forward declarations for top-level maths functions.
+// (these require access to private simd_impl<Impl>::wrap method).
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&);
+
+namespace simd_detail {
+    template <typename Impl>
+    struct simd_impl {
+        // Type aliases:
+        //
+        //     scalar_type           internal value type in one simd lane,
+        //     simd_mask             simd_mask_impl specialization represeting comparison results,
+        //
+        //     vector_type           underlying representation,
+        //     mask_type             underlying representation for mask.
+
+        using scalar_type = typename simd_traits<Impl>::scalar_type;
+        using simd_mask   = simd_mask_impl<typename simd_traits<Impl>::mask_impl>;
+
+    protected:
+        using vector_type = typename simd_traits<Impl>::vector_type;
+        using mask_type   = typename simd_traits<typename simd_traits<Impl>::mask_impl>::vector_type;
+
+    public:
+        static constexpr unsigned width = simd_traits<Impl>::width;
+
+        template <typename Other>
+        friend class simd_impl;
+
+        simd_impl() = default;
+
+        // Construct by filling with scalar value.
+        simd_impl(const scalar_type& x) {
+            value_ = Impl::broadcast(x);
+        }
+
+        // Construct from scalar values in memory.
+        explicit simd_impl(const scalar_type* p) {
+            value_ = Impl::copy_from(p);
+        }
+
+        // Construct from const array ref.
+        explicit simd_impl(const scalar_type (&a)[width]) {
+            value_ = Impl::copy_from(a);
+        }
+
+        // Construct from scalar values in memory with mask.
+        explicit simd_impl(const scalar_type* p, const simd_mask& m) {
+            value_ = Impl::copy_from_masked(p, m.value_);
+        }
+
+        // Construct from const array ref with mask.
+        explicit simd_impl(const scalar_type (&a)[width], const simd_mask& m) {
+            value_ = Impl::copy_from_masked(a, m.value_);
+        }
+
+        // Copy constructor.
+        simd_impl(const simd_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+        }
+
+        // Scalar asssignment fills vector.
+        simd_impl& operator=(scalar_type x) {
+            value_ = Impl::broadcast(x);
+            return *this;
+        }
+
+        // Copy assignment.
+        simd_impl& operator=(const simd_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+            return *this;
+        }
+
+        // Read/write operations: copy_to, copy_from.
+
+        void copy_to(scalar_type* p) const {
+            Impl::copy_to(value_, p);
+        }
+
+        void copy_from(const scalar_type* p) {
+            value_ = Impl::copy_from(p);
+        }
+
+        // Arithmetic operations: +, -, *, /, fma.
+
+        simd_impl operator-() const {
+            return wrap(Impl::negate(value_));
+        }
+
+        friend simd_impl operator+(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::add(a.value_, b.value_));
+        }
+
+        friend simd_impl operator-(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::sub(a.value_, b.value_));
+        }
+
+        friend simd_impl operator*(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::mul(a.value_, b.value_));
+        }
+
+        friend simd_impl operator/(const simd_impl& a, simd_impl b) {
+            return simd_impl::wrap(Impl::div(a.value_, b.value_));
+        }
+
+        friend simd_impl fma(const simd_impl& a, simd_impl b, simd_impl c) {
+            return simd_impl::wrap(Impl::fma(a.value_, b.value_, c.value_));
+        }
+
+        // Lane-wise relational operations.
+
+        friend simd_mask operator==(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_eq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator!=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_neq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator<=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_leq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator<(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_lt(a.value_, b.value_));
+        }
+
+        friend simd_mask operator>=(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_geq(a.value_, b.value_));
+        }
+
+        friend simd_mask operator>(const simd_impl& a, const simd_impl& b) {
+            return simd_impl::mask(Impl::cmp_gt(a.value_, b.value_));
+        }
+
+        // Compound assignment operations: +=, -=, *=, /=.
+
+        simd_impl& operator+=(const simd_impl& x) {
+            value_ = Impl::add(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator-=(const simd_impl& x) {
+            value_ = Impl::sub(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator*=(const simd_impl& x) {
+            value_ = Impl::mul(value_, x.value_);
+            return *this;
+        }
+
+        simd_impl& operator/=(const simd_impl& x) {
+            value_ = Impl::div(value_, x.value_);
+            return *this;
+        }
+
+        // Gather and scatter.
+
+        template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+        void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) {
+            value_ = Impl::gather(IndexImpl{}, p, index.value_);
+        }
+
+        template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+        void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) {
+            Impl::scatter(IndexImpl{}, value_, p, index.value_);
+        }
+
+        // Array subscript operations.
+
+        struct reference {
+            reference() = delete;
+            reference(const reference&) = default;
+            reference& operator=(const reference&) = delete;
+
+            reference(vector_type& value, int i):
+                ptr_(&value), i(i) {}
+
+            reference& operator=(scalar_type v) && {
+                Impl::set_element(*ptr_, i, v);
+                return *this;
+            }
+
+            operator scalar_type() const {
+                return Impl::element(*ptr_, i);
+            }
+
+            vector_type* ptr_;
+            int i;
+        };
+
+        reference operator[](int i) {
+            return reference(value_, i);
+        }
+
+        scalar_type operator[](int i) const {
+            return Impl::element(value_, i);
+        }
+
+        // Masked assignment (via where expressions).
+
+        struct where_expression {
+            where_expression(const where_expression&) = default;
+            where_expression& operator=(const where_expression&) = delete;
+
+            where_expression(const simd_mask& m, simd_impl& s):
+                mask_(m), data_(s) {}
+
+            where_expression& operator=(scalar_type v) {
+                data_.value_ = Impl::ifelse(mask_.value_, simd_impl(v).value_, data_.value_);
+                return *this;
+            }
+
+            where_expression& operator=(const simd_impl& v) {
+                data_.value_ = Impl::ifelse(mask_.value_, v.value_, data_.value_);
+                return *this;
+            }
+
+            void copy_to(scalar_type* p) const {
+                Impl::copy_to_masked(data_.value_, p, mask_.value_);
+            }
+
+            void copy_from(const scalar_type* p) {
+                data_.value_ = Impl::copy_from_masked(data_.value_, p, mask_.value_);
+            }
+
+            template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+            void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) {
+                data_.value_ = Impl::gather(IndexImpl{}, data_.value_, p, index.value_, mask_.value_);
+            }
+
+            template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
+            void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) {
+                Impl::scatter(IndexImpl{}, data_.value_, p, index.value_, mask_.value_);
+            }
+
+        private:
+            const simd_mask& mask_;
+            simd_impl& data_;
+        };
+
+        // Maths functions are implemented as top-level functions, but require
+        // access to `wrap`.
+
+        friend simd_impl abs<Impl>(const simd_impl&);
+        friend simd_impl sin<Impl>(const simd_impl&);
+        friend simd_impl cos<Impl>(const simd_impl&);
+        friend simd_impl exp<Impl>(const simd_impl&);
+        friend simd_impl log<Impl>(const simd_impl&);
+        friend simd_impl expm1<Impl>(const simd_impl&);
+        friend simd_impl exprelr<Impl>(const simd_impl&);
+        friend simd_impl min<Impl>(const simd_impl&, const simd_impl&);
+        friend simd_impl max<Impl>(const simd_impl&, const simd_impl&);
+        friend simd_impl pow<Impl>(const simd_impl&, const simd_impl&);
+
+    protected:
+        vector_type value_;
+        simd_impl(const vector_type& x) {
+            value_ = x;
+        }
+
+    private:
+        static simd_impl wrap(const vector_type& v) {
+            simd_impl s;
+            s.value_ = v;
+            return s;
+        }
+
+        static simd_mask mask(const mask_type& v) {
+            simd_mask m;
+            m.value_ = v;
+            return m;
+        }
+    };
+
+    template <typename Impl>
+    struct simd_mask_impl: simd_impl<Impl> {
+        using base = simd_impl<Impl>;
+        using typename base::vector_type;
+        using typename base::scalar_type;
+        using base::width;
+        using base::value_;
+
+        simd_mask_impl() = default;
+
+        // Construct by filling with scalar value.
+        simd_mask_impl(bool b) {
+            value_ = Impl::mask_broadcast(b);
+        }
+
+        // Scalar asssignment fills vector.
+        simd_mask_impl& operator=(bool b) {
+            value_ = Impl::mask_broadcast(b);
+            return *this;
+        }
+
+        // Construct from bool values in memory.
+        explicit simd_mask_impl(const bool* y) {
+            value_ = Impl::mask_copy_from(y);
+        }
+
+        // Construct from const array ref.
+        explicit simd_mask_impl(const bool (&a)[width]) {
+            value_ = Impl::mask_copy_from(&a[0]);
+        }
+
+        // Copy assignment.
+        simd_mask_impl& operator=(const simd_mask_impl& other) {
+            std::memcpy(&value_, &other.value_, sizeof(vector_type));
+            return *this;
+        }
+
+        // Read/write bool operations: copy_to, copy_from.
+
+        void copy_to(bool* w) const {
+            Impl::mask_copy_to(value_, w);
+        }
+
+        void copy_from(const bool* y) {
+            value_ = Impl::mask_copy_from(y);
+        }
+
+        // Array subscript operations.
+
+        struct reference {
+            reference() = delete;
+            reference(const reference&) = default;
+            reference& operator=(const reference&) = delete;
+
+            reference(vector_type& value, int i):
+                ptr_(&value), i(i) {}
+
+            reference& operator=(bool v) && {
+                Impl::mask_set_element(*ptr_, i, v);
+                return *this;
+            }
+
+            operator bool() const {
+                return Impl::mask_element(*ptr_, i);
+            }
+
+            vector_type* ptr_;
+            int i;
+        };
+
+        reference operator[](int i) {
+            return reference(value_, i);
+        }
+
+        bool operator[](int i) const {
+            return Impl::element(value_, i);
+        }
+
+        // Logical operations.
+
+        simd_mask_impl operator!() const {
+            return simd_mask_impl::wrap(Impl::logical_not(value_));
+        }
+
+        friend simd_mask_impl operator&&(const simd_mask_impl& a, const simd_mask_impl& b) {
+            return simd_mask_impl::wrap(Impl::logical_and(a.value_, b.value_));
+        }
+
+        friend simd_mask_impl operator||(const simd_mask_impl& a, const simd_mask_impl& b) {
+            return simd_mask_impl::wrap(Impl::logical_or(a.value_, b.value_));
+        }
+
+        // Make mask from corresponding bits of integer.
+
+        static simd_mask_impl unpack(unsigned long long bits) {
+            return simd_mask_impl::wrap(Impl::mask_unpack(bits));
+        }
+
+    private:
+        simd_mask_impl(const vector_type& v): base(v) {}
+
+        template <class> friend class simd_impl;
+
+        static simd_mask_impl wrap(const vector_type& v) {
+            simd_mask_impl m;
+            m.value_ = v;
+            return m;
+        }
+    };
+} // namespace simd_detail
+
+namespace simd_abi {
+    // Note: `simd_abi::native` template class defined in `simd/native.hpp`,
+    // `simd_abi::generic` in `simd/generic.hpp`.
+
+    template <typename Value, unsigned N>
+    struct default_abi {
+        using type = typename std::conditional<
+            std::is_same<void, typename native<Value, N>::type>::value,
+            typename generic<Value, N>::type,
+            typename native<Value, N>::type>::type;
+    };
+}
+
+template <typename Value, unsigned N, template <class, unsigned> class Abi = simd_abi::default_abi>
+using simd = simd_detail::simd_impl<typename Abi<Value, N>::type>;
+
+template <typename Value, unsigned N>
+using simd_mask = typename simd<Value, N>::simd_mask;
+
+template <typename Simd>
+using where_expression = typename Simd::where_expression;
+
+template <typename Simd>
+where_expression<Simd> where(const typename Simd::simd_mask& m, Simd& v) {
+    return where_expression<Simd>(m, v);
+}
+
+template <typename>
+struct is_simd: std::false_type {};
+
+template <typename Impl>
+struct is_simd<simd_detail::simd_impl<Impl>>: std::true_type {};
+
+// Top-level maths functions: forward to underlying Impl.
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::abs(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::sin(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::cos(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::exp(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::log(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::expm1(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>& s) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::exprelr(s.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::pow(s.value_, t.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::min(s.value_, t.value_));
+}
+
+template <typename Impl>
+simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) {
+    return simd_detail::simd_impl<Impl>::wrap(Impl::max(s.value_, t.value_));
+}
+
+} // namespace simd
+} // namespace arb
diff --git a/src/util/debug.hpp b/src/util/debug.hpp
index 214e2269f0602c2cd437e78190b03b629f9c7f5f..dfc553924461f9d68c5aeaa951b667c26975909b 100644
--- a/src/util/debug.hpp
+++ b/src/util/debug.hpp
@@ -2,9 +2,14 @@
 
 #include <iostream>
 #include <sstream>
+#include <string>
 #include <mutex>
 #include <utility>
 
+extern "C" {
+#include <endian.h>
+}
+
 #include <threading/threading.hpp>
 #include "unwind.hpp"
 
@@ -84,16 +89,85 @@ namespace impl {
             return out;
         }
     };
+
+    enum class endian {
+        little = __ORDER_LITTLE_ENDIAN__,
+        big = __ORDER_BIG_ENDIAN__,
+        native = __BYTE_ORDER__
+    };
+
+    // Wrapper for emitting values on an ostream as a sequence of hex digits.
+    struct hexdump_inline_wrap {
+        const unsigned char* from;
+        std::size_t size;
+        unsigned width;
+
+        friend std::ostream& operator<<(std::ostream& out, const hexdump_inline_wrap& h) {
+            using std::ptrdiff_t;
+
+            constexpr bool little = endian::native==endian::little;
+            ptrdiff_t width = h.width;
+            const unsigned char* from = h.from;
+            const unsigned char* end = h.from+h.size;
+            std::string buf;
+
+            auto emit = [&buf](unsigned char c) {
+                const char* digit = "0123456789abcdef";
+                buf += digit[(c>>4)&0xf];
+                buf += digit[c&0xf];
+            };
+
+            constexpr unsigned bufsz = 512;
+            unsigned bufmargin = 4*width+1;
+
+            buf.reserve(bufsz);
+            while (end-from>width) {
+                if (buf.size()+bufmargin>=bufsz) {
+                    out << buf;
+                    buf.clear();
+                }
+                for (ptrdiff_t i = 0; i<width; ++i) {
+                    emit(little? from[width-i-1]: from[i]);
+                }
+                from += width;
+                buf += ' ';
+            }
+            for (ptrdiff_t i = 0; i<end-from; ++i) {
+                emit(little? from[width-i-1]: from[i]);
+            }
+
+            out << buf;
+            return out;
+        }
+    };
 }
 
 // Wrap a sequence or container of values so that they can be printed
 // to an `std::ostream` with the elements separated by the supplied 
 // separator.
+
 template <typename Seq, typename Separator>
 impl::sepval<Seq, Separator> sepval(const Seq& seq, Separator sep) {
     return impl::sepval<Seq, Separator>(seq, std::move(sep));
 }
 
+template <typename Seq>
+impl::sepval<Seq, const char*> csv(const Seq& seq) {
+    return sepval(seq, ", ");
+}
+
+// Dump something in hex (inline representation).
+
+template <typename T>
+impl::hexdump_inline_wrap hexdump(const T& obj, unsigned width = 4) {
+    return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(&obj), sizeof obj, width};
+}
+
+template <typename T>
+impl::hexdump_inline_wrap hexdump_n(const T* ptr, std::size_t n, unsigned width = 4) {
+    return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(ptr), n, width};
+}
+
 } // namespace util
 } // namespace arb
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index f2efc57058477160ec83cce1d79a9aa1c54c8a94..fe018f935b9692d5bd941765ae657dc5ab34e77c 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -70,6 +70,7 @@ set(TEST_SOURCES
     test_segment.cpp
     test_schedule.cpp
     test_rss_cell.cpp
+    test_simd.cpp
     test_span.cpp
     test_spikes.cpp
     test_spike_store.cpp
diff --git a/tests/unit/common.hpp b/tests/unit/common.hpp
index 70eace9a79c0500ceddf4658b0eae5f115c7ee39..ad6c68739d4711831b2c47ba14ee8832265a8275 100644
--- a/tests/unit/common.hpp
+++ b/tests/unit/common.hpp
@@ -40,7 +40,7 @@ struct null_terminated_t {
 
 constexpr null_terminated_t null_terminated;
 
-// wrap a value type, with copy operations disabled
+// Wrap a value type, with copy operations disabled.
 
 template <typename V>
 struct nocopy {
@@ -81,7 +81,7 @@ int nocopy<V>::move_ctor_count;
 template <typename V>
 int nocopy<V>::move_assign_count;
 
-// wrap a value type, with move operations disabled
+// Wrap a value type, with move operations disabled.
 
 template <typename V>
 struct nomove {
@@ -138,12 +138,13 @@ template <typename FPType, typename Seq1, typename Seq2>
     for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) {
         using FP = testing::internal::FloatingPoint<FPType>;
 
-        // cast to FPType to avoid warnings about lowering conversion
-        // if FPType has lower precision than Seq{12}::value_type
         auto v1 = *i1;
         auto v2 = *i2;
 
-        if (!FP{v1}.AlmostEquals(FP{v2})) {
+        // Cast to FPType to avoid warnings about lowering conversion
+        // if FPType has lower precision than Seq{12}::value_type.
+
+        if (!(std::isnan(v1) && std::isnan(v2)) && !FP{v1}.AlmostEquals(FP{v2})) {
             return ::testing::AssertionFailure() << "floating point numbers " << v1 << " and " << v2 << " differ at index " << j;
         }
 
@@ -155,6 +156,59 @@ template <typename FPType, typename Seq1, typename Seq2>
     return ::testing::AssertionSuccess();
 }
 
+template <typename V>
+inline bool generic_isnan(const V& x) { return false; }
+inline bool generic_isnan(float x) { return std::isnan(x); }
+inline bool generic_isnan(double x) { return std::isnan(x); }
+inline bool generic_isnan(long double x) { return std::isnan(x); }
+
+template <typename U, typename V>
+static bool equiv(const U& u, const V& v) {
+    return u==v || (generic_isnan(u) && generic_isnan(v));
+}
+
+template <typename Seq1, typename Seq2>
+::testing::AssertionResult seq_eq(Seq1&& seq1, Seq2&& seq2) {
+    using std::begin;
+    using std::end;
+
+    auto i1 = begin(seq1);
+    auto i2 = begin(seq2);
+
+    auto e1 = end(seq1);
+    auto e2 = end(seq2);
+
+    for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) {
+        auto v1 = *i1;
+        auto v2 = *i2;
+
+        if (!equiv(v1, v2)) {
+            return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << j;
+        }
+    }
+
+    if (i1!=e1 || i2!=e2) {
+        return ::testing::AssertionFailure() << "sequences differ in length";
+    }
+    return ::testing::AssertionSuccess();
+}
+
+// Assert elements 0..n-1 inclusive of two indexed collections are exactly equal.
+template <typename Arr1, typename Arr2>
+::testing::AssertionResult indexed_eq_n(int n, Arr1&& a1, Arr2&& a2) {
+    for (int i = 0; i<n; ++i) {
+        auto v1 = a1[i];
+        auto v2 = a2[i];
+
+        if (!equiv(v1,v2)) {
+            return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << i;
+        }
+    }
+
+    return ::testing::AssertionSuccess();
+}
+
+
 // Assert two floating point values are within a relative tolerance.
 inline ::testing::AssertionResult near_relative(double a, double b, double relerr) {
     double tol = relerr*std::max(std::abs(a), std::abs(b));
diff --git a/tests/unit/test_simd.cpp b/tests/unit/test_simd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f22de961aafac7ac946e7529d0dbcf37e793b327
--- /dev/null
+++ b/tests/unit/test_simd.cpp
@@ -0,0 +1,1001 @@
+#include <cmath>
+#include <random>
+
+#include <simd/simd.hpp>
+#include <simd/avx.hpp>
+
+#include "common.hpp"
+
+using namespace arb::simd;
+
+namespace {
+    // Use different distributions in `fill_random`, based on the value type in question:
+    //
+    //     * floating point type => uniform_real_distribution, default interval [-1, 1).
+    //     * bool                => uniform_int_distribution, default interval [0, 1).
+    //     * other integral type => uniform_int_distribution, default interval [L, U]
+    //                              such that L^2+L and U^2+U fit within the integer range.
+
+    template <typename V, typename = typename std::enable_if<std::is_floating_point<V>::value>::type>
+    std::uniform_real_distribution<V> make_udist(V lb = -1., V ub = 1.) {
+        return std::uniform_real_distribution<V>(lb, ub);
+    }
+
+    template <typename V, typename = typename std::enable_if<std::is_integral<V>::value && !std::is_same<V, bool>::value>::type>
+    std::uniform_int_distribution<V> make_udist(
+            V lb = std::numeric_limits<V>::lowest() / (2 << std::numeric_limits<V>::digits/2),
+            V ub = std::numeric_limits<V>::max() >> (1+std::numeric_limits<V>::digits/2))
+    {
+        return std::uniform_int_distribution<V>(lb, ub);
+    }
+
+    template <typename V, typename = typename std::enable_if<std::is_same<V, bool>::value>::type>
+    std::uniform_int_distribution<> make_udist(V lb = 0, V ub = 1) {
+        return std::uniform_int_distribution<>(0, 1);
+    }
+
+    template <typename Seq, typename Rng>
+    void fill_random(Seq&& seq, Rng& rng) {
+        using V = typename std::decay<decltype(*std::begin(seq))>::type;
+
+        auto u = make_udist<V>();
+        for (auto& x: seq) { x = u(rng); }
+    }
+
+    template <typename Seq, typename Rng, typename B1, typename B2>
+    void fill_random(Seq&& seq, Rng& rng, B1 lb, B2 ub) {
+        using V = typename std::decay<decltype(*std::begin(seq))>::type;
+
+        auto u = make_udist<V>(lb, ub);
+        for (auto& x: seq) { x = u(rng); }
+    }
+
+    template <typename Simd, typename Rng, typename B1, typename B2, typename = typename std::enable_if<is_simd<Simd>::value>::type>
+    void fill_random(Simd& s, Rng& rng, B1 lb, B2 ub) {
+        using V = typename Simd::scalar_type;
+        constexpr unsigned N = Simd::width;
+
+        V v[N];
+        fill_random(v, rng, lb, ub);
+        s.copy_from(v);
+    }
+
+    template <typename Simd, typename Rng, typename = typename std::enable_if<is_simd<Simd>::value>::type>
+    void fill_random(Simd& s, Rng& rng) {
+        using V = typename Simd::scalar_type;
+        constexpr unsigned N = Simd::width;
+
+        V v[N];
+        fill_random(v, rng);
+        s.copy_from(v);
+    }
+
+    template <typename Simd>
+    ::testing::AssertionResult simd_eq(Simd a, Simd b) {
+        constexpr unsigned N = Simd::width;
+        using V = typename Simd::scalar_type;
+
+        V as[N], bs[N];
+        a.copy_to(as);
+        b.copy_to(bs);
+
+        return ::testing::seq_eq(as, bs);
+    }
+
+    constexpr unsigned nrounds = 20u;
+}
+
+template <typename S>
+struct simd_value: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_value);
+
+// Initialization and element access.
+TYPED_TEST_P(simd_value, elements) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1001);
+
+    // broadcast:
+    simd a(2);
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(2., a[i]);
+    }
+
+    // scalar assignment:
+    a = 3;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(3, a[i]);
+    }
+
+    scalar bv[N], cv[N], dv[N];
+
+    fill_random(bv, rng);
+    fill_random(cv, rng);
+    fill_random(dv, rng);
+
+    // array initialization:
+    simd b(bv);
+    EXPECT_TRUE(testing::indexed_eq_n(N, bv, b));
+
+    // array rvalue initialization:
+    simd c(std::move(cv));
+    EXPECT_TRUE(testing::indexed_eq_n(N, cv, c));
+
+    // pointer initialization:
+    simd d(&dv[0]);
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, d));
+
+    // copy construction:
+    simd e(d);
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, e));
+
+    // copy assignment:
+    b = d;
+    EXPECT_TRUE(testing::indexed_eq_n(N, dv, b));
+}
+
+TYPED_TEST_P(simd_value, element_lvalue) {
+    using simd = TypeParam;
+    constexpr unsigned N = simd::width;
+
+    simd a(3);
+    ASSERT_GT(N, 1u);
+    a[N-2] = 5;
+
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(i==N-2? 5: 3, a[i]);
+    }
+}
+
+TYPED_TEST_P(simd_value, copy_to_from) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1010);
+
+    scalar buf1[N], buf2[N];
+    fill_random(buf1, rng);
+    fill_random(buf2, rng);
+
+    simd s;
+    s.copy_from(buf1);
+    s.copy_to(buf2);
+
+    EXPECT_TRUE(testing::indexed_eq_n(N, buf1, s));
+    EXPECT_TRUE(testing::seq_eq(buf1, buf2));
+}
+
+TYPED_TEST_P(simd_value, copy_to_from_masked) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1031);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar buf1[N], buf2[N], buf3[N];
+        fill_random(buf1, rng);
+        fill_random(buf2, rng);
+        fill_random(buf3, rng);
+
+        bool mbuf1[N], mbuf2[N];
+        fill_random(mbuf1, rng);
+        fill_random(mbuf2, rng);
+        mask m1(mbuf1);
+        mask m2(mbuf2);
+
+        scalar expected[N];
+        for (unsigned i = 0; i<N; ++i) {
+            expected[i] = mbuf1[i]? buf2[i]: buf1[i];
+        }
+
+        simd s(buf1);
+        where(m1, s).copy_from(buf2);
+        EXPECT_TRUE(testing::indexed_eq_n(N, expected, s));
+
+        for (unsigned i = 0; i<N; ++i) {
+            if (!mbuf2[i]) expected[i] = buf3[i];
+        }
+
+        where(m2, s).copy_to(buf3);
+        EXPECT_TRUE(testing::indexed_eq_n(N, expected, buf3));
+    }
+}
+
+TYPED_TEST_P(simd_value, construct_masked) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1031);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar buf[N];
+        fill_random(buf, rng);
+
+        bool mbuf[N];
+        fill_random(mbuf, rng);
+
+        mask m(mbuf);
+        simd s(buf, m);
+
+        for (unsigned i = 0; i<N; ++i) {
+            if (!mbuf[i]) continue;
+            EXPECT_EQ(buf[i], s[i]);
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, arithmetic) {
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1002);
+    scalar u[N], v[N], w[N], r[N];
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        fill_random(u, rng);
+        fill_random(v, rng);
+        fill_random(w, rng);
+
+        scalar neg_u[N];
+        for (unsigned i = 0; i<N; ++i) neg_u[i] = -u[i];
+
+        scalar u_plus_v[N];
+        for (unsigned i = 0; i<N; ++i) u_plus_v[i] = u[i]+v[i];
+
+        scalar u_minus_v[N];
+        for (unsigned i = 0; i<N; ++i) u_minus_v[i] = u[i]-v[i];
+
+        scalar u_times_v[N];
+        for (unsigned i = 0; i<N; ++i) u_times_v[i] = u[i]*v[i];
+
+        scalar u_divide_v[N];
+        for (unsigned i = 0; i<N; ++i) u_divide_v[i] = u[i]/v[i];
+
+        scalar fma_u_v_w[N];
+        for (unsigned i = 0; i<N; ++i) fma_u_v_w[i] = std::fma(u[i],v[i],w[i]);
+
+        simd us(u), vs(v), ws(w);
+
+        (-us).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(neg_u, r));
+
+        (us+vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_plus_v, r));
+
+        (us-vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_minus_v, r));
+
+        (us*vs).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(u_times_v, r));
+
+        (us/vs).copy_to(r);
+#if defined(__INTEL_COMPILER)
+        // icpc will by default use an approximation for scalar division,
+        // and a different one for vectorized scalar division; the latter,
+        // in particular, is often out by 1 ulp for normal quotients.
+        //
+        // Unfortunately, we can't check at compile time the floating
+        // point dodginess quotient.
+
+        if (std::is_floating_point<scalar>::value) {
+            EXPECT_TRUE(testing::seq_almost_eq<scalar>(u_divide_v, r));
+        }
+        else {
+            EXPECT_TRUE(testing::seq_eq(u_divide_v, r));
+        }
+#else
+        EXPECT_TRUE(testing::seq_eq(u_divide_v, r));
+#endif
+
+        (fma(us, vs, ws)).copy_to(r);
+        EXPECT_TRUE(testing::seq_eq(fma_u_v_w, r));
+    }
+}
+
+TYPED_TEST_P(simd_value, compound_assignment) {
+    using simd = TypeParam;
+
+    simd a, b, r;
+
+    std::minstd_rand rng(1003);
+    fill_random(a, rng);
+    fill_random(b, rng);
+
+    EXPECT_TRUE(simd_eq(a+b, (r = a)+=b));
+    EXPECT_TRUE(simd_eq(a-b, (r = a)-=b));
+    EXPECT_TRUE(simd_eq(a*b, (r = a)*=b));
+    EXPECT_TRUE(simd_eq(a/b, (r = a)/=b));
+}
+
+TYPED_TEST_P(simd_value, comparison) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1004);
+    std::uniform_int_distribution<> sgn(-1, 1); // -1, 0 or 1.
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        int cmp[N];
+        bool test[N];
+        simd a, b;
+
+        fill_random(b, rng);
+
+        for (unsigned j = 0; j<N; ++j) {
+            cmp[j] = sgn(rng);
+            a[j] = b[j]+17*cmp[j];
+        }
+
+        mask gt = a>b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, gt));
+
+        mask geq = a>=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, geq));
+
+        mask lt = a<b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, lt));
+
+        mask leq = a<=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, leq));
+
+        mask eq = a==b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]==0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, eq));
+
+        mask ne = a!=b;
+        for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]!=0; }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, ne));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_elements) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1005);
+
+    // bool broadcast:
+    mask a(true);
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(true, a[i]);
+    }
+
+    // scalar assignment:
+    mask d;
+    d = false;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(false, d[i]);
+    }
+    d = true;
+    for (unsigned i = 0; i<N; ++i) {
+        EXPECT_EQ(true, d[i]);
+    }
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool bv[N], cv[N], dv[N];
+
+        fill_random(bv, rng);
+        fill_random(cv, rng);
+        fill_random(dv, rng);
+
+        // array initialization:
+        mask b(bv);
+        EXPECT_TRUE(testing::indexed_eq_n(N, bv, b));
+
+        // array rvalue initialization:
+        mask c(std::move(cv));
+        EXPECT_TRUE(testing::indexed_eq_n(N, cv, c));
+
+        // pointer initialization:
+        mask d(&dv[0]);
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, d));
+
+        // copy construction:
+        mask e(d);
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, e));
+
+        // copy assignment:
+        b = d;
+        EXPECT_TRUE(testing::indexed_eq_n(N, dv, b));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_element_lvalue) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1006);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool v[N];
+        fill_random(v, rng);
+
+        mask m(v);
+        for (unsigned j = 0; j<N; ++j) {
+            bool b = v[j];
+            m[j] = !b;
+            v[j] = !b;
+
+            EXPECT_EQ(m[j], !b);
+            EXPECT_TRUE(testing::indexed_eq_n(N, v, m));
+
+            m[j] = b;
+            v[j] = b;
+            EXPECT_EQ(m[j], b);
+            EXPECT_TRUE(testing::indexed_eq_n(N, v, m));
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_copy_to_from) {
+    using simd = TypeParam;
+    using simd_mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1012);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        bool buf1[N], buf2[N];
+        fill_random(buf1, rng);
+        fill_random(buf2, rng);
+
+        simd_mask m;
+        m.copy_from(buf1);
+        m.copy_to(buf2);
+
+        EXPECT_TRUE(testing::indexed_eq_n(N, buf1, m));
+        EXPECT_TRUE(testing::seq_eq(buf1, buf2));
+    }
+}
+
+TYPED_TEST_P(simd_value, mask_unpack) {
+    using simd = TypeParam;
+    using mask = typename simd::simd_mask;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1035);
+    std::uniform_int_distribution<unsigned long long> U(0, (1ull<<N)-1);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        unsigned long long packed = U(rng);
+        bool b[N];
+        mask::unpack(packed).copy_to(b);
+
+        for (unsigned j = 0; j<N; ++j) {
+            EXPECT_EQ((bool)(packed&(1ull<<j)), b[j]);
+        }
+    }
+}
+
+TYPED_TEST_P(simd_value, maths) {
+    // min, max, abs tests valid for both fp and int types.
+
+    using simd = TypeParam;
+    using scalar = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1013);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar a[N], b[N], test[N];
+        fill_random(a, rng);
+        fill_random(b, rng);
+
+        simd as(a), bs(b);
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::abs(a[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, abs(as)));
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::min(a[j], b[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, min(as, bs)));
+
+        for (unsigned j = 0; j<N; ++j) { test[j] = std::max(a[j], b[j]); }
+        EXPECT_TRUE(testing::indexed_eq_n(N, test, max(as, bs)));
+    }
+}
+
+REGISTER_TYPED_TEST_CASE_P(simd_value, elements, element_lvalue, copy_to_from, copy_to_from_masked, construct_masked, arithmetic, compound_assignment, comparison, mask_elements, mask_element_lvalue, mask_copy_to_from, mask_unpack, maths);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd<int, 4, simd_abi::avx>,
+    simd<double, 4, simd_abi::avx>,
+#endif
+#ifdef __AVX2__
+    simd<int, 4, simd_abi::avx2>,
+    simd<double, 4, simd_abi::avx2>,
+#endif
+#ifdef __AVX512F__
+    simd<int, 8, simd_abi::avx512>,
+    simd<double, 8, simd_abi::avx512>,
+#endif
+
+    simd<int, 4, simd_abi::generic>,
+    simd<double, 4, simd_abi::generic>,
+    simd<float, 16, simd_abi::generic>,
+
+    simd<int, 4, simd_abi::default_abi>,
+    simd<double, 4, simd_abi::default_abi>,
+    simd<int, 8, simd_abi::default_abi>,
+    simd<double, 8, simd_abi::default_abi>
+> simd_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_value, simd_test_types);
+
+// FP-only SIMD value tests (maths).
+
+template <typename S>
+struct simd_fp_value: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_fp_value);
+
+TYPED_TEST_P(simd_fp_value, fp_maths) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    std::minstd_rand rng(1014);
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        fp epsilon = std::numeric_limits<fp>::epsilon();
+        int min_exponent = std::numeric_limits<fp>::min_exponent;
+        int max_exponent = std::numeric_limits<fp>::max_exponent;
+
+        fp u[N], v[N], r[N];
+
+        // Trigonometric functions (sin, cos):
+        fill_random(u, rng);
+
+        fp sin_u[N];
+        for (unsigned i = 0; i<N; ++i) sin_u[i] = std::sin(u[i]);
+        sin(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(sin_u, r));
+
+        fp cos_u[N];
+        for (unsigned i = 0; i<N; ++i) cos_u[i] = std::cos(u[i]);
+        cos(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(cos_u, r));
+
+        // Logarithms (natural log):
+        fill_random(u, rng, -max_exponent*std::log(2.), max_exponent*std::log(2.));
+        for (auto& x: u) {
+            x = std::exp(x);
+            // SIMD log implementation may treat subnormal as zero
+            if (std::fpclassify(x)==FP_SUBNORMAL) x = 0;
+        }
+
+        fp log_u[N];
+        for (unsigned i = 0; i<N; ++i) log_u[i] = std::log(u[i]);
+        log(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(log_u, r));
+
+        // Exponential functions (exp, expm1, exprelr):
+
+        // Use max_exponent to get coverage over finite domain.
+        fp exp_min_arg = min_exponent*std::log(2.);
+        fp exp_max_arg = max_exponent*std::log(2.);
+        fill_random(u, rng, exp_min_arg, exp_max_arg);
+
+        fp exp_u[N];
+        for (unsigned i = 0; i<N; ++i) exp_u[i] = std::exp(u[i]);
+        exp(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u, r));
+
+        fp expm1_u[N];
+        for (unsigned i = 0; i<N; ++i) expm1_u[i] = std::expm1(u[i]);
+        expm1(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u, r));
+
+        fp exprelr_u[N];
+        for (unsigned i = 0; i<N; ++i) {
+            exprelr_u[i] = u[i]+fp(1)==fp(1)? fp(1): u[i]/(std::exp(u[i])-fp(1));
+        }
+        exprelr(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u, r));
+
+        // Test expm1 and exprelr with small (magnitude < fp epsilon) values.
+        fill_random(u, rng, -epsilon, epsilon);
+        fp expm1_u_small[N];
+        for (unsigned i = 0; i<N; ++i) {
+            expm1_u_small[i] = std::expm1(u[i]);
+            EXPECT_NEAR(u[i], expm1_u_small[i], std::abs(4*u[i]*epsilon)); // just to confirm!
+        }
+        expm1(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u_small, r));
+
+        fp exprelr_u_small[N];
+        for (unsigned i = 0; i<N; ++i) exprelr_u_small[i] = 1;
+        exprelr(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u_small, r));
+
+        // Test zero result for highly negative exponents.
+        fill_random(u, rng, 4*exp_min_arg, 2*exp_min_arg);
+        fp exp_u_very_negative[N];
+        for (unsigned i = 0; i<N; ++i) exp_u_very_negative[i] = std::exp(u[i]);
+        exp(simd(u)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u_very_negative, r));
+
+        // Power function:
+
+        // Non-negative base, arbitrary exponent.
+        fill_random(u, rng, 0., std::exp(1));
+        fill_random(v, rng, exp_min_arg, exp_max_arg);
+        fp pow_u_pos_v[N];
+        for (unsigned i = 0; i<N; ++i) pow_u_pos_v[i] = std::pow(u[i], v[i]);
+        pow(simd(u), simd(v)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_pos_v, r));
+
+        // Arbitrary base, small magnitude integer exponent.
+        fill_random(u, rng);
+        int int_exponent[N];
+        fill_random(int_exponent, rng, -2, 2);
+        for (unsigned i = 0; i<N; ++i) v[i] = int_exponent[i];
+        fp pow_u_v_int[N];
+        for (unsigned i = 0; i<N; ++i) pow_u_v_int[i] = std::pow(u[i], v[i]);
+        pow(simd(u), simd(v)).copy_to(r);
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_v_int, r));
+    }
+}
+
+// Check special function behaviour for specific values including
+// qNAN, infinity etc.
+
+TYPED_TEST_P(simd_fp_value, exp_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    const fp exp_minarg = std::log(normal_least);
+    const fp exp_maxarg = std::log(largest);
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    normal_least, denorm_least, -normal_least, -denorm_least,
+                    exp_minarg, exp_maxarg, qnan, -qnan };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::exp(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = exp(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+TYPED_TEST_P(simd_fp_value, expm1_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    const fp expm1_minarg = std::nextafter(std::log(eps/4), fp(0));
+    const fp expm1_maxarg = std::log(largest);
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    normal_least, denorm_least, -normal_least, -denorm_least,
+                    expm1_minarg, expm1_maxarg, qnan, -qnan };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::expm1(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = expm1(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+TYPED_TEST_P(simd_fp_value, log_special_values) {
+    using simd = TypeParam;
+    using fp = typename simd::scalar_type;
+    constexpr unsigned N = simd::width;
+
+    using limits = std::numeric_limits<fp>;
+
+    // NOTE: simd log implementations may treat subnormal
+    // numbers as zero, so omit the denorm_least tests...
+
+    constexpr fp inf = limits::infinity();
+    constexpr fp eps = limits::epsilon();
+    constexpr fp largest = limits::max();
+    constexpr fp normal_least = limits::min();
+    //constexpr fp denorm_least = limits::denorm_min();
+    constexpr fp qnan = limits::quiet_NaN();
+
+    fp values[] = { inf, -inf, eps, -eps,
+                    eps/2, -eps/2, 0., -0.,
+                    1., -1., 2., -2.,
+                    //normal_least, denorm_least, -normal_least, -denorm_least,
+                    normal_least, -normal_least,
+                    qnan, -qnan, largest };
+
+    constexpr unsigned n_values = sizeof(values)/sizeof(fp);
+    constexpr unsigned n_packed = (n_values+N-1)/N;
+    fp data[n_packed][N];
+
+    std::fill((fp *)data, (fp *)data+N*n_packed, fp(0));
+    std::copy(std::begin(values), std::end(values), (fp *)data);
+
+    for (unsigned i = 0; i<n_packed; ++i) {
+        fp expected[N], result[N];
+        for (unsigned j = 0; j<N; ++j) {
+            expected[j] = std::log(data[i][j]);
+        }
+
+        simd s(data[i]);
+        s = log(s);
+        s.copy_to(result);
+
+        EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result));
+    }
+}
+
+REGISTER_TYPED_TEST_CASE_P(simd_fp_value, fp_maths, exp_special_values, expm1_special_values, log_special_values);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd<double, 4, simd_abi::avx>,
+#endif
+#ifdef __AVX2__
+    simd<double, 4, simd_abi::avx2>,
+#endif
+#ifdef __AVX512F__
+    simd<double, 8, simd_abi::avx512>,
+#endif
+
+    simd<float, 2, simd_abi::generic>,
+    simd<double, 4, simd_abi::generic>,
+    simd<float, 8, simd_abi::generic>,
+
+    simd<double, 4, simd_abi::default_abi>,
+    simd<double, 8, simd_abi::default_abi>
+> simd_fp_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_fp_value, simd_fp_test_types);
+
+// Gather/scatter tests.
+
+template <typename A, typename B>
+struct simd_and_index {
+    using simd = A;
+    using simd_index = B;
+};
+
+template <typename SI>
+struct simd_indirect: public ::testing::Test {};
+
+TYPED_TEST_CASE_P(simd_indirect);
+
+TYPED_TEST_P(simd_indirect, gather) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen];
+        index indirect[N];
+
+        fill_random(array, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+
+        simd s;
+        s.gather(array, simd_index(indirect));
+
+        scalar test[N];
+        for (unsigned j = 0; j<N; ++j) {
+            test[j] = array[indirect[j]];
+        }
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, s));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, masked_gather) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+    using simd_mask = typename simd::simd_mask;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], original[N], test[N];
+        index indirect[N];
+        bool mask[N];
+
+        fill_random(array, rng);
+        fill_random(original, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+        fill_random(mask, rng);
+
+        for (unsigned j = 0; j<N; ++j) {
+            test[j] = mask[j]? array[indirect[j]]: original[j];
+        }
+
+        simd s(original);
+        simd_mask m(mask);
+        where(m, s).gather(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, s));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, scatter) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], test[buflen], values[N];
+        index indirect[N];
+
+        fill_random(array, rng);
+        fill_random(values, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+
+        for (unsigned j = 0; j<buflen; ++j) {
+            test[j] = array[j];
+        }
+        for (unsigned j = 0; j<N; ++j) {
+            test[indirect[j]] = values[j];
+        }
+
+        simd s(values);
+        s.scatter(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, array));
+    }
+}
+
+TYPED_TEST_P(simd_indirect, masked_scatter) {
+    using simd = typename TypeParam::simd;
+    using simd_index = typename TypeParam::simd_index;
+    using simd_mask = typename simd::simd_mask;
+
+    constexpr unsigned N = simd::width;
+    using scalar = typename simd::scalar_type;
+    using index = typename simd_index::scalar_type;
+
+    std::minstd_rand rng(1011);
+
+    constexpr std::size_t buflen = 1000;
+
+    for (unsigned i = 0; i<nrounds; ++i) {
+        scalar array[buflen], test[buflen], values[N];
+        index indirect[N];
+        bool mask[N];
+
+        fill_random(array, rng);
+        fill_random(values, rng);
+        fill_random(indirect, rng, 0, (int)(buflen-1));
+        fill_random(mask, rng);
+
+        for (unsigned j = 0; j<buflen; ++j) {
+            test[j] = array[j];
+        }
+        for (unsigned j = 0; j<N; ++j) {
+            if (mask[j]) { test[indirect[j]] = values[j]; }
+        }
+
+        simd s(values);
+        simd_mask m(mask);
+        where(m, s).scatter(array, simd_index(indirect));
+
+        EXPECT_TRUE(::testing::indexed_eq_n(N, test, array));
+    }
+}
+
+
+REGISTER_TYPED_TEST_CASE_P(simd_indirect, gather, masked_gather, scatter, masked_scatter);
+
+typedef ::testing::Types<
+
+#ifdef __AVX__
+    simd_and_index<simd<double, 4, simd_abi::avx>,
+                   simd<int, 4, simd_abi::avx>>,
+#endif
+
+#ifdef __AVX2__
+    simd_and_index<simd<double, 4, simd_abi::avx2>,
+                   simd<int, 4, simd_abi::avx2>>,
+#endif
+
+#ifdef __AVX512F__
+    simd_and_index<simd<double, 8, simd_abi::avx512>,
+                   simd<int, 8, simd_abi::avx512>>,
+#endif
+
+    simd_and_index<simd<float, 4, simd_abi::generic>,
+                   simd<std::int64_t, 4, simd_abi::generic>>,
+
+    simd_and_index<simd<double, 8, simd_abi::generic>,
+                   simd<unsigned, 8, simd_abi::generic>>,
+
+    simd_and_index<simd<double, 4, simd_abi::default_abi>,
+                   simd<int, 4, simd_abi::default_abi>>,
+
+    simd_and_index<simd<double, 8, simd_abi::default_abi>,
+                   simd<int, 8, simd_abi::default_abi>>
+> simd_indirect_test_types;
+
+INSTANTIATE_TYPED_TEST_CASE_P(S, simd_indirect, simd_indirect_test_types);