diff --git a/CMakeLists.txt b/CMakeLists.txt index f8f4ac9729ab8ae398073402f323143e9432b379..2f92d0a7fd39bb11622429c361a74cfa391d5ec7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,9 @@ endif() set(ARB_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX2,AVX512}") set_property(CACHE ARB_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX2 AVX512) +# Note: this option conflates modcc code generation options and +# the target architecture for compilation of Arbor. TODO: fix! + if(ARB_VECTORIZE_TARGET STREQUAL "KNL") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL} -DSIMD_KNL") elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX2") diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake index 40a019bcbcdec70f9ac72d1fd076ea18e558ba72..8e51dddbdb597933a7808f7ae77685e334be2a7a 100644 --- a/cmake/CompilerOptions.cmake +++ b/cmake/CompilerOptions.cmake @@ -16,6 +16,10 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL") endif() if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + set(CXXOPT_KNL "-march=knl") + set(CXXOPT_AVX2 "-mavx2 -mfma") + set(CXXOPT_AVX512 "-mavx512f -mavx512cd") + # Disable 'missing-braces' warning: this will inappropriately # flag initializations such as # std::array<int,3> a={1,2,3}; @@ -36,7 +40,7 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU") # Compiler flags for generating KNL-specific AVX512 instructions # supported in gcc 4.9.x and later. set(CXXOPT_KNL "-march=knl") - set(CXXOPT_AVX2 "-mavx2") + set(CXXOPT_AVX2 "-mavx2 -mfma") set(CXXOPT_AVX512 "-mavx512f -mavx512cd") # Disable 'maybe-uninitialized' warning: this will be raised diff --git a/doc/index.rst b/doc/index.rst index 23852d71b732e1f22e1aa846afc3845995ae94f2..661a67946e69f057bb4f0f40ebd7f540e1e4d59c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -25,6 +25,8 @@ Arbor is a high-performance library for computationa neurscience simulations. :maxdepth: 1 library + simd_api + simd_maths profiler sampling_api diff --git a/doc/simd_api.rst b/doc/simd_api.rst new file mode 100644 index 0000000000000000000000000000000000000000..be33e7a040ebaa0693adaad8af0ca926b4207750 --- /dev/null +++ b/doc/simd_api.rst @@ -0,0 +1,950 @@ +SIMD classes for Arbor +====================== + +The purpose of the SIMD classes is to abstract and consolidate the use of +compiler intrinsics for the manipulation of architecture-specific vector +(SIMD) values. + +The implementation is rather loosely based on the data-parallel vector types +proposal `P0214R6 for the C++ Parallelism TS 2 <http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0214r6.pdf>`_. + +Unless otherwise specified, all classes, namespaces and top-level functions +described below are all within the top-level `arb::simd` namespace. + +.. rubric:: Example usage + +The following code performs an element-wise vector product, storing +only non-zero values in the resultant array. + +.. container:: example-code + + .. code-block:: cpp + + #include <simd/simd.hpp> + using namespace arb::simd; + + void product_nonzero(int n, const double* a, const double* b, double* result) { + constexpr int N = simd_abi::native_width<double>::value; + using simd = simd<double, N>; + using mask = simd::simd_mask; + + int i = 0; + for (; i+N<=n; i+=N) { + auto vp = simd(a+i)*simd(b+i); + where(vp!=0, vp).copy_to(result+i); + } + + int tail = n-i; + auto m = mask::unpack((1<<tail)-1); + + auto vp = simd(a+i, m)*simd(b+i, m); + where(m && vp!=0, vp).copy_to(c+i); + } + + +Classes +------- + +Three user-facing template classes are provided: + +1. ``simd<V, N, I = simd_abi::default_abi>`` + + *N*-wide vector type of values of type *V*, using architecture-specific + implementation *I*. The implementation parameter is itself a template, + acting as a type-map, with ``I<V, N>::type`` being the concrete implementation + class (see below) for *N*-wide vectors of type *V* for this architecture. + + The implementation ``simd_abi::generic`` provides a ``std::array``-backed + implementation for arbitrary *V* and *N*, while ``simd_abi::native`` + maps to the native architecture implementation for *V* and *N*, if + one is available for the target architecture. + + ``simd_abi::default_abi`` will use ``simd_abi::native`` if available, or + else fall back to the generic implementation. + +2. ``simd_mask<V, N, I = simd_api::default_abi>`` + + The result of performing a lane-wise comparison/test operation on + a ``simd<V, N, I>`` vector value. ``simd_mask`` objects support logical + operations and are used as arguments to ``where`` expressions. + + ``simd_mask<V, N, I>`` is a type alias for ``simd<V, N, I>::simd_mask``. + +3. ``where_expression<simd<V, N, I>>`` + + The result of a ``where`` expression, used for masked assignment. + +Implementation typemaps live in the ``simd_abi`` namespace, while concrete +implementation classes live in ``simd_detail``. A particular specialization +for an architecture, for example 4-wide double on AVX, then requires: + +* A concrete implementation class, e.g. ``simd_detail::avx_double4``. + +* A specialization of its ABI map, so that ``simd_abi::avx<double, 4>::type`` + is an alias for ``simd_detail::avx_double4``. + +* A specialization of the native ABI map, so that + ``simd_abi::native<double, 4>::type`` is an alias for ``simd_abi::avx<double, 4>::type``. + +The maximum natively supported width for a scalar type *V* is recorded in +``simd_abi::native_width<V>::value``. + +Class ``simd`` +^^^^^^^^^^^^^^ + +The class ``simd<V, N, I>`` is an alias for ``simd_detail::simd_impl<I<V, N>::type>``; +the class ``simd_detail::simd_impl<C>`` provides the public interface and +arithmetic operators for a concrete implementation class `C`. + +In the following: + +* *S* stands for the class ``simd<V, N, I>``. +* *s* is an SIMD value of type *S*. +* *m* is a mask value of type ``S::simd_mask``. +* *t*, *u* and *v* are const objects of type *S*. +* *i* is an index of type ``int``. +* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type. +* *x* is a value of type *V*. +* *p* is a pointer to *V*. +* *c* is a const pointer to *V* or a length *N* array of *V*. + +Here and below, the value in lane *i* of a SIMD vector or mask *v* is denoted by +*v*\ `i`:sub: + + +.. rubric:: Type aliases and constexpr members + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Name + - Type + - Description + + * - ``S::scalar_type`` + - *V* + - The type of one lane of the SIMD type. + + * - ``S::simd_mask`` + - ``simd_mask<V, N, I>`` + - The ``simd_mask`` specialization resulting from comparisons of *S* SIMD values. + + * - ``S::width`` + - ``unsigned`` + - The SIMD width *N*. + +.. rubric:: Constructors + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Expression + - Description + + * - ``S(x)`` + - A SIMD value *v* with *v*\ `i`:sub: equal to *x* for *i* = 0…*N*-1. + + * - ``S(t)`` + - A copy of the SIMD value *t*. + + * - ``S(c)`` + - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* = 0…*N*-1. + + * - ``S(c, m)`` + - A SIMD value *v* with *v*\ `i`:sub: equal to ``c[i]`` for *i* where *m*\ `i`:sub: is true. + +.. rubric:: Member functions + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``t.copy_to(p)`` + - ``void`` + - Set ``p[i]`` to *t*\ `i`:sub: for *i* = 0…*N*-1. + + * - ``t.scatter(p, j)`` + - ``void`` + - Set ``p[j[i]]`` to *t*\ `i`:sub: for *i* = 0…*N*-1. + + * - ``s.copy_from(c)`` + - ``void`` + - Set *s*\ `i`:sub: to ``c[i]`` for *i* = 0…*N*-1. + + * - ``s.gather(c, j)`` + - ``void`` + - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* = 0…*N*-1. + +.. rubric:: Expressions + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``t+u`` + - ``S`` + - Lane-wise sum. + + * - ``t-u`` + - ``S`` + - Lane-wise difference. + + * - ``t*u`` + - ``S`` + - Lane-wise product. + + * - ``t/u`` + - ``S`` + - Lane-wise quotient. + + * - ``fma(t, u, v)`` + - ``S`` + - Lane-wise FMA *t* * *u* + *v*. + + * - ``s<t`` + - ``S::simd_mask`` + - Lane-wise less-than comparison. + + * - ``s<=t`` + - ``S::simd_mask`` + - Lane-wise less-than-or-equals comparison. + + * - ``s>t`` + - ``S::simd_mask`` + - Lane-wise greater-than comparison. + + * - ``s>=t`` + - ``S::simd_mask`` + - Lane-wise greater-than-or-equals comparison. + + * - ``s==t`` + - ``S::simd_mask`` + - Lane-wise equality test. + + * - ``s!=t`` + - ``S::simd_mask`` + - Lane-wise inequality test. + + * - ``s=t`` + - ``S&`` + - Lane-wise assignment. + + * - ``s+=t`` + - ``S&`` + - Equivalent to ``s=s+t``. + + * - ``s-=t`` + - ``S&`` + - Equivalent to ``s=s-t``. + + * - ``s*=t`` + - ``S&`` + - Equivalent to ``s=s*t``. + + * - ``s/=t`` + - ``S&`` + - Equivalent to ``s=s/t``. + + * - ``s=x`` + - ``S&`` + - Equivalent to ``s=S(x)``. + + * - ``t[i]`` + - ``V`` + - Value *t*\ `i`:sub: + + * - ``s[i]=x`` + - ``S::reference`` + - Set value *s*\ `i`:sub: to *x*. + +The (non-const) index operator ``operator[]`` returns a proxy object of type ``S::reference``, +which writes the corresponding lane in the SIMD value on assignment, and has an +implicit conversion to ``scalar_type``. + + +Class ``simd_mask`` +^^^^^^^^^^^^^^^^^^^ + +``simd_mask<V, N, I>`` is an alias for ``simd<V, N, I>::simd_mask``, which in turn +will be an alias for a class ``simd_detail::simd_mask_impl<D>``, where *D* is +a concrete implementation class for the SIMD mask representation. ``simd_mask_impl<D>`` +inherits from, and is implemented in terms of, ``simd_detail::simd_impl<D>``, +but note that the concrete implementation class *D* may or may not be the same +as the concrete implementation class ``I<V, N>::type`` used by ``simd<V, N, I>``. + +Mask values are read and written as ``bool`` values of 0 or 1, which may +differ from the internal representation in each lane of the SIMD implementation. + +In the following: + +* *M* stands for the class ``simd_mask<V, N, I>``. +* *m* and *q* are const objects of type ``simd_mask<V, N, I>``. +* *u* is an object of type ``simd_mask<V, N, I>``. +* *b* is a boolean value. +* *w* is a pointer to ``bool``. +* *y* is a const pointer to ``bool`` or a length *N* array of ``bool``. +* *i* is of type ``int``. +* *k* is of type ``unsigned long long``. + +.. rubric:: Constructors + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Expression + - Description + + * - ``M(b)`` + - A SIMD mask *u* with *u*\ `i`:sub: equal to *b* for *i* = 0…*N*-1. + + * - ``M(m)`` + - A copy of the SIMD mask *m*. + + * - ``M(y)`` + - A SIMD value *u* with *u*\ `i`:sub: equal to ``y[i]`` for *i* = 0…*N*-1. + +Note that ``simd_mask`` does not (currently) offer a masked pointer/array constructor. + +.. rubric:: Member functions + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``m.copy_to(w)`` + - ``void`` + - Write the boolean value *m*\ `i`:sub: to ``w[i]`` for *i* = 0…*N*-1. + + * - ``u.copy_from(y)`` + - ``void`` + - Set *u*\ `i`:sub: to the boolean value ``y[i]`` for *i* = 0…*N*-1. + +.. rubric:: Expressions + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``!m`` + - ``M`` + - Lane-wise negation. + + * - ``m&&q`` + - ``M`` + - Lane-wise logical and. + + * - ``m||q`` + - ``M`` + - Lane-wise logical or. + + * - ``m==q`` + - ``M`` + - Lane-wise equality (equivalent to ``m!=!q``). + + * - ``m!=q`` + - ``M`` + - Lane-wise logical xor. + + * - ``m=q`` + - ``M&`` + - Lane-wise assignment. + + * - ``m[i]`` + - ``bool`` + - Boolean value *m*\ `i`:sub:. + + * - ``m[i]=b`` + - ``M::reference`` + - Set *m*\ `i`:sub: to boolean value *b*. + +.. rubric:: Static member functions + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``M::unpack(k)`` + - ``M`` + - Mask with value *m*\ `i`:sub: equal to the *i*\ th bit of *k*. + + +Class ``where_expression`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``where_expression<S>`` represents a masked subset of the lanes +of a SIMD value of type ``S``, used for conditional assignment, +masked scatter, and masked gather. It is a type alias for +``S::where_expression``, and is the result of an expression of the +form ``where(mask, simdvalue)``. + +In the following: + +* *W* stands for the class ``where_expression<simd<V, N, I>>``. +* *s* is a reference to a SIMD value of type ``simd<V, N, I>&``. +* *t* is a SIMD value of type ``simd<V, N, I>``. +* *m* is a mask of type ``simd<V, N, I>::simd_mask``. +* *j* is a const object of type ``simd<U, N, J>`` where *U* is an integral type. +* *x* is a scalar of type *V*. +* *p* is a pointer to *V*. +* *c* is a const pointer to *V* or a length *N* array of *V*. + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``where(m, s)`` + - ``W`` + - A proxy for masked-assignment operations. + + * - ``where(m, s)=t`` + - ``void`` + - Set *s*\ `i`:sub: to *t*\ `i`:sub: for *i* where *m*\ `i`:sub: is true. + + * - ``where(m, s)=x`` + - ``void`` + - Set *s*\ `i`:sub: to *x* for *i* where *m*\ `i`:sub: is true. + + * - ``where(m, s).copy_to(p)`` + - ``void`` + - Set ``p[i]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true. + + * - ``where(m, s).scatter(p, j)`` + - ``void`` + - Set ``p[j[i]]`` to *s*\ `i`:sub: for *i* where *m*\ `i`:sub: is true. + + * - ``where(m, s).copy_from(c)`` + - ``void`` + - Set *s*\ `i`:sub: to ``c[i]`` for *i* where *m*\ `i`:sub: is true. + + * - ``where(m, s).gather(c, j)`` + - ``void`` + - Set *s*\ `i`:sub: to ``c[j[i]]`` for *i* where *m*\ `i`:sub: is true. + + +Top-level functions +------------------- + +Lane-wise mathematical operations *abs(x)*, *min(x, y)* and *max(x, y)* are offered for +all SIMD value types, while the transcendental functions are only usable for +SIMD floating point types. + +Vectorized implementations of some of the transcendental functions are provided: +refer to :doc:`simd_maths` for details. + + +In the following: + +* *A* is a SIMD class ``simd<K, N, I>`` for some scalar type *K*. +* *S* is a SIMD class ``simd<V, N, I>`` for a floating point type *V*. +* *a* and *b* are values of type *A*. +* *s* and *t* are values of type *S*. + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``abs(a)`` + - *A* + - Lane-wise absolute value of *a*. + + * - ``min(a, b)`` + - *A* + - Lane-wise minimum of *a* and *b*. + + * - ``max(a, b)`` + - *A* + - Lane-wise maximum of *a* and *b*. + + * - ``sin(s)`` + - *S* + - Lane-wise sine of *s*. + + * - ``cos(s)`` + - *S* + - Lane-wise cosine of *s*. + + * - ``log(s)`` + - *S* + - Lane-wise natural logarithm of *s*. + + * - ``exp(s)`` + - *S* + - Lane-wise exponential of *s*. + + * - ``expm1(s)`` + - *S* + - Lane-wise :math:`x \mapsto e^x - 1`. + + * - ``exprelr(s)`` + - *S* + - Lane-wise :math:`x \mapsto x / (e^x - 1)`. + + * - ``pow(s, t)`` + - *S* + - Lane-wise raise *s* to the power of *t*. + + +Implementation requirements +--------------------------- + +Each specific architecture is represented by a templated class *I*, with +``I<V, N>::type`` being the concrete implementation for an *N*-wide +SIMD value with ``scalar_type`` *V*. + +A concrete implementation class *C* inherits from ``simd_detail::implbase<C>``, +which provides (via CRTP) generic implementations of most of the SIMD +functionality. The base class ``implbase<C>`` in turn relies upon +``simd_detail::simd_traits<C>`` to look up the SIMD width, and associated types. + +All the required SIMD operations are given by static member functions of *C*. + +Minimal implementation +^^^^^^^^^^^^^^^^^^^^^^ + +In the following, let *C* be the concrete implementation class for a +*N*-wide vector of scalar_type *V*, with low-level representation +``archvec``. + +The specialization of ``simd_detail::simd_traits<C>`` then exposes these +types and values, and also provides the concrete implementation class *M* +for masks associated with *C*: + +.. container:: api-code + + .. code-block:: cpp + + template <> + struct simd_traits<C> { + static constexpr unsigned width = N; + using scalar_type = V; + using vector_type = archvec; + using mask_impl = M; + }; + + +The mask implementation class *M* may or may not be the same as *C*. +For example, ``simd_detail::avx_double4`` provides both the arithmetic operations and mask +operations for an AVX 4 × double SIMD vector, while the mask +implementation for ``simd_detail::avx512_double8`` is ``simd_detail::avx512_mask8``. + +The concrete implementation class must provide at minimum implementations +of ``copy_to`` and ``copy_from`` (see the section below for semantics): + +.. container:: api-code + + .. code-block:: cpp + + struct C: implbase<C> { + static void copy_to(const arch_vector&, V*); + static arch_vector copy_from(const V*); + }; + +If the implementation is also acting as a mask implementation, it must also +provide ``mask_copy_to``, ``mask_copy_from``, ``mask_element`` and +``mask_set_element``: + +.. container:: api-code + + .. code-block:: cpp + + struct C: implbase<C> { + static void copy_to(const arch_vector&, V*); + static arch_vector copy_from(const V*); + + static void mask_copy_to(const arch_vector& v, bool* w); + static arch_vector mask_copy_from(const bool* y); + static bool mask_element(const arch_vector& v, int i); + static void mask_set_element(arch_vector& v, int i, bool x); + }; + +The ``simd_detial::generic<T, N>`` provides an example of a minimal +implementation based on an ``arch_vector`` type of ``std::array<T, N>``. + + +Concrete implementation API +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In the following, *C* represents the concrete implementation class for +a SIMD class of width *N* and value type *V*. + +* *u*, *v*, and *w* are values of type ``C::vector_type``. +* *r* is a reference of type ``C::vector_type&``. +* *x* is a value of type ``C::scalar_type``. +* *c* is a const pointer of type ``const C::scalar_type*``. +* *p* is a pointer of type ``C::scalar_type*``. +* *j* is a SIMD index representation of type ``J::vector_type`` for + an integral concrete implementation class *J*. +* *b* is a ``bool`` value. +* *w* is a pointer to ``bool``. +* *y* is a const pointer to ``bool``. +* *i* is an unsigned (index) value. +* *k* is an unsigned long long value. +* *m* is a mask representation of type ``C::mask_type``. + +.. rubric:: Types and constants + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Name + - Type + - Description + + * - ``C::vector_type`` + - ``simd_traits<C>::vector_type`` + - Underlying SIMD representation type. + + * - ``C::scalar_type`` + - ``simd_traits<C>::scalar_type`` + - Should be convertible to/from *V*. + + * - ``C::mask_impl`` + - ``simd_traits<C>::mask_impl`` + - Concrete implementation class for mask SIMD type. + + * - ``C::mask_type`` + - ``C::mask_impl::vector_type`` + - Underlying SIMD representation for masks. + + * - ``C::width`` + - ``unsigned`` + - The SIMD width *N*. + +.. rubric:: Initialization, load, store + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``C::broadcast(x)`` + - ``C::vector_type`` + - Fill representation with scalar *x*. + + * - ``C::copy_to(v, p)`` + - ``void`` + - Store values *v*\ `i`:sub: to *p+i*. *p* may be unaligned. + + * - ``C::copy_to_masked(v, p, m)`` + - ``void`` + - Store values *v*\ `i`:sub: to *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned. + + * - ``C::copy_from(c)`` + - ``C::vector_type`` + - Return a vector with values *v*\ `i`:sub: loaded from *p+i*. *p* may be unaligned. + + * - ``C::copy_from_masked(c, m)`` + - ``C::vector_type`` + - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true. *p* may be unaligned. + + * - ``C::copy_from_masked(w, c, m)`` + - ``void`` + - Return a vector with values *v*\ `i`:sub: loaded from *p+i* wherever *m*\ `i`:sub: is true, or equal to *w*\ `i`:sub + otherwise. *p* may be unaligned. + +.. rubric:: Lane access + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``C::element(v, i)`` + - ``C::scalar_type`` + - Value in ith lane of *u*. + + * - ``C::set_element(r, i, x)`` + - ``void`` + - Set value in lane *i* of *r* to *x*. + +.. rubric:: Gather and scatter + +The offsets for gather and scatter operations are given +by a vector type ``J::vector_type`` for some possibly +different concrete implementation class *J*, and the +static methods implementing gather and scatter are templated +on this class. + +Implementations can provide optimized versions for specific +index classes *J*; this process would be simplified with +more support for casts between SIMD types and their concrete +implementations, functionality which is not yet provided. + +The first argument to these functions is a dummy argument +of type *J*, used only to disambiguate overloads. + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Expression + - Type + - Description + + * - ``C::gather(J{}, p, j)`` + - ``C::vector_type`` + - Vector *v* with values *v*\ `i`:sub: = ``p[j[i]]``. + + * - ``C::gather(J{}, u, p, j, m)`` + - ``C::vector_type`` + - Vector *v* with values *v*\ `i`:sub: = *m*\ `i`:sub: ? ``p[j[i]]`` : *u*\ `i`:sub:. + + * - ``C::scatter(J{}, u, p, j)`` + - ``void`` + - Write values *u*\ `i`:sub: to ``p[j[i]]``. + + * - ``C::scatter(J{}, u, p, j, m)`` + - ``void`` + - Write values *u*\ `i`:sub: to ``p[j[i]]`` for lanes *i* where *m*\ `i`:sub: is true. + +.. rubric:: Arithmetic operations + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Expression + - Type + - Description + + * - ``C::negate(v)`` + - ``C::vector_type`` + - Lane-wise unary minus. + + * - ``C::mul(u, v)`` + - ``C::vector_type`` + - Lane-wise multiplication. + + * - ``C::add(u, v)`` + - ``C::vector_type`` + - Lane-wise addition. + + * - ``C::sub(u, v)`` + - ``C::vector_type`` + - Lane-wise subtraction. + + * - ``C::div(u, v)`` + - ``C::vector_type`` + - Lane-wise division. + + * - ``C::fma(u, v, w)`` + - ``C::vector_type`` + - Lane-wise fused multiply-add (u*v+w). + +.. rubric:: Comparison and blends + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``C::cmp_eq(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* = *v*. + + * - ``C::cmp_neq(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* ≠*v*. + + * - ``C::cmp_gt(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* > *v*. + + * - ``C::cmp_geq(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* ≥ *v*. + + * - ``C::cmp_lt(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* < *v*. + + * - ``C::cmp_leq(u, v)`` + - ``C::mask_type`` + - Lane-wise *u* ≤ *v*. + + * - ``C::ifelse(m, u, v)`` + - ``C::vector_type`` + - Vector *w* with values *w*\ `i`:sub: = *m*\ `i`:sub: ? *u*\ `i`:sub: : *v*\ `i`:sub:. + +.. rubric:: Mathematical function support. + +With the exception of ``abs``, ``min`` and ``max``, these are only +required for floating point vector implementations. + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``C::abs(v)`` + - ``C::vector_type`` + - Lane-wise absolute value. + + * - ``C::min(u, v)`` + - ``C::vector_type`` + - Lane-wise minimum. + + * - ``C::max(u, v)`` + - ``C::vector_type`` + - Lane-wise maximum. + + * - ``C::sin(v)`` + - ``C::vector_type`` + - Lane-wise sine. + + * - ``C::cos(v)`` + - ``C::vector_type`` + - Lane-wise cosine. + + * - ``C::log(v)`` + - ``C::vector_type`` + - Lane-wise natural logarithm. + + * - ``C::exp(v)`` + - ``C::vector_type`` + - Lane-wise exponential. + + * - ``C::expm1(v)`` + - ``C::vector_type`` + - Lane-wise :math:`x \mapsto e^x -1`. + + * - ``C::exprelr(v)`` + - ``C::vector_type`` + - Lane-wise :math:`x \mapsto x/(e^x -1)`. + + * - ``C::pow(u, v)`` + - ``C::vector_type`` + - Lane-wise *u* raised to the power of *v*. + +.. rubric:: Mask value support + +Mask operations are only required if *C* constitutes the implementation of a +SIMD mask class. + +.. list-table:: + :widths: 20 20 60 + :header-rows: 1 + + * - Expression + - Type + - Description + + * - ``C::mask_broadcast(b)`` + - ``C::vector_type`` + - Fill mask representation with bool *b*. + + * - ``C::mask_element(v, i)`` + - ``bool`` + - Mask value *v*\ `i`:sub:. + + * - ``C::mask_set_element(u, i, b)`` + - ``void`` + - Set mask value *u*\ `i`:sub: to *b*. + + * - ``C::mask_copy_to(v, w)`` + - ``void`` + - Write bool values to memory (unaligned). + + * - ``C::mask_copy_from(y)`` + - ``C::vector_type`` + - Load bool values from memory (unaligned). + + * - ``C::mask_unpack(k)`` + - ``C::vector_type`` + - Return vector *v* with boolean value *v*\ `i`:sub: equal + to the *i*\ th bit of *k*. + +.. rubric:: Logical operations + +Logical operations are only required if *C* constitutes the implementation of a +SIMD mask class. + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Expression + - Type + - Description + + * - ``C::logical_not(u)`` + - ``C::vector_type`` + - Lane-wise negation. + + * - ``C::logical_and(u, v)`` + - ``C::vector_type`` + - Lane-wise logical and. + + * - ``C::logical_or(u, v)`` + - ``C::vector_type`` + - Lane-wise logical or. + + * - ``C::select(m, v, w)`` + - ``C::vector_type`` + - Lane-wise *m*? *v*: *u*. + + +Missing functionality +--------------------- + +There is no support yet for the following features, although some of these +will need to be provided in order to improve the efficiency of SIMD versions +of our generated mechanisms. + +* A SIMD cast function, e.g. ``simd_cast<S>(const T&)`` that converts between + different SIMD wrappers of the same width. The infrastructure that supports + this in concrete implementation classes would also simplify the implementation + of more generic ``gather`` and ``scatter`` methods. + +* Horizontal reductions across the lanes of a SIMD value or where-expression. + +* Vectorizable implementations of trigonometric functions. + +* Compound assignment operations for where-expressions. Extending the concrete + implementation API to support this would allow, for example, efficient use + of AVX512 masked arithmetic instructions. + diff --git a/doc/simd_maths.rst b/doc/simd_maths.rst new file mode 100644 index 0000000000000000000000000000000000000000..b93b07e663973100949c6f885bea4dff6863bd78 --- /dev/null +++ b/doc/simd_maths.rst @@ -0,0 +1,185 @@ +Implementation of vector transcendental functions +================================================= + +When building with the Intel C++ compiler, transcendental +functions on SIMD values in ``simd<double, 8, simd_detail::avx512>`` +wrap calls to the Intel scalar vector mathematics library (SVML). + +Outside of this case, the functions *exp*, *log*, *expm1* and +*exprelr* use explicit approximations as detailed below. The +algortihms follow those used in the +`Cephes library <http://www.netlib.org/cephes/>`_, with +some accommodations. + +.. default-role:: math + +Exponentials +------------ + +`\operatorname{exp}(x)` +^^^^^^^^^^^^^^^^^^^^^^^ + +The exponential is computed as + +.. math:: + + e^x = 2^n · e^g, + +with `|g| ≤ 0.5` and `n` an integer. The power of two +is computed via direct manipulation of the exponent bits of the floating +point representation, while `e^g` is approximated by a rational polynomial. + +`n` and `g` are computed by: + +.. math:: + + n &= \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor + + g &= x - n·\log 2 + +where the subtraction in the calculation of `g` is performed in two stages, +to limit cancellation error: + +.. math:: + + g &\leftarrow \operatorname{fl}(x - n · c_1) + + g &\leftarrow \operatorname{fl}(g - n · c_2) + +where `c_1+c_2 = \log 2`,`c_1` comprising the first 32 bits of the mantissa. +(In principle `c_1` might contain more bits of the logarithm, but this +particular decomposition matches that used in the Cephes library.) This +decomposition gives `|g|\leq \frac{1}{2}\log 2\approx 0.347`. + +The rational approximation for `e^g` is of the form + +.. math:: + + e^g \approx \frac{R(g)}{R(-g)} + +where `R(g)` is a polynomial of order 6. The coefficients are again those +used by Cephes, and probably are derived via a Remez algorithm. +`R(g)` is decomposed into even and odd terms + +.. math:: + + R(g) = Q(x^2) + xP(x^2) + +so that the ratio can be calculated by: + +.. math:: + + e^g \approx 1 + \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}. + +Randomized testing indicates the approximation is accurate to 2 ulp. + + +`\operatorname{expm1}(x)` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +A similar decomposition of `x = g + n·\log 2` is performed so that +`g≤0.5`, with the exception that `n` is always taken to +be zero for `|x|≤0.5`, i.e. + +.. math:: + + n = \begin{cases} + 0&\text{if $|x|≤0.5$,}\\ + \left\lfloor \frac{x}{\log 2} + 0.5 \right\rfloor + &\text{otherwise.} + \end{cases} + + +`\operatorname{expm1}(x)` is then computed as + +.. math:: + + e^x - 1 = 2^n·(e^g - 1)+(2^n-1). + +and the same rational polynomial is used to approximate `e^g-1`, + +.. math:: + + e^g - 1 \approx \frac{2gP(g^2)}{Q(g^2)-gP(g^2)}. + +The scaling by step for `n≠0` is in practice calculated as + +.. math:: + + e^x - 1 = 2·(2^{n-1}·(e^g - 1)+(2^{n-1}-0.5)). + +in order to avoid overflow at the upper end of the range. + +The order 6 rational polynomial approximation for small `x` +is insufficiently accurate to maintain 1 ulp accuracy; randomized +testing indicates a maximum error of up to 3 ulp. + + +`\operatorname{exprelr}(x)` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The function is defined as + +.. math:: + + \operatorname{exprelr}(x) = x/(e^x-1), + +and is the reciprocal of the relative exponential function, + +.. math:: + + \operatorname{exprel}(x) &= {}_1F_1(1; 2; x)\\ + &= \frac{e^x-1}{x}. + +This is computed in terms of expm1 by: + +.. math:: + + \operatorname{exprelr}(x) := + \begin{cases} + 1&\text{if $\operatorname{fl}(1+x) = 1$,}\\ + x/\operatorname{expm1}(x)&\text{otherwise.} + \end{cases} + +With the approximation for `\operatorname{expm1}` used above, +randomized testing demonstrates a maximum error on the order +of 4 ulp. + + +Logarithms +---------- + +The natural logarithm is computed as + +.. math:: + + \log x = \log u + n·log 2 + +where `n` is an integer and `u` is in the interval +`[ \frac{1}{2}\sqrt 2, \sqrt 2]`. The logarithm of +`u` is then approximated by the rational polynomial +used in the Cephes implementation, + +.. math:: + + \log u &\approx R(u-1) + + R(z) &= z - \frac{z^2}{2} + z^3·\frac{P(z)}{Q(z)}, + +where `P` and `Q` are polynomials of degree 5, with +`Q` monic. + +Cancellation error is minimized by computing the sum for +`\log x` as: + +.. math:: + + s &\leftarrow \operatorname{fl}(z^3·P(z)/Q(z))\\ + s &\leftarrow \operatorname{fl}(s + n·c_4)\\ + s &\leftarrow \operatorname{fl}(s - 0.5·z^2)\\ + s &\leftarrow \operatorname{fl}(s + z)\\ + s &\leftarrow \operatorname{fl}(s + n·c_3) + +where `z=u-1` and `c_3+c_4=\log 2`, `c_3` comprising +the first 9 bits of the mantissa. + diff --git a/doc/static/custom.css b/doc/static/custom.css index 005eac316b8d6274be5c4749129a7b8780a692fe..54b964182c59ae21cf065a5a005579faea4d56ce 100644 --- a/doc/static/custom.css +++ b/doc/static/custom.css @@ -5,7 +5,7 @@ padding-left: 3pt; content: "Example"; background: #6ab0de; - color: #ffffff + color: #ffffff; } .api-code>div[class^='highlight']:before { @@ -14,9 +14,24 @@ padding-bottom: 2pt; padding-left: 3pt; content: "API"; - background: #e0e0e0 + background: #e0e0e0; } .api-code>div[class^='highlight'] { - background: #f4f4f4 + background: #f4f4f4; +} + +li>p:last-child { + padding-bottom: 2ex; +} + +table.docutils td { + vertical-align: top !important; +} + +@media screen and (min-width: 40em) { + .wy-table-responsive table td, + .wy-table-responsive table th { + white-space: normal; + } } diff --git a/src/simd/approx.hpp b/src/simd/approx.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cd5f8395414b69289db93fd94985a8d662569aa3 --- /dev/null +++ b/src/simd/approx.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include <cfloat> + +// Constants/magic numbers for the approximation of +// exponential, trigonometric and special functions. +// +// Polynomial coefficients and evaluation orders +// match those of the Cephes library, +// <URL: http://www.netlib.org/cephes>. +// +// Refer to the developer documentation for more +// detail concerning the approximations. + +namespace arb { +namespace simd_detail { + +// Exponential: +// +// Approximation of exponential by a Padé-like rational +// polynomial R(x)/R(-x) of order 6. +// +// P comprises the odd coefficients, and Q the even. + +constexpr double P0exp = 9.99999999999999999910E-1; +constexpr double P1exp = 3.02994407707441961300E-2; +constexpr double P2exp = 1.26177193074810590878E-4; + +constexpr double Q0exp = 2.00000000000000000009E0; +constexpr double Q1exp = 2.27265548208155028766E-1; +constexpr double Q2exp = 2.52448340349684104192E-3; +constexpr double Q3exp = 3.00198505138664455042E-6; + +// Cancellation-corrected ln(2) = ln2C1 + ln2C2: + +constexpr double ln2C1 = 6.93145751953125E-1; +constexpr double ln2C2 = 1.42860682030941723212E-6; + +// 1/ln(2): + +constexpr double ln2inv = 1.4426950408889634073599; + +// Min and max argument values for finite and normal +// double-precision exponential. + +constexpr double exp_minarg = -708.3964185322641; +constexpr double exp_maxarg = 709.782712893384; + +// For expm1, minimum argument that gives a result +// over -1. + +constexpr double expm1_minarg = -37.42994775023705; + +// Logarithm: +// +// Positive denormal numbers are treated as zero +// for the purposes of the log function. + +constexpr double log_minarg = DBL_MIN; +constexpr double sqrt2 = 1.41421356237309504880; + +// Cancellation-corrected ln(2) = ln2C3 + ln2C4: + +constexpr double ln2C3 = 0.693359375; +constexpr double ln2C4 = -2.121944400546905827679e-4; + +// Polynomial coefficients (Q is also order 5, +// but monic). + +constexpr double P0log = 7.70838733755885391666E0; +constexpr double P1log = 1.79368678507819816313e1; +constexpr double P2log = 1.44989225341610930846e1; +constexpr double P3log = 4.70579119878881725854e0; +constexpr double P4log = 4.97494994976747001425e-1; +constexpr double P5log = 1.01875663804580931796e-4; + +constexpr double Q0log = 2.31251620126765340583E1; +constexpr double Q1log = 7.11544750618563894466e1; +constexpr double Q2log = 8.29875266912776603211e1; +constexpr double Q3log = 4.52279145837532221105e1; +constexpr double Q4log = 1.12873587189167450590e1; + +} // namespace simd_detail +} // namespace arb diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1527f1b073ee3a4f7b1d147b3b5d64787fa29cc5 --- /dev/null +++ b/src/simd/avx.hpp @@ -0,0 +1,921 @@ +#pragma once + +// AVX/AVX2 SIMD intrinsics implementation. + +#ifdef __AVX__ + +#include <cmath> +#include <cstdint> +#include <immintrin.h> + +#include <simd/approx.hpp> +#include <simd/implbase.hpp> + +namespace arb { +namespace simsimd { +namespace simd_detail { + +struct avx_int4; +struct avx_double4; + +template <> +struct simd_traits<avx_int4> { + static constexpr unsigned width = 4; + using scalar_type = std::int32_t; + using vector_type = __m128i; + using mask_impl = avx_int4; +}; + +template <> +struct simd_traits<avx_double4> { + static constexpr unsigned width = 4; + using scalar_type = double; + using vector_type = __m256d; + using mask_impl = avx_double4; +}; + +struct avx_int4: implbase<avx_int4> { + // Use default implementations for: element, set_element, div. + + using int32 = std::int32_t; + + static __m128i broadcast(int32 v) { + return _mm_set1_epi32(v); + } + + static void copy_to(const __m128i& v, int32* p) { + _mm_storeu_si128((__m128i*)p, v); + } + + static void copy_to_masked(const __m128i& v, int32* p, const __m128i& mask) { + _mm_maskstore_ps(reinterpret_cast<float*>(p), mask, _mm_castsi128_ps(v)); + } + + static __m128i copy_from(const int32* p) { + return _mm_loadu_si128((const __m128i*)p); + } + + static __m128i copy_from_masked(const int32* p, const __m128i& mask) { + return _mm_castps_si128(_mm_maskload_ps(reinterpret_cast<const float*>(p), mask)); + } + + static __m128i copy_from_masked(const __m128i& v, const int32* p, const __m128i& mask) { + __m128 d = _mm_maskload_ps(reinterpret_cast<const float*>(p), mask); + return ifelse(mask, _mm_castps_si128(d), v); + } + + static __m128i negate(const __m128i& a) { + __m128i zero = _mm_setzero_si128(); + return _mm_sub_epi32(zero, a); + } + + static __m128i add(const __m128i& a, const __m128i& b) { + return _mm_add_epi32(a, b); + } + + static __m128i sub(const __m128i& a, const __m128i& b) { + return _mm_sub_epi32(a, b); + } + + static __m128i mul(const __m128i& a, const __m128i& b) { + return _mm_mullo_epi32(a, b); + } + + static __m128i fma(const __m128i& a, const __m128i& b, const __m128i& c) { + return _mm_add_epi32(_mm_mullo_epi32(a, b), c); + } + + static __m128i logical_not(const __m128i& a) { + __m128i ones = {}; + return _mm_xor_si128(a, _mm_cmpeq_epi32(ones, ones)); + } + + static __m128i logical_and(const __m128i& a, const __m128i& b) { + return _mm_and_si128(a, b); + } + + static __m128i logical_or(const __m128i& a, const __m128i& b) { + return _mm_or_si128(a, b); + } + + static __m128i cmp_eq(const __m128i& a, const __m128i& b) { + return _mm_cmpeq_epi32(a, b); + } + + static __m128i cmp_neq(const __m128i& a, const __m128i& b) { + return logical_not(cmp_eq(a, b)); + } + + static __m128i cmp_gt(const __m128i& a, const __m128i& b) { + return _mm_cmpgt_epi32(a, b); + } + + static __m128i cmp_geq(const __m128i& a, const __m128i& b) { + return logical_not(cmp_gt(b, a)); + } + + static __m128i cmp_lt(const __m128i& a, const __m128i& b) { + return cmp_gt(b, a); + } + + static __m128i cmp_leq(const __m128i& a, const __m128i& b) { + return logical_not(cmp_gt(a, b)); + } + + static __m128i ifelse(const __m128i& m, const __m128i& u, const __m128i& v) { + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(u), _mm_castsi128_ps(m))); + } + + static __m128i mask_broadcast(bool b) { + return _mm_set1_epi32(-(int32)b); + } + + static __m128i mask_unpack(unsigned long long k) { + // Only care about bottom four bits of k. + __m128i b = _mm_set1_epi8((char)k); + b = logical_or(b, _mm_setr_epi32(0xfefefefe,0xfdfdfdfd,0xfbfbfbfb,0xf7f7f7f7)); + + __m128i ones = {}; + ones = _mm_cmpeq_epi32(ones, ones); + return _mm_cmpeq_epi32(b, ones); + } + + static bool mask_element(const __m128i& u, int i) { + return static_cast<bool>(element(u, i)); + } + + static void mask_set_element(__m128i& u, int i, bool b) { + set_element(u, i, -(int32)b); + } + + static void mask_copy_to(const __m128i& m, bool* y) { + // Negate (convert 0xffffffff to 0x00000001) and move low bytes to + // bottom 4 bytes. + + __m128i s = _mm_setr_epi32(0x0c080400ul,0,0,0); + __m128i p = _mm_shuffle_epi8(negate(m), s); + std::memcpy(y, &p, 4); + } + + static __m128i mask_copy_from(const bool* w) { + __m128i r; + std::memcpy(&r, w, 4); + + __m128i s = _mm_setr_epi32(0x80808000ul, 0x80808001ul, 0x80808002ul, 0x80808003ul); + return negate(_mm_shuffle_epi8(r, s)); + } + + static __m128i max(const __m128i& a, const __m128i& b) { + return _mm_max_epi32(a, b); + } + + static __m128i min(const __m128i& a, const __m128i& b) { + return _mm_min_epi32(a, b); + } +}; + +struct avx_double4: implbase<avx_double4> { + // Use default implementations for: + // element, set_element, fma. + + using int64 = std::int64_t; + + // CMPPD predicates: + static constexpr int cmp_eq_oq = 0; + static constexpr int cmp_unord_q = 3; + static constexpr int cmp_neq_uq = 4; + static constexpr int cmp_true_uq = 15; + static constexpr int cmp_lt_oq = 17; + static constexpr int cmp_le_oq = 18; + static constexpr int cmp_nge_uq = 25; + static constexpr int cmp_ge_oq = 29; + static constexpr int cmp_gt_oq = 30; + + static __m256d broadcast(double v) { + return _mm256_set1_pd(v); + } + + static void copy_to(const __m256d& v, double* p) { + _mm256_storeu_pd(p, v); + } + + static void copy_to_masked(const __m256d& v, double* p, const __m256d& mask) { + _mm256_maskstore_pd(p, _mm256_castpd_si256(mask), v); + } + + static __m256d copy_from(const double* p) { + return _mm256_loadu_pd(p); + } + + static __m256d copy_from_masked(const double* p, const __m256d& mask) { + return _mm256_maskload_pd(p, _mm256_castpd_si256(mask)); + } + + static __m256d copy_from_masked(const __m256d& v, const double* p, const __m256d& mask) { + __m256d d = _mm256_maskload_pd(p, _mm256_castpd_si256(mask)); + return ifelse(mask, d, v); + } + + static __m256d negate(const __m256d& a) { + return _mm256_sub_pd(zero(), a); + } + + static __m256d add(const __m256d& a, const __m256d& b) { + return _mm256_add_pd(a, b); + } + + static __m256d sub(const __m256d& a, const __m256d& b) { + return _mm256_sub_pd(a, b); + } + + static __m256d mul(const __m256d& a, const __m256d& b) { + return _mm256_mul_pd(a, b); + } + + static __m256d div(const __m256d& a, const __m256d& b) { + return _mm256_div_pd(a, b); + } + + static __m256d logical_not(const __m256d& a) { + __m256d ones = {}; + return _mm256_xor_pd(a, _mm256_cmp_pd(ones, ones, 15)); + } + + static __m256d logical_and(const __m256d& a, const __m256d& b) { + return _mm256_and_pd(a, b); + } + + static __m256d logical_or(const __m256d& a, const __m256d& b) { + return _mm256_or_pd(a, b); + } + + static __m256d cmp_eq(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_eq_oq); + } + + static __m256d cmp_neq(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_neq_uq); + } + + static __m256d cmp_gt(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_gt_oq); + } + + static __m256d cmp_geq(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_ge_oq); + } + + static __m256d cmp_lt(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_lt_oq); + } + + static __m256d cmp_leq(const __m256d& a, const __m256d& b) { + return _mm256_cmp_pd(a, b, cmp_le_oq); + } + + static __m256d ifelse(const __m256d& m, const __m256d& u, const __m256d& v) { + return _mm256_blendv_pd(v, u, m); + } + + static __m256d mask_broadcast(bool b) { + return _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64)b)); + } + + static bool mask_element(const __m256d& u, int i) { + return static_cast<bool>(element(u, i)); + } + + static __m256d mask_unpack(unsigned long long k) { + // Only care about bottom four bits of k. + __m128i b = _mm_set1_epi8((char)k); + // (Note: there is no _mm_setr_epi64x (!)) + __m128i bl = _mm_or_si128(b, _mm_set_epi64x(0xfdfdfdfdfdfdfdfd, 0xfefefefefefefefe)); + __m128i bu = _mm_or_si128(b, _mm_set_epi64x(0xf7f7f7f7f7f7f7f7, 0xfbfbfbfbfbfbfbfb)); + + __m128i ones = {}; + ones = _mm_cmpeq_epi32(ones, ones); + bl = _mm_cmpeq_epi64(bl, ones); + bu = _mm_cmpeq_epi64(bu, ones); + return _mm256_castsi256_pd(combine_m128i(bu, bl)); + } + + static void mask_set_element(__m256d& u, int i, bool b) { + char data[256]; + _mm256_storeu_pd((double*)data, u); + ((int64*)data)[i] = -(int64)b; + u = _mm256_loadu_pd((double*)data); + } + + static void mask_copy_to(const __m256d& m, bool* y) { + // Convert to 32-bit wide mask values, and delegate to + // avx2_int4. + + avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y); + } + + static __m256d mask_copy_from(const bool* w) { + __m128i zero = _mm_setzero_si128(); + + __m128i r; + std::memcpy(&r, w, 4); + + // Move bytes: + // rl: byte 0 to byte 0, byte 1 to byte 8, zero elsewhere. + // ru: byte 2 to byte 0, byte 3 to byte 8, zero elsewhere. + // + // Subtract from zero to translate + // 0x0000000000000001 to 0xffffffffffffffff. + + __m128i sl = _mm_setr_epi32(0x80808000ul, 0x80808080ul, 0x80808001ul, 0x80808080ul); + __m128i rl = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, sl)); + + __m128i su = _mm_setr_epi32(0x80808002ul, 0x80808080ul, 0x80808003ul, 0x80808080ul); + __m128i ru = _mm_sub_epi64(zero, _mm_shuffle_epi8(r, su)); + + return _mm256_castsi256_pd(combine_m128i(ru, rl)); + } + + static __m256d max(const __m256d& a, const __m256d& b) { + return _mm256_max_pd(a, b); + } + + static __m256d min(const __m256d& a, const __m256d& b) { + return _mm256_min_pd(a, b); + } + + static __m256d abs(const __m256d& x) { + __m256i m = _mm256_set1_epi64x(0x7fffffffffffffffll); + return _mm256_and_pd(x, _mm256_castsi256_pd(m)); + } + + // Exponential is calculated as follows: + // + // e^x = e^g · 2^n, + // + // where g in [-0.5, 0.5) and n is an integer. 2^n can be + // calculated via bit manipulation or specialized scaling intrinsics, + // whereas e^g is approximated using the order-6 rational + // approximation: + // + // e^g = R(g)/R(-g) + // + // with R(x) split into even and odd terms: + // + // R(x) = Q(x^2) + x·P(x^2) + // + // so that the ratio can be computed as: + // + // e^g = 1 + 2·g·P(g^2) / (Q(g^2)-g·P(g^2)). + // + // Note that the coefficients for R are close to but not the same as those + // from the 6,6 Padé approximant to the exponential. + // + // The exponents n and g are calculated by: + // + // n = floor(x/ln(2) + 0.5) + // g = x - n·ln(2) + // + // so that x = g + n·ln(2). We have: + // + // |g| = |x - n·ln(2)| + // = |x - x + α·ln(2)| + // + // for some fraction |α| ≤ 0.5, and thus |g| ≤ 0.5ln(2) ≈ 0.347. + // + // Tne subtraction x - n·ln(2) is performed in two parts, with + // ln(2) = C1 + C2, in order to compensate for the possible loss of precision + // attributable to catastrophic rounding. C1 comprises the first + // 32-bits of mantissa, C2 the remainder. + + static __m256d exp(const __m256d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(exp_minarg)); + auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q); + + // Compute n and g. + + auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5))); + + auto g = sub(x, mul(n, broadcast(ln2C1))); + g = sub(g, mul(n, broadcast(ln2C2))); + + auto gg = mul(g, g); + + // Compute the g*P(g^2) and Q(g^2). + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) + + auto expg = add(broadcast(1), mul(broadcast(2), + div(odd, sub(even, odd)))); + + // Finally, compute product with 2^n. + // Note: can only achieve full range using the ldexp implementation, + // rather than multiplying by 2^n directly. + + auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n)); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(0), + ifelse(is_nan, broadcast(NAN), + result))); + } + + // Use same rational polynomial expansion as for exp(x), without + // the unit term. + // + // For |x|<=0.5, take n to be zero. Otherwise, set n as above, + // and scale the answer by: + // expm1(x) = 2^n * expm1(g) + (2^n - 1). + + static __m256d expm1(const __m256d& x) { + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(expm1_minarg)); + auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q); + + auto half = broadcast(0.5); + auto one = broadcast(1.); + auto two = add(one, one); + + auto nzero = cmp_leq(abs(x), half); + auto n = _mm256_floor_pd(add(mul(broadcast(ln2inv), x), half)); + n = ifelse(nzero, zero(), n); + + auto g = sub(x, mul(n, broadcast(ln2C1))); + g = sub(g, mul(n, broadcast(ln2C2))); + + auto gg = mul(g, g); + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + auto expgm1 = mul(two, div(odd, sub(even, odd))); + + // For small x (n zero), bypass scaling step to avoid underflow. + // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: + // result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) ) + // to avoid overflow when n=1024. + + auto nm1 = _mm256_cvtpd_epi32(sub(n, one)); + auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-1), + ifelse(is_nan, broadcast(NAN), + ifelse(nzero, expgm1, + scaled)))); + } + + // Natural logarithm: + // + // Decompose x = 2^g * u such that g is an integer and + // u is in the interval [sqrt(2)/2, sqrt(2)]. + // + // Then ln(x) is computed as R(u-1) + g*ln(2) where + // R(z) is a rational polynomial approximating ln(z+1) + // for small z: + // + // R(z) = z - z^2/2 + z^3 * P(z)/Q(z) + // + // where P and Q are degree 5 polynomials, Q monic. + // + // In order to avoid cancellation error, ln(2) is represented + // as C3 + C4, with the C4 correction term approx. -2.1e-4. + // The summation order for R(z)+2^g is: + // + // z^3*P(z)/Q(z) + g*C4 - z^2/2 + z + g*C3 + + static __m256d log(const __m256d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_geq(x, broadcast(HUGE_VAL)); + auto is_small = cmp_lt(x, broadcast(log_minarg)); + auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq); + + __m256d g = _mm256_cvtepi32_pd(logb_normal(x)); + __m256d u = fraction_normal(x); + + __m256d one = broadcast(1.); + __m256d half = broadcast(0.5); + auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2)); + g = ifelse(gtsqrt2, add(g, one), g); + u = ifelse(gtsqrt2, mul(u, half), u); + + auto z = sub(u, one); + auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log); + auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log); + + auto z2 = mul(z, z); + auto z3 = mul(z2, z); + + auto r = div(mul(z3, pz), qz); + r = add(r, mul(g, broadcast(ln2C4))); + r = sub(r, mul(z2, half)); + r = add(r, z); + r = add(r, mul(g, broadcast(ln2C3))); + + // Return NaN if x is NaN or negarive, +inf if x is +inf, + // or -inf if zero or (positive) denormal. + + return + ifelse(is_domainerr, broadcast(NAN), + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-HUGE_VAL), + r))); + } + +protected: + static __m256d zero() { + return _mm256_setzero_pd(); + } + + static __m128i hi_epi32(__m256i x) { + __m128i xl = _mm256_castsi256_si128(x); + __m128i xh = _mm256_extractf128_si256(x, 1); + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0xddu)); + } + + static __m128i lo_epi32(__m256i x) { + __m128i xl = _mm256_castsi256_si128(x); + __m128i xh = _mm256_extractf128_si256(x, 1); + return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(xl), _mm_castsi128_ps(xh), 0x88u)); + } + + static __m256i combine_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + } + + // horner(x, a0, ..., an) computes the degree n polynomial A(x) with coefficients + // a0, ..., an by a0+x·(a1+x·(a2+...+x·an)...). + + static inline __m256d horner(__m256d x, double a0) { + return broadcast(a0); + } + + template <typename... T> + static __m256d horner(__m256d x, double a0, T... tail) { + return add(mul(x, horner(x, tail...)), broadcast(a0)); + } + + // horner1(x, a0, ..., an) computes the degree n+1 monic polynomial A(x) with coefficients + // a0, ..., an, 1 by by a0+x·(a1+x·(a2+...+x·(an+x)...). + + static inline __m256d horner1(__m256d x, double a0) { + return add(x, broadcast(a0)); + } + + template <typename... T> + static __m256d horner1(__m256d x, double a0, T... tail) { + return add(mul(x, horner1(x, tail...)), broadcast(a0)); + } + + // Compute 2.0^n. + static __m256d exp2int(__m128i n) { + n = _mm_slli_epi32(n, 20); + n = _mm_add_epi32(n, _mm_set1_epi32(1023<<20)); + + auto nl = _mm_shuffle_epi32(n, 0x50); + auto nh = _mm_shuffle_epi32(n, 0xfa); + __m256i nhnl = combine_m128i(nh, nl); + + return _mm256_castps_pd( + _mm256_blend_ps(_mm256_set1_ps(0), + _mm256_castsi256_ps(nhnl),0xaa)); + } + + // Compute n and f such that x = 2^n·f, with |f| ∈ [1,2), given x is finite and normal. + static __m128i logb_normal(const __m256d& x) { + __m128i xw = hi_epi32(_mm256_castpd_si256(x)); + __m128i emask = _mm_set1_epi32(0x7ff00000); + __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20); + + return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023)); + } + + static __m256d fraction_normal(const __m256d& x) { + // 0x800fffffffffffff (intrinsic takes signed parameter) + __m256d emask = _mm256_castsi256_pd(_mm256_set1_epi64x(-0x7ff0000000000001)); + __m256d bias = _mm256_castsi256_pd(_mm256_set1_epi64x(0x3ff0000000000000)); + return _mm256_or_pd(bias, _mm256_and_pd(emask, x)); + } + + // Compute 2^n·x when both x and 2^n·x are normal, finite and strictly positive doubles. + static __m256d ldexp_positive(__m256d x, __m128i n) { + n = _mm_slli_epi32(n, 20); + auto zero = _mm_set1_epi32(0); + auto nl = _mm_unpacklo_epi32(zero, n); + auto nh = _mm_unpackhi_epi32(zero, n); + + __m128d xl = _mm256_castpd256_pd128(x); + __m128d xh = _mm256_extractf128_pd(x, 1); + + __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl)); + __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh)); + __m256i sumhl = combine_m128i(sumh, suml); + + return _mm256_castsi256_pd(sumhl); + } + + // Compute 2^n·x when both x and 2^n·x are normal and finite. + static __m256d ldexp_normal(__m256d x, __m128i n) { + __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll)); + __m256d sbits = _mm256_andnot_pd(smask, x); + + n = _mm_slli_epi32(n, 20); + auto zi = _mm_set1_epi32(0); + auto nl = _mm_unpacklo_epi32(zi, n); + auto nh = _mm_unpackhi_epi32(zi, n); + + __m128d xl = _mm256_castpd256_pd128(x); + __m128d xh = _mm256_extractf128_pd(x, 1); + + __m128i suml = _mm_add_epi64(nl, _mm_castpd_si128(xl)); + __m128i sumh = _mm_add_epi64(nh, _mm_castpd_si128(xh)); + __m256i sumhl = combine_m128i(sumh, suml); + + auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sumhl), smask), sbits); + return ifelse(cmp_eq(x, zero()), zero(), nzans); + } +}; + + +#if defined(__AVX2__) && defined(__FMA__) + +// Same implementation as for AVX. +using avx2_int4 = avx_int4; + +struct avx2_double4; + +template <> +struct simd_traits<avx2_double4> { + static constexpr unsigned width = 4; + using scalar_type = double; + using vector_type = __m256d; + using mask_impl = avx2_double4; +}; + +// Note: we derive from avx_double4 only as an implementation shortcut. +// Because `avx2_double4` does not derive from `implbase<avx2_double4>`, +// any fallback methods in `implbase` will use the `avx_double4` +// functions rather than the `avx2_double4` functions. + +struct avx2_double4: avx_double4 { + static __m256d fma(const __m256d& a, const __m256d& b, const __m256d& c) { + return _mm256_fmadd_pd(a, b, c); + } + + static vector_type logical_not(const vector_type& a) { + __m256i ones = {}; + return _mm256_xor_pd(a, _mm256_castsi256_pd(_mm256_cmpeq_epi32(ones, ones))); + } + + static __m256d mask_unpack(unsigned long long k) { + // Only care about bottom four bits of k. + __m256i b = _mm256_set1_epi8((char)k); + b = _mm256_or_si256(b, _mm256_setr_epi64x( + 0xfefefefefefefefe, 0xfdfdfdfdfdfdfdfd, + 0xfbfbfbfbfbfbfbfb, 0xf7f7f7f7f7f7f7f7)); + + __m256i ones = {}; + ones = _mm256_cmpeq_epi64(ones, ones); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(ones, b)); + } + + static void mask_copy_to(const __m256d& m, bool* y) { + // Convert to 32-bit wide mask values, and delegate to + // avx2_int4. + + avx_int4::mask_copy_to(lo_epi32(_mm256_castpd_si256(m)), y); + } + + static __m256d mask_copy_from(const bool* w) { + __m256i zero = _mm256_setzero_si256(); + + __m128i r; + std::memcpy(&r, w, 4); + return _mm256_castsi256_pd(_mm256_sub_epi64(zero, _mm256_cvtepi8_epi64(r))); + } + + static __m256d gather(avx2_int4, const double* p, const __m128i& index) { + return _mm256_i32gather_pd(p, index, 8); + } + + static __m256d gather(avx2_int4, __m256d a, const double* p, const __m128i& index, const __m256d& mask) { + return _mm256_mask_i32gather_pd(a, p, index, mask, 8); + }; + + // avx4_double4 versions of log, exp, and expm1 use the same algorithms as for avx_double4, + // but use AVX2-specialized bit manipulation and FMA. + + static __m256d exp(const __m256d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(exp_minarg)); + auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q); + + // Compute n and g. + + auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, broadcast(0.5))); + + auto g = fma(n, broadcast(-ln2C1), x); + g = fma(n, broadcast(-ln2C2), g); + + auto gg = mul(g, g); + + // Compute the g*P(g^2) and Q(g^2). + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) + + auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1)); + + // Finally, compute product with 2^n. + // Note: can only achieve full range using the ldexp implementation, + // rather than multiplying by 2^n directly. + + auto result = ldexp_positive(expg, _mm256_cvtpd_epi32(n)); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(0), + ifelse(is_nan, broadcast(NAN), + result))); + } + + static __m256d expm1(const __m256d& x) { + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(expm1_minarg)); + auto is_nan = _mm256_cmp_pd(x, x, cmp_unord_q); + + auto half = broadcast(0.5); + auto one = broadcast(1.); + auto two = add(one, one); + + auto smallx = cmp_leq(abs(x), half); + auto n = _mm256_floor_pd(fma(broadcast(ln2inv), x, half)); + n = ifelse(smallx, zero(), n); + + auto g = fma(n, broadcast(-ln2C1), x); + g = fma(n, broadcast(-ln2C2), g); + + auto gg = mul(g, g); + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + auto expgm1 = mul(two, div(odd, sub(even, odd))); + + auto nm1 = _mm256_cvtpd_epi32(sub(n, one)); + auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-1), + ifelse(is_nan, broadcast(NAN), + ifelse(smallx, expgm1, + scaled)))); + } + + static __m256d log(const __m256d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_geq(x, broadcast(HUGE_VAL)); + auto is_small = cmp_lt(x, broadcast(log_minarg)); + auto is_domainerr = _mm256_cmp_pd(x, broadcast(0), cmp_nge_uq); + + __m256d g = _mm256_cvtepi32_pd(logb_normal(x)); + __m256d u = fraction_normal(x); + + __m256d one = broadcast(1.); + __m256d half = broadcast(0.5); + auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2)); + g = ifelse(gtsqrt2, add(g, one), g); + u = ifelse(gtsqrt2, mul(u, half), u); + + auto z = sub(u, one); + auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log); + auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log); + + auto z2 = mul(z, z); + auto z3 = mul(z2, z); + + auto r = div(mul(z3, pz), qz); + r = fma(g, broadcast(ln2C4), r); + r = fms(z2, half, r); + r = sub(z, r); + r = fma(g, broadcast(ln2C3), r); + + // Return NaN if x is NaN or negative, +inf if x is +inf, + // or -inf if zero or (positive) denormal. + + return + ifelse(is_domainerr, broadcast(NAN), + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-HUGE_VAL), + r))); + } + +protected: + static __m128i lo_epi32(__m256i a) { + a = _mm256_shuffle_epi32(a, 0x08); + a = _mm256_permute4x64_epi64(a, 0x08); + return _mm256_castsi256_si128(a); + } + + static __m128i hi_epi32(__m256i a) { + a = _mm256_shuffle_epi32(a, 0x0d); + a = _mm256_permute4x64_epi64(a, 0x08); + return _mm256_castsi256_si128(a); + } + + static inline __m256d horner(__m256d x, double a0) { + return broadcast(a0); + } + + template <typename... T> + static __m256d horner(__m256d x, double a0, T... tail) { + return fma(x, horner(x, tail...), broadcast(a0)); + } + + static inline __m256d horner1(__m256d x, double a0) { + return add(x, broadcast(a0)); + } + + template <typename... T> + static __m256d horner1(__m256d x, double a0, T... tail) { + return fma(x, horner1(x, tail...), broadcast(a0)); + } + + static __m256d fms(const __m256d& a, const __m256d& b, const __m256d& c) { + return _mm256_fmsub_pd(a, b, c); + } + + // Compute 2.0^n. + // Overrides avx_double4::exp2int. + static __m256d exp2int(__m128i n) { + __m256d x = broadcast(1); + __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52); + __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x)); + return _mm256_castsi256_pd(sum); + } + + // Compute 2^n*x when both x and 2^n*x are normal, finite and strictly positive doubles. + // Overrides avx_double4::ldexp_positive. + static __m256d ldexp_positive(__m256d x, __m128i n) { + __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52); + __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x)); + return _mm256_castsi256_pd(sum); + } + + // Compute 2^n*x when both x and 2^n*x are normal and finite. + // Overrides avx_double4::ldexp_normal. + static __m256d ldexp_normal(__m256d x, __m128i n) { + __m256d smask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7fffffffffffffffll)); + __m256d sbits = _mm256_andnot_pd(smask, x); + __m256i nshift = _mm256_slli_epi64(_mm256_cvtepi32_epi64(n), 52); + __m256i sum = _mm256_add_epi64(nshift, _mm256_castpd_si256(x)); + + auto nzans = _mm256_or_pd(_mm256_and_pd(_mm256_castsi256_pd(sum), smask), sbits); + return ifelse(cmp_eq(x, zero()), zero(), nzans); + } + + // Override avx_double4::logb_normal so as to use avx2 version of hi_epi32. + static __m128i logb_normal(const __m256d& x) { + __m128i xw = hi_epi32(_mm256_castpd_si256(x)); + __m128i emask = _mm_set1_epi32(0x7ff00000); + __m128i ebiased = _mm_srli_epi32(_mm_and_si128(xw, emask), 20); + + return _mm_sub_epi32(ebiased, _mm_set1_epi32(1023)); + } +}; +#endif // defined(__AVX2__) && defined(__FMA__) + +} // namespace simd_detail + +namespace simd_abi { + template <typename T, unsigned N> struct avx; + + template <> struct avx<int, 4> { using type = simd_detail::avx_int4; }; + template <> struct avx<double, 4> { using type = simd_detail::avx_double4; }; + +#if defined(__AVX2__) && defined(__FMA__) + template <typename T, unsigned N> struct avx2; + + template <> struct avx2<int, 4> { using type = simd_detail::avx2_int4; }; + template <> struct avx2<double, 4> { using type = simd_detail::avx2_double4; }; +#endif +} // namespace simd_abi + +} // namespace simd +} // namespace arb + +#endif // def __AVX__ diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f956bb00e9e96dfae90a259eb78cf0d487cf3664 --- /dev/null +++ b/src/simd/avx512.hpp @@ -0,0 +1,712 @@ +#pragma once + +// AVX512F SIMD intrinsics implementation. + +#ifdef __AVX512F__ + +#include <cmath> +#include <cstdint> +#include <immintrin.h> + +#include <simd/approx.hpp> +#include <simd/implbase.hpp> + +namespace arb { +namespace simd { +namespace simd_detail { + +struct avx512_double8; +struct avx512_int8; +struct avx512_mask8; + +template <> +struct simd_traits<avx512_mask8> { + static constexpr unsigned width = 8; + using scalar_type = bool; + using vector_type = __mmask8; + using mask_impl = avx512_mask8; +}; + +template <> +struct simd_traits<avx512_double8> { + static constexpr unsigned width = 8; + using scalar_type = double; + using vector_type = __m512d; + using mask_impl = avx512_mask8; +}; + +template <> +struct simd_traits<avx512_int8> { + static constexpr unsigned width = 8; + using scalar_type = std::int32_t; + using vector_type = __m512i; + using mask_impl = avx512_mask8; +}; + +struct avx512_mask8: implbase<avx512_mask8> { + static __mmask8 broadcast(bool b) { + return _mm512_int2mask(-b); + } + + static void copy_to(const __mmask8& k, bool* b) { + __m256i a = _mm256_setzero_si256(); + a = _mm512_castsi512_si256(_mm512_mask_set1_epi32(_mm512_castsi256_si512(a), k, 1)); + + __m256i s = _mm256_set1_epi32(0x0c080400); + a = _mm256_shuffle_epi8(a, s); + + s = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0); + a = _mm256_permutevar8x32_epi32(a, s); + + std::memcpy(b, &a, 8); + } + + static __mmask8 copy_from(const bool* p) { + __m256i a = _mm256_setzero_si256(); + std::memcpy(&a, p, 8); + a = _mm256_sub_epi8(_mm256_setzero_si256(), a); + return _mm512_int2mask(_mm256_movemask_epi8(a)); + } + + // Note: fall back to implbase implementations of copy_to_masked and copy_from_masked; + // could be improved with the use of AVX512BW instructions on supported platforms. + + static __mmask8 logical_not(const __mmask8& k) { + return _mm512_knot(k); + } + + static __mmask8 logical_and(const __mmask8& a, const __mmask8& b) { + return _mm512_kand(a, b); + } + + static __mmask8 logical_or(const __mmask8& a, const __mmask8& b) { + return _mm512_kor(a, b); + } + + // Arithmetic operations not necessarily appropriate for + // packed bit mask, but implemented for completeness/testing, + // with Z modulo 2 semantics: + // a + b is equivalent to a ^ b + // a * b a & b + // a / b a + // a - b a ^ b + // -a a + // max(a, b) a | b + // min(a, b) a & b + + static __mmask8 negate(const __mmask8& a) { + return a; + } + + static __mmask8 add(const __mmask8& a, const __mmask8& b) { + return _mm512_kxor(a, b); + } + + static __mmask8 sub(const __mmask8& a, const __mmask8& b) { + return _mm512_kxor(a, b); + } + + static __mmask8 mul(const __mmask8& a, const __mmask8& b) { + return _mm512_kand(a, b); + } + + static __mmask8 div(const __mmask8& a, const __mmask8& b) { + return a; + } + + static __mmask8 fma(const __mmask8& a, const __mmask8& b, const __mmask8& c) { + return add(mul(a, b), c); + } + + static __mmask8 max(const __mmask8& a, const __mmask8& b) { + return _mm512_kor(a, b); + } + + static __mmask8 min(const __mmask8& a, const __mmask8& b) { + return _mm512_kand(a, b); + } + + // Comparison operators are also taken as operating on Z modulo 2, + // with 1 > 0: + // + // a > b is equivalent to a & ~b + // a >= b a | ~b, ~(~a & b) + // a < b ~a & b + // a <= b ~a | b, ~(a & ~b) + // a == b ~(a ^ b) + // a != b a ^ b + + static __mmask8 cmp_eq(const __mmask8& a, const __mmask8& b) { + return _mm512_kxnor(a, b); + } + + static __mmask8 cmp_neq(const __mmask8& a, const __mmask8& b) { + return _mm512_kxor(a, b); + } + + static __mmask8 cmp_lt(const __mmask8& a, const __mmask8& b) { + return _mm512_kandn(a, b); + } + + static __mmask8 cmp_gt(const __mmask8& a, const __mmask8& b) { + return cmp_lt(b, a); + } + + static __mmask8 cmp_geq(const __mmask8& a, const __mmask8& b) { + return logical_not(cmp_lt(a, b)); + } + + static __mmask8 cmp_leq(const __mmask8& a, const __mmask8& b) { + return logical_not(cmp_gt(a, b)); + } + + static __mmask8 ifelse(const __mmask8& m, const __mmask8& u, const __mmask8& v) { + return _mm512_kor(_mm512_kandn(m, u), _mm512_kand(m, v)); + } + + static bool element(const __mmask8& k, int i) { + return _mm512_mask2int(k)&(1<<i); + } + + static void set_element(__mmask8& k, int i, bool b) { + int n = _mm512_mask2int(k); + k = _mm512_int2mask((n&~(1<<i))|(b<<i)); + } + + static __mmask8 mask_broadcast(bool b) { + return broadcast(b); + } + + static __mmask8 mask_unpack(unsigned long long p) { + return _mm512_int2mask(p); + } + + static bool mask_element(const __mmask8& u, int i) { + return element(u, i); + } + + static void mask_set_element(__mmask8& u, int i, bool b) { + set_element(u, i, b); + } + + static void mask_copy_to(const __mmask8& m, bool* y) { + copy_to(m, y); + } + + static __mmask8 mask_copy_from(const bool* y) { + return copy_from(y); + } +}; + +struct avx512_int8: implbase<avx512_int8> { + // Use default implementations for: + // element, set_element. + // + // Consider changing mask representation to __mmask16 + // and be explicit about comparison masks: restrictions + // to __mmask8 seem to produce a lot of ultimately unnecessary + // operations. + + using int32 = std::int32_t; + + static __mmask8 lo() { + return _mm512_int2mask(0xff); + } + + static __m512i broadcast(int32 v) { + return _mm512_set1_epi32(v); + } + + static void copy_to(const __m512i& v, int32* p) { + _mm512_mask_storeu_epi32(p, lo(), v); + } + + static void copy_to_masked(const __m512i& v, int32* p, const __mmask8& mask) { + _mm512_mask_storeu_epi32(p, mask, v); + } + + static __m512i copy_from(const int32* p) { + return _mm512_maskz_loadu_epi32(lo(), p); + } + + static __m512i copy_from_masked(const int32* p, const __mmask8& mask) { + return _mm512_maskz_loadu_epi32(mask, p); + } + + static __m512i copy_from_masked(const __m512i& v, const int32* p, const __mmask8& mask) { + return _mm512_mask_loadu_epi32(v, mask, p); + } + + static __m512i negate(const __m512i& a) { + return sub(_mm512_setzero_epi32(), a); + } + + static __m512i add(const __m512i& a, const __m512i& b) { + return _mm512_add_epi32(a, b); + } + + static __m512i sub(const __m512i& a, const __m512i& b) { + return _mm512_sub_epi32(a, b); + } + + static __m512i mul(const __m512i& a, const __m512i& b) { + // Can represent 32-bit exactly in double, and technically overflow is + // undefined behaviour, so we can do this in doubles. + constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; + auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a)); + auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b)); + auto fpmul = _mm512_mul_round_pd(da, db, rtz); + return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpmul, rtz)); + } + + static __m512i div(const __m512i& a, const __m512i& b) { + // Can represent 32-bit exactly in double, so do a fp division with fixed rounding. + constexpr int32 rtz = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; + auto da = _mm512_cvtepi32_pd(_mm512_castsi512_si256(a)); + auto db = _mm512_cvtepi32_pd(_mm512_castsi512_si256(b)); + auto fpdiv = _mm512_div_round_pd(da, db, rtz); + return _mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(fpdiv, rtz)); + } + + static __m512i fma(const __m512i& a, const __m512i& b, const __m512i& c) { + return add(mul(a, b), c); + } + + static __mmask8 cmp_eq(const __m512i& a, const __m512i& b) { + return _mm512_cmpeq_epi32_mask(a, b); + } + + static __mmask8 cmp_neq(const __m512i& a, const __m512i& b) { + return _mm512_cmpneq_epi32_mask(a, b); + } + + static __mmask8 cmp_gt(const __m512i& a, const __m512i& b) { + return _mm512_cmpgt_epi32_mask(a, b); + } + + static __mmask8 cmp_geq(const __m512i& a, const __m512i& b) { + return _mm512_cmpge_epi32_mask(a, b); + } + + static __mmask8 cmp_lt(const __m512i& a, const __m512i& b) { + return _mm512_cmplt_epi32_mask(a, b); + } + + static __mmask8 cmp_leq(const __m512i& a, const __m512i& b) { + return _mm512_cmple_epi32_mask(a, b); + } + + static __m512i ifelse(const __mmask8& m, const __m512i& u, const __m512i& v) { + return _mm512_mask_blend_epi32(m, v, u); + } + + static __m512i max(const __m512i& a, const __m512i& b) { + return _mm512_max_epi32(a, b); + } + + static __m512i min(const __m512i& a, const __m512i& b) { + return _mm512_min_epi32(a, b); + } + + static __m512i abs(const __m512i& a) { + return _mm512_abs_epi32(a); + } + + // Generic 8-wide int solutions for gather and scatter. + + template <typename Impl> + using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>; + + template <typename ImplIndex, + typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static __m512i gather(ImplIndex, const int32* p, const typename ImplIndex::vector_type& index) { + int32 o[16]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m512i*>(o); + return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), _mm512_loadu_si512(op), p, 4); + } + + template <typename ImplIndex, + typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static __m512i gather(ImplIndex, __m512i a, const int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) { + int32 o[16]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m512i*>(o); + return _mm512_mask_i32gather_epi32(a, mask, _mm512_loadu_si512(op), p, 4); + } + + template <typename ImplIndex, + typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index) { + int32 o[16]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m512i*>(o); + _mm512_mask_i32scatter_epi32(p, lo(), _mm512_loadu_si512(op), s, 4); + } + + template <typename ImplIndex, + typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static void scatter(ImplIndex, const __m512i& s, int32* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) { + int32 o[16]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m512i*>(o); + _mm512_mask_i32scatter_epi32(p, mask, _mm512_loadu_si512(op), s, 4); + } + + // Specialized 8-wide gather and scatter for avx512_int8 implementation. + + static __m512i gather(avx512_int8, const int32* p, const __m512i& index) { + return _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), lo(), index, p, 4); + } + + static __m512i gather(avx512_int8, __m512i a, const int32* p, const __m512i& index, const __mmask8& mask) { + return _mm512_mask_i32gather_epi32(a, mask, index, p, 4); + } + + static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index) { + _mm512_mask_i32scatter_epi32(p, lo(), index, s, 4); + } + + static void scatter(avx512_int8, const __m512i& s, int32* p, const __m512i& index, const __mmask8& mask) { + _mm512_mask_i32scatter_epi32(p, mask, index, s, 4); + } +}; + +struct avx512_double8: implbase<avx512_double8> { + // Use default implementations for: + // element, set_element. + + // CMPPD predicates: + static constexpr int cmp_eq_oq = 0; + static constexpr int cmp_unord_q = 3; + static constexpr int cmp_neq_uq = 4; + static constexpr int cmp_true_uq = 15; + static constexpr int cmp_lt_oq = 17; + static constexpr int cmp_le_oq = 18; + static constexpr int cmp_nge_uq = 25; + static constexpr int cmp_ge_oq = 29; + static constexpr int cmp_gt_oq = 30; + + static __m512d broadcast(double v) { + return _mm512_set1_pd(v); + } + + static void copy_to(const __m512d& v, double* p) { + _mm512_storeu_pd(p, v); + } + + static void copy_to_masked(const __m512d& v, double* p, const __mmask8& mask) { + _mm512_mask_storeu_pd(p, mask, v); + } + + static __m512d copy_from(const double* p) { + return _mm512_loadu_pd(p); + } + + static __m512d copy_from_masked(const double* p, const __mmask8& mask) { + return _mm512_maskz_loadu_pd(mask, p); + } + + static __m512d copy_from_masked(const __m512d& v, const double* p, const __mmask8& mask) { + return _mm512_mask_loadu_pd(v, mask, p); + } + + static __m512d negate(const __m512d& a) { + return _mm512_sub_pd(_mm512_setzero_pd(), a); + } + + static __m512d add(const __m512d& a, const __m512d& b) { + return _mm512_add_pd(a, b); + } + + static __m512d sub(const __m512d& a, const __m512d& b) { + return _mm512_sub_pd(a, b); + } + + static __m512d mul(const __m512d& a, const __m512d& b) { + return _mm512_mul_pd(a, b); + } + + static __m512d div(const __m512d& a, const __m512d& b) { + return _mm512_div_pd(a, b); + } + + static __m512d fma(const __m512d& a, const __m512d& b, const __m512d& c) { + return _mm512_fmadd_pd(a, b, c); + } + + static __mmask8 cmp_eq(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_eq_oq); + } + + static __mmask8 cmp_neq(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_neq_uq); + } + + static __mmask8 cmp_gt(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_gt_oq); + } + + static __mmask8 cmp_geq(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_ge_oq); + } + + static __mmask8 cmp_lt(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_lt_oq); + } + + static __mmask8 cmp_leq(const __m512d& a, const __m512d& b) { + return _mm512_cmp_pd_mask(a, b, cmp_le_oq); + } + + static __m512d ifelse(const __mmask8& m, const __m512d& u, const __m512d& v) { + return _mm512_mask_blend_pd(m, v, u); + } + + static __m512d max(const __m512d& a, const __m512d& b) { + return _mm512_max_pd(a, b); + } + + static __m512d min(const __m512d& a, const __m512d& b) { + return _mm512_min_pd(a, b); + } + + static __m512d abs(const __m512d& x) { + // (NB: _mm512_abs_pd() wrapper buggy on gcc, so use AND directly.) + __m512i m = _mm512_set1_epi64(0x7fffffffffffffff); + return _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(x), m)); + } + + // Generic 8-wide int solutions for gather and scatter. + + template <typename Impl> + using is_int8_simd = std::integral_constant<bool, std::is_same<int, typename Impl::scalar_type>::value && Impl::width==8>; + + template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static __m512d gather(ImplIndex, const double* p, const typename ImplIndex::vector_type& index) { + int o[8]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m256i*>(o); + return _mm512_i32gather_pd(_mm256_loadu_si256(op), p, 8); + } + + template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static __m512d gather(ImplIndex, __m512d a, const double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) { + int o[8]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m256i*>(o); + return _mm512_mask_i32gather_pd(a, mask, _mm256_loadu_si256(op), p, 8); + } + + template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index) { + int o[8]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m256i*>(o); + _mm512_i32scatter_pd(p, _mm256_loadu_si256(op), s, 8); + } + + template <typename ImplIndex, typename = typename std::enable_if<is_int8_simd<ImplIndex>::value>::type> + static void scatter(ImplIndex, const __m512d& s, double* p, const typename ImplIndex::vector_type& index, const __mmask8& mask) { + int o[8]; + ImplIndex::copy_to(index, o); + auto op = reinterpret_cast<const __m256i*>(o); + _mm512_mask_i32scatter_pd(p, mask, _mm256_loadu_si256(op), s, 8); + } + + // Specialized 8-wide gather and scatter for avx512_int8 implementation. + + static __m512d gather(avx512_int8, const double* p, const __m512i& index) { + return _mm512_i32gather_pd(_mm512_castsi512_si256(index), p, 8); + } + + static __m512d gather(avx512_int8, __m512d a, const double* p, const __m512i& index, const __mmask8& mask) { + return _mm512_mask_i32gather_pd(a, mask, _mm512_castsi512_si256(index), p, 8); + } + + static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index) { + _mm512_i32scatter_pd(p, _mm512_castsi512_si256(index), s, 8); + } + + static void scatter(avx512_int8, const __m512d& s, double* p, const __m512i& index, const __mmask8& mask) { + _mm512_mask_i32scatter_pd(p, mask, _mm512_castsi512_si256(index), s, 8); + } + + // Use SVML for exp and log if compiling with icpc, else use ratpoly + // approximations. + +#if defined(__INTEL_COMPILER) + static __m512d exp(const __m512d& x) { + return _mm512_exp_pd(x); + } + + static __m512d expm1(const __m512d& x) { + return _mm512_expm1_pd(x); + } + + static __m512d log(const __m512d& x) { + return _mm512_log_pd(x); + } + + static __m512d pow(const __m512d& x, const __m512d& y) { + return _mm512_pow_pd(x, y); + } +#else + + // Refer to avx/avx2 code for details of the exponential and log + // implementations. + + static __m512d exp(const __m512d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(exp_minarg)); + + // Compute n and g. + + auto n = _mm512_floor_pd(add(mul(broadcast(ln2inv), x), broadcast(0.5))); + + auto g = fma(n, broadcast(-ln2C1), x); + g = fma(n, broadcast(-ln2C2), g); + + auto gg = mul(g, g); + + // Compute the g*P(g^2) and Q(g^2). + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + // Compute R(g)/R(-g) = 1 + 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) + + auto expg = fma(broadcast(2), div(odd, sub(even, odd)), broadcast(1)); + + // Scale by 2^n, propogating NANs. + + auto result = _mm512_scalef_pd(expg, n); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(0), + result)); + } + + static __m512d expm1(const __m512d& x) { + auto is_large = cmp_gt(x, broadcast(exp_maxarg)); + auto is_small = cmp_lt(x, broadcast(expm1_minarg)); + + auto half = broadcast(0.5); + auto one = broadcast(1.); + + auto nnz = cmp_gt(abs(x), half); + auto n = _mm512_maskz_roundscale_round_pd( + nnz, + mul(broadcast(ln2inv), x), + 0, + _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); + + auto g = fma(n, broadcast(-ln2C1), x); + g = fma(n, broadcast(-ln2C2), g); + + auto gg = mul(g, g); + + auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); + auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); + + // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) + + auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd))); + + // For small x (n zero), bypass scaling step to avoid underflow. + // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: + // result = 2 * ( 2^(n-1)*expgm1 + (2^(n-1)+0.5) ) + // to avoid overflow when n=1024. + + auto nm1 = sub(n, one); + + auto result = + _mm512_scalef_pd( + add(sub(_mm512_scalef_pd(one, nm1), half), + _mm512_scalef_pd(expgm1, nm1)), + one); + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-1), + ifelse(nnz, result, expgm1))); + } + + static __m512d log(const __m512d& x) { + // Masks for exceptional cases. + + auto is_large = cmp_geq(x, broadcast(HUGE_VAL)); + auto is_small = cmp_lt(x, broadcast(log_minarg)); + is_small = avx512_mask8::logical_and(is_small, cmp_geq(x, broadcast(0))); + + __m512d g = _mm512_getexp_pd(x); + __m512d u = _mm512_getmant_pd(x, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_nan); + + __m512d one = broadcast(1.); + __m512d half = broadcast(0.5); + auto gtsqrt2 = cmp_geq(u, broadcast(sqrt2)); + g = ifelse(gtsqrt2, add(g, one), g); + u = ifelse(gtsqrt2, mul(u, half), u); + + auto z = sub(u, one); + auto pz = horner(z, P0log, P1log, P2log, P3log, P4log, P5log); + auto qz = horner1(z, Q0log, Q1log, Q2log, Q3log, Q4log); + + auto z2 = mul(z, z); + auto z3 = mul(z2, z); + + auto r = div(mul(z3, pz), qz); + r = fma(g, broadcast(ln2C4), r); + r = fms(z2, half, r); + r = sub(z, r); + r = fma(g, broadcast(ln2C3), r); + + // r is alrady NaN if x is NaN or negative, otherwise + // return +inf if x is +inf, or -inf if zero or (positive) denormal. + + return + ifelse(is_large, broadcast(HUGE_VAL), + ifelse(is_small, broadcast(-HUGE_VAL), + r)); + } +#endif + +protected: + static inline __m512d horner1(__m512d x, double a0) { + return add(x, broadcast(a0)); + } + + static inline __m512d horner(__m512d x, double a0) { + return broadcast(a0); + } + + template <typename... T> + static __m512d horner(__m512d x, double a0, T... tail) { + return fma(x, horner(x, tail...), broadcast(a0)); + } + + template <typename... T> + static __m512d horner1(__m512d x, double a0, T... tail) { + return fma(x, horner1(x, tail...), broadcast(a0)); + } + + static __m512d fms(const __m512d& a, const __m512d& b, const __m512d& c) { + return _mm512_fmsub_pd(a, b, c); + } +}; + +} // namespace simd_detail + +namespace simd_abi { + template <typename T, unsigned N> struct avx512; + template <> struct avx512<double, 8> { using type = simd_detail::avx512_double8; }; + template <> struct avx512<int, 8> { using type = simd_detail::avx512_int8; }; +} // namespace simd_abi + +} // namespace simd +} // namespace arb + +#endif // def __AVX512F__ diff --git a/src/simd/generic.hpp b/src/simd/generic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e8ca5524b8ce5511d815a4483facbcf4c4d53cda --- /dev/null +++ b/src/simd/generic.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include <array> +#include <cstring> +#include <cmath> + +#include <simd/implbase.hpp> + +namespace arb { +namespace simd { +namespace simd_detail { + +template <typename T, unsigned N> +struct generic; + +template <typename T, unsigned N> +struct simd_traits<generic<T, N>> { + static constexpr unsigned width = N; + using scalar_type = T; + using vector_type = std::array<T, N>; + using mask_impl = generic<bool, N>; +}; + +template <typename T, unsigned N> +struct generic: implbase<generic<T, N>> { + using array = std::array<T, N>; + + static array copy_from(const T* p) { + array result; + std::memcpy(&result, p, sizeof(result)); + return result; + } + + static void copy_to(const array& v, T* p) { + std::memcpy(p, &v, sizeof(v)); + } + + static void mask_copy_to(const array& v, bool* w) { + std::copy(v.begin(), v.end(), w); + } + + static array mask_copy_from(const bool* y) { + array v; + std::copy(y, y+N, v.data()); + return v; + } + + static bool mask_element(const array& v, int i) { + return static_cast<bool>(v[i]); + } + + static void mask_set_element(array& v, int i, bool x) { + v[i] = x; + } +}; + +} // namespace simd_detail + +namespace simd_abi { + + template <typename T, unsigned N> struct generic { + using type = simd_detail::generic<T, N>; + }; + +} // namespace simd_abi + +} // namespace simd +} // namespace arb diff --git a/src/simd/implbase.hpp b/src/simd/implbase.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f5b968f95fe834c67a2a8b87e1f47fb9cf041bc4 --- /dev/null +++ b/src/simd/implbase.hpp @@ -0,0 +1,478 @@ +#pragma once + +// Base class (via CRTP) for concrete implementation +// classes, with default implementations based on +// copy_to/copy_from. +// +// Also provides simd_detail::simd_traits type map. +// +// Maths functions are implemented in terms of +// arithmetic primitives or lane-wise invocation of +// std::math functions; specialized implementations +// should be provided by the concrete implementation +// where it is more efficient: +// +// Function | Default implemention by +// ---------------------------------- +// min | negate, cmp_gt, ifelse +// max | negate, cmp_gt, ifelse +// abs | negate, max +// sin | lane-wise std::sin +// cos | lane-wise std::cos +// exp | lane-wise std::exp +// log | lane-wise std::log +// pow | lane-wise std::pow +// expm1 | lane-wise std::expm1 +// exprelr | expm1, div, add, cmp_eq, ifelse +// +// 'exprelr' is the function x ↦ x/(exp(x)-1). + +#include <cstring> +#include <cmath> +#include <algorithm> +#include <iterator> + +// Derived class I must at minimum provide: +// +// * specialization of simd_traits. +// +// * implementations (static) for copy_to, copy_from: +// +// void I::copy_to(const vector_type&, scalar_type*) +// vector_type I::copy_from(const scalar_type*) +// +// void I::mask_copy_to(const vector_type&, scalar_type*) +// vector_type I::mask_copy_from(const bool*) +// +// * implementations (static) for mask element get/set: +// +// bool I::mask_element(const vector_type& v, int i); +// void I::mask_set_element(vector_type& v, int i, bool x); + +namespace arb { +namespace simd { +namespace simd_detail { + +// The simd_traits class provides the mapping between a concrete SIMD +// implementation I and its associated classes. This must be specialized +// for each concrete implementation. + +template <typename I> +struct simd_traits { + static constexpr unsigned width = 0; + using scalar_type = void; + using vector_type = void; + using mask_impl = void; +}; + +template <typename I> +struct implbase { + constexpr static unsigned width = simd_traits<I>::width; + using scalar_type = typename simd_traits<I>::scalar_type; + using vector_type = typename simd_traits<I>::vector_type; + + using mask_impl = typename simd_traits<I>::mask_impl; + using mask_type = typename simd_traits<mask_impl>::vector_type; + + using store = scalar_type[width]; + using mask_store = bool[width]; + + static vector_type broadcast(scalar_type x) { + store a; + std::fill(std::begin(a), std::end(a), x); + return I::copy_from(a); + } + + static scalar_type element(const vector_type& v, int i) { + store a; + I::copy_to(v, a); + return a[i]; + } + + static void set_element(vector_type& v, int i, scalar_type x) { + store a; + I::copy_to(v, a); + a[i] = x; + v = I::copy_from(a); + } + + static void copy_to_masked(const vector_type& v, scalar_type* p, const mask_type& mask) { + store a; + I::copy_to(v, a); + + mask_store m; + mask_impl::mask_copy_to(mask, m); + for (unsigned i = 0; i<width; ++i) { + if (m[i]) p[i] = a[i]; + } + } + + static vector_type copy_from_masked(const scalar_type* p, const mask_type& mask) { + store a; + + mask_store m; + mask_impl::mask_copy_to(mask, m); + for (unsigned i = 0; i<width; ++i) { + if (m[i]) a[i] = p[i]; + } + return I::copy_from(a); + } + + static vector_type copy_from_masked(const vector_type& v, const scalar_type* p, const mask_type& mask) { + store a; + I::copy_to(v, a); + + mask_store m; + mask_impl::mask_copy_to(mask, m); + for (unsigned i = 0; i<width; ++i) { + if (m[i]) a[i] = p[i]; + } + return I::copy_from(a); + } + + static vector_type negate(const vector_type& u) { + store a, r; + I::copy_to(u, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = -a[i]; + } + return I::copy_from(r); + } + + static vector_type add(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]+b[i]; + } + return I::copy_from(r); + } + + static vector_type mul(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]*b[i]; + } + return I::copy_from(r); + } + + static vector_type sub(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]-b[i]; + } + return I::copy_from(r); + } + + static vector_type div(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]/b[i]; + } + return I::copy_from(r); + } + + static vector_type fma(const vector_type& u, const vector_type& v, const vector_type& w) { + store a, b, c, r; + I::copy_to(u, a); + I::copy_to(v, b); + I::copy_to(w, c); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::fma(a[i], b[i], c[i]); + } + return I::copy_from(r); + } + + static mask_type cmp_eq(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]==b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static mask_type cmp_neq(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]!=b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static mask_type cmp_gt(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]>b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static mask_type cmp_geq(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]>=b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static mask_type cmp_lt(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]<b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static mask_type cmp_leq(const vector_type& u, const vector_type& v) { + store a, b; + mask_store r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i]<=b[i]; + } + return mask_impl::mask_copy_from(r); + } + + static vector_type logical_not(const vector_type& u) { + store a, r; + I::copy_to(u, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = !a[i]; + } + return I::copy_from(r); + } + + static vector_type logical_and(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i] && b[i]; + } + return I::copy_from(r); + } + + static vector_type logical_or(const vector_type& u, const vector_type& v) { + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = a[i] || b[i]; + } + return I::copy_from(r); + } + + static vector_type ifelse(const mask_type& mask, const vector_type& u, const vector_type& v) { + mask_store m; + mask_impl::mask_copy_to(mask, m); + + store a, b, r; + I::copy_to(u, a); + I::copy_to(v, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = m[i]? a[i]: b[i]; + } + return I::copy_from(r); + } + + static vector_type mask_broadcast(bool v) { + mask_store m; + std::fill(std::begin(m), std::end(m), v); + return I::mask_copy_from(m); + } + + static vector_type mask_unpack(unsigned long long k) { + mask_store m; + for (unsigned i = 0; i<width; ++i) { + m[i] = k&(1ull<<i); + } + return I::mask_copy_from(m); + } + + template <typename ImplIndex> + static vector_type gather(ImplIndex, const scalar_type* p, const typename ImplIndex::vector_type& index) { + typename ImplIndex::scalar_type o[width]; + ImplIndex::copy_to(index, o); + + store a; + for (unsigned i = 0; i<width; ++i) { + a[i] = p[o[i]]; + } + return I::copy_from(a); + } + + template <typename ImplIndex> + static vector_type gather(ImplIndex, const vector_type& s, const scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) { + mask_store m; + mask_impl::mask_copy_to(mask, m); + + typename ImplIndex::scalar_type o[width]; + ImplIndex::copy_to(index, o); + + store a; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + if (m[i]) { a[i] = p[o[i]]; } + } + return I::copy_from(a); + }; + + template <typename ImplIndex> + static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index) { + typename ImplIndex::scalar_type o[width]; + ImplIndex::copy_to(index, o); + + store a; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + p[o[i]] = a[i]; + } + } + + template <typename ImplIndex> + static void scatter(ImplIndex, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, const mask_type& mask) { + mask_store m; + mask_impl::mask_copy_to(mask, m); + + typename ImplIndex::scalar_type o[width]; + ImplIndex::copy_to(index, o); + + store a; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + if (m[i]) { p[o[i]] = a[i]; } + } + } + + // Maths + + static vector_type abs(const vector_type& u) { + store a; + I::copy_to(u, a); + + for (unsigned i = 0; i<width; ++i) { + a[i] = std::abs(a[i]); + } + return I::copy_from(a); + } + + static vector_type min(const vector_type& s, const vector_type& t) { + return ifelse(cmp_gt(t, s), s, t); + } + + static vector_type max(const vector_type& s, const vector_type& t) { + return ifelse(cmp_gt(t, s), t, s); + } + + static vector_type sin(const vector_type& s) { + store a, r; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::sin(a[i]); + } + return I::copy_from(r); + } + + static vector_type cos(const vector_type& s) { + store a, r; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::cos(a[i]); + } + return I::copy_from(r); + } + + static vector_type exp(const vector_type& s) { + store a, r; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::exp(a[i]); + } + return I::copy_from(r); + } + + static vector_type expm1(const vector_type& s) { + store a, r; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::expm1(a[i]); + } + return I::copy_from(r); + } + + static vector_type log(const vector_type& s) { + store a, r; + I::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::log(a[i]); + } + return I::copy_from(r); + } + + static vector_type exprelr(const vector_type& s) { + vector_type ones = I::broadcast(1); + return ifelse(cmp_eq(ones, add(ones, s)), ones, div(s, expm1(s))); + } + + static vector_type pow(const vector_type& s, const vector_type &t) { + store a, b, r; + I::copy_to(s, a); + I::copy_to(t, b); + + for (unsigned i = 0; i<width; ++i) { + r[i] = std::pow(a[i], b[i]); + } + return I::copy_from(r); + } +}; + +} // namespace simd_detail +} // namespace simd +} // namespace arb diff --git a/src/simd/native.hpp b/src/simd/native.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d71bccec0085940ae279490eccbd4d14c20a32c2 --- /dev/null +++ b/src/simd/native.hpp @@ -0,0 +1,93 @@ +#pragma once + +// Native SIMD representations based on architecture. + +// Predefined macros from the compiler are used to guard +// architecture-specific code here and in the source code +// for the concrete implementation classes. +// +// For x86 vector extensions, the same preprocessor defines +// are used across gcc, clang and icpc: __AVX__, __AVX2__, +// __FMA__, __AVX512F__. +// +// Note that the FMA extensions for x86 are strictly speaking +// independent of AVX2, and instructing gcc (for example) +// to generate AVX2 code with '-mavx2' will not enable FMA +// instructions unless '-mfma' is also given. It is generally +// safer to explicitly request the target using +// '-march', '-mcpu' or '-x', depending on the compiler and +// architecure. + +namespace arb { +namespace simd { +namespace simd_abi { + +template <typename Value, unsigned N> +struct native { + using type = void; +}; + +} // namespace simd_abi +} // namespace simd +} // namespace arb + +#define ARB_DEF_NATIVE_SIMD_(T, N, A)\ +namespace arb {\ +namespace simd_abi {\ +template <> struct native<T, N> {\ + using type = typename A<T, N>::type;\ +};\ +}\ +} + + +#if defined(__AVX2__) && defined(__FMA__) + +#include <simd/avx.hpp> +ARB_DEF_NATIVE_SIMD_(int, 4, avx2) +ARB_DEF_NATIVE_SIMD_(double, 4, avx2) + +#elif defined(__AVX__) + +#include <simd/avx.hpp> +ARB_DEF_NATIVE_SIMD_(int, 4, avx) +ARB_DEF_NATIVE_SIMD_(double, 4, avx) + +#endif + +#if defined(__AVX512F__) + +#include <simd/avx512.hpp> +ARB_DEF_NATIVE_SIMD_(int, 8, avx512) +ARB_DEF_NATIVE_SIMD_(double, 8, avx512) + +#endif + + +namespace arb { +namespace simd { +namespace simd_abi { + +// Define native widths based on largest native vector implementation +// of corresponding type. Presume power of 2, no larger than largest +// possible over implemented architectures. + +template <typename Value, int k = 64> +struct native_width; + +template <typename Value, int k> +struct native_width { + static constexpr int value = + std::is_same<void, typename native<Value, k>::type>::value? + native_width<Value, k/2>::value: + k; +}; + +template <typename Value> +struct native_width<Value, 1> { + static constexpr int value = 1; +}; + +} // namespace simd_abi +} // namespace simd +} // namespace arb diff --git a/src/simd/simd.hpp b/src/simd/simd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a7a1c65505c3c181e74d84fc0ea850b42a965070 --- /dev/null +++ b/src/simd/simd.hpp @@ -0,0 +1,521 @@ +#pragma once + +#include <type_traits> + +#include <simd/implbase.hpp> +#include <simd/generic.hpp> +#include <simd/native.hpp> + +namespace arb { +namespace simd { + +namespace simd_detail { + template <typename Impl> + struct simd_impl; + + template <typename Impl> + struct simd_mask_impl; +} + +// Forward declarations for top-level maths functions. +// (these require access to private simd_impl<Impl>::wrap method). + +template <typename Impl> +simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&); + +template <typename Impl> +simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>&, const simd_detail::simd_impl<Impl>&); + +namespace simd_detail { + template <typename Impl> + struct simd_impl { + // Type aliases: + // + // scalar_type internal value type in one simd lane, + // simd_mask simd_mask_impl specialization represeting comparison results, + // + // vector_type underlying representation, + // mask_type underlying representation for mask. + + using scalar_type = typename simd_traits<Impl>::scalar_type; + using simd_mask = simd_mask_impl<typename simd_traits<Impl>::mask_impl>; + + protected: + using vector_type = typename simd_traits<Impl>::vector_type; + using mask_type = typename simd_traits<typename simd_traits<Impl>::mask_impl>::vector_type; + + public: + static constexpr unsigned width = simd_traits<Impl>::width; + + template <typename Other> + friend class simd_impl; + + simd_impl() = default; + + // Construct by filling with scalar value. + simd_impl(const scalar_type& x) { + value_ = Impl::broadcast(x); + } + + // Construct from scalar values in memory. + explicit simd_impl(const scalar_type* p) { + value_ = Impl::copy_from(p); + } + + // Construct from const array ref. + explicit simd_impl(const scalar_type (&a)[width]) { + value_ = Impl::copy_from(a); + } + + // Construct from scalar values in memory with mask. + explicit simd_impl(const scalar_type* p, const simd_mask& m) { + value_ = Impl::copy_from_masked(p, m.value_); + } + + // Construct from const array ref with mask. + explicit simd_impl(const scalar_type (&a)[width], const simd_mask& m) { + value_ = Impl::copy_from_masked(a, m.value_); + } + + // Copy constructor. + simd_impl(const simd_impl& other) { + std::memcpy(&value_, &other.value_, sizeof(vector_type)); + } + + // Scalar asssignment fills vector. + simd_impl& operator=(scalar_type x) { + value_ = Impl::broadcast(x); + return *this; + } + + // Copy assignment. + simd_impl& operator=(const simd_impl& other) { + std::memcpy(&value_, &other.value_, sizeof(vector_type)); + return *this; + } + + // Read/write operations: copy_to, copy_from. + + void copy_to(scalar_type* p) const { + Impl::copy_to(value_, p); + } + + void copy_from(const scalar_type* p) { + value_ = Impl::copy_from(p); + } + + // Arithmetic operations: +, -, *, /, fma. + + simd_impl operator-() const { + return wrap(Impl::negate(value_)); + } + + friend simd_impl operator+(const simd_impl& a, simd_impl b) { + return simd_impl::wrap(Impl::add(a.value_, b.value_)); + } + + friend simd_impl operator-(const simd_impl& a, simd_impl b) { + return simd_impl::wrap(Impl::sub(a.value_, b.value_)); + } + + friend simd_impl operator*(const simd_impl& a, simd_impl b) { + return simd_impl::wrap(Impl::mul(a.value_, b.value_)); + } + + friend simd_impl operator/(const simd_impl& a, simd_impl b) { + return simd_impl::wrap(Impl::div(a.value_, b.value_)); + } + + friend simd_impl fma(const simd_impl& a, simd_impl b, simd_impl c) { + return simd_impl::wrap(Impl::fma(a.value_, b.value_, c.value_)); + } + + // Lane-wise relational operations. + + friend simd_mask operator==(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_eq(a.value_, b.value_)); + } + + friend simd_mask operator!=(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_neq(a.value_, b.value_)); + } + + friend simd_mask operator<=(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_leq(a.value_, b.value_)); + } + + friend simd_mask operator<(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_lt(a.value_, b.value_)); + } + + friend simd_mask operator>=(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_geq(a.value_, b.value_)); + } + + friend simd_mask operator>(const simd_impl& a, const simd_impl& b) { + return simd_impl::mask(Impl::cmp_gt(a.value_, b.value_)); + } + + // Compound assignment operations: +=, -=, *=, /=. + + simd_impl& operator+=(const simd_impl& x) { + value_ = Impl::add(value_, x.value_); + return *this; + } + + simd_impl& operator-=(const simd_impl& x) { + value_ = Impl::sub(value_, x.value_); + return *this; + } + + simd_impl& operator*=(const simd_impl& x) { + value_ = Impl::mul(value_, x.value_); + return *this; + } + + simd_impl& operator/=(const simd_impl& x) { + value_ = Impl::div(value_, x.value_); + return *this; + } + + // Gather and scatter. + + template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> + void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) { + value_ = Impl::gather(IndexImpl{}, p, index.value_); + } + + template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> + void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) { + Impl::scatter(IndexImpl{}, value_, p, index.value_); + } + + // Array subscript operations. + + struct reference { + reference() = delete; + reference(const reference&) = default; + reference& operator=(const reference&) = delete; + + reference(vector_type& value, int i): + ptr_(&value), i(i) {} + + reference& operator=(scalar_type v) && { + Impl::set_element(*ptr_, i, v); + return *this; + } + + operator scalar_type() const { + return Impl::element(*ptr_, i); + } + + vector_type* ptr_; + int i; + }; + + reference operator[](int i) { + return reference(value_, i); + } + + scalar_type operator[](int i) const { + return Impl::element(value_, i); + } + + // Masked assignment (via where expressions). + + struct where_expression { + where_expression(const where_expression&) = default; + where_expression& operator=(const where_expression&) = delete; + + where_expression(const simd_mask& m, simd_impl& s): + mask_(m), data_(s) {} + + where_expression& operator=(scalar_type v) { + data_.value_ = Impl::ifelse(mask_.value_, simd_impl(v).value_, data_.value_); + return *this; + } + + where_expression& operator=(const simd_impl& v) { + data_.value_ = Impl::ifelse(mask_.value_, v.value_, data_.value_); + return *this; + } + + void copy_to(scalar_type* p) const { + Impl::copy_to_masked(data_.value_, p, mask_.value_); + } + + void copy_from(const scalar_type* p) { + data_.value_ = Impl::copy_from_masked(data_.value_, p, mask_.value_); + } + + template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> + void gather(const scalar_type* p, const simd_impl<IndexImpl>& index) { + data_.value_ = Impl::gather(IndexImpl{}, data_.value_, p, index.value_, mask_.value_); + } + + template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> + void scatter(scalar_type* p, const simd_impl<IndexImpl>& index) { + Impl::scatter(IndexImpl{}, data_.value_, p, index.value_, mask_.value_); + } + + private: + const simd_mask& mask_; + simd_impl& data_; + }; + + // Maths functions are implemented as top-level functions, but require + // access to `wrap`. + + friend simd_impl abs<Impl>(const simd_impl&); + friend simd_impl sin<Impl>(const simd_impl&); + friend simd_impl cos<Impl>(const simd_impl&); + friend simd_impl exp<Impl>(const simd_impl&); + friend simd_impl log<Impl>(const simd_impl&); + friend simd_impl expm1<Impl>(const simd_impl&); + friend simd_impl exprelr<Impl>(const simd_impl&); + friend simd_impl min<Impl>(const simd_impl&, const simd_impl&); + friend simd_impl max<Impl>(const simd_impl&, const simd_impl&); + friend simd_impl pow<Impl>(const simd_impl&, const simd_impl&); + + protected: + vector_type value_; + simd_impl(const vector_type& x) { + value_ = x; + } + + private: + static simd_impl wrap(const vector_type& v) { + simd_impl s; + s.value_ = v; + return s; + } + + static simd_mask mask(const mask_type& v) { + simd_mask m; + m.value_ = v; + return m; + } + }; + + template <typename Impl> + struct simd_mask_impl: simd_impl<Impl> { + using base = simd_impl<Impl>; + using typename base::vector_type; + using typename base::scalar_type; + using base::width; + using base::value_; + + simd_mask_impl() = default; + + // Construct by filling with scalar value. + simd_mask_impl(bool b) { + value_ = Impl::mask_broadcast(b); + } + + // Scalar asssignment fills vector. + simd_mask_impl& operator=(bool b) { + value_ = Impl::mask_broadcast(b); + return *this; + } + + // Construct from bool values in memory. + explicit simd_mask_impl(const bool* y) { + value_ = Impl::mask_copy_from(y); + } + + // Construct from const array ref. + explicit simd_mask_impl(const bool (&a)[width]) { + value_ = Impl::mask_copy_from(&a[0]); + } + + // Copy assignment. + simd_mask_impl& operator=(const simd_mask_impl& other) { + std::memcpy(&value_, &other.value_, sizeof(vector_type)); + return *this; + } + + // Read/write bool operations: copy_to, copy_from. + + void copy_to(bool* w) const { + Impl::mask_copy_to(value_, w); + } + + void copy_from(const bool* y) { + value_ = Impl::mask_copy_from(y); + } + + // Array subscript operations. + + struct reference { + reference() = delete; + reference(const reference&) = default; + reference& operator=(const reference&) = delete; + + reference(vector_type& value, int i): + ptr_(&value), i(i) {} + + reference& operator=(bool v) && { + Impl::mask_set_element(*ptr_, i, v); + return *this; + } + + operator bool() const { + return Impl::mask_element(*ptr_, i); + } + + vector_type* ptr_; + int i; + }; + + reference operator[](int i) { + return reference(value_, i); + } + + bool operator[](int i) const { + return Impl::element(value_, i); + } + + // Logical operations. + + simd_mask_impl operator!() const { + return simd_mask_impl::wrap(Impl::logical_not(value_)); + } + + friend simd_mask_impl operator&&(const simd_mask_impl& a, const simd_mask_impl& b) { + return simd_mask_impl::wrap(Impl::logical_and(a.value_, b.value_)); + } + + friend simd_mask_impl operator||(const simd_mask_impl& a, const simd_mask_impl& b) { + return simd_mask_impl::wrap(Impl::logical_or(a.value_, b.value_)); + } + + // Make mask from corresponding bits of integer. + + static simd_mask_impl unpack(unsigned long long bits) { + return simd_mask_impl::wrap(Impl::mask_unpack(bits)); + } + + private: + simd_mask_impl(const vector_type& v): base(v) {} + + template <class> friend class simd_impl; + + static simd_mask_impl wrap(const vector_type& v) { + simd_mask_impl m; + m.value_ = v; + return m; + } + }; +} // namespace simd_detail + +namespace simd_abi { + // Note: `simd_abi::native` template class defined in `simd/native.hpp`, + // `simd_abi::generic` in `simd/generic.hpp`. + + template <typename Value, unsigned N> + struct default_abi { + using type = typename std::conditional< + std::is_same<void, typename native<Value, N>::type>::value, + typename generic<Value, N>::type, + typename native<Value, N>::type>::type; + }; +} + +template <typename Value, unsigned N, template <class, unsigned> class Abi = simd_abi::default_abi> +using simd = simd_detail::simd_impl<typename Abi<Value, N>::type>; + +template <typename Value, unsigned N> +using simd_mask = typename simd<Value, N>::simd_mask; + +template <typename Simd> +using where_expression = typename Simd::where_expression; + +template <typename Simd> +where_expression<Simd> where(const typename Simd::simd_mask& m, Simd& v) { + return where_expression<Simd>(m, v); +} + +template <typename> +struct is_simd: std::false_type {}; + +template <typename Impl> +struct is_simd<simd_detail::simd_impl<Impl>>: std::true_type {}; + +// Top-level maths functions: forward to underlying Impl. + +template <typename Impl> +simd_detail::simd_impl<Impl> abs(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::abs(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> sin(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::sin(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> cos(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::cos(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> exp(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::exp(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> log(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::log(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> expm1(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::expm1(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> exprelr(const simd_detail::simd_impl<Impl>& s) { + return simd_detail::simd_impl<Impl>::wrap(Impl::exprelr(s.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> pow(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) { + return simd_detail::simd_impl<Impl>::wrap(Impl::pow(s.value_, t.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> min(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) { + return simd_detail::simd_impl<Impl>::wrap(Impl::min(s.value_, t.value_)); +} + +template <typename Impl> +simd_detail::simd_impl<Impl> max(const simd_detail::simd_impl<Impl>& s, const simd_detail::simd_impl<Impl>& t) { + return simd_detail::simd_impl<Impl>::wrap(Impl::max(s.value_, t.value_)); +} + +} // namespace simd +} // namespace arb diff --git a/src/util/debug.hpp b/src/util/debug.hpp index 214e2269f0602c2cd437e78190b03b629f9c7f5f..dfc553924461f9d68c5aeaa951b667c26975909b 100644 --- a/src/util/debug.hpp +++ b/src/util/debug.hpp @@ -2,9 +2,14 @@ #include <iostream> #include <sstream> +#include <string> #include <mutex> #include <utility> +extern "C" { +#include <endian.h> +} + #include <threading/threading.hpp> #include "unwind.hpp" @@ -84,16 +89,85 @@ namespace impl { return out; } }; + + enum class endian { + little = __ORDER_LITTLE_ENDIAN__, + big = __ORDER_BIG_ENDIAN__, + native = __BYTE_ORDER__ + }; + + // Wrapper for emitting values on an ostream as a sequence of hex digits. + struct hexdump_inline_wrap { + const unsigned char* from; + std::size_t size; + unsigned width; + + friend std::ostream& operator<<(std::ostream& out, const hexdump_inline_wrap& h) { + using std::ptrdiff_t; + + constexpr bool little = endian::native==endian::little; + ptrdiff_t width = h.width; + const unsigned char* from = h.from; + const unsigned char* end = h.from+h.size; + std::string buf; + + auto emit = [&buf](unsigned char c) { + const char* digit = "0123456789abcdef"; + buf += digit[(c>>4)&0xf]; + buf += digit[c&0xf]; + }; + + constexpr unsigned bufsz = 512; + unsigned bufmargin = 4*width+1; + + buf.reserve(bufsz); + while (end-from>width) { + if (buf.size()+bufmargin>=bufsz) { + out << buf; + buf.clear(); + } + for (ptrdiff_t i = 0; i<width; ++i) { + emit(little? from[width-i-1]: from[i]); + } + from += width; + buf += ' '; + } + for (ptrdiff_t i = 0; i<end-from; ++i) { + emit(little? from[width-i-1]: from[i]); + } + + out << buf; + return out; + } + }; } // Wrap a sequence or container of values so that they can be printed // to an `std::ostream` with the elements separated by the supplied // separator. + template <typename Seq, typename Separator> impl::sepval<Seq, Separator> sepval(const Seq& seq, Separator sep) { return impl::sepval<Seq, Separator>(seq, std::move(sep)); } +template <typename Seq> +impl::sepval<Seq, const char*> csv(const Seq& seq) { + return sepval(seq, ", "); +} + +// Dump something in hex (inline representation). + +template <typename T> +impl::hexdump_inline_wrap hexdump(const T& obj, unsigned width = 4) { + return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(&obj), sizeof obj, width}; +} + +template <typename T> +impl::hexdump_inline_wrap hexdump_n(const T* ptr, std::size_t n, unsigned width = 4) { + return impl::hexdump_inline_wrap{reinterpret_cast<const unsigned char*>(ptr), n, width}; +} + } // namespace util } // namespace arb diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f2efc57058477160ec83cce1d79a9aa1c54c8a94..fe018f935b9692d5bd941765ae657dc5ab34e77c 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -70,6 +70,7 @@ set(TEST_SOURCES test_segment.cpp test_schedule.cpp test_rss_cell.cpp + test_simd.cpp test_span.cpp test_spikes.cpp test_spike_store.cpp diff --git a/tests/unit/common.hpp b/tests/unit/common.hpp index 70eace9a79c0500ceddf4658b0eae5f115c7ee39..ad6c68739d4711831b2c47ba14ee8832265a8275 100644 --- a/tests/unit/common.hpp +++ b/tests/unit/common.hpp @@ -40,7 +40,7 @@ struct null_terminated_t { constexpr null_terminated_t null_terminated; -// wrap a value type, with copy operations disabled +// Wrap a value type, with copy operations disabled. template <typename V> struct nocopy { @@ -81,7 +81,7 @@ int nocopy<V>::move_ctor_count; template <typename V> int nocopy<V>::move_assign_count; -// wrap a value type, with move operations disabled +// Wrap a value type, with move operations disabled. template <typename V> struct nomove { @@ -138,12 +138,13 @@ template <typename FPType, typename Seq1, typename Seq2> for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) { using FP = testing::internal::FloatingPoint<FPType>; - // cast to FPType to avoid warnings about lowering conversion - // if FPType has lower precision than Seq{12}::value_type auto v1 = *i1; auto v2 = *i2; - if (!FP{v1}.AlmostEquals(FP{v2})) { + // Cast to FPType to avoid warnings about lowering conversion + // if FPType has lower precision than Seq{12}::value_type. + + if (!(std::isnan(v1) && std::isnan(v2)) && !FP{v1}.AlmostEquals(FP{v2})) { return ::testing::AssertionFailure() << "floating point numbers " << v1 << " and " << v2 << " differ at index " << j; } @@ -155,6 +156,59 @@ template <typename FPType, typename Seq1, typename Seq2> return ::testing::AssertionSuccess(); } +template <typename V> +inline bool generic_isnan(const V& x) { return false; } +inline bool generic_isnan(float x) { return std::isnan(x); } +inline bool generic_isnan(double x) { return std::isnan(x); } +inline bool generic_isnan(long double x) { return std::isnan(x); } + +template <typename U, typename V> +static bool equiv(const U& u, const V& v) { + return u==v || (generic_isnan(u) && generic_isnan(v)); +} + +template <typename Seq1, typename Seq2> +::testing::AssertionResult seq_eq(Seq1&& seq1, Seq2&& seq2) { + using std::begin; + using std::end; + + auto i1 = begin(seq1); + auto i2 = begin(seq2); + + auto e1 = end(seq1); + auto e2 = end(seq2); + + for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) { + auto v1 = *i1; + auto v2 = *i2; + + if (!equiv(v1, v2)) { + return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << j; + } + } + + if (i1!=e1 || i2!=e2) { + return ::testing::AssertionFailure() << "sequences differ in length"; + } + return ::testing::AssertionSuccess(); +} + +// Assert elements 0..n-1 inclusive of two indexed collections are exactly equal. +template <typename Arr1, typename Arr2> +::testing::AssertionResult indexed_eq_n(int n, Arr1&& a1, Arr2&& a2) { + for (int i = 0; i<n; ++i) { + auto v1 = a1[i]; + auto v2 = a2[i]; + + if (!equiv(v1,v2)) { + return ::testing::AssertionFailure() << "values " << v1 << " and " << v2 << " differ at index " << i; + } + } + + return ::testing::AssertionSuccess(); +} + + // Assert two floating point values are within a relative tolerance. inline ::testing::AssertionResult near_relative(double a, double b, double relerr) { double tol = relerr*std::max(std::abs(a), std::abs(b)); diff --git a/tests/unit/test_simd.cpp b/tests/unit/test_simd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f22de961aafac7ac946e7529d0dbcf37e793b327 --- /dev/null +++ b/tests/unit/test_simd.cpp @@ -0,0 +1,1001 @@ +#include <cmath> +#include <random> + +#include <simd/simd.hpp> +#include <simd/avx.hpp> + +#include "common.hpp" + +using namespace arb::simd; + +namespace { + // Use different distributions in `fill_random`, based on the value type in question: + // + // * floating point type => uniform_real_distribution, default interval [-1, 1). + // * bool => uniform_int_distribution, default interval [0, 1). + // * other integral type => uniform_int_distribution, default interval [L, U] + // such that L^2+L and U^2+U fit within the integer range. + + template <typename V, typename = typename std::enable_if<std::is_floating_point<V>::value>::type> + std::uniform_real_distribution<V> make_udist(V lb = -1., V ub = 1.) { + return std::uniform_real_distribution<V>(lb, ub); + } + + template <typename V, typename = typename std::enable_if<std::is_integral<V>::value && !std::is_same<V, bool>::value>::type> + std::uniform_int_distribution<V> make_udist( + V lb = std::numeric_limits<V>::lowest() / (2 << std::numeric_limits<V>::digits/2), + V ub = std::numeric_limits<V>::max() >> (1+std::numeric_limits<V>::digits/2)) + { + return std::uniform_int_distribution<V>(lb, ub); + } + + template <typename V, typename = typename std::enable_if<std::is_same<V, bool>::value>::type> + std::uniform_int_distribution<> make_udist(V lb = 0, V ub = 1) { + return std::uniform_int_distribution<>(0, 1); + } + + template <typename Seq, typename Rng> + void fill_random(Seq&& seq, Rng& rng) { + using V = typename std::decay<decltype(*std::begin(seq))>::type; + + auto u = make_udist<V>(); + for (auto& x: seq) { x = u(rng); } + } + + template <typename Seq, typename Rng, typename B1, typename B2> + void fill_random(Seq&& seq, Rng& rng, B1 lb, B2 ub) { + using V = typename std::decay<decltype(*std::begin(seq))>::type; + + auto u = make_udist<V>(lb, ub); + for (auto& x: seq) { x = u(rng); } + } + + template <typename Simd, typename Rng, typename B1, typename B2, typename = typename std::enable_if<is_simd<Simd>::value>::type> + void fill_random(Simd& s, Rng& rng, B1 lb, B2 ub) { + using V = typename Simd::scalar_type; + constexpr unsigned N = Simd::width; + + V v[N]; + fill_random(v, rng, lb, ub); + s.copy_from(v); + } + + template <typename Simd, typename Rng, typename = typename std::enable_if<is_simd<Simd>::value>::type> + void fill_random(Simd& s, Rng& rng) { + using V = typename Simd::scalar_type; + constexpr unsigned N = Simd::width; + + V v[N]; + fill_random(v, rng); + s.copy_from(v); + } + + template <typename Simd> + ::testing::AssertionResult simd_eq(Simd a, Simd b) { + constexpr unsigned N = Simd::width; + using V = typename Simd::scalar_type; + + V as[N], bs[N]; + a.copy_to(as); + b.copy_to(bs); + + return ::testing::seq_eq(as, bs); + } + + constexpr unsigned nrounds = 20u; +} + +template <typename S> +struct simd_value: public ::testing::Test {}; + +TYPED_TEST_CASE_P(simd_value); + +// Initialization and element access. +TYPED_TEST_P(simd_value, elements) { + using simd = TypeParam; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1001); + + // broadcast: + simd a(2); + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(2., a[i]); + } + + // scalar assignment: + a = 3; + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(3, a[i]); + } + + scalar bv[N], cv[N], dv[N]; + + fill_random(bv, rng); + fill_random(cv, rng); + fill_random(dv, rng); + + // array initialization: + simd b(bv); + EXPECT_TRUE(testing::indexed_eq_n(N, bv, b)); + + // array rvalue initialization: + simd c(std::move(cv)); + EXPECT_TRUE(testing::indexed_eq_n(N, cv, c)); + + // pointer initialization: + simd d(&dv[0]); + EXPECT_TRUE(testing::indexed_eq_n(N, dv, d)); + + // copy construction: + simd e(d); + EXPECT_TRUE(testing::indexed_eq_n(N, dv, e)); + + // copy assignment: + b = d; + EXPECT_TRUE(testing::indexed_eq_n(N, dv, b)); +} + +TYPED_TEST_P(simd_value, element_lvalue) { + using simd = TypeParam; + constexpr unsigned N = simd::width; + + simd a(3); + ASSERT_GT(N, 1u); + a[N-2] = 5; + + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(i==N-2? 5: 3, a[i]); + } +} + +TYPED_TEST_P(simd_value, copy_to_from) { + using simd = TypeParam; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1010); + + scalar buf1[N], buf2[N]; + fill_random(buf1, rng); + fill_random(buf2, rng); + + simd s; + s.copy_from(buf1); + s.copy_to(buf2); + + EXPECT_TRUE(testing::indexed_eq_n(N, buf1, s)); + EXPECT_TRUE(testing::seq_eq(buf1, buf2)); +} + +TYPED_TEST_P(simd_value, copy_to_from_masked) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1031); + + for (unsigned i = 0; i<nrounds; ++i) { + scalar buf1[N], buf2[N], buf3[N]; + fill_random(buf1, rng); + fill_random(buf2, rng); + fill_random(buf3, rng); + + bool mbuf1[N], mbuf2[N]; + fill_random(mbuf1, rng); + fill_random(mbuf2, rng); + mask m1(mbuf1); + mask m2(mbuf2); + + scalar expected[N]; + for (unsigned i = 0; i<N; ++i) { + expected[i] = mbuf1[i]? buf2[i]: buf1[i]; + } + + simd s(buf1); + where(m1, s).copy_from(buf2); + EXPECT_TRUE(testing::indexed_eq_n(N, expected, s)); + + for (unsigned i = 0; i<N; ++i) { + if (!mbuf2[i]) expected[i] = buf3[i]; + } + + where(m2, s).copy_to(buf3); + EXPECT_TRUE(testing::indexed_eq_n(N, expected, buf3)); + } +} + +TYPED_TEST_P(simd_value, construct_masked) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1031); + + for (unsigned i = 0; i<nrounds; ++i) { + scalar buf[N]; + fill_random(buf, rng); + + bool mbuf[N]; + fill_random(mbuf, rng); + + mask m(mbuf); + simd s(buf, m); + + for (unsigned i = 0; i<N; ++i) { + if (!mbuf[i]) continue; + EXPECT_EQ(buf[i], s[i]); + } + } +} + +TYPED_TEST_P(simd_value, arithmetic) { + using simd = TypeParam; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1002); + scalar u[N], v[N], w[N], r[N]; + + for (unsigned i = 0; i<nrounds; ++i) { + fill_random(u, rng); + fill_random(v, rng); + fill_random(w, rng); + + scalar neg_u[N]; + for (unsigned i = 0; i<N; ++i) neg_u[i] = -u[i]; + + scalar u_plus_v[N]; + for (unsigned i = 0; i<N; ++i) u_plus_v[i] = u[i]+v[i]; + + scalar u_minus_v[N]; + for (unsigned i = 0; i<N; ++i) u_minus_v[i] = u[i]-v[i]; + + scalar u_times_v[N]; + for (unsigned i = 0; i<N; ++i) u_times_v[i] = u[i]*v[i]; + + scalar u_divide_v[N]; + for (unsigned i = 0; i<N; ++i) u_divide_v[i] = u[i]/v[i]; + + scalar fma_u_v_w[N]; + for (unsigned i = 0; i<N; ++i) fma_u_v_w[i] = std::fma(u[i],v[i],w[i]); + + simd us(u), vs(v), ws(w); + + (-us).copy_to(r); + EXPECT_TRUE(testing::seq_eq(neg_u, r)); + + (us+vs).copy_to(r); + EXPECT_TRUE(testing::seq_eq(u_plus_v, r)); + + (us-vs).copy_to(r); + EXPECT_TRUE(testing::seq_eq(u_minus_v, r)); + + (us*vs).copy_to(r); + EXPECT_TRUE(testing::seq_eq(u_times_v, r)); + + (us/vs).copy_to(r); +#if defined(__INTEL_COMPILER) + // icpc will by default use an approximation for scalar division, + // and a different one for vectorized scalar division; the latter, + // in particular, is often out by 1 ulp for normal quotients. + // + // Unfortunately, we can't check at compile time the floating + // point dodginess quotient. + + if (std::is_floating_point<scalar>::value) { + EXPECT_TRUE(testing::seq_almost_eq<scalar>(u_divide_v, r)); + } + else { + EXPECT_TRUE(testing::seq_eq(u_divide_v, r)); + } +#else + EXPECT_TRUE(testing::seq_eq(u_divide_v, r)); +#endif + + (fma(us, vs, ws)).copy_to(r); + EXPECT_TRUE(testing::seq_eq(fma_u_v_w, r)); + } +} + +TYPED_TEST_P(simd_value, compound_assignment) { + using simd = TypeParam; + + simd a, b, r; + + std::minstd_rand rng(1003); + fill_random(a, rng); + fill_random(b, rng); + + EXPECT_TRUE(simd_eq(a+b, (r = a)+=b)); + EXPECT_TRUE(simd_eq(a-b, (r = a)-=b)); + EXPECT_TRUE(simd_eq(a*b, (r = a)*=b)); + EXPECT_TRUE(simd_eq(a/b, (r = a)/=b)); +} + +TYPED_TEST_P(simd_value, comparison) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1004); + std::uniform_int_distribution<> sgn(-1, 1); // -1, 0 or 1. + + for (unsigned i = 0; i<nrounds; ++i) { + int cmp[N]; + bool test[N]; + simd a, b; + + fill_random(b, rng); + + for (unsigned j = 0; j<N; ++j) { + cmp[j] = sgn(rng); + a[j] = b[j]+17*cmp[j]; + } + + mask gt = a>b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, gt)); + + mask geq = a>=b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]>=0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, geq)); + + mask lt = a<b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, lt)); + + mask leq = a<=b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]<=0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, leq)); + + mask eq = a==b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]==0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, eq)); + + mask ne = a!=b; + for (unsigned j = 0; j<N; ++j) { test[j] = cmp[j]!=0; } + EXPECT_TRUE(testing::indexed_eq_n(N, test, ne)); + } +} + +TYPED_TEST_P(simd_value, mask_elements) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1005); + + // bool broadcast: + mask a(true); + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(true, a[i]); + } + + // scalar assignment: + mask d; + d = false; + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(false, d[i]); + } + d = true; + for (unsigned i = 0; i<N; ++i) { + EXPECT_EQ(true, d[i]); + } + + for (unsigned i = 0; i<nrounds; ++i) { + bool bv[N], cv[N], dv[N]; + + fill_random(bv, rng); + fill_random(cv, rng); + fill_random(dv, rng); + + // array initialization: + mask b(bv); + EXPECT_TRUE(testing::indexed_eq_n(N, bv, b)); + + // array rvalue initialization: + mask c(std::move(cv)); + EXPECT_TRUE(testing::indexed_eq_n(N, cv, c)); + + // pointer initialization: + mask d(&dv[0]); + EXPECT_TRUE(testing::indexed_eq_n(N, dv, d)); + + // copy construction: + mask e(d); + EXPECT_TRUE(testing::indexed_eq_n(N, dv, e)); + + // copy assignment: + b = d; + EXPECT_TRUE(testing::indexed_eq_n(N, dv, b)); + } +} + +TYPED_TEST_P(simd_value, mask_element_lvalue) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1006); + + for (unsigned i = 0; i<nrounds; ++i) { + bool v[N]; + fill_random(v, rng); + + mask m(v); + for (unsigned j = 0; j<N; ++j) { + bool b = v[j]; + m[j] = !b; + v[j] = !b; + + EXPECT_EQ(m[j], !b); + EXPECT_TRUE(testing::indexed_eq_n(N, v, m)); + + m[j] = b; + v[j] = b; + EXPECT_EQ(m[j], b); + EXPECT_TRUE(testing::indexed_eq_n(N, v, m)); + } + } +} + +TYPED_TEST_P(simd_value, mask_copy_to_from) { + using simd = TypeParam; + using simd_mask = typename simd::simd_mask; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1012); + + for (unsigned i = 0; i<nrounds; ++i) { + bool buf1[N], buf2[N]; + fill_random(buf1, rng); + fill_random(buf2, rng); + + simd_mask m; + m.copy_from(buf1); + m.copy_to(buf2); + + EXPECT_TRUE(testing::indexed_eq_n(N, buf1, m)); + EXPECT_TRUE(testing::seq_eq(buf1, buf2)); + } +} + +TYPED_TEST_P(simd_value, mask_unpack) { + using simd = TypeParam; + using mask = typename simd::simd_mask; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1035); + std::uniform_int_distribution<unsigned long long> U(0, (1ull<<N)-1); + + for (unsigned i = 0; i<nrounds; ++i) { + unsigned long long packed = U(rng); + bool b[N]; + mask::unpack(packed).copy_to(b); + + for (unsigned j = 0; j<N; ++j) { + EXPECT_EQ((bool)(packed&(1ull<<j)), b[j]); + } + } +} + +TYPED_TEST_P(simd_value, maths) { + // min, max, abs tests valid for both fp and int types. + + using simd = TypeParam; + using scalar = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1013); + + for (unsigned i = 0; i<nrounds; ++i) { + scalar a[N], b[N], test[N]; + fill_random(a, rng); + fill_random(b, rng); + + simd as(a), bs(b); + + for (unsigned j = 0; j<N; ++j) { test[j] = std::abs(a[j]); } + EXPECT_TRUE(testing::indexed_eq_n(N, test, abs(as))); + + for (unsigned j = 0; j<N; ++j) { test[j] = std::min(a[j], b[j]); } + EXPECT_TRUE(testing::indexed_eq_n(N, test, min(as, bs))); + + for (unsigned j = 0; j<N; ++j) { test[j] = std::max(a[j], b[j]); } + EXPECT_TRUE(testing::indexed_eq_n(N, test, max(as, bs))); + } +} + +REGISTER_TYPED_TEST_CASE_P(simd_value, elements, element_lvalue, copy_to_from, copy_to_from_masked, construct_masked, arithmetic, compound_assignment, comparison, mask_elements, mask_element_lvalue, mask_copy_to_from, mask_unpack, maths); + +typedef ::testing::Types< + +#ifdef __AVX__ + simd<int, 4, simd_abi::avx>, + simd<double, 4, simd_abi::avx>, +#endif +#ifdef __AVX2__ + simd<int, 4, simd_abi::avx2>, + simd<double, 4, simd_abi::avx2>, +#endif +#ifdef __AVX512F__ + simd<int, 8, simd_abi::avx512>, + simd<double, 8, simd_abi::avx512>, +#endif + + simd<int, 4, simd_abi::generic>, + simd<double, 4, simd_abi::generic>, + simd<float, 16, simd_abi::generic>, + + simd<int, 4, simd_abi::default_abi>, + simd<double, 4, simd_abi::default_abi>, + simd<int, 8, simd_abi::default_abi>, + simd<double, 8, simd_abi::default_abi> +> simd_test_types; + +INSTANTIATE_TYPED_TEST_CASE_P(S, simd_value, simd_test_types); + +// FP-only SIMD value tests (maths). + +template <typename S> +struct simd_fp_value: public ::testing::Test {}; + +TYPED_TEST_CASE_P(simd_fp_value); + +TYPED_TEST_P(simd_fp_value, fp_maths) { + using simd = TypeParam; + using fp = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + std::minstd_rand rng(1014); + + for (unsigned i = 0; i<nrounds; ++i) { + fp epsilon = std::numeric_limits<fp>::epsilon(); + int min_exponent = std::numeric_limits<fp>::min_exponent; + int max_exponent = std::numeric_limits<fp>::max_exponent; + + fp u[N], v[N], r[N]; + + // Trigonometric functions (sin, cos): + fill_random(u, rng); + + fp sin_u[N]; + for (unsigned i = 0; i<N; ++i) sin_u[i] = std::sin(u[i]); + sin(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(sin_u, r)); + + fp cos_u[N]; + for (unsigned i = 0; i<N; ++i) cos_u[i] = std::cos(u[i]); + cos(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(cos_u, r)); + + // Logarithms (natural log): + fill_random(u, rng, -max_exponent*std::log(2.), max_exponent*std::log(2.)); + for (auto& x: u) { + x = std::exp(x); + // SIMD log implementation may treat subnormal as zero + if (std::fpclassify(x)==FP_SUBNORMAL) x = 0; + } + + fp log_u[N]; + for (unsigned i = 0; i<N; ++i) log_u[i] = std::log(u[i]); + log(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(log_u, r)); + + // Exponential functions (exp, expm1, exprelr): + + // Use max_exponent to get coverage over finite domain. + fp exp_min_arg = min_exponent*std::log(2.); + fp exp_max_arg = max_exponent*std::log(2.); + fill_random(u, rng, exp_min_arg, exp_max_arg); + + fp exp_u[N]; + for (unsigned i = 0; i<N; ++i) exp_u[i] = std::exp(u[i]); + exp(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u, r)); + + fp expm1_u[N]; + for (unsigned i = 0; i<N; ++i) expm1_u[i] = std::expm1(u[i]); + expm1(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u, r)); + + fp exprelr_u[N]; + for (unsigned i = 0; i<N; ++i) { + exprelr_u[i] = u[i]+fp(1)==fp(1)? fp(1): u[i]/(std::exp(u[i])-fp(1)); + } + exprelr(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u, r)); + + // Test expm1 and exprelr with small (magnitude < fp epsilon) values. + fill_random(u, rng, -epsilon, epsilon); + fp expm1_u_small[N]; + for (unsigned i = 0; i<N; ++i) { + expm1_u_small[i] = std::expm1(u[i]); + EXPECT_NEAR(u[i], expm1_u_small[i], std::abs(4*u[i]*epsilon)); // just to confirm! + } + expm1(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(expm1_u_small, r)); + + fp exprelr_u_small[N]; + for (unsigned i = 0; i<N; ++i) exprelr_u_small[i] = 1; + exprelr(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(exprelr_u_small, r)); + + // Test zero result for highly negative exponents. + fill_random(u, rng, 4*exp_min_arg, 2*exp_min_arg); + fp exp_u_very_negative[N]; + for (unsigned i = 0; i<N; ++i) exp_u_very_negative[i] = std::exp(u[i]); + exp(simd(u)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(exp_u_very_negative, r)); + + // Power function: + + // Non-negative base, arbitrary exponent. + fill_random(u, rng, 0., std::exp(1)); + fill_random(v, rng, exp_min_arg, exp_max_arg); + fp pow_u_pos_v[N]; + for (unsigned i = 0; i<N; ++i) pow_u_pos_v[i] = std::pow(u[i], v[i]); + pow(simd(u), simd(v)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_pos_v, r)); + + // Arbitrary base, small magnitude integer exponent. + fill_random(u, rng); + int int_exponent[N]; + fill_random(int_exponent, rng, -2, 2); + for (unsigned i = 0; i<N; ++i) v[i] = int_exponent[i]; + fp pow_u_v_int[N]; + for (unsigned i = 0; i<N; ++i) pow_u_v_int[i] = std::pow(u[i], v[i]); + pow(simd(u), simd(v)).copy_to(r); + EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_v_int, r)); + } +} + +// Check special function behaviour for specific values including +// qNAN, infinity etc. + +TYPED_TEST_P(simd_fp_value, exp_special_values) { + using simd = TypeParam; + using fp = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + using limits = std::numeric_limits<fp>; + + constexpr fp inf = limits::infinity(); + constexpr fp eps = limits::epsilon(); + constexpr fp largest = limits::max(); + constexpr fp normal_least = limits::min(); + constexpr fp denorm_least = limits::denorm_min(); + constexpr fp qnan = limits::quiet_NaN(); + + const fp exp_minarg = std::log(normal_least); + const fp exp_maxarg = std::log(largest); + + fp values[] = { inf, -inf, eps, -eps, + eps/2, -eps/2, 0., -0., + 1., -1., 2., -2., + normal_least, denorm_least, -normal_least, -denorm_least, + exp_minarg, exp_maxarg, qnan, -qnan }; + + constexpr unsigned n_values = sizeof(values)/sizeof(fp); + constexpr unsigned n_packed = (n_values+N-1)/N; + fp data[n_packed][N]; + + std::fill((fp *)data, (fp *)data+N*n_packed, fp(0)); + std::copy(std::begin(values), std::end(values), (fp *)data); + + for (unsigned i = 0; i<n_packed; ++i) { + fp expected[N], result[N]; + for (unsigned j = 0; j<N; ++j) { + expected[j] = std::exp(data[i][j]); + } + + simd s(data[i]); + s = exp(s); + s.copy_to(result); + + EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result)); + } +} + +TYPED_TEST_P(simd_fp_value, expm1_special_values) { + using simd = TypeParam; + using fp = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + using limits = std::numeric_limits<fp>; + + constexpr fp inf = limits::infinity(); + constexpr fp eps = limits::epsilon(); + constexpr fp largest = limits::max(); + constexpr fp normal_least = limits::min(); + constexpr fp denorm_least = limits::denorm_min(); + constexpr fp qnan = limits::quiet_NaN(); + + const fp expm1_minarg = std::nextafter(std::log(eps/4), fp(0)); + const fp expm1_maxarg = std::log(largest); + + fp values[] = { inf, -inf, eps, -eps, + eps/2, -eps/2, 0., -0., + 1., -1., 2., -2., + normal_least, denorm_least, -normal_least, -denorm_least, + expm1_minarg, expm1_maxarg, qnan, -qnan }; + + constexpr unsigned n_values = sizeof(values)/sizeof(fp); + constexpr unsigned n_packed = (n_values+N-1)/N; + fp data[n_packed][N]; + + std::fill((fp *)data, (fp *)data+N*n_packed, fp(0)); + std::copy(std::begin(values), std::end(values), (fp *)data); + + for (unsigned i = 0; i<n_packed; ++i) { + fp expected[N], result[N]; + for (unsigned j = 0; j<N; ++j) { + expected[j] = std::expm1(data[i][j]); + } + + simd s(data[i]); + s = expm1(s); + s.copy_to(result); + + EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result)); + } +} + +TYPED_TEST_P(simd_fp_value, log_special_values) { + using simd = TypeParam; + using fp = typename simd::scalar_type; + constexpr unsigned N = simd::width; + + using limits = std::numeric_limits<fp>; + + // NOTE: simd log implementations may treat subnormal + // numbers as zero, so omit the denorm_least tests... + + constexpr fp inf = limits::infinity(); + constexpr fp eps = limits::epsilon(); + constexpr fp largest = limits::max(); + constexpr fp normal_least = limits::min(); + //constexpr fp denorm_least = limits::denorm_min(); + constexpr fp qnan = limits::quiet_NaN(); + + fp values[] = { inf, -inf, eps, -eps, + eps/2, -eps/2, 0., -0., + 1., -1., 2., -2., + //normal_least, denorm_least, -normal_least, -denorm_least, + normal_least, -normal_least, + qnan, -qnan, largest }; + + constexpr unsigned n_values = sizeof(values)/sizeof(fp); + constexpr unsigned n_packed = (n_values+N-1)/N; + fp data[n_packed][N]; + + std::fill((fp *)data, (fp *)data+N*n_packed, fp(0)); + std::copy(std::begin(values), std::end(values), (fp *)data); + + for (unsigned i = 0; i<n_packed; ++i) { + fp expected[N], result[N]; + for (unsigned j = 0; j<N; ++j) { + expected[j] = std::log(data[i][j]); + } + + simd s(data[i]); + s = log(s); + s.copy_to(result); + + EXPECT_TRUE(testing::seq_almost_eq<fp>(expected, result)); + } +} + +REGISTER_TYPED_TEST_CASE_P(simd_fp_value, fp_maths, exp_special_values, expm1_special_values, log_special_values); + +typedef ::testing::Types< + +#ifdef __AVX__ + simd<double, 4, simd_abi::avx>, +#endif +#ifdef __AVX2__ + simd<double, 4, simd_abi::avx2>, +#endif +#ifdef __AVX512F__ + simd<double, 8, simd_abi::avx512>, +#endif + + simd<float, 2, simd_abi::generic>, + simd<double, 4, simd_abi::generic>, + simd<float, 8, simd_abi::generic>, + + simd<double, 4, simd_abi::default_abi>, + simd<double, 8, simd_abi::default_abi> +> simd_fp_test_types; + +INSTANTIATE_TYPED_TEST_CASE_P(S, simd_fp_value, simd_fp_test_types); + +// Gather/scatter tests. + +template <typename A, typename B> +struct simd_and_index { + using simd = A; + using simd_index = B; +}; + +template <typename SI> +struct simd_indirect: public ::testing::Test {}; + +TYPED_TEST_CASE_P(simd_indirect); + +TYPED_TEST_P(simd_indirect, gather) { + using simd = typename TypeParam::simd; + using simd_index = typename TypeParam::simd_index; + + constexpr unsigned N = simd::width; + using scalar = typename simd::scalar_type; + using index = typename simd_index::scalar_type; + + std::minstd_rand rng(1011); + + constexpr std::size_t buflen = 1000; + + for (unsigned i = 0; i<nrounds; ++i) { + scalar array[buflen]; + index indirect[N]; + + fill_random(array, rng); + fill_random(indirect, rng, 0, (int)(buflen-1)); + + simd s; + s.gather(array, simd_index(indirect)); + + scalar test[N]; + for (unsigned j = 0; j<N; ++j) { + test[j] = array[indirect[j]]; + } + + EXPECT_TRUE(::testing::indexed_eq_n(N, test, s)); + } +} + +TYPED_TEST_P(simd_indirect, masked_gather) { + using simd = typename TypeParam::simd; + using simd_index = typename TypeParam::simd_index; + using simd_mask = typename simd::simd_mask; + + constexpr unsigned N = simd::width; + using scalar = typename simd::scalar_type; + using index = typename simd_index::scalar_type; + + std::minstd_rand rng(1011); + + constexpr std::size_t buflen = 1000; + + for (unsigned i = 0; i<nrounds; ++i) { + scalar array[buflen], original[N], test[N]; + index indirect[N]; + bool mask[N]; + + fill_random(array, rng); + fill_random(original, rng); + fill_random(indirect, rng, 0, (int)(buflen-1)); + fill_random(mask, rng); + + for (unsigned j = 0; j<N; ++j) { + test[j] = mask[j]? array[indirect[j]]: original[j]; + } + + simd s(original); + simd_mask m(mask); + where(m, s).gather(array, simd_index(indirect)); + + EXPECT_TRUE(::testing::indexed_eq_n(N, test, s)); + } +} + +TYPED_TEST_P(simd_indirect, scatter) { + using simd = typename TypeParam::simd; + using simd_index = typename TypeParam::simd_index; + + constexpr unsigned N = simd::width; + using scalar = typename simd::scalar_type; + using index = typename simd_index::scalar_type; + + std::minstd_rand rng(1011); + + constexpr std::size_t buflen = 1000; + + for (unsigned i = 0; i<nrounds; ++i) { + scalar array[buflen], test[buflen], values[N]; + index indirect[N]; + + fill_random(array, rng); + fill_random(values, rng); + fill_random(indirect, rng, 0, (int)(buflen-1)); + + for (unsigned j = 0; j<buflen; ++j) { + test[j] = array[j]; + } + for (unsigned j = 0; j<N; ++j) { + test[indirect[j]] = values[j]; + } + + simd s(values); + s.scatter(array, simd_index(indirect)); + + EXPECT_TRUE(::testing::indexed_eq_n(N, test, array)); + } +} + +TYPED_TEST_P(simd_indirect, masked_scatter) { + using simd = typename TypeParam::simd; + using simd_index = typename TypeParam::simd_index; + using simd_mask = typename simd::simd_mask; + + constexpr unsigned N = simd::width; + using scalar = typename simd::scalar_type; + using index = typename simd_index::scalar_type; + + std::minstd_rand rng(1011); + + constexpr std::size_t buflen = 1000; + + for (unsigned i = 0; i<nrounds; ++i) { + scalar array[buflen], test[buflen], values[N]; + index indirect[N]; + bool mask[N]; + + fill_random(array, rng); + fill_random(values, rng); + fill_random(indirect, rng, 0, (int)(buflen-1)); + fill_random(mask, rng); + + for (unsigned j = 0; j<buflen; ++j) { + test[j] = array[j]; + } + for (unsigned j = 0; j<N; ++j) { + if (mask[j]) { test[indirect[j]] = values[j]; } + } + + simd s(values); + simd_mask m(mask); + where(m, s).scatter(array, simd_index(indirect)); + + EXPECT_TRUE(::testing::indexed_eq_n(N, test, array)); + } +} + + +REGISTER_TYPED_TEST_CASE_P(simd_indirect, gather, masked_gather, scatter, masked_scatter); + +typedef ::testing::Types< + +#ifdef __AVX__ + simd_and_index<simd<double, 4, simd_abi::avx>, + simd<int, 4, simd_abi::avx>>, +#endif + +#ifdef __AVX2__ + simd_and_index<simd<double, 4, simd_abi::avx2>, + simd<int, 4, simd_abi::avx2>>, +#endif + +#ifdef __AVX512F__ + simd_and_index<simd<double, 8, simd_abi::avx512>, + simd<int, 8, simd_abi::avx512>>, +#endif + + simd_and_index<simd<float, 4, simd_abi::generic>, + simd<std::int64_t, 4, simd_abi::generic>>, + + simd_and_index<simd<double, 8, simd_abi::generic>, + simd<unsigned, 8, simd_abi::generic>>, + + simd_and_index<simd<double, 4, simd_abi::default_abi>, + simd<int, 4, simd_abi::default_abi>>, + + simd_and_index<simd<double, 8, simd_abi::default_abi>, + simd<int, 8, simd_abi::default_abi>> +> simd_indirect_test_types; + +INSTANTIATE_TYPED_TEST_CASE_P(S, simd_indirect, simd_indirect_test_types);