diff --git a/ATTRIBUTIONS.md b/ATTRIBUTIONS.md index 80bbea20f8d7b38cc663efc68ab5ce3fcb21baa0..48142a3c8bd1d01bb02c1b6b52ee154a81cfe2a7 100644 --- a/ATTRIBUTIONS.md +++ b/ATTRIBUTIONS.md @@ -29,3 +29,27 @@ project. BSD License. http://llvm.org/ + +## Transcendentals intrinsics + +The numerical algorithms for the transcendentals intrinsics are based on the scalar Cephes library. + +http://www.netlib.org/cephes/ +https://github.com/jeremybarnes/cephes (Github mirror) + +Custom license (as it appears in the original `readme` from the project's page): + +>Some software in this archive may be from the book _Methods and +>Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster +>International, 1989) or from the Cephes Mathematical Library, a +>commercial product. In either event, it is copyrighted by the author. +>What you see here may be used freely but it comes with no support or +>guarantee. +> +> The two known misprints in the book are repaired here in the +>source listings for the gamma function and the incomplete beta +>integral. +> +> +> Stephen L. Moshier +> moshier@na-net.ornl.gov diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp index 1fbcf848082de751be0f023204ea9cb825366a0c..7aebf61d7f99a335a2582b5bc7ec8518e4ba2029 100644 --- a/modcc/backends/avx2.hpp +++ b/modcc/backends/avx2.hpp @@ -5,6 +5,7 @@ #pragma once #include "backends/base.hpp" +#include "util/compat.hpp" namespace nest { @@ -23,8 +24,13 @@ struct simd_intrinsics<targetKind::avx2> { } static std::string emit_headers() { - return "#include <immintrin.h>"; - }; + std::string ret = "#include <immintrin.h>"; + if (!compat::using_intel_compiler()) { + ret += "\n#include <backends/multicore/intrin.hpp>"; + } + + return ret; + } static std::string emit_simd_width() { return "256"; @@ -69,10 +75,20 @@ struct simd_intrinsics<targetKind::avx2> { tb << "_mm256_sub_pd(_mm256_set1_pd(0), "; break; case tok::exp: - tb << "_mm256_exp_pd("; + if (compat::using_intel_compiler()) { + tb << "_mm256_exp_pd("; + } + else { + tb << "nest::mc::multicore::nmc_mm256_exp_pd("; + } break; case tok::log: - tb << "_mm256_log_pd("; + if (compat::using_intel_compiler()) { + tb << "_mm256_log_pd("; + } + else { + tb << "nest::mc::multicore::nmc_mm256_log_pd("; + } break; default: throw std::invalid_argument("Unknown unary operator"); @@ -84,7 +100,13 @@ struct simd_intrinsics<targetKind::avx2> { template<typename B, typename E> static void emit_pow(TextBuffer& tb, const B& base, const E& exp) { - tb << "_mm256_pow_pd("; + if (compat::using_intel_compiler()) { + tb << "_mm256_pow_pd("; + } + else { + tb << "nest::mc::multicore::nmc_mm256_pow_pd("; + } + emit_operands(tb, arg_emitter(base), arg_emitter(exp)); tb << ")"; } diff --git a/modcc/simd_printer.hpp b/modcc/simd_printer.hpp index 849ac2da18a08320147877baecb0d8010f2c8725..ef1a0e3ff19c68756b3a4ee1111e3aa753d66387 100644 --- a/modcc/simd_printer.hpp +++ b/modcc/simd_printer.hpp @@ -9,6 +9,12 @@ #include "options.hpp" #include "textbuffer.hpp" +#ifdef __GNUC__ +# define ANNOT_UNUSED "__attribute__((unused))" +#else +# define ANNOT_UNUSED "" +#endif + using namespace nest::mc; @@ -111,13 +117,13 @@ void SimdPrinter<Arch>::visit(APIMethod *e) { text_.increase_indentation(); // First emit the raw pointer of node_index_ and vec_ci_ - text_.add_line("constexpr size_t simd_width = " + + text_.add_line("constexpr int simd_width = " + simd_backend::emit_simd_width() + " / (CHAR_BIT*sizeof(value_type));"); - text_.add_line("const size_type* " + emit_rawptr_name("node_index_") + - " = node_index_.data();"); - text_.add_line("const size_type* " + emit_rawptr_name("vec_ci_") + - " = vec_ci_.data();"); + text_.add_line("const size_type " ANNOT_UNUSED " *" + + emit_rawptr_name("node_index_") +" = node_index_.data();"); + text_.add_line("const size_type " ANNOT_UNUSED " *" + + emit_rawptr_name("vec_ci_") + " = vec_ci_.data();"); text_.add_line(); // create local indexed views @@ -197,7 +203,7 @@ void SimdPrinter<Arch>::emit_indexed_view_simd(LocalVariable* var, if (var->is_read()) text_ << "const "; - text_ << "value_type* "; + text_ << "value_type " ANNOT_UNUSED " *"; decls.insert(raw_index_name); text_ << raw_index_name << " = " << emit_member_name(index_name) << ".data()"; @@ -219,7 +225,7 @@ void SimdPrinter<Arch>::emit_indexed_view_simd(LocalVariable* var, if (var->is_read()) text_ << "const "; - text_ << "value_type* "; + text_ << "value_type " ANNOT_UNUSED " *"; decls.insert(ion_var_names.second); text_ << ion_var_names.second << " = " << iname << "." << name << ".data()"; @@ -265,7 +271,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e, if (declared_ion_vars.find(vindex_name) == declared_ion_vars.cend()) { declared_ion_vars.insert(vindex_name); text_.add_gutter(); - text_ << simd_backend::emit_index_type() << " " + text_ << simd_backend::emit_index_type() << " " ANNOT_UNUSED " " << vindex_name << " = "; // FIXME: cast should better go inside `emit_load_index()` simd_backend::emit_load_index( @@ -280,9 +286,9 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e, if (declared_ion_vars.find(vci_name) == declared_ion_vars.cend()) { declared_ion_vars.insert(vci_name); text_.add_gutter(); - text_ << simd_backend::emit_index_type() << " " + text_ << simd_backend::emit_index_type() << " " ANNOT_UNUSED " " << vci_name << " = "; - simd_backend::emit_gather_index(text_, "(int *)"+ci_ptr_name, vindex_name, "sizeof(size_type)"); + simd_backend::emit_gather_index(text_, "(int *)" + ci_ptr_name, vindex_name, "sizeof(size_type)"); text_.end_line(";"); } } @@ -408,6 +414,9 @@ void SimdPrinter<Arch>::visit(CellIndexedVariable *e) { std::string vindex_name, value_name; vindex_name = emit_vtmp_name("vec_ci_"); +#ifdef __GNUC__ + vindex_name += " __attribute__((unused))"; +#endif value_name = emit_rawptr_name(e->index_name()); simd_backend::emit_gather(text_, vindex_name, value_name, "sizeof(value_type)"); diff --git a/src/backends/multicore/intrin.hpp b/src/backends/multicore/intrin.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7f6de29b26b820d823b14678e561416b17d0f225 --- /dev/null +++ b/src/backends/multicore/intrin.hpp @@ -0,0 +1,408 @@ +// +// Custom transcendental intrinsics +// +// Implementation inspired by the Cephes library: +// - http://www.netlib.org/cephes/ + +#pragma once + +#include <iostream> +#include <immintrin.h> + +namespace nest { +namespace mc { +namespace multicore { + +namespace detail { + +constexpr double exp_limit = 708; + +// P/Q polynomial coefficients for the exponential function +constexpr double P0exp = 9.99999999999999999910E-1; +constexpr double P1exp = 3.02994407707441961300E-2; +constexpr double P2exp = 1.26177193074810590878E-4; + +constexpr double Q0exp = 2.00000000000000000009E0; +constexpr double Q1exp = 2.27265548208155028766E-1; +constexpr double Q2exp = 2.52448340349684104192E-3; +constexpr double Q3exp = 3.00198505138664455042E-6; + +// P/Q polynomial coefficients for the log function +constexpr double P0log = 7.70838733755885391666E0; +constexpr double P1log = 1.79368678507819816313e1; +constexpr double P2log = 1.44989225341610930846e1; +constexpr double P3log = 4.70579119878881725854e0; +constexpr double P4log = 4.97494994976747001425e-1; +constexpr double P5log = 1.01875663804580931796e-4; + +constexpr double Q0log = 2.31251620126765340583E1; +constexpr double Q1log = 7.11544750618563894466e1; +constexpr double Q2log = 8.29875266912776603211e1; +constexpr double Q3log = 4.52279145837532221105e1; +constexpr double Q4log = 1.12873587189167450590e1; +constexpr double ln2inv = 1.4426950408889634073599; // 1/ln(2) + +// C1 + C2 = ln(2) +constexpr double C1 = 6.93145751953125E-1; +constexpr double C2 = 1.42860682030941723212E-6; + +// C4 - C3 = ln(2) +constexpr double C3 = 2.121944400546905827679e-4; +constexpr double C4 = 0.693359375; + +constexpr uint64_t dmant_mask = ((1UL<<52) - 1) | (1UL << 63); // mantissa + sign +constexpr uint64_t dexp_mask = ((1UL<<11) - 1) << 52; +constexpr int exp_bias = 1023; +constexpr double dsqrth = 0.70710678118654752440; + +// Useful constants in vector registers +const __m256d nmc_m256d_zero = _mm256_set1_pd(0.0); +const __m256d nmc_m256d_one = _mm256_set1_pd(1.0); +const __m256d nmc_m256d_two = _mm256_set1_pd(2.0); +const __m256d nmc_m256d_nan = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN()); +const __m256d nmc_m256d_inf = _mm256_set1_pd(std::numeric_limits<double>::infinity()); +const __m256d nmc_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity()); +} + +static void nmc_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused)); +static void nmc_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused)); +static void nmc_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused)); +static __m256d nmc_mm256_exp_pd(__m256d x) __attribute__ ((unused)); +static __m256d nmc_mm256_subnormal_pd(__m256d x) __attribute__ ((unused)); +static __m256d nmc_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused)); +static __m256d nmc_mm256_log_pd(__m256d x) __attribute__ ((unused)); +static __m256d nmc_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused)); + +void nmc_mm256_print_pd(__m256d x, const char *name) { + double *val = (double *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +void nmc_mm256_print_epi32(__m128i x, const char *name) { + int *val = (int *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +void nmc_mm256_print_epi64x(__m256i x, const char *name) { + uint64_t *val = (uint64_t *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +// +// Calculates exponential using AVX2 instructions +// +// Exponential is calculated as follows: +// e^x = e^g * 2^n, +// +// where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated +// fast with bit shift, whereas e^g is approximated using the following Pade' +// form: +// +// e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2)) +// +// The exponents n and g are calculated using the following formulas: +// +// n = floor(x/ln(2) + 0.5) +// g = x - n*ln(2) +// +// They can be derived as follows: +// +// e^x = 2^(x/ln(2)) +// = 2^-0.5 * 2^(x/ln(2) + 0.5) +// = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5) (1) +// +// Setting n = floor(x/ln(2) + 0.5), +// +// r' = x/ln(2) - n, and r' in [0, 1) +// +// Substituting r' in (1) gives us +// +// e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5) +// = e^(x-n*ln(2)) * 2^n +// = e^g * 2^n, where g = x - n*ln(2) (2) +// +// NOTE: The calculation of ln(2) in (2) is split in two operations to +// compensate for rounding errors: +// +// ln(2) = C1 + C2, where +// +// C1 = floor(2^k*ln(2))/2^k +// C2 = ln(2) - C1 +// +// We use k=32, since this is what the Cephes library does historically. +// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but +// the standard library seems not to do that, so we are getting differences +// compared to std::exp() for large exponents. +// +__m256d nmc_mm256_exp_pd(__m256d x) { + __m256d x_orig = x; + + __m256d px = _mm256_floor_pd( + _mm256_add_pd( + _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x), + _mm256_set1_pd(0.5) + ) + ); + + __m128i n = _mm256_cvtpd_epi32(px); + + x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1))); + x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2))); + + __m256d xx = _mm256_mul_pd(x, x); + + // Compute the P and Q polynomials. + + // Polynomials are computed in factorized form in order to reduce the total + // numbers of operations: + // + // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2) + // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3)) + + // Compute x*P(x**2) + px = _mm256_set1_pd(detail::P2exp); + px = _mm256_mul_pd(px, xx); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp)); + px = _mm256_mul_pd(px, xx); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp)); + px = _mm256_mul_pd(px, x); + + + // Compute Q(x**2) + __m256d qx = _mm256_set1_pd(detail::Q3exp); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp)); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp)); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp)); + + // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2)) + x = _mm256_div_pd(px, _mm256_sub_pd(qx, px)); + x = _mm256_add_pd(detail::nmc_m256d_one, + _mm256_mul_pd(detail::nmc_m256d_two, x)); + + // Finally, compute x *= 2**n + __m256i n64 = _mm256_cvtepi32_epi64(n); + n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023)); + n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52)); + x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64)); + + // Treat exceptional cases + __m256d is_large = _mm256_cmp_pd( + x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */ + ); + __m256d is_small = _mm256_cmp_pd( + x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */ + ); + __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); + + x = _mm256_blendv_pd(x, detail::nmc_m256d_inf, is_large); + x = _mm256_blendv_pd(x, detail::nmc_m256d_zero, is_small); + x = _mm256_blendv_pd(x, detail::nmc_m256d_nan, is_nan); + return x; + +} + +__m256d nmc_mm256_subnormal_pd(__m256d x) { + __m256i x_raw = _mm256_castpd_si256(x); + __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); + __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask)); + + // Subnormals have a zero exponent + return _mm256_cmp_pd(x_exp, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */); +} + +__m256d nmc_mm256_frexp_pd(__m256d x, __m128i *e) { + __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); + __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask); + + __m256d x_orig = x; + + // we will work on the raw bits of x + __m256i x_raw = _mm256_castpd_si256(x); + __m256i x_exp = _mm256_and_si256(x_raw, exp_mask); + x_exp = _mm256_srli_epi64(x_exp, 52); + + // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1) + x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1)); + + // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a + // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it + // an 2^-1 exponent, i.e., 1022 in IEEE-754 format + __m256i x_ret = _mm256_and_si256(x_raw, mant_mask); + + __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52); + x_ret = _mm256_or_si256(x_ret, exp_bits); + x = _mm256_castsi256_pd(x_ret); + + // Treat special cases + __m256d is_zero = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_inf = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_inf, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_ninf = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_ninf, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); + + // Denormalized numbers have a zero exponent. Here we expect -1022 since we + // have already prepared it as a power of 2 + __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022)); + + x = _mm256_blendv_pd(x, detail::nmc_m256d_zero, is_zero); + x = _mm256_blendv_pd(x, detail::nmc_m256d_inf, is_inf); + x = _mm256_blendv_pd(x, detail::nmc_m256d_ninf, is_ninf); + x = _mm256_blendv_pd(x, detail::nmc_m256d_nan, is_nan); + + // FIXME: We treat denormalized numbers as zero here + x = _mm256_blendv_pd(x, detail::nmc_m256d_zero, + _mm256_castsi256_pd(is_denorm)); + x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm); + + x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), + _mm256_castpd_si256(is_zero)); + + + // We need to "compress" x_exp into the first 128 bits before casting it + // safely to __m128i and return to *e + x_exp = _mm256_permutevar8x32_epi32( + x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0) + ); + *e = _mm256_castsi256_si128(x_exp); + return x; +} + +// +// Calculates natural logarithm using AVX2 instructions +// +// ln(x) = ln(x'*2^g), x' in [0,1), g in N +// = ln(x') + g*ln(2) +// +// The logarithm in [0,1) is computed using the following Pade' form: +// +// ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x) +// +__m256d nmc_mm256_log_pd(__m256d x) { + __m256d x_orig = x; + __m128i x_exp; + + // x := x', x_exp := g + x = nmc_mm256_frexp_pd(x, &x_exp); + + // convert x_exp to packed double + __m256d dx_exp = _mm256_cvtepi32_pd(x_exp); + + // blending + __m256d lt_sqrth = _mm256_cmp_pd( + x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */); + + // Adjust the argument and the exponent + // | 2*x - 1; e := e -1 , if x < sqrt(2)/2 + // x := | + // | x - 1, otherwise + + // Precompute both branches + // 2*x - 1 + __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::nmc_m256d_one); + + // x - 1 + __m256d xm1 = _mm256_sub_pd(x, detail::nmc_m256d_one); + + // dx_exp - 1 + __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::nmc_m256d_one); + + x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth); + dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth); + + // compute P(x) + __m256d px = _mm256_set1_pd(detail::P5log); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log)); + + // xx := x^2 + // px := P(x)*x^3 + __m256d xx = _mm256_mul_pd(x, x); + px = _mm256_mul_pd(px, x); + px = _mm256_mul_pd(px, xx); + + // compute Q(x) + __m256d qx = x; + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log)); + + // x^3*P(x)/Q(x) + __m256d ret = _mm256_div_pd(px, qx); + + // x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_sub_pd( + ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3)) + ); + + // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx)); + + // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_add_pd(ret, x); + + // rounding error correction for ln(2) + ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4))); + + // Treat exceptional cases + __m256d is_inf = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_inf, 0 /* _CMP_EQ_OQ */); + __m256d is_zero = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */); + __m256d is_neg = _mm256_cmp_pd( + x_orig, detail::nmc_m256d_zero, 17 /* _CMP_LT_OQ */); + __m256d is_denorm = nmc_mm256_subnormal_pd(x_orig); + + ret = _mm256_blendv_pd(ret, detail::nmc_m256d_inf, is_inf); + ret = _mm256_blendv_pd(ret, detail::nmc_m256d_ninf, is_zero); + + // We treat denormalized cases as zeros + ret = _mm256_blendv_pd(ret, detail::nmc_m256d_ninf, is_denorm); + ret = _mm256_blendv_pd(ret, detail::nmc_m256d_nan, is_neg); + return ret; +} + +// Equivalent to exp(y*log(x)) +__m256d nmc_mm256_pow_pd(__m256d x, __m256d y) { + return nmc_mm256_exp_pd(_mm256_mul_pd(y, nmc_mm256_log_pd(x))); +} + +} // end namespace multicore +} // end namespace mc +} // end namespace nest diff --git a/src/util/compat.hpp b/src/util/compat.hpp index 4f1c2d34aa24a3b3d277f5a6115123fb7ca9ada7..85d54ad254e5286bc49bb06bfbebda66a4b8ef10 100644 --- a/src/util/compat.hpp +++ b/src/util/compat.hpp @@ -7,6 +7,27 @@ namespace compat { +template<int major=0, int minor=0, int patchlevel=0> +constexpr bool using_intel_compiler() { +#if defined(__INTEL_COMPILER) + return __INTEL_COMPILER >= major*100 + minor && + __INTEL_COMPILER_UPDATE >= patchlevel; +#else + return false; +#endif +} + +template<int major=0, int minor=0, int patchlevel=0> +constexpr bool using_gnu_compiler() { +#if defined(__GNUC__) + constexpr int available = __GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__; + constexpr int required = major*10000 + minor*100 + patchlevel; + return available >= required; +#else + return false; +#endif +} + // std::end() broken with (at least) xlC 13.1.4. template <typename T> diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 3016564404cf1a8d8087a90e5997826fb6e4c204..316e690913811e19763788bb583c9783a027c0a2 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -47,6 +47,7 @@ set(TEST_SOURCES test_event_binner.cpp test_filter.cpp test_fvm_multi.cpp + test_intrin.cpp test_mc_cell_group.cpp test_lexcmp.cpp test_mask_stream.cpp diff --git a/tests/unit/test_intrin.cpp b/tests/unit/test_intrin.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e96b1e4c32b160b2dcaf8f55582a0d3df554ca1 --- /dev/null +++ b/tests/unit/test_intrin.cpp @@ -0,0 +1,118 @@ +#include <cmath> +#include <limits> +#include <backends/multicore/intrin.hpp> +#include <immintrin.h> + +#include "../gtest.h" + +using namespace nest::mc::multicore; + +constexpr double dqnan = std::numeric_limits<double>::quiet_NaN(); +constexpr double dmax = std::numeric_limits<double>::max(); +constexpr double dmin = std::numeric_limits<double>::min(); +constexpr double dmin_denorm = std::numeric_limits<double>::denorm_min(); +constexpr double dinf = std::numeric_limits<double>::infinity(); + +constexpr double values[] = { + -300, -3, -2, -1, + 1, 2, 3, 600, + dqnan, -dinf, dinf, -0.0, + dmin, dmax, dmin_denorm, +0.0, +}; + +TEST(intrin, exp256) { + constexpr size_t simd_len = 4; + + __m256d vvalues[] = { + _mm256_set_pd(values[3], values[2], values[1], values[0]), + _mm256_set_pd(values[7], values[6], values[5], values[4]), + _mm256_set_pd(values[11], values[10], values[9], values[8]), + _mm256_set_pd(values[15], values[14], values[13], values[12]) + }; + + + for (size_t i = 0; i < 16/simd_len; ++i) { + __m256d vv = nmc_mm256_exp_pd(vvalues[i]); + double *intrin = (double *) &vv; + for (size_t j = 0; j < simd_len; ++j) { + double v = values[i*simd_len + j]; + if (std::isnan(v)) { + EXPECT_TRUE(std::isnan(intrin[j])); + } + else { + EXPECT_DOUBLE_EQ(std::exp(v), intrin[j]); + } + } + } +} + +TEST(intrin, frexp256) { + constexpr size_t simd_len = 4; + + __m256d vvalues[] = { + _mm256_set_pd(values[3], values[2], values[1], values[0]), + _mm256_set_pd(values[7], values[6], values[5], values[4]), + _mm256_set_pd(values[11], values[10], values[9], values[8]), + _mm256_set_pd(values[15], values[14], values[13], values[12]) + }; + + + for (size_t i = 0; i < 16/simd_len; ++i) { + __m128i vexp; + __m256d vbase = nmc_mm256_frexp_pd(vvalues[i], &vexp); + double *vbase_ = (double *) &vbase; + int *vexp_ = (int *) &vexp; + for (size_t j = 0; j < simd_len; ++j) { + double v = values[i*simd_len + j]; + if (std::fpclassify(v) == FP_SUBNORMAL) { + // FIXME: our implementation treats subnormals as zeros + v = 0; + } + + int exp; + double base = std::frexp(v, &exp); + if (std::isnan(v)) { + EXPECT_TRUE(std::isnan(vbase_[j])); + } + else if (std::isinf(v)) { + // Returned exponents are implementation defined in this case + EXPECT_DOUBLE_EQ(base, vbase_[j]); + } + else { + EXPECT_DOUBLE_EQ(base, vbase_[j]); + EXPECT_EQ(exp, vexp_[j]); + } + } + } +} + + +TEST(intrin, log256) { + constexpr size_t simd_len = 4; + + __m256d vvalues[] = { + _mm256_set_pd(values[3], values[2], values[1], values[0]), + _mm256_set_pd(values[7], values[6], values[5], values[4]), + _mm256_set_pd(values[11], values[10], values[9], values[8]), + _mm256_set_pd(values[15], values[14], values[13], values[12]) + }; + + + for (size_t i = 0; i < 16/simd_len; ++i) { + __m256d vv = nmc_mm256_log_pd(vvalues[i]); + double *intrin = (double *) &vv; + for (size_t j = 0; j < simd_len; ++j) { + double v = values[i*simd_len + j]; + if (std::fpclassify(v) == FP_SUBNORMAL) { + // FIXME: our implementation treats subnormals as zeros + v = 0; + } + if (v < 0 || std::isnan(v)) { + EXPECT_TRUE(std::isnan(intrin[j])); + } + else { + EXPECT_DOUBLE_EQ(std::log(v), intrin[j]); + } + } + } +}