diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e75a7ede389d829a80f97d578a565de05cabecb..6a8802f9f720933ee4c5a7f7febefeae17d1c19d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,11 +205,11 @@ set(ARB_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL, set_property(CACHE ARB_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX2 AVX512) if(ARB_VECTORIZE_TARGET STREQUAL "KNL") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL} -DSIMD_KNL") elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2} -DSIMD_AVX2") elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX512") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX512}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX512} -DSIMD_AVX512") endif() #---------------------------------------------------------- diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake index 1db887eec3f02d68cf0e3e593a4e66ed8ced96b5..40a019bcbcdec70f9ac72d1fd076ea18e558ba72 100644 --- a/cmake/CompilerOptions.cmake +++ b/cmake/CompilerOptions.cmake @@ -36,7 +36,6 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU") # Compiler flags for generating KNL-specific AVX512 instructions # supported in gcc 4.9.x and later. set(CXXOPT_KNL "-march=knl") - set(CXXOPT_AVX "-mavx") set(CXXOPT_AVX2 "-mavx2") set(CXXOPT_AVX512 "-mavx512f -mavx512cd") @@ -49,7 +48,6 @@ endif() if(${CMAKE_CXX_COMPILER_ID} MATCHES "Intel") # Compiler flags for generating KNL-specific AVX512 instructions. set(CXXOPT_KNL "-xMIC-AVX512") - set(CXXOPT_AVX "-xAVX") set(CXXOPT_AVX2 "-xCORE-AVX2") set(CXXOPT_AVX512 "-xCORE-AVX512") diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt index ac434b0d861d7624a86e9ed6a607422222a7d45f..a4a62da7d38391130cac5bb6dd7e1048218d0663 100644 --- a/mechanisms/CMakeLists.txt +++ b/mechanisms/CMakeLists.txt @@ -1,7 +1,7 @@ include(BuildModules.cmake) # the list of built-in mechanisms to be provided by default -set(mechanisms pas hh expsyn exp2syn test_kin1 test_kinlva test_ca) +set(mechanisms pas hh expsyn exp2syn test_kin1 test_kinlva test_ca nax kdrmt kamt) set(mod_srcdir "${CMAKE_CURRENT_SOURCE_DIR}/mod") diff --git a/mechanisms/mod/hh.mod b/mechanisms/mod/hh.mod index f5acc9fc7c66d283d170f5b11d1ba2da21e039fa..00554aecfba7a35679c90b60c255ae2ee14e5b6f 100644 --- a/mechanisms/mod/hh.mod +++ b/mechanisms/mod/hh.mod @@ -55,7 +55,7 @@ INITIAL { DERIVATIVE states { rates(v) - m' = (minf-m)/mtau + m' = (minf-m)/mtau h' = (hinf-h)/htau n' = (ninf-n)/ntau } @@ -88,13 +88,8 @@ PROCEDURE rates(v) ninf = alpha/sum } -: We don't trap for zero in the denominator like Neuron, because function -: inlining in modparser won't support it. There is a good argument that -: vtrap should be provided as a built in of the language, because -: - it is a common pattern in many mechanisms. -: - it can be implemented efficiently on different back ends if the -: compiler has enough information. FUNCTION vtrap(x,y) { - vtrap = x/(exp(x/y) - 1) + : use built in exprelr(z) = z/(exp(z)-1), which handles the z=0 case correctly + vtrap = y*exprelr(x/y) } diff --git a/mechanisms/mod/kamt.mod b/mechanisms/mod/kamt.mod new file mode 100644 index 0000000000000000000000000000000000000000..d897163b24634d650b969ad28a3665038de7fec3 --- /dev/null +++ b/mechanisms/mod/kamt.mod @@ -0,0 +1,98 @@ +TITLE K-A +: K-A current for Mitral Cells from Wang et al (1996) +: M.Migliore Jan. 2002 + +: ek must be explicitly set in hoc +: ik has units (mA/cm2) + +NEURON { + THREADSAFE + SUFFIX kamt + USEION k READ ek WRITE ik + RANGE gbar, q10 + GLOBAL minf, mtau, hinf, htau +} + +PARAMETER { + gbar = 0.002 (mho/cm2) + + celsius + a0m=0.04 + vhalfm=-45 + zetam=0.1 + gmm=0.75 + + a0h=0.018 + vhalfh=-70 + zetah=0.2 + gmh=0.99 + + sha=9.9 + shi=5.7 + q10=3 +} + + +UNITS { + (mA) = (milliamp) + (mV) = (millivolt) + (pS) = (picosiemens) + (um) = (micron) +} + +ASSIGNED { + v (mV) + minf + mtau (ms) + hinf + htau (ms) +} + +STATE { + m + h +} + +BREAKPOINT { + SOLVE states METHOD cnexp + ik = gbar*m*h*(v - ek) +} + +INITIAL { + trates(v) + m=minf + h=hinf +} + +DERIVATIVE states { + trates(v) + m' = (minf-m)/mtau + h' = (hinf-h)/htau +} + +PROCEDURE trates(v) { + LOCAL qt + qt=q10^((celsius-24)/10) + + minf = 1/(1 + exp(-(v-sha-7.6)/14)) + mtau = betm(v)/(qt*a0m*(1+alpm(v))) + + hinf = 1/(1 + exp((v-shi+47.4)/6)) + htau = beth(v)/(qt*a0h*(1+alph(v))) +} + +FUNCTION alpm(v) { + alpm = exp(zetam*(v-vhalfm)) +} + +FUNCTION betm(v) { + betm = exp(zetam*gmm*(v-vhalfm)) +} + +FUNCTION alph(v) { + alph = exp(zetah*(v-vhalfh)) +} + +FUNCTION beth(v) { + beth = exp(zetah*gmh*(v-vhalfh)) +} diff --git a/mechanisms/mod/kdrmt.mod b/mechanisms/mod/kdrmt.mod new file mode 100644 index 0000000000000000000000000000000000000000..c61ea5e92ff04265f3bc4631189fdfb30fd8b857 --- /dev/null +++ b/mechanisms/mod/kdrmt.mod @@ -0,0 +1,72 @@ +TITLE K-DR +: K-DR current for Mitral Cells from Wang et al (1996) +: M.Migliore Jan. 2002 + +: ek (mV) must be explicitly def. in hoc +: ik (mA/cm2) + +NEURON { + THREADSAFE + SUFFIX kdrmt + USEION k READ ek WRITE ik + RANGE gbar, q10, vhalfm + GLOBAL minf, mtau +} + +PARAMETER { + gbar = 0.002 (mho/cm2) + + celsius + a0m=0.0035 + vhalfm=-50 + zetam=0.055 + gmm=0.5 + q10=3 + alpm=0 + betm=0 +} + + +UNITS { + (mA) = (milliamp) + (mV) = (millivolt) + (pS) = (picosiemens) + (um) = (micron) +} + +ASSIGNED { + v (mV) + minf + mtau (ms) +} + +STATE { + m +} + +BREAKPOINT { + SOLVE states METHOD cnexp + ik = gbar*m*(v - ek) +} + +INITIAL { + trates(v) + m=minf +} + +DERIVATIVE states { + trates(v) + m' = (minf-m)/mtau +} + +PROCEDURE trates(v) { + LOCAL qt + LOCAL alpm, betm + LOCAL tmp + qt=q10^((celsius-24)/10) + minf = 1/(1 + exp(-(v-21)/10)) + tmp = zetam*(v-vhalfm) + alpm = exp(tmp) + betm = exp(gmm*tmp) + mtau = betm/(qt*a0m*(1+alpm)) +} diff --git a/mechanisms/mod/nax.mod b/mechanisms/mod/nax.mod new file mode 100644 index 0000000000000000000000000000000000000000..2feddda26aca3a6c3f01a06855cec3e11df448c2 --- /dev/null +++ b/mechanisms/mod/nax.mod @@ -0,0 +1,94 @@ +TITLE nax +: Na current for axon. No slow inact. +: M.Migliore Jul. 1997 +: added sh to account for higher threshold M.Migliore, Apr.2002 + +NEURON { + SUFFIX nax + USEION na READ ena WRITE ina + RANGE gbar, sh +} + +PARAMETER { + sh = 5 (mV) + gbar = 0.010 (mho/cm2) + + tha = -30 (mV) : v 1/2 for act + qa = 7.2 (mV) : act slope (4.5) + Ra = 0.4 (/ms) : open (v) + Rb = 0.124 (/ms) : close (v) + + thi1 = -45 (mV) : v 1/2 for inact + thi2 = -45 (mV) : v 1/2 for inact + qd = 1.5 (mV) : inact tau slope + qg = 1.5 (mV) + mmin=0.02 + hmin=0.5 + q10=2 + Rg = 0.01 (/ms) : inact recov (v) + Rd = .03 (/ms) : inact (v) + + thinf = -50 (mV) : inact inf slope + qinf = 4 (mV) : inact inf slope + + celsius +} + + +UNITS { + (mA) = (milliamp) + (mV) = (millivolt) + (pS) = (picosiemens) + (um) = (micron) +} + +ASSIGNED { + v (mV) + thegna (mho/cm2) + minf + hinf + mtau (ms) + htau (ms) +} + +STATE { + m + h +} + +BREAKPOINT { + SOLVE states METHOD cnexp + thegna = gbar*m*m*m*h + ina = thegna * (v - ena) +} + +INITIAL { + trates(v,sh) + m=minf + h=hinf +} + +DERIVATIVE states { + trates(v,sh) + m' = (minf-m)/mtau + h' = (hinf-h)/htau +} + +PROCEDURE trates(vm,sh2) { + LOCAL a, b, qt + qt=q10^((celsius-24)/10) + a = trap0(vm,tha+sh2,Ra,qa) + b = trap0(-vm,-tha-sh2,Rb,qa) + mtau = max(1/(a+b)/qt, mmin) + minf = a/(a+b) + + a = trap0(vm,thi1+sh2,Rd,qd) + b = trap0(-vm,-thi2-sh2,Rg,qg) + htau = max(1/(a+b)/qt, hmin) + hinf = 1/(1+exp((vm-thinf-sh2)/qinf)) +} + +FUNCTION trap0(v,th,a,q) { + : trap0 = a * (v - th) / (1 - exp(-(v - th)/q)) + trap0 = -a*q*exprelr((v-th)/q) +} diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp index 803890a33378e7b9e174c092ad790b1f44d14e2f..d72f48eac01dd5e89aba417dbf1b9f8d9735ac16 100644 --- a/modcc/backends/avx2.hpp +++ b/modcc/backends/avx2.hpp @@ -21,12 +21,18 @@ struct simd_intrinsics<simdKind::avx2> { } static std::string emit_headers() { + /* std::string ret = "#include <immintrin.h>"; + ret += "\n#include <backends/multicore/intrin.hpp>"; if (!compat::using_intel_compiler()) { ret += "\n#include <backends/multicore/intrin.hpp>"; } - return ret; + */ + + return + "#include <immintrin.h>\n" + "#include <backends/multicore/intrin.hpp>"; } static std::string emit_simd_width() { @@ -57,8 +63,14 @@ struct simd_intrinsics<simdKind::avx2> { case tok::divide: tb << "_mm256_div_pd("; break; + case tok::max: + tb << "_mm256_max_pd("; + break; + case tok::min: + tb << "arb::multicore::arb_mm256_min_pd("; + break; default: - throw std::invalid_argument("Unknown binary operator"); + throw std::invalid_argument("Unable to generate avx2 for binary operator " + token_map[op]); } emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2)); @@ -87,8 +99,14 @@ struct simd_intrinsics<simdKind::avx2> { tb << "arb::multicore::arb_mm256_log_pd("; } break; + case tok::abs: + tb << "arb::multicore::arb_mm256_abs_pd("; + break; + case tok::exprelr: + tb << "arb::multicore::arb_mm256_exprelr_pd("; + break; default: - throw std::invalid_argument("Unknown unary operator"); + throw std::invalid_argument("Unable to generate avx2 for unary operator " + token_map[op]); } emit_operands(tb, arg_emitter(arg)); diff --git a/modcc/backends/avx512.hpp b/modcc/backends/avx512.hpp index 16b05ae1dc53345b5dd177720753f5ea231d7f07..97e3ecbce14a376f07adc40444f70e9cf87f0872 100644 --- a/modcc/backends/avx512.hpp +++ b/modcc/backends/avx512.hpp @@ -21,7 +21,9 @@ struct simd_intrinsics<simdKind::avx512> { } static std::string emit_headers() { - return "#include <immintrin.h>"; + return + "#include <immintrin.h>\n" + "#include <backends/multicore/intrin.hpp>"; }; static std::string emit_simd_width() { @@ -29,7 +31,7 @@ struct simd_intrinsics<simdKind::avx512> { } static std::string emit_value_type() { - return "__m512d"; + return "vecd_avx512"; } static std::string emit_index_type() { @@ -41,19 +43,25 @@ struct simd_intrinsics<simdKind::avx512> { const T1& arg1, const T2& arg2) { switch (op) { case tok::plus: - tb << "_mm512_add_pd("; + tb << "add("; break; case tok::minus: - tb << "_mm512_sub_pd("; + tb << "sub("; break; case tok::times: - tb << "_mm512_mul_pd("; + tb << "mul("; break; case tok::divide: - tb << "_mm512_div_pd("; + tb << "div("; + break; + case tok::max: + tb << "max("; + break; + case tok::min: + tb << "min("; break; default: - throw std::invalid_argument("Unknown binary operator"); + throw std::invalid_argument("Unable to generate avx512 for binary operator " + token_map[op]); } emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2)); @@ -64,7 +72,7 @@ struct simd_intrinsics<simdKind::avx512> { static void emit_unary_op(TextBuffer& tb, tok op, const T& arg) { switch (op) { case tok::minus: - tb << "_mm512_sub_pd(_mm512_set1_pd(0), "; + tb << "sub(set(0), "; break; case tok::exp: tb << "_mm512_exp_pd("; @@ -72,8 +80,14 @@ struct simd_intrinsics<simdKind::avx512> { case tok::log: tb << "_mm512_log_pd("; break; + case tok::abs: + tb << "abs("; + break; + case tok::exprelr: + tb << "exprelr("; + break; default: - throw std::invalid_argument("Unknown unary operator"); + throw std::invalid_argument("Unable to generate avx512 for unary operator " + token_map[op]); } emit_operands(tb, arg_emitter(arg)); @@ -139,7 +153,7 @@ struct simd_intrinsics<simdKind::avx512> { template<typename T> static void emit_set_value(TextBuffer& tb, const T& arg) { - tb << "_mm512_set1_pd("; + tb << "set("; emit_operands(tb, arg_emitter(arg)); tb << ")"; } diff --git a/modcc/cexpr_emit.cpp b/modcc/cexpr_emit.cpp index 1b1e8f49d0f5c5fcccc7756506e673f5c16098dd..cb5ee70dd12c7b147e63ffba004e27743bb89242 100644 --- a/modcc/cexpr_emit.cpp +++ b/modcc/cexpr_emit.cpp @@ -26,11 +26,13 @@ void CExprEmitter::visit(UnaryExpression* e) { // Place a space in front of minus sign to avoid invalid // expressions of the form: (v[i]--67) static std::unordered_map<tok, const char*> unaryop_tbl = { - {tok::minus, " -"}, - {tok::exp, "exp"}, - {tok::cos, "cos"}, - {tok::sin, "sin"}, - {tok::log, "log"} + {tok::minus, " -"}, + {tok::exp, "exp"}, + {tok::cos, "cos"}, + {tok::sin, "sin"}, + {tok::log, "log"}, + {tok::abs, "fabs"}, + {tok::exprelr, "exprelr"} }; if (!unaryop_tbl.count(e->op())) { @@ -62,7 +64,7 @@ void CExprEmitter::visit(PowBinaryExpression* e) { emit_as_call("std::pow", e->lhs(), e->rhs()); } -void CExprEmitter::visit(BinaryExpression *e) { +void CExprEmitter::visit(BinaryExpression* e) { static std::unordered_map<tok, const char*> binop_tbl = { {tok::minus, "-"}, {tok::plus, "+"}, @@ -72,7 +74,9 @@ void CExprEmitter::visit(BinaryExpression *e) { {tok::lte, "<="}, {tok::gt, ">"}, {tok::gte, ">="}, - {tok::equality, "=="} + {tok::equality, "=="}, + {tok::min, "min"}, + {tok::max, "max"}, }; if (!binop_tbl.count(e->op())) { @@ -80,33 +84,39 @@ void CExprEmitter::visit(BinaryExpression *e) { "CExprEmitter: unsupported binary operator "+token_string(e->op()), e->location()); } + auto rhs = e->rhs(); + auto lhs = e->lhs(); const char* op_spelling = binop_tbl.at(e->op()); - associativityKind assoc = Lexer::operator_associativity(e->op()); - int op_prec = Lexer::binop_precedence(e->op()); - auto need_paren = [op_prec](Expression* subexpr, bool assoc_side) -> bool { - if (auto b = subexpr->is_binary()) { - int sub_prec = Lexer::binop_precedence(b->op()); - return sub_prec<op_prec || (!assoc_side && sub_prec==op_prec); - } - return false; - }; + if (e->is_infix()) { + associativityKind assoc = Lexer::operator_associativity(e->op()); + int op_prec = Lexer::binop_precedence(e->op()); - auto lhs = e->lhs(); - if (need_paren(lhs, assoc==associativityKind::left)) { - emit_as_call("", lhs); - } - else { - lhs->accept(this); - } + auto need_paren = [op_prec](Expression* subexpr, bool assoc_side) -> bool { + if (auto b = subexpr->is_binary()) { + int sub_prec = Lexer::binop_precedence(b->op()); + return sub_prec<op_prec || (!assoc_side && sub_prec==op_prec); + } + return false; + }; - out_ << op_spelling; + if (need_paren(lhs, assoc==associativityKind::left)) { + emit_as_call("", lhs); + } + else { + lhs->accept(this); + } - auto rhs = e->rhs(); - if (need_paren(rhs, assoc==associativityKind::right)) { - emit_as_call("", rhs); + out_ << op_spelling; + + if (need_paren(rhs, assoc==associativityKind::right)) { + emit_as_call("", rhs); + } + else { + rhs->accept(this); + } } else { - rhs->accept(this); + emit_as_call(op_spelling, lhs, rhs); } } diff --git a/modcc/cprinter.cpp b/modcc/cprinter.cpp index bb855053f791acbe171c6423ebb1917fe12433e7..72999c30c9cb3e52e8e8c96c78c9d3899ae27df4 100644 --- a/modcc/cprinter.cpp +++ b/modcc/cprinter.cpp @@ -39,6 +39,10 @@ std::string CPrinter::emit_source() { text_.add_line("namespace arb { namespace multicore {"); text_.add_line(); + text_.add_line("using math::exprelr;"); + text_.add_line("using math::min;"); + text_.add_line("using math::max;"); + text_.add_line(); text_.add_line("template<class Backend>"); text_.add_line("class " + class_name + " : public mechanism<Backend> {"); text_.add_line("public:"); @@ -462,6 +466,7 @@ void CPrinter::emit_headers() { text_.add_line("#include <cmath>"); text_.add_line("#include <limits>"); text_.add_line(); + text_.add_line("#include <math.hpp>"); text_.add_line("#include <mechanism.hpp>"); text_.add_line("#include <algorithms.hpp>"); text_.add_line("#include <backends/event.hpp>"); diff --git a/modcc/expression.cpp b/modcc/expression.cpp index dbd6ae357c363f339425f0937cc8c11dd1da0ee5..8a9da2a17ff199e3b7f87b07533fe0b6bdf222bb 100644 --- a/modcc/expression.cpp +++ b/modcc/expression.cpp @@ -397,6 +397,7 @@ void CallExpression::semantic(scope_ptr scp) { if(!s) { error(pprintf("there is no function or procedure named '%' ", yellow(spelling_))); + return; } if(s->kind()==symbolKind::local_variable || s->kind()==symbolKind::variable) { error(pprintf("the symbol '%' refers to a variable, but it is being" @@ -698,7 +699,7 @@ void AssignmentExpression::semantic(scope_ptr scp) { if(!lhs_->has_error() && !lhs_->is_lvalue()) { error("the left hand side of an assignment must be an lvalue"); } - if(rhs_->is_procedure_call()) { + if(!rhs_->has_error() && rhs_->is_procedure_call()) { error("procedure calls can't be made in an expression"); } } @@ -933,6 +934,12 @@ void ExpUnaryExpression::accept(Visitor *v) { void LogUnaryExpression::accept(Visitor *v) { v->visit(this); } +void AbsUnaryExpression::accept(Visitor *v) { + v->visit(this); +} +void ExprelrUnaryExpression::accept(Visitor *v) { + v->visit(this); +} void CosUnaryExpression::accept(Visitor *v) { v->visit(this); } @@ -969,6 +976,12 @@ void MulBinaryExpression::accept(Visitor *v) { void DivBinaryExpression::accept(Visitor *v) { v->visit(this); } +void MinBinaryExpression::accept(Visitor *v) { + v->visit(this); +} +void MaxBinaryExpression::accept(Visitor *v) { + v->visit(this); +} void PowBinaryExpression::accept(Visitor *v) { v->visit(this); } @@ -995,6 +1008,10 @@ expression_ptr unary_expression( Location loc, return make_expression<SinUnaryExpression>(loc, std::move(e)); case tok::log : return make_expression<LogUnaryExpression>(loc, std::move(e)); + case tok::abs : + return make_expression<AbsUnaryExpression>(loc, std::move(e)); + case tok::exprelr : + return make_expression<ExprelrUnaryExpression>(loc, std::move(e)); default : std::cerr << yellow(token_string(op)) << " is not a valid unary operator" @@ -1039,6 +1056,14 @@ expression_ptr binary_expression(Location loc, return make_expression<DivBinaryExpression>( loc, std::move(lhs), std::move(rhs) ); + case tok::min : + return make_expression<MinBinaryExpression>( + loc, std::move(lhs), std::move(rhs) + ); + case tok::max : + return make_expression<MaxBinaryExpression>( + loc, std::move(lhs), std::move(rhs) + ); case tok::pow : return make_expression<PowBinaryExpression>( loc, std::move(lhs), std::move(rhs) diff --git a/modcc/expression.hpp b/modcc/expression.hpp index 704f361859ab2094d23b3f0e62bd4819f3aba758..db25ca0d2825edda167e12c1274d129948b5a157 100644 --- a/modcc/expression.hpp +++ b/modcc/expression.hpp @@ -16,48 +16,39 @@ class Visitor; class Expression; -class IdentifierExpression; +class CallExpression; class BlockExpression; -class InitialBlock; class IfExpression; -class VariableExpression; -class AbstractIndexedVariable; -class IndexedVariable; -class CellIndexedVariable; -class NumberExpression; -class IntegerExpression; class LocalDeclaration; class ArgumentExpression; +class FunctionExpression; class DerivativeExpression; class PrototypeExpression; -class CallExpression; class ProcedureExpression; -class NetReceiveExpression; -class APIMethod; -class FunctionExpression; -class UnaryExpression; -class NegUnaryExpression; -class ExpUnaryExpression; -class LogUnaryExpression; -class CosUnaryExpression; -class SinUnaryExpression; +class IdentifierExpression; +class NumberExpression; +class IntegerExpression; class BinaryExpression; +class UnaryExpression; class AssignmentExpression; +class ConserveExpression; class ReactionExpression; class StoichExpression; class StoichTermExpression; -class ConserveExpression; -class AddBinaryExpression; -class SubBinaryExpression; -class MulBinaryExpression; -class DivBinaryExpression; -class PowBinaryExpression; class ConditionalExpression; +class InitialBlock; class SolveExpression; -class ConductanceExpression; class Symbol; +class ConductanceExpression; +class PDiffExpression; +class VariableExpression; +class ProcedureExpression; +class NetReceiveExpression; +class APIMethod; +class AbstractIndexedVariable; +class IndexedVariable; +class CellIndexedVariable; class LocalVariable; -class PDiffExpression; // not parsed; possible result of symbolic differentiation using expression_ptr = std::unique_ptr<Expression>; using symbol_ptr = std::unique_ptr<Symbol>; @@ -1243,6 +1234,27 @@ public: void accept(Visitor *v) override; }; +// absolute value unary expression, i.e. abs(x) +class AbsUnaryExpression : public UnaryExpression { +public: + AbsUnaryExpression(Location loc, expression_ptr e) + : UnaryExpression(loc, tok::abs, std::move(e)) + {} + + void accept(Visitor *v) override; +}; + +// exprel reciprocal unary expression, +// i.e. x/(exp(x)-1)=x/expm1(x) with exprelr(0)=1 +class ExprelrUnaryExpression : public UnaryExpression { +public: + ExprelrUnaryExpression(Location loc, expression_ptr e) + : UnaryExpression(loc, tok::exprelr, std::move(e)) + {} + + void accept(Visitor *v) override; +}; + // cosine unary expression, i.e. cos(x) class CosUnaryExpression : public UnaryExpression { public: @@ -1285,6 +1297,7 @@ public: {} tok op() const {return op_;} + virtual bool is_infix() const {return true;} Expression* lhs() {return lhs_.get();} Expression* rhs() {return rhs_.get();} const Expression* lhs() const {return lhs_.get();} @@ -1361,6 +1374,30 @@ public: void accept(Visitor *v) override; }; +class MinBinaryExpression : public BinaryExpression { +public: + MinBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs) + : BinaryExpression(loc, tok::min, std::move(lhs), std::move(rhs)) + {} + + // min is a prefix binop + bool is_infix() const override {return false;} + + void accept(Visitor *v) override; +}; + +class MaxBinaryExpression : public BinaryExpression { +public: + MaxBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs) + : BinaryExpression(loc, tok::max, std::move(lhs), std::move(rhs)) + {} + + // max is a prefix binop + bool is_infix() const override {return false;} + + void accept(Visitor *v) override; +}; + class PowBinaryExpression : public BinaryExpression { public: PowBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs) diff --git a/modcc/lexer.cpp b/modcc/lexer.cpp index f0bc0e19d127eba7ccccb0f674342c457178a70b..48a4e426f72cf1e167d4538fab7fa731216ff628 100644 --- a/modcc/lexer.cpp +++ b/modcc/lexer.cpp @@ -354,6 +354,7 @@ void Lexer::binop_prec_init() { return; // I have taken the operator precedence from C++ + // Note that only infix operators require precidence. binop_prec_[tok::eq] = 2; binop_prec_[tok::equality] = 4; binop_prec_[tok::ne] = 4; diff --git a/modcc/parser.cpp b/modcc/parser.cpp index cb45d2547d7a3cbdb36c6916f119f7202f979c76..2c52fc4c2600cb4deaa958c48f48393a4ba0388b 100644 --- a/modcc/parser.cpp +++ b/modcc/parser.cpp @@ -1,4 +1,3 @@ -#include <iostream> #include <list> #include <cstring> @@ -1159,8 +1158,12 @@ expression_ptr Parser::parse_expression(int prec) { auto op = token_; auto p_op = binop_precedence(op.type); + // Note: all tokens that are not infix binary operators have + // precidence of -1, so expressions like function calls will short + // circuit this loop here. if(p_op<=prec) return lhs; - get_token(); + + get_token(); // consume the infix binary operator lhs = parse_binop(std::move(lhs), op); if(!lhs) return nullptr; @@ -1192,10 +1195,12 @@ expression_ptr Parser::parse_unaryop() { e = parse_unaryop(); // handle recursive unary if(!e) return nullptr; return unary_expression(token_.location, op.type, std::move(e)); - case tok::exp : - case tok::sin : - case tok::cos : - case tok::log : + case tok::exp : + case tok::sin : + case tok::cos : + case tok::log : + case tok::abs : + case tok::exprelr: get_token(); // consume operator (exp, sin, cos or log) if(token_.type!=tok::lparen) { error( "missing parenthesis after call to " @@ -1217,6 +1222,7 @@ expression_ptr Parser::parse_unaryop() { /// :: identifier /// :: call /// :: parenthesis expression (parsed recursively) +/// :: prefix binary operators expression_ptr Parser::parse_primary() { switch(token_.type) { case tok::real: @@ -1230,6 +1236,35 @@ expression_ptr Parser::parse_primary() { return parse_identifier(); case tok::lparen: return parse_parenthesis_expression(); + case tok::min : + case tok::max : + { + auto op = token_; + // handle infix binary operators, e.g. min(l,r) and max(l,r) + get_token(); // consume operator keyword token + if (token_.type!=tok::lparen) { + error("expected opening parenthesis '('"); + return nullptr; + } + get_token(); // consume ( + auto lhs = parse_expression(); + if (!lhs) return nullptr; + + if (token_.type!=tok::comma) { + error("expected comma ','"); + return nullptr; + } + get_token(); // consume , + + auto rhs = parse_expression(); + if (!rhs) return nullptr; + if (token_.type!=tok::rparen) { + error("expected closing parenthesis ')'"); + return nullptr; + } + get_token(); // consume ) + return binary_expression(op.location, op.type, std::move(lhs), std::move(rhs)); + } default: // fall through to return nullptr at end of function error( pprintf( "unexpected token '%' in expression", yellow(token_.spelling) )); diff --git a/modcc/token.cpp b/modcc/token.cpp index 25d08da9d6060ed3767042b94de0388927f02395..1ea8d27abf2a4f20305661e995ee57ac1e961c29 100644 --- a/modcc/token.cpp +++ b/modcc/token.cpp @@ -53,10 +53,14 @@ static Keyword keywords[] = { {"else", tok::else_stmt}, {"cnexp", tok::cnexp}, {"sparse", tok::sparse}, + {"min", tok::min}, + {"max", tok::max}, {"exp", tok::exp}, {"sin", tok::sin}, {"cos", tok::cos}, {"log", tok::log}, + {"abs", tok::abs}, + {"exprelr", tok::exprelr}, {"CONDUCTANCE", tok::conductance}, {nullptr, tok::reserved}, }; @@ -116,8 +120,12 @@ static TokenString token_strings[] = { {"if", tok::if_stmt}, {"else", tok::else_stmt}, {"eof", tok::eof}, + {"min", tok::min}, + {"max", tok::max}, {"exp", tok::exp}, {"log", tok::log}, + {"abs", tok::abs}, + {"exprelr", tok::exprelr}, {"cos", tok::cos}, {"sin", tok::sin}, {"cnexp", tok::cnexp}, diff --git a/modcc/token.hpp b/modcc/token.hpp index fcafd2e1d47930859fc31f498488fbf3118409cc..3fd5bb0b33e561d930357126b88af18d1560cfa6 100644 --- a/modcc/token.hpp +++ b/modcc/token.hpp @@ -12,6 +12,9 @@ enum class tok { ///////////////////////////// // symbols ///////////////////////////// + + // infix binary ops + // = + - * / ^ eq, plus, minus, times, divide, pow, // comparison @@ -62,8 +65,12 @@ enum class tok { threadsafe, global, point_process, + // prefix binary operators + min, max, + // unary operators - exp, sin, cos, log, + exp, sin, cos, log, abs, + exprelr, // equivalent to x/(exp(x)-1) with exprelr(0)=1 // logical keywords if_stmt, else_stmt, // add _stmt to avoid clash with c++ keywords diff --git a/src/backends/gpu/fvm.cpp b/src/backends/gpu/fvm.cpp index 550962e06fba440e573a60e4bcdee1d92365056b..e70b30cfe6c566429349e98267f149138a3c5cbf 100644 --- a/src/backends/gpu/fvm.cpp +++ b/src/backends/gpu/fvm.cpp @@ -7,6 +7,9 @@ #include <mechanisms/gpu/test_kin1_gpu.hpp> #include <mechanisms/gpu/test_kinlva_gpu.hpp> #include <mechanisms/gpu/test_ca_gpu.hpp> +#include <mechanisms/gpu/nax_gpu.hpp> +#include <mechanisms/gpu/kamt_gpu.hpp> +#include <mechanisms/gpu/kdrmt_gpu.hpp> namespace arb { namespace gpu { @@ -19,7 +22,10 @@ backend::mech_map_ = { { "exp2syn", maker<mechanism_exp2syn> }, { "test_kin1", maker<mechanism_test_kin1> }, { "test_kinlva", maker<mechanism_test_kinlva> }, - { "test_ca", maker<mechanism_test_ca> } + { "test_ca", maker<mechanism_test_ca> }, + { "nax", maker<mechanism_nax> }, + { "kamt", maker<mechanism_kamt> }, + { "kdrmt", maker<mechanism_kdrmt> }, }; } // namespace gpu diff --git a/src/backends/gpu/intrinsics.hpp b/src/backends/gpu/intrinsics.hpp index 1ee9e53d85cd0d034e3c86f3b652d8eb5420681f..6f6cf46be1288d4e03208296769a4a4a9d22dec9 100644 --- a/src/backends/gpu/intrinsics.hpp +++ b/src/backends/gpu/intrinsics.hpp @@ -39,3 +39,24 @@ inline float cuda_atomic_sub(float* address, float val) { return atomicAdd(address, -val); } +__device__ +inline double exprelr(double x) { + if (1.0+x == 1.0) { + return 1.0; + } + return x/expm1(x); +} + +// Return minimum of the two values +template <typename T> +__device__ +inline T min(T lhs, T rhs) { + return lhs<rhs? lhs: rhs; +} + +// Return maximum of the two values +template <typename T> +__device__ +inline T max(T lhs, T rhs) { + return lhs<rhs? rhs: lhs; +} diff --git a/src/backends/multicore/fvm.cpp b/src/backends/multicore/fvm.cpp index 2c34fb0039979bd29c51c553c92753fb703b3cdb..ea471917cca568f85ef807d0011e79090fc2137f 100644 --- a/src/backends/multicore/fvm.cpp +++ b/src/backends/multicore/fvm.cpp @@ -7,6 +7,9 @@ #include <mechanisms/multicore/test_kin1_cpu.hpp> #include <mechanisms/multicore/test_kinlva_cpu.hpp> #include <mechanisms/multicore/test_ca_cpu.hpp> +#include <mechanisms/multicore/nax_cpu.hpp> +#include <mechanisms/multicore/kamt_cpu.hpp> +#include <mechanisms/multicore/kdrmt_cpu.hpp> namespace arb { namespace multicore { @@ -19,7 +22,10 @@ backend::mech_map_ = { { std::string("exp2syn"), maker<mechanism_exp2syn> }, { std::string("test_kin1"), maker<mechanism_test_kin1> }, { std::string("test_kinlva"), maker<mechanism_test_kinlva> }, - { std::string("test_ca"), maker<mechanism_test_ca> } + { std::string("test_ca"), maker<mechanism_test_ca> }, + { std::string("nax"), maker<mechanism_nax> }, + { std::string("kamt"), maker<mechanism_kamt> }, + { std::string("kdrmt"), maker<mechanism_kdrmt> }, }; } // namespace multicore diff --git a/src/backends/multicore/intrin.hpp b/src/backends/multicore/intrin.hpp index 23d7b05cec1f5632cf634540dc76559a3198594a..66fc2f5dd26e0410714c68d6c719dc3a9a9e9b7b 100644 --- a/src/backends/multicore/intrin.hpp +++ b/src/backends/multicore/intrin.hpp @@ -7,6 +7,8 @@ #pragma once #include <iostream> +#include <limits> + #include <immintrin.h> namespace arb { @@ -53,354 +55,13 @@ constexpr uint64_t dmant_mask = ((1UL<<52) - 1) | (1UL << 63); // mantissa + sig constexpr uint64_t dexp_mask = ((1UL<<11) - 1) << 52; constexpr int exp_bias = 1023; constexpr double dsqrth = 0.70710678118654752440; - -// Useful constants in vector registers -const __m256d arb_m256d_zero = _mm256_set1_pd(0.0); -const __m256d arb_m256d_one = _mm256_set1_pd(1.0); -const __m256d arb_m256d_two = _mm256_set1_pd(2.0); -const __m256d arb_m256d_nan = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN()); -const __m256d arb_m256d_inf = _mm256_set1_pd(std::numeric_limits<double>::infinity()); -const __m256d arb_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity()); -} - -static void arb_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused)); -static void arb_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused)); -static void arb_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused)); -static __m256d arb_mm256_exp_pd(__m256d x) __attribute__ ((unused)); -static __m256d arb_mm256_subnormal_pd(__m256d x) __attribute__ ((unused)); -static __m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused)); -static __m256d arb_mm256_log_pd(__m256d x) __attribute__ ((unused)); -static __m256d arb_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused)); - -void arb_mm256_print_pd(__m256d x, const char *name) { - double *val = (double *) &x; - std::cout << name << " = { "; - for (size_t i = 0; i < 4; ++i) { - std::cout << val[i] << " "; - } - - std::cout << "}\n"; -} - -void arb_mm256_print_epi32(__m128i x, const char *name) { - int *val = (int *) &x; - std::cout << name << " = { "; - for (size_t i = 0; i < 4; ++i) { - std::cout << val[i] << " "; - } - - std::cout << "}\n"; -} - -void arb_mm256_print_epi64x(__m256i x, const char *name) { - uint64_t *val = (uint64_t *) &x; - std::cout << name << " = { "; - for (size_t i = 0; i < 4; ++i) { - std::cout << val[i] << " "; - } - - std::cout << "}\n"; -} - -// -// Calculates exponential using AVX2 instructions -// -// Exponential is calculated as follows: -// e^x = e^g * 2^n, -// -// where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated -// fast with bit shift, whereas e^g is approximated using the following Pade' -// form: -// -// e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2)) -// -// The exponents n and g are calculated using the following formulas: -// -// n = floor(x/ln(2) + 0.5) -// g = x - n*ln(2) -// -// They can be derived as follows: -// -// e^x = 2^(x/ln(2)) -// = 2^-0.5 * 2^(x/ln(2) + 0.5) -// = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5) (1) -// -// Setting n = floor(x/ln(2) + 0.5), -// -// r' = x/ln(2) - n, and r' in [0, 1) -// -// Substituting r' in (1) gives us -// -// e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5) -// = e^(x-n*ln(2)) * 2^n -// = e^g * 2^n, where g = x - n*ln(2) (2) -// -// NOTE: The calculation of ln(2) in (2) is split in two operations to -// compensate for rounding errors: -// -// ln(2) = C1 + C2, where -// -// C1 = floor(2^k*ln(2))/2^k -// C2 = ln(2) - C1 -// -// We use k=32, since this is what the Cephes library does historically. -// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but -// the standard library seems not to do that, so we are getting differences -// compared to std::exp() for large exponents. -// -__m256d arb_mm256_exp_pd(__m256d x) { - __m256d x_orig = x; - - __m256d px = _mm256_floor_pd( - _mm256_add_pd( - _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x), - _mm256_set1_pd(0.5) - ) - ); - - __m128i n = _mm256_cvtpd_epi32(px); - - x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1))); - x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2))); - - __m256d xx = _mm256_mul_pd(x, x); - - // Compute the P and Q polynomials. - - // Polynomials are computed in factorized form in order to reduce the total - // numbers of operations: - // - // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2) - // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3)) - - // Compute x*P(x**2) - px = _mm256_set1_pd(detail::P2exp); - px = _mm256_mul_pd(px, xx); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp)); - px = _mm256_mul_pd(px, xx); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp)); - px = _mm256_mul_pd(px, x); - - - // Compute Q(x**2) - __m256d qx = _mm256_set1_pd(detail::Q3exp); - qx = _mm256_mul_pd(qx, xx); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp)); - qx = _mm256_mul_pd(qx, xx); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp)); - qx = _mm256_mul_pd(qx, xx); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp)); - - // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2)) - x = _mm256_div_pd(px, _mm256_sub_pd(qx, px)); - x = _mm256_add_pd(detail::arb_m256d_one, - _mm256_mul_pd(detail::arb_m256d_two, x)); - - // Finally, compute x *= 2**n - __m256i n64 = _mm256_cvtepi32_epi64(n); - n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023)); - n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52)); - x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64)); - - // Treat exceptional cases - __m256d is_large = _mm256_cmp_pd( - x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */ - ); - __m256d is_small = _mm256_cmp_pd( - x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */ - ); - __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); - - x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_large); - x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_small); - x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan); - return x; - -} - -__m256d arb_mm256_subnormal_pd(__m256d x) { - __m256i x_raw = _mm256_castpd_si256(x); - __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); - __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask)); - - // Subnormals have a zero exponent - return _mm256_cmp_pd(x_exp, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */); -} - -__m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) { - __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); - __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask); - - __m256d x_orig = x; - - // we will work on the raw bits of x - __m256i x_raw = _mm256_castpd_si256(x); - __m256i x_exp = _mm256_and_si256(x_raw, exp_mask); - x_exp = _mm256_srli_epi64(x_exp, 52); - - // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1) - x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1)); - - // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a - // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it - // an 2^-1 exponent, i.e., 1022 in IEEE-754 format - __m256i x_ret = _mm256_and_si256(x_raw, mant_mask); - - __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52); - x_ret = _mm256_or_si256(x_ret, exp_bits); - x = _mm256_castsi256_pd(x_ret); - - // Treat special cases - __m256d is_zero = _mm256_cmp_pd( - x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */ - ); - __m256d is_inf = _mm256_cmp_pd( - x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */ - ); - __m256d is_ninf = _mm256_cmp_pd( - x_orig, detail::arb_m256d_ninf, 0 /* _CMP_EQ_OQ */ - ); - __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); - - // Denormalized numbers have a zero exponent. Here we expect -1022 since we - // have already prepared it as a power of 2 - __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022)); - - x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_zero); - x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_inf); - x = _mm256_blendv_pd(x, detail::arb_m256d_ninf, is_ninf); - x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan); - - // FIXME: We treat denormalized numbers as zero here - x = _mm256_blendv_pd(x, detail::arb_m256d_zero, - _mm256_castsi256_pd(is_denorm)); - x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm); - - x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), - _mm256_castpd_si256(is_zero)); - - - // We need to "compress" x_exp into the first 128 bits before casting it - // safely to __m128i and return to *e - x_exp = _mm256_permutevar8x32_epi32( - x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0) - ); - *e = _mm256_castsi256_si128(x_exp); - return x; } -// -// Calculates natural logarithm using AVX2 instructions -// -// ln(x) = ln(x'*2^g), x' in [0,1), g in N -// = ln(x') + g*ln(2) -// -// The logarithm in [0,1) is computed using the following Pade' form: -// -// ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x) -// -__m256d arb_mm256_log_pd(__m256d x) { - __m256d x_orig = x; - __m128i x_exp; - - // x := x', x_exp := g - x = arb_mm256_frexp_pd(x, &x_exp); - - // convert x_exp to packed double - __m256d dx_exp = _mm256_cvtepi32_pd(x_exp); - - // blending - __m256d lt_sqrth = _mm256_cmp_pd( - x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */); - - // Adjust the argument and the exponent - // | 2*x - 1; e := e -1 , if x < sqrt(2)/2 - // x := | - // | x - 1, otherwise - - // Precompute both branches - // 2*x - 1 - __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::arb_m256d_one); - - // x - 1 - __m256d xm1 = _mm256_sub_pd(x, detail::arb_m256d_one); - - // dx_exp - 1 - __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::arb_m256d_one); - - x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth); - dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth); - - // compute P(x) - __m256d px = _mm256_set1_pd(detail::P5log); - px = _mm256_mul_pd(px, x); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log)); - px = _mm256_mul_pd(px, x); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log)); - px = _mm256_mul_pd(px, x); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log)); - px = _mm256_mul_pd(px, x); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log)); - px = _mm256_mul_pd(px, x); - px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log)); - - // xx := x^2 - // px := P(x)*x^3 - __m256d xx = _mm256_mul_pd(x, x); - px = _mm256_mul_pd(px, x); - px = _mm256_mul_pd(px, xx); +#include "intrin_avx2.hpp" - // compute Q(x) - __m256d qx = x; - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log)); - qx = _mm256_mul_pd(qx, x); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log)); - qx = _mm256_mul_pd(qx, x); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log)); - qx = _mm256_mul_pd(qx, x); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log)); - qx = _mm256_mul_pd(qx, x); - qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log)); - - // x^3*P(x)/Q(x) - __m256d ret = _mm256_div_pd(px, qx); - - // x^3*P(x)/Q(x) - g*ln(2) - ret = _mm256_sub_pd( - ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3)) - ); - - // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) - ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx)); - - // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) - ret = _mm256_add_pd(ret, x); - - // rounding error correction for ln(2) - ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4))); - - // Treat exceptional cases - __m256d is_inf = _mm256_cmp_pd( - x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */); - __m256d is_zero = _mm256_cmp_pd( - x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */); - __m256d is_neg = _mm256_cmp_pd( - x_orig, detail::arb_m256d_zero, 17 /* _CMP_LT_OQ */); - __m256d is_denorm = arb_mm256_subnormal_pd(x_orig); - - ret = _mm256_blendv_pd(ret, detail::arb_m256d_inf, is_inf); - ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_zero); - - // We treat denormalized cases as zeros - ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_denorm); - ret = _mm256_blendv_pd(ret, detail::arb_m256d_nan, is_neg); - return ret; -} - -// Equivalent to exp(y*log(x)) -__m256d arb_mm256_pow_pd(__m256d x, __m256d y) { - return arb_mm256_exp_pd(_mm256_mul_pd(y, arb_mm256_log_pd(x))); -} +#if defined(SIMD_KNL) || defined(SIMD_AVX512) +#include "intrin_avx512.hpp" +#endif } // end namespace multicore } // end namespace arb diff --git a/src/backends/multicore/intrin_avx2.hpp b/src/backends/multicore/intrin_avx2.hpp new file mode 100644 index 0000000000000000000000000000000000000000..be8aa8b64aec6a12efa02010eb7547242301a3ea --- /dev/null +++ b/src/backends/multicore/intrin_avx2.hpp @@ -0,0 +1,396 @@ +#pragma once + +namespace detail { + // Useful constants in vector registers + const __m256d arb_m256d_zero = _mm256_set1_pd(0.0); + const __m256d arb_m256d_one = _mm256_set1_pd(1.0); + const __m256d arb_m256d_two = _mm256_set1_pd(2.0); + const __m256d arb_m256d_nan = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN()); + const __m256d arb_m256d_inf = _mm256_set1_pd(std::numeric_limits<double>::infinity()); + const __m256d arb_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity()); +} + +inline void arb_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused)); +inline void arb_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused)); +inline void arb_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused)); +inline __m256d arb_mm256_exp_pd(__m256d x) __attribute__ ((unused)); +inline __m256d arb_mm256_subnormal_pd(__m256d x) __attribute__ ((unused)); +inline __m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused)); +inline __m256d arb_mm256_log_pd(__m256d x) __attribute__ ((unused)); +inline __m256d arb_mm256_abs_pd(__m256d x) __attribute__ ((unused)); +inline __m256d arb_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused)); +inline __m256d arb_mm256_abs_pd(__m256d x); +inline __m256d arb_mm256_min_pd(__m256d x, __m256d y); +inline __m256d arb_mm256_exprelr_pd(__m256d x); + +void arb_mm256_print_pd(__m256d x, const char *name) { + double *val = (double *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +void arb_mm256_print_epi32(__m128i x, const char *name) { + int *val = (int *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +void arb_mm256_print_epi64x(__m256i x, const char *name) { + uint64_t *val = (uint64_t *) &x; + std::cout << name << " = { "; + for (size_t i = 0; i < 4; ++i) { + std::cout << val[i] << " "; + } + + std::cout << "}\n"; +} + +// +// Calculates absolute value using AVX2 instructions +// +// Calculated as follows: +// abs(x) = max(x, 0-x) +// +// Other approaches that use a bitwise mask might be more efficient, but using +// max gives a simple one liner. +inline +__m256d arb_mm256_abs_pd(__m256d x) { + return _mm256_max_pd(x, _mm256_sub_pd(_mm256_set1_pd(0.), x)); +} + +// +// Calculates minimum of two values using AVX2 instructions +// +// Caluclated as follows: +// min(x,y) = x>y? y: x +inline +__m256d arb_mm256_min_pd(__m256d x, __m256d y) { + // substitute values in x with values from y where x>y + return _mm256_blendv_pd(x, y, _mm256_cmp_pd(x, y, 30)); // 30 -> _CMP_GT_OQ +} + +// +// Calculates exprelr value using AVX2 instructions +// +// Calculated as follows: +// exprelr(x) = x / (exp(x)-1) = x / expm1(x) +// +// TODO: currently calculates exp(x)-1 for the denominator, which will not be +// accurate for x≈0. A vectorized implementation of expm1(x) would fix this. +// An example of such an implementation is in Cephes. +inline +__m256d arb_mm256_exprelr_pd(__m256d x) { + const auto ones = _mm256_set1_pd(1); + return _mm256_blendv_pd( + _mm256_div_pd(x, _mm256_sub_pd(arb_mm256_exp_pd(x), ones)), // x / (exp(x)-1) + ones, // 1 + _mm256_cmp_pd(ones, _mm256_add_pd(x, ones), 0)); // 1+x == 1 +} + +// +// Calculates exponential using AVX2 instructions +// +// Exponential is calculated as follows: +// e^x = e^g * 2^n, +// +// where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated +// fast with bit shift, whereas e^g is approximated using the following Pade' +// form: +// +// e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2)) +// +// The exponents n and g are calculated using the following formulas: +// +// n = floor(x/ln(2) + 0.5) +// g = x - n*ln(2) +// +// They can be derived as follows: +// +// e^x = 2^(x/ln(2)) +// = 2^-0.5 * 2^(x/ln(2) + 0.5) +// = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5) (1) +// +// Setting n = floor(x/ln(2) + 0.5), +// +// r' = x/ln(2) - n, and r' in [0, 1) +// +// Substituting r' in (1) gives us +// +// e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5) +// = e^(x-n*ln(2)) * 2^n +// = e^g * 2^n, where g = x - n*ln(2) (2) +// +// NOTE: The calculation of ln(2) in (2) is split in two operations to +// compensate for rounding errors: +// +// ln(2) = C1 + C2, where +// +// C1 = floor(2^k*ln(2))/2^k +// C2 = ln(2) - C1 +// +// We use k=32, since this is what the Cephes library does historically. +// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but +// the standard library seems not to do that, so we are getting differences +// compared to std::exp() for large exponents. +// +__m256d arb_mm256_exp_pd(__m256d x) { + __m256d x_orig = x; + + __m256d px = _mm256_floor_pd( + _mm256_add_pd( + _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x), + _mm256_set1_pd(0.5) + ) + ); + + __m128i n = _mm256_cvtpd_epi32(px); + + x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1))); + x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2))); + + __m256d xx = _mm256_mul_pd(x, x); + + // Compute the P and Q polynomials. + + // Polynomials are computed in factorized form in order to reduce the total + // numbers of operations: + // + // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2) + // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3)) + + // Compute x*P(x**2) + px = _mm256_set1_pd(detail::P2exp); + px = _mm256_mul_pd(px, xx); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp)); + px = _mm256_mul_pd(px, xx); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp)); + px = _mm256_mul_pd(px, x); + + + // Compute Q(x**2) + __m256d qx = _mm256_set1_pd(detail::Q3exp); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp)); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp)); + qx = _mm256_mul_pd(qx, xx); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp)); + + // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2)) + x = _mm256_div_pd(px, _mm256_sub_pd(qx, px)); + x = _mm256_add_pd(detail::arb_m256d_one, + _mm256_mul_pd(detail::arb_m256d_two, x)); + + // Finally, compute x *= 2**n + __m256i n64 = _mm256_cvtepi32_epi64(n); + n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023)); + n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52)); + x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64)); + + // Treat exceptional cases + __m256d is_large = _mm256_cmp_pd( + x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */ + ); + __m256d is_small = _mm256_cmp_pd( + x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */ + ); + __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); + + x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_large); + x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_small); + x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan); + return x; + +} + +__m256d arb_mm256_subnormal_pd(__m256d x) { + __m256i x_raw = _mm256_castpd_si256(x); + __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); + __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask)); + + // Subnormals have a zero exponent + return _mm256_cmp_pd(x_exp, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */); +} + +__m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) { + __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask); + __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask); + + __m256d x_orig = x; + + // we will work on the raw bits of x + __m256i x_raw = _mm256_castpd_si256(x); + __m256i x_exp = _mm256_and_si256(x_raw, exp_mask); + x_exp = _mm256_srli_epi64(x_exp, 52); + + // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1) + x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1)); + + // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a + // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it + // an 2^-1 exponent, i.e., 1022 in IEEE-754 format + __m256i x_ret = _mm256_and_si256(x_raw, mant_mask); + + __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52); + x_ret = _mm256_or_si256(x_ret, exp_bits); + x = _mm256_castsi256_pd(x_ret); + + // Treat special cases + __m256d is_zero = _mm256_cmp_pd( + x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_inf = _mm256_cmp_pd( + x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_ninf = _mm256_cmp_pd( + x_orig, detail::arb_m256d_ninf, 0 /* _CMP_EQ_OQ */ + ); + __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ ); + + // Denormalized numbers have a zero exponent. Here we expect -1022 since we + // have already prepared it as a power of 2 + __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022)); + + x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_zero); + x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_inf); + x = _mm256_blendv_pd(x, detail::arb_m256d_ninf, is_ninf); + x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan); + + // FIXME: We treat denormalized numbers as zero here + x = _mm256_blendv_pd(x, detail::arb_m256d_zero, + _mm256_castsi256_pd(is_denorm)); + x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm); + + x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), + _mm256_castpd_si256(is_zero)); + + + // We need to "compress" x_exp into the first 128 bits before casting it + // safely to __m128i and return to *e + x_exp = _mm256_permutevar8x32_epi32( + x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0) + ); + *e = _mm256_castsi256_si128(x_exp); + return x; +} + +// +// Calculates natural logarithm using AVX2 instructions +// +// ln(x) = ln(x'*2^g), x' in [0,1), g in N +// = ln(x') + g*ln(2) +// +// The logarithm in [0,1) is computed using the following Pade' form: +// +// ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x) +// +__m256d arb_mm256_log_pd(__m256d x) { + __m256d x_orig = x; + __m128i x_exp; + + // x := x', x_exp := g + x = arb_mm256_frexp_pd(x, &x_exp); + + // convert x_exp to packed double + __m256d dx_exp = _mm256_cvtepi32_pd(x_exp); + + // blending + __m256d lt_sqrth = _mm256_cmp_pd( + x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */); + + // Adjust the argument and the exponent + // | 2*x - 1; e := e -1 , if x < sqrt(2)/2 + // x := | + // | x - 1, otherwise + + // Precompute both branches + // 2*x - 1 + __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::arb_m256d_one); + + // x - 1 + __m256d xm1 = _mm256_sub_pd(x, detail::arb_m256d_one); + + // dx_exp - 1 + __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::arb_m256d_one); + + x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth); + dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth); + + // compute P(x) + __m256d px = _mm256_set1_pd(detail::P5log); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log)); + px = _mm256_mul_pd(px, x); + px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log)); + + // xx := x^2 + // px := P(x)*x^3 + __m256d xx = _mm256_mul_pd(x, x); + px = _mm256_mul_pd(px, x); + px = _mm256_mul_pd(px, xx); + + // compute Q(x) + __m256d qx = x; + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log)); + qx = _mm256_mul_pd(qx, x); + qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log)); + + // x^3*P(x)/Q(x) + __m256d ret = _mm256_div_pd(px, qx); + + // x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_sub_pd( + ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3)) + ); + + // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx)); + + // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2) + ret = _mm256_add_pd(ret, x); + + // rounding error correction for ln(2) + ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4))); + + // Treat exceptional cases + __m256d is_inf = _mm256_cmp_pd( + x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */); + __m256d is_zero = _mm256_cmp_pd( + x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */); + __m256d is_neg = _mm256_cmp_pd( + x_orig, detail::arb_m256d_zero, 17 /* _CMP_LT_OQ */); + __m256d is_denorm = arb_mm256_subnormal_pd(x_orig); + + ret = _mm256_blendv_pd(ret, detail::arb_m256d_inf, is_inf); + ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_zero); + + // We treat denormalized cases as zeros + ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_denorm); + ret = _mm256_blendv_pd(ret, detail::arb_m256d_nan, is_neg); + return ret; +} + +// Equivalent to exp(y*log(x)) +__m256d arb_mm256_pow_pd(__m256d x, __m256d y) { + return arb_mm256_exp_pd(_mm256_mul_pd(y, arb_mm256_log_pd(x))); +} diff --git a/src/backends/multicore/intrin_avx512.hpp b/src/backends/multicore/intrin_avx512.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ef6354dfd55f594f6f760b932c01b1f6445964a --- /dev/null +++ b/src/backends/multicore/intrin_avx512.hpp @@ -0,0 +1,80 @@ +#pragma once + +// vector types for avx512 + +// double precision avx512 register +using vecd_avx512 = __m512d; +// 8 way mask for avx512 register (for use with double precision) +using mask8_avx512 = __mmask8; + +inline vecd_avx512 set(double x) { + return _mm512_set1_pd(x); +} + +namespace detail { + // Useful constants in vector registers + const vecd_avx512 vecd_avx512_zero = set(0.0); + const vecd_avx512 vecd_avx512_one = set(1.0); + const vecd_avx512 vecd_avx512_two = set(2.0); + const vecd_avx512 vecd_avx512_nan = set(std::numeric_limits<double>::quiet_NaN()); + const vecd_avx512 vecd_avx512_inf = set(std::numeric_limits<double>::infinity()); + const vecd_avx512 vecd_avx512_ninf = set(-std::numeric_limits<double>::infinity()); +} + +// +// Operations on vector registers. +// +// shorter, less verbose wrappers around intrinsics +// + +inline vecd_avx512 blend(mask8_avx512 m, vecd_avx512 x, vecd_avx512 y) { + return _mm512_mask_blend_pd(m, x, y); +} + +inline vecd_avx512 add(vecd_avx512 x, vecd_avx512 y) { + return _mm512_add_pd(x, y); +} + +inline vecd_avx512 sub(vecd_avx512 x, vecd_avx512 y) { + return _mm512_sub_pd(x, y); +} + +inline vecd_avx512 mul(vecd_avx512 x, vecd_avx512 y) { + return _mm512_mul_pd(x, y); +} + +inline vecd_avx512 div(vecd_avx512 x, vecd_avx512 y) { + return _mm512_div_pd(x, y); +} + +inline vecd_avx512 max(vecd_avx512 x, vecd_avx512 y) { + return _mm512_max_pd(x, y); +} + +inline vecd_avx512 expm1(vecd_avx512 x) { + // Assume that we are using the Intel compiler, and use the vectorized expm1 + // defined in the Intel SVML library. + return _mm512_expm1_pd(x); +} + +inline mask8_avx512 less(vecd_avx512 x, vecd_avx512 y) { + return _mm512_cmp_pd_mask(x, y, 0); +} + +inline mask8_avx512 greater(vecd_avx512 x, vecd_avx512 y) { + return _mm512_cmp_pd_mask(x, y, 30); +} + +inline vecd_avx512 abs(vecd_avx512 x) { + return max(x, sub(set(0.), x)); +} + +inline vecd_avx512 min(vecd_avx512 x, vecd_avx512 y) { + // substitute values in x with values from y where x>y + return blend(greater(x, y), x, y); +} + +inline vecd_avx512 exprelr(vecd_avx512 x) { + const auto ones = set(1); + return blend(less(ones, add(x, ones)), div(x, expm1(x)), ones); +} diff --git a/src/math.hpp b/src/math.hpp index 50e2ebbda77dd848d53b320bf261791707e8cf4a..120b7fef02526526a4766712537ba8a5742670c6 100644 --- a/src/math.hpp +++ b/src/math.hpp @@ -80,6 +80,26 @@ int signum(T x) { return (x>T(0)) - (x<T(0)); } +// Return minimum of the two values +template <typename T> +T min(const T& lhs, const T& rhs) { + return lhs<rhs? lhs: rhs; +} + +// Return maximum of the two values +template <typename T> +T max(const T& lhs, const T& rhs) { + return lhs<rhs? rhs: lhs; +} + +// Value of x/(exp(x)-1) with care taken to handle x=0 case +template <typename T> +inline +T exprelr(T x) { + // If abs(x) is less than epsilon return 1, else calculate the result directly. + return (T(1)==T(1)+x)? T(1): x/std::expm1(x); +} + // Quaternion implementation. // Represents w + x.i + y.j + z.k. diff --git a/tests/modcc/test_parser.cpp b/tests/modcc/test_parser.cpp index f91ce45339adfbee47c0a1604c44b59b5731141f..7b263e7975a7ddf359375bf041b7fff9789a1b9d 100644 --- a/tests/modcc/test_parser.cpp +++ b/tests/modcc/test_parser.cpp @@ -205,14 +205,14 @@ TEST(Parser, parse_if) { } EXPECT_TRUE(check_parse(s, &Parser::parse_if, - " if(a<b) { \n" + " if(abs(a-b)) { \n" " a = 2+b \n" " } else if(b>a){\n" " a = 2+b \n" " } " )); if (s) { - EXPECT_NE(s->condition()->is_binary(), nullptr); + EXPECT_NE(s->condition()->is_unary(), nullptr); EXPECT_NE(s->true_branch()->is_block(), nullptr); ASSERT_NE(s->false_branch(), nullptr); ASSERT_NE(s->false_branch()->is_if(), nullptr); @@ -296,8 +296,11 @@ TEST(Parser, parse_line_expression) { "x=(y + 2 * z ^ 3) ", "foo(x+3, y, bar(21.4))", "y=exp(x+3) + log(exp(x/y))", + "x=abs(y+z)", "a=x^y^z", - "a=x/y/z" + "a=x/y/z", + "a=min(x,y)", + "a=max(min(x,z),y)", }; for (auto& text: good_expr) { @@ -499,6 +502,8 @@ long double eval(Expression *e) { case tok::times : return lhs*rhs; case tok::divide: return lhs/rhs; case tok::pow : return std::pow(lhs,rhs); + case tok::min : return std::min(lhs,rhs); + case tok::max : return std::max(lhs,rhs); default:; } } @@ -525,6 +530,10 @@ TEST(Parser, parse_binop) { {"2*3", 2.*3.}, {"2/3", 2./3.}, {"2^3", pow(2., 3.)}, + {"min(2,3)", 2.}, + {"min(3,2)", 2.}, + {"max(2,3)", 3.}, + {"max(3,2)", 3.}, // more complicated {"2+3*2", 2.+(3*2)}, @@ -532,6 +541,10 @@ TEST(Parser, parse_binop) { {"2+3*(-2)", 2.+(3*-2)}, {"2+3*(-+2)", 2.+(3*-+2)}, {"2/3*4", (2./3.)*4.}, + {"min(2+3, 4/2)", 4./2}, + {"max(2+3, 4/2)", 2.+3.}, + {"max(2+3, min(12, 24))", 12.}, + {"max(min(12, 24), 2+3)", 12.}, {"2 * 7 - 3 * 11 + 4 * 13", 2.*7.-3.*11.+4.*13.}, // right associative @@ -551,7 +564,7 @@ TEST(Parser, parse_binop) { std::unique_ptr<Expression> e; EXPECT_TRUE(check_parse(e, &Parser::parse_expression, test_case.first)); - // A loose tolerance of 1d-10 is required here because the eval() + // A loose tolerance of 1e-10 is required here because the eval() // function uses long double for intermediate results (like constant // folding in modparser). For expressions with transcendental // operations this can see relatively large divergence between the diff --git a/tests/modcc/test_printers.cpp b/tests/modcc/test_printers.cpp index 855d9d69c5d6788c2b220bb41acbc1eee0e17d9b..1160cbbea047e2c56c16cd07e58266efde29c664 100644 --- a/tests/modcc/test_printers.cpp +++ b/tests/modcc/test_printers.cpp @@ -42,7 +42,10 @@ TEST(scalar_printer, statement) { {"z=(a*b)/c", "z=a*b/c"}, {"z=a-(b+c)", "z=a-(b+c)"}, {"z=(a>0)<(b>0)", "z=a>0<(b>0)"}, - {"z=a- -2", "z=a- -2"} + {"z=a- -2", "z=a- -2"}, + {"z=abs(x-z)", "z=fabs(x-z)"}, + {"z=min(x,y)", "z=min(x,y)"}, + {"z=min(max(a,b),y)","z=min(max(a,b),y)"}, }; // create a scope that contains the symbols used in the tests diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 11ad7927747e4489907c11347f73e1d9de5e929a..d913b07b58700a540f2db93d95f72300caa2917d 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -18,7 +18,7 @@ build_modules( # Unit test sources set(TEST_CUDA_SOURCES - test_atomics.cu + test_intrin.cu test_mc_cell_group.cu test_gpu_stack.cu test_matrix.cu diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu deleted file mode 100644 index b1bac8d78ff7324c1d265aad9ff85de1921cf03c..0000000000000000000000000000000000000000 --- a/tests/unit/test_atomics.cu +++ /dev/null @@ -1,53 +0,0 @@ -#include "../gtest.h" - -#include <backends/gpu/intrinsics.hpp> -#include <backends/gpu/managed_ptr.hpp> - -namespace kernels { - template <typename T> - __global__ - void test_atomic_add(T* x) { - cuda_atomic_add(x, threadIdx.x+1); - } - - template <typename T> - __global__ - void test_atomic_sub(T* x) { - cuda_atomic_sub(x, threadIdx.x+1); - } -} - -// test atomic addition wrapper for single and double precision -TEST(gpu_intrinsics, cuda_atomic_add) { - int expected = (128*129)/2; - - auto f = arb::gpu::make_managed_ptr<float>(0.f); - kernels::test_atomic_add<<<1, 128>>>(f.get()); - cudaDeviceSynchronize(); - - EXPECT_EQ(float(expected), *f); - - auto d = arb::gpu::make_managed_ptr<double>(0.); - kernels::test_atomic_add<<<1, 128>>>(d.get()); - cudaDeviceSynchronize(); - - EXPECT_EQ(double(expected), *d); -} - -// test atomic subtraction wrapper for single and double precision -TEST(gpu_intrinsics, cuda_atomic_sub) { - int expected = -(128*129)/2; - - auto f = arb::gpu::make_managed_ptr<float>(0.f); - kernels::test_atomic_sub<<<1, 128>>>(f.get()); - cudaDeviceSynchronize(); - - EXPECT_EQ(float(expected), *f); - - auto d = arb::gpu::make_managed_ptr<double>(0.); - kernels::test_atomic_sub<<<1, 128>>>(d.get()); - cudaDeviceSynchronize(); - - EXPECT_EQ(double(expected), *d); -} - diff --git a/tests/unit/test_intrin.cpp b/tests/unit/test_intrin.cpp index 5cb3310c648c84fc75321292cf8c35f2c5d83289..e0b3d3855912fac0fe96eec128044354ddda5a1e 100644 --- a/tests/unit/test_intrin.cpp +++ b/tests/unit/test_intrin.cpp @@ -3,15 +3,21 @@ #include <backends/multicore/intrin.hpp> #include <immintrin.h> +#include <util/span.hpp> + #include "../gtest.h" using namespace arb::multicore; +using arb::util::make_span; +using arb::util::size; + constexpr double dqnan = std::numeric_limits<double>::quiet_NaN(); constexpr double dmax = std::numeric_limits<double>::max(); constexpr double dmin = std::numeric_limits<double>::min(); constexpr double dmin_denorm = std::numeric_limits<double>::denorm_min(); constexpr double dinf = std::numeric_limits<double>::infinity(); +constexpr double deps = std::numeric_limits<double>::epsilon(); constexpr double values[] = { -300, -3, -2, -1, @@ -46,6 +52,76 @@ TEST(intrin, exp256) { } } +TEST(intrin, abs256) { + constexpr size_t simd_len = 4; + + __m256d vvalues[] = { + _mm256_set_pd(-42., -0., 0., 42.), + _mm256_set_pd(-dmin, -dmax, -dmax, dqnan), + }; + + + for (auto i: {0u, 1u}) { + auto x = arb_mm256_abs_pd(vvalues[i]); + double* in = (double*) &(vvalues[i]); + double* out = (double*) &x; + for (size_t j = 0; j < simd_len; ++j) { + double v = in[j]; + if (std::isnan(v)) { + EXPECT_TRUE(std::isnan(out[j])); + } + else { + EXPECT_DOUBLE_EQ(std::fabs(v), out[j]); + } + } + } +} + +TEST(intrin, min256) { + constexpr size_t simd_len = 4; + + __m256d lhs = _mm256_set_pd(-2, 2, -dinf, dinf); + __m256d rhs = _mm256_set_pd(2, -2, 42, 1); + + auto mi = arb_mm256_min_pd(lhs, rhs); + + double* lp = (double*) &lhs; + double* rp = (double*) &rhs; + double* mip = (double*) &mi; + for (size_t j = 0; j < simd_len; ++j) { + EXPECT_DOUBLE_EQ(std::min(lp[j], rp[j]), mip[j]); + } +} + +TEST(intrin, exprelr256) { + constexpr size_t simd_len = 4; + + // TODO: the third set of test values is commented out because it currently + // fails, because our implementation uses x/(exp(x)-1), when we should use + // x/expm(1) to handle rounding errors when x is close to 0. + // This test can be added once we have an implementation of expm1. + __m256d vvalues[] = { + _mm256_set_pd(-1., -0., 0., 1.), + _mm256_set_pd(-dmax, -dmin, dmin, dmax), + //_mm256_set_pd(-deps, deps, 10*deps, 100*deps), + }; + + for (auto i: make_span(0, size(vvalues))) { + auto x = arb_mm256_exprelr_pd(vvalues[i]); + double* in = (double*) &(vvalues[i]); + double* out = (double*) &x; + for (size_t j = 0; j < simd_len; ++j) { + double v = in[j]; + if (std::fabs(v)<deps) { + EXPECT_DOUBLE_EQ(1.0, out[j]); + } + else { + EXPECT_DOUBLE_EQ(v/expm1(v), out[j]); + } + } + } +} + TEST(intrin, frexp256) { constexpr size_t simd_len = 4; diff --git a/tests/unit/test_intrin.cu b/tests/unit/test_intrin.cu new file mode 100644 index 0000000000000000000000000000000000000000..5787b0bdd2054982df0a2ae019d1b16c71aa04f6 --- /dev/null +++ b/tests/unit/test_intrin.cu @@ -0,0 +1,157 @@ +#include "../gtest.h" + +#include <limits> + +#include <backends/gpu/intrinsics.hpp> +#include <backends/gpu/managed_ptr.hpp> +#include <memory/memory.hpp> +#include <util/rangeutil.hpp> +#include <util/span.hpp> + +namespace kernels { + template <typename T> + __global__ + void test_atomic_add(T* x) { + cuda_atomic_add(x, threadIdx.x+1); + } + + template <typename T> + __global__ + void test_atomic_sub(T* x) { + cuda_atomic_sub(x, threadIdx.x+1); + } + + __global__ + void test_min(double* x, double* y, double* result) { + const auto i = threadIdx.x; + result[i] = min(x[i], y[i]); + } + + __global__ + void test_max(double* x, double* y, double* result) { + const auto i = threadIdx.x; + result[i] = max(x[i], y[i]); + } + + __global__ + void test_exprelr(double* x, double* result) { + const auto i = threadIdx.x; + result[i] = exprelr(x[i]); + } + +} + +// test atomic addition wrapper for single and double precision +TEST(gpu_intrinsics, cuda_atomic_add) { + int expected = (128*129)/2; + + auto f = arb::gpu::make_managed_ptr<float>(0.f); + kernels::test_atomic_add<<<1, 128>>>(f.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(float(expected), *f); + + auto d = arb::gpu::make_managed_ptr<double>(0.); + kernels::test_atomic_add<<<1, 128>>>(d.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(double(expected), *d); +} + +// test atomic subtraction wrapper for single and double precision +TEST(gpu_intrinsics, cuda_atomic_sub) { + int expected = -(128*129)/2; + + auto f = arb::gpu::make_managed_ptr<float>(0.f); + kernels::test_atomic_sub<<<1, 128>>>(f.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(float(expected), *f); + + auto d = arb::gpu::make_managed_ptr<double>(0.); + kernels::test_atomic_sub<<<1, 128>>>(d.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(double(expected), *d); +} + +TEST(gpu_intrinsics, minmax) { + const double inf = std::numeric_limits<double>::infinity(); + struct X { + double lhs; + double rhs; + double expected_min; + double expected_max; + }; + + std::vector<X> inputs = { + { 0, 1, 0, 1}, + { -1, 1, -1, 1}, + { 42, 42, 42, 42}, + {inf, -inf, -inf, inf}, + { 0, inf, 0, inf}, + { 0, -inf, -inf, 0}, + }; + + const auto n = arb::util::size(inputs); + + arb::memory::device_vector<double> lhs(n); + arb::memory::device_vector<double> rhs(n); + arb::memory::device_vector<double> result(n); + + using arb::util::make_span; + + for (auto i: make_span(0, n)) { + lhs[i] = inputs[i].lhs; + rhs[i] = inputs[i].rhs; + } + + // test min + kernels::test_min<<<1, n>>>(lhs.data(), rhs.data(), result.data()); + cudaDeviceSynchronize(); + for (auto i: make_span(0, n)) { + EXPECT_EQ(double(result[i]), inputs[i].expected_min); + } + + kernels::test_min<<<1, n>>>(rhs.data(), lhs.data(), result.data()); + cudaDeviceSynchronize(); + for (auto i: make_span(0, n)) { + EXPECT_EQ(double(result[i]), inputs[i].expected_min); + } + + // test max + kernels::test_max<<<1, n>>>(lhs.data(), rhs.data(), result.data()); + cudaDeviceSynchronize(); + for (auto i: make_span(0, n)) { + EXPECT_EQ(double(result[i]), inputs[i].expected_max); + } + + kernels::test_max<<<1, n>>>(rhs.data(), lhs.data(), result.data()); + cudaDeviceSynchronize(); + for (auto i: make_span(0, n)) { + EXPECT_EQ(double(result[i]), inputs[i].expected_max); + } +} + +TEST(gpu_intrinsics, exprelr) { + constexpr double dmin = std::numeric_limits<double>::min(); + constexpr double dmax = std::numeric_limits<double>::max(); + constexpr double deps = std::numeric_limits<double>::epsilon(); + double inputs[] = {-1., -0., 0., 1., -dmax, -dmin, dmin, dmax, -deps, deps, 10*deps, 100*deps, 1000*deps}; + + auto n = arb::util::size(inputs); + arb::memory::device_vector<double> x(arb::memory::host_view<double>(inputs, n)); + arb::memory::device_vector<double> result(n); + + kernels::test_exprelr<<<1,n>>>(x.data(), result.data()); + cudaDeviceSynchronize(); + + auto index = arb::util::make_span(0, n); + for (auto i: index) { + auto x = inputs[i]; + double expected = std::fabs(x)<deps? 1.0: x/std::expm1(x); + double error = std::fabs(expected-double(result[i])); + double relerr = expected==0.? error: error/std::fabs(expected); + EXPECT_TRUE(relerr<deps); + } +} diff --git a/tests/unit/test_math.cpp b/tests/unit/test_math.cpp index 7ea1aeac366e5a96d3edab1a1e143eb1f472ef32..37e38594b277cce75b4ac333ce46a29c498ac9aa 100644 --- a/tests/unit/test_math.cpp +++ b/tests/unit/test_math.cpp @@ -285,3 +285,44 @@ TEST(quaternion, rotate) { EXPECT_NEAR(q.x/2.+q.y*sqrt3o2, r.y, eps); EXPECT_NEAR(q.z, r.z, eps); } + +TEST(math, exprelr) { + constexpr double dmin = std::numeric_limits<double>::min(); + constexpr double dmax = std::numeric_limits<double>::max(); + constexpr double deps = std::numeric_limits<double>::epsilon(); + double inputs[] = {-1., -0., 0., 1., -dmax, -dmin, dmin, dmax, -deps, deps, 10*deps, 100*deps, 1000*deps}; + + for (auto x: inputs) { + if (std::fabs(x)<deps) EXPECT_EQ(1.0, exprelr(x)); + else EXPECT_EQ(x/std::expm1(x), exprelr(x)); + } +} + +TEST(math, minmax) { + constexpr double inf = std::numeric_limits<double>::infinity(); + + struct X { + double lhs; + double rhs; + double expected_min; + double expected_max; + }; + + std::vector<X> inputs = { + { 0, 1, 0, 1}, + { -1, 1, -1, 1}, + { 42, 42, 42, 42}, + {inf, -inf, -inf, inf}, + { 0, inf, 0, inf}, + { 0, -inf, -inf, 0}, + }; + + for (auto x: inputs) { + // Call min and max with arguments in both possible orders. + EXPECT_EQ(min(x.lhs, x.rhs), x.expected_min); + EXPECT_EQ(min(x.rhs, x.lhs), x.expected_min); + EXPECT_EQ(max(x.lhs, x.rhs), x.expected_max); + EXPECT_EQ(max(x.rhs, x.lhs), x.expected_max); + } +} +