diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt index 8521261ebd872734f566d5ab6f77861974e33bc7..ace0936e5ac3aaa88ab9f73d0474275f269a7f5c 100644 --- a/mechanisms/CMakeLists.txt +++ b/mechanisms/CMakeLists.txt @@ -15,8 +15,7 @@ elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX") set(modcc_opt "-O") set(modcc_target "cpu") elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX2") - set(modcc_opt "-O") - set(modcc_target "cpu") + set(modcc_target "avx2") else() set(modcc_target "cpu") endif() diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7aceb44387bb1b80c8f29e9ad542f13b89456bad --- /dev/null +++ b/modcc/backends/avx2.hpp @@ -0,0 +1,174 @@ +// +// AVX2 backend +// + +#pragma once + +#include "backends/base.hpp" + + +namespace nest { +namespace mc { +namespace modcc { + +// Specialize for the different architectures +template<> +struct simd_intrinsics<targetKind::avx2> { + static bool has_scatter() { + return false; + } + + static bool has_gather() { + return true; + } + + static std::string emit_headers() { + return "#include <immintrin.h>"; + }; + + static std::string emit_simd_width() { + return "256"; + } + + static std::string emit_value_type() { + return "__m256d"; + } + + static std::string emit_index_type() { + return "__m128i"; + } + + template<typename T1, typename T2> + static void emit_binary_op(TextBuffer& tb, tok op, + const T1& arg1, const T2& arg2) { + switch (op) { + case tok::plus: + tb << "_mm256_add_pd("; + break; + case tok::minus: + tb << "_mm256_sub_pd("; + break; + case tok::times: + tb << "_mm256_mul_pd("; + break; + case tok::divide: + tb << "_mm256_div_pd("; + break; + default: + throw std::invalid_argument("Unknown binary operator"); + } + + emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2)); + tb << ")"; + } + + template<typename T> + static void emit_unary_op(TextBuffer& tb, tok op, const T& arg) { + switch (op) { + case tok::minus: + tb << "_mm256_sub_pd(_mm256_set1_pd(0), "; + break; + case tok::exp: + tb << "_mm256_exp_pd("; + break; + case tok::log: + tb << "_mm256_log_pd("; + break; + default: + throw std::invalid_argument("Unknown unary operator"); + } + + emit_operands(tb, arg_emitter(arg)); + tb << ")"; + } + + template<typename B, typename E> + static void emit_pow(TextBuffer& tb, const B& base, const E& exp) { + tb << "_mm256_pow_pd("; + emit_operands(tb, arg_emitter(base), arg_emitter(exp)); + tb << ")"; + } + + template<typename A, typename V> + static void emit_store_unaligned(TextBuffer& tb, const A& addr, + const V& value) { + tb << "_mm256_storeu_pd("; + emit_operands(tb, arg_emitter(addr), arg_emitter(value)); + tb << ")"; + } + + template<typename A> + static void emit_load_unaligned(TextBuffer& tb, const A& addr) { + tb << "_mm256_loadu_pd("; + emit_operands(tb, arg_emitter(addr)); + tb << ")"; + } + + template<typename A> + static void emit_load_index(TextBuffer& tb, const A& addr) { + tb << "_mm_lddqu_si128("; + emit_operands(tb, arg_emitter(addr)); + tb << ")"; + } + + template<typename A, typename I, typename V, typename S> + static void emit_scatter(TextBuffer& tb, const A& addr, + const I& index, const V& value, const S& scale) { + // no support of scatter in AVX2, so revert to simple scalar updates + std::string scalar_index_ptr = varprefix + std::to_string(varcnt++); + std::string scalar_value_ptr = varprefix + std::to_string(varcnt++); + + tb.end_line("{"); + tb.increase_indentation(); + + // FIXME: should probably read "index_type*" + tb.add_gutter(); + tb << "int* " << scalar_index_ptr + << " = (int*) &" << index; + tb.end_line(";"); + + tb.add_gutter(); + tb << "value_type* " << scalar_value_ptr + << " = (value_type*) &" << value; + tb.end_line(";"); + + tb.add_line("for (int k_ = 0; k_ < simd_width; ++k_) {"); + tb.increase_indentation(); + tb.add_gutter(); + tb << addr << "[" << scalar_index_ptr << "[k_]] = " + << scalar_value_ptr << "[k_]"; + tb.end_line(";"); + + tb.decrease_indentation(); + tb.add_line("}"); + + tb.decrease_indentation(); + tb.add_gutter(); + tb << "}"; + } + + template<typename A, typename I, typename S> + static void emit_gather(TextBuffer& tb, const A& addr, + const I& index, const S& scale) { + tb << "_mm256_i32gather_pd("; + emit_operands(tb, arg_emitter(addr), arg_emitter(index), + arg_emitter(scale)); + tb << ")"; + } + + template<typename T> + static void emit_set_value(TextBuffer& tb, const T& arg) { + tb << "_mm256_set1_pd("; + emit_operands(tb, arg_emitter(arg)); + tb << ")"; + } + +private: + static int varcnt; + const static std::string varprefix; +}; + +int simd_intrinsics<targetKind::avx2>::varcnt = 0; +const std::string simd_intrinsics<targetKind::avx2>::varprefix = "_r"; + +}}} // closing namespaces diff --git a/modcc/backends/avx512.hpp b/modcc/backends/avx512.hpp index cce0bc88998bb6abd50f8cce927bf0eb1874f448..87b7ef5da6e47c837178c2921b8f626da486da13 100644 --- a/modcc/backends/avx512.hpp +++ b/modcc/backends/avx512.hpp @@ -15,7 +15,11 @@ namespace modcc { template<> struct simd_intrinsics<targetKind::avx512> { - static bool has_gather_scatter() { + static bool has_scatter() { + return true; + } + + static bool has_gather() { return true; } diff --git a/modcc/backends/base.hpp b/modcc/backends/base.hpp index 487c0722b775610cc64db29ee5f08f6b62b17888..8323cb484a41dc145f857e4a187fc3708fd7e443 100644 --- a/modcc/backends/base.hpp +++ b/modcc/backends/base.hpp @@ -79,7 +79,8 @@ struct simd_intrinsics { template<typename T> static void emit_set_value(TextBuffer& tb, const T& arg); - static bool has_gather_scatter(); + static bool has_gather(); + static bool has_scatter(); }; }}} // closing namespaces diff --git a/modcc/backends/simd.hpp b/modcc/backends/simd.hpp index 1f7d3485c9c2254d1b1a53591a72febe9f521dc2..63c034d701249d652d8026b7a293f8ffcab76315 100644 --- a/modcc/backends/simd.hpp +++ b/modcc/backends/simd.hpp @@ -1,3 +1,4 @@ #pragma once +#include "backends/avx2.hpp" #include "backends/avx512.hpp" diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp index 2954c31dbb061e33ce1444e189804f0820191987..8a038d412054445d11dc677a7aabfc22eb4856f5 100644 --- a/modcc/modcc.cpp +++ b/modcc/modcc.cpp @@ -68,8 +68,12 @@ int main(int argc, char **argv) { else if(targstr == "avx512") { Options::instance().target = targetKind::avx512; } + else if(targstr == "avx2") { + Options::instance().target = targetKind::avx2; + } else { - std::cerr << red("error") << " target must be one in {cpu, gpu}\n"; + std::cerr << red("error") + << " target must be one in {cpu, gpu, avx2, avx512}\n"; return 1; } } @@ -153,6 +157,10 @@ int main(int argc, char **argv) { text = SimdPrinter<targetKind::avx512>( m, Options::instance().optimize).emit_source(); break; + case targetKind::avx2: + text = SimdPrinter<targetKind::avx2>( + m, Options::instance().optimize).emit_source(); + break; default : std::cerr << red("error") << ": unknown printer" << std::endl; exit(1); diff --git a/modcc/options.hpp b/modcc/options.hpp index 02c2f6fd4bf817d473291eff90483fc0ca53cf65..7de816921af7bb505b185bb59f941c7a7853788a 100644 --- a/modcc/options.hpp +++ b/modcc/options.hpp @@ -7,6 +7,7 @@ enum class targetKind { cpu, gpu, // Vectorisation targets + avx2, avx512 }; diff --git a/modcc/simd_printer.hpp b/modcc/simd_printer.hpp index 668c89bc4e82bff15b458c73b7a2e1f1634670cd..ed77cc6b6fba9f2c68e0ed7f20d489152b0368cc 100644 --- a/modcc/simd_printer.hpp +++ b/modcc/simd_printer.hpp @@ -259,6 +259,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e, text_.add_gutter(); text_ << simd_backend::emit_index_type() << " " << vindex_name << " = "; + // FIXME: cast should better go inside `emit_load_index()` simd_backend::emit_load_index( text_, cast_type + "&" + index_ptr_name + "[off_]"); text_.end_line(";"); @@ -289,7 +290,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e, auto var = symbol.second->is_local_variable(); if (is_output(var) && !is_point_process() && - simd_backend::has_gather_scatter()) { + simd_backend::has_scatter()) { // We can safely use scatter, but we need to fetch the variable // first text_.add_line();