From 64171e43573a608cbbdaf88e41ed9bcdc4a966e8 Mon Sep 17 00:00:00 2001 From: noraabiakar <nora.abiakar@gmail.com> Date: Mon, 4 Jun 2018 16:44:57 +0200 Subject: [PATCH] Simd partition by constraint (#494) Changes have been made to the simd implementation of mechansim functions: - The node_index array (array of indices that specifies for each mechanism the CVs where it is present), is now partitioned into 4 arrays according to the constraint on each simd_vector in node_index: 1. contiguous array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are contiguous 2. constant array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are identical 3. independent array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are independent (no repetitions) but not contiguous 4. none array: contains the indices of all simd_vectors in node_index where the none of the above constraints apply When mechanism functions are executed, they loop over each of the 4 arrays separately. This allows for optimizations in every category. - The modcc compiler was modified to generate code for the previous changes, including the optimizations per constraint: 1. contiguous array: we use vector load/store and vector arithmetic. 2. constant array: we load only one element and broadcast it into a simd_vector; we use vector arithmetic; we reduce the result; we store one element. 3. indepndent array: we use vector scatter/gather and vector arithmetic. 4. none array: we cannot operate on the simd_vector in parallel, we loop over the elements to read, perform arithmetic and write back - Added a mechanism benchmark for pas, hh and expsyn - Moved/modified some functions in simd.hpp to ensure that the correct implementation of a function is being called. --- modcc/printer/cprinter.cpp | 197 ++++++++-- modcc/printer/cprinter.hpp | 11 + src/backends/multicore/mechanism.cpp | 7 + src/backends/multicore/mechanism.hpp | 2 + .../multicore/partition_by_constraint.hpp | 118 ++++++ src/fvm_lowered_cell_impl.hpp | 5 + src/simd/implbase.hpp | 39 -- src/simd/simd.hpp | 87 ++++- tests/ubench/CMakeLists.txt | 2 + tests/ubench/mech_vec.cpp | 363 ++++++++++++++++++ tests/unit/CMakeLists.txt | 1 + tests/unit/test_partition_by_constraint.cpp | 153 ++++++++ tests/unit/test_simd.cpp | 2 + 13 files changed, 903 insertions(+), 84 deletions(-) create mode 100644 src/backends/multicore/partition_by_constraint.hpp create mode 100644 tests/ubench/mech_vec.cpp create mode 100644 tests/unit/test_partition_by_constraint.cpp diff --git a/modcc/printer/cprinter.cpp b/modcc/printer/cprinter.cpp index a75b9a47..c856f4b6 100644 --- a/modcc/printer/cprinter.cpp +++ b/modcc/printer/cprinter.cpp @@ -21,6 +21,20 @@ void emit_simd_procedure_proto(std::ostream&, ProcedureExpression*, const std::s void emit_api_body(std::ostream&, APIMethod*); void emit_simd_api_body(std::ostream&, APIMethod*, moduleKind); +void emit_index_initialize(std::ostream& out, const std::unordered_set<std::string>& indices, + simd_expr_constraint constraint); + +void emit_body_for_loop(std::ostream& out, BlockExpression* body, const std::vector<LocalVariable*>& indexed_vars, + const std::unordered_set<std::string>& indices, const simd_expr_constraint& read_constraint, + const simd_expr_constraint& write_constraint); + +void emit_for_loop_per_constraint(std::ostream& out, BlockExpression* body, + const std::vector<LocalVariable*>& indexed_vars, + const std::unordered_set<std::string>& indices, + const simd_expr_constraint& read_constraint, + const simd_expr_constraint& write_constraint, + std::string underlying_constraint_name); + struct cprint { Expression* expr_; explicit cprint(Expression* expr): expr_(expr) {} @@ -33,10 +47,21 @@ struct cprint { struct simdprint { Expression* expr_; - explicit simdprint(Expression* expr): expr_(expr) {} + bool is_indirect_index_; + simd_expr_constraint constraint_; + + explicit simdprint(Expression* expr): expr_(expr), is_indirect_index_(false), + constraint_(simd_expr_constraint::other) {} + explicit simdprint(Expression* expr, bool is_indexed, simd_expr_constraint constraint): + expr_(expr), is_indirect_index_(is_indexed), constraint_(constraint) {} + + void set_indirect_index() { + is_indirect_index_ = true; + } friend std::ostream& operator<<(std::ostream& out, const simdprint& w) { SimdPrinter printer(out); + printer.set_var_indexed_to(w.is_indirect_index_); return w.expr_->accept(&printer), out; } }; @@ -107,6 +132,7 @@ std::string emit_cpp_source(const Module& module_, const std::string& ns, simd_s if (with_simd) { out << "namespace S = ::arb::simd;\n" + "using S::index_constraint;\n" "static constexpr unsigned simd_width_ = "; if (!simd.width) { @@ -416,11 +442,6 @@ static std::string index_i_name(const std::string& index_var) { return index_var+"i_"; } -static std::string index_constraint_name(const std::string& index_var) { - return index_var+"constraint_"; -} - - void SimdPrinter::visit(IdentifierExpression *e) { e->symbol()->accept(this); } @@ -431,7 +452,10 @@ void SimdPrinter::visit(LocalVariable* sym) { void SimdPrinter::visit(VariableExpression *sym) { if (sym->is_range()) { - out_ << "simd_value(" << sym->name() << "+i_)"; + if(is_indirect_index_) + out_ << "simd_value(" << sym->name() << "+index_)"; + else + out_ << "simd_value(" << sym->name() << "+i_)"; } else { out_ << sym->name(); @@ -448,7 +472,10 @@ void SimdPrinter::visit(AssignmentExpression* e) { if (lhs->is_variable() && lhs->is_variable()->is_range()) { out_ << "simd_value("; e->rhs()->accept(this); - out_ << ").copy_to(" << lhs->name() << "+i_)"; + if(is_indirect_index_) + out_ << ").copy_to(" << lhs->name() << "+index_)"; + else + out_ << ").copy_to(" << lhs->name() << "+i_)"; } else { out_ << lhs->name() << " = "; @@ -460,7 +487,7 @@ void SimdPrinter::visit(IndexedVariable *sym) { indexed_variable_info v = decode_indexed_variable(sym); out_ << "S::indirect(" << v.data_var << ", " << index_i_name(v.index_var) - << ", " << index_constraint_name(v.index_var) << ")"; + << ", constraint_category_)"; } void SimdPrinter::visit(CallExpression* e) { @@ -505,24 +532,104 @@ void emit_simd_procedure_proto(std::ostream& out, ProcedureExpression* e, const out << ")"; } -void emit_simd_state_read(std::ostream& out, LocalVariable* local) { +void emit_simd_state_read(std::ostream& out, LocalVariable* local, simd_expr_constraint constraint) { out << "simd_value " << local->name(); if (local->is_read()) { - out << "(" << simdprint(local->external_variable()) << ");\n"; + indexed_variable_info v = decode_indexed_variable(local->external_variable()); + if(constraint == simd_expr_constraint::contiguous) { + out << "(" << v.data_var + << " + " << v.index_var + << "[index_]);\n"; + } + else if(constraint == simd_expr_constraint::constant){ + out << "(" << v.data_var + << "[" << v.index_var + << "element0]);\n"; + } + else + out << "(" << simdprint(local->external_variable()) << ");\n"; } else { out << " = 0;\n"; } } -void emit_simd_state_update(std::ostream& out, Symbol* from, IndexedVariable* external) { +void emit_simd_state_update(std::ostream& out, Symbol* from, IndexedVariable* external, simd_expr_constraint constraint) { if (!external->is_write()) return; const char* op = external->op()==tok::plus? " += ": " -= "; - out << simdprint(external) << op << from->name() << ";\n"; + indexed_variable_info v = decode_indexed_variable(external); + + if(constraint == simd_expr_constraint::contiguous) { + out << "simd_value t_"<< external->name() <<"(" << v.data_var << " + " << v.index_var << "[index_]);\n"; + out << "t_" << external->name() << op << from->name() << ";\n"; + out << "t_" << external->name() << ".copy_to(" << v.data_var << " + " << v.index_var << "[index_]);\n"; + + } + else { + out << simdprint(external) << op << from->name() << ";\n"; + } +} + +void emit_index_initialize(std::ostream& out, const std::unordered_set<std::string>& indices, + simd_expr_constraint constraint) { + switch(constraint) { + case simd_expr_constraint::contiguous : + break; + case simd_expr_constraint::constant : { + for (auto& index: indices) { + out << "simd_index::scalar_type " << index << "element0 = " << index << "[index_];\n"; + out << index_i_name(index) << " = " << index << "element0;\n"; + } + } + break; + case simd_expr_constraint::other : { + for (auto& index: indices) { + out << index_i_name(index) << ".copy_from(" << index << ".data() + index_);\n"; + } + } + break; + } +} + +void emit_body_for_loop(std::ostream& out, BlockExpression* body, const std::vector<LocalVariable*>& indexed_vars, + const std::unordered_set<std::string>& indices, const simd_expr_constraint& read_constraint, + const simd_expr_constraint& write_constraint) { + emit_index_initialize(out, indices, read_constraint); + + for (auto& sym: indexed_vars) { + emit_simd_state_read(out, sym, read_constraint); + } + + simdprint printer(body); + printer.set_indirect_index(); + + out << printer; + + for (auto& sym: indexed_vars) { + emit_simd_state_update(out, sym, sym->external_variable(), write_constraint); + } } +void emit_for_loop_per_constraint(std::ostream& out, BlockExpression* body, + const std::vector<LocalVariable*>& indexed_vars, + const std::unordered_set<std::string>& indices, + const simd_expr_constraint& read_constraint, + const simd_expr_constraint& write_constraint, + std::string underlying_constraint_name) { + + out << "constraint_category_ = index_constraint::"<< underlying_constraint_name << ";\n"; + out << "for (unsigned i_ = 0; i_ < index_constraints_." << underlying_constraint_name + << ".size(); i_++) {\n" + << indent; + + out << "index_type index_ = index_constraints_." << underlying_constraint_name << "[i_];\n"; + + emit_body_for_loop(out, body, indexed_vars, indices, read_constraint, write_constraint); + + out << popindent << "}\n"; +} void emit_simd_api_body(std::ostream& out, APIMethod* method, moduleKind module_kind) { auto body = method->body(); @@ -533,43 +640,51 @@ void emit_simd_api_body(std::ostream& out, APIMethod* method, moduleKind module_ indices.insert(decode_indexed_variable(sym->external_variable()).index_var); } - // Note: expect to make index constraints non-constant for point mechanisms as - // an optimization in the near future. + if (!body->statements().empty()) { + if (!indices.empty()) { + for (auto& index: indices) { + out << "simd_index " << index_i_name(index) << ";\n"; + } - // Another note: (TODO) can't actually use index_constraint::independent - // for density mechanisms because of collisions in the padded part of - // the indices. Work-arounds exist, but not yet implemented. + out << "index_constraint constraint_category_;\n\n"; + //Generate for loop for all contiguous simd_vectors + simd_expr_constraint constraint = simd_expr_constraint::contiguous; + std::string underlying_constraint = "contiguous"; - std::string common_constraint = module_kind==moduleKind::density? - //"S::index_constraint::independent": - "S::index_constraint::none": - "S::index_constraint::none"; + emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint, + constraint, underlying_constraint); - for (auto& index: indices) { - out << "simd_index " << index_i_name(index) << ";\n"; - out << "constexpr S::index_constraint " << index_constraint_name(index) - << " = " << common_constraint << ";\n"; - } + //Generate for loop for all independent simd_vectors + constraint = simd_expr_constraint::other; + underlying_constraint = "independent"; - if (!body->statements().empty()) { - out << - "int n_ = width_;\n" - "for (int i_ = 0; i_ < n_; i_ += simd_width_) {\n" << indent; + emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint, + constraint, underlying_constraint); - for (auto& index: indices) { - out << index_i_name(index) << ".copy_from(" << index << ".data()+i_);\n"; - } + //Generate for loop for all simd_vectors that have no optimizing constraints + constraint = simd_expr_constraint::other; + underlying_constraint = "none"; - for (auto& sym: indexed_vars) { - emit_simd_state_read(out, sym); - } + emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint, + constraint, underlying_constraint); - out << simdprint(body); + //Generate for loop for all constant simd_vectors + simd_expr_constraint read_constraint = simd_expr_constraint::constant; + simd_expr_constraint write_constraint = simd_expr_constraint::other; + underlying_constraint = "constant"; + + emit_for_loop_per_constraint(out, body, indexed_vars, indices, read_constraint, + write_constraint, underlying_constraint); - for (auto& sym: indexed_vars) { - emit_simd_state_update(out, sym, sym->external_variable()); } - out << popindent << "}\n"; + else { + out << "unsigned n_ = width_;\n\n"; + out << + "for (unsigned i_ = 0; i_ < n_; i_ += simd_width_) {\n" << indent; + out << simdprint(body); + out << popindent << "}\n"; + } } } + diff --git a/modcc/printer/cprinter.hpp b/modcc/printer/cprinter.hpp index 77b2e37c..e7e96ab8 100644 --- a/modcc/printer/cprinter.hpp +++ b/modcc/printer/cprinter.hpp @@ -38,6 +38,13 @@ protected: std::ostream& out_; }; + +enum class simd_expr_constraint{ + constant, + contiguous, + other +}; + class SimdPrinter: public Visitor { public: SimdPrinter(std::ostream& out): out_(out) {} @@ -45,6 +52,9 @@ public: void visit(Expression* e) override { throw compiler_exception("SimdPrinter cannot translate expression "+e->to_string()); } + void set_var_indexed_to(bool is_indirect_index) { + is_indirect_index_ = is_indirect_index; + } void visit(BlockExpression*) override; void visit(CallExpression*) override; @@ -60,4 +70,5 @@ public: private: std::ostream& out_; + bool is_indirect_index_; }; diff --git a/src/backends/multicore/mechanism.cpp b/src/backends/multicore/mechanism.cpp index 73f95529..b0fc076f 100644 --- a/src/backends/multicore/mechanism.cpp +++ b/src/backends/multicore/mechanism.cpp @@ -19,6 +19,7 @@ #include <backends/multicore/mechanism.hpp> #include <backends/multicore/multicore_common.hpp> #include <backends/multicore/fvm.hpp> +#include <backends/multicore/partition_by_constraint.hpp> namespace arb { namespace multicore { @@ -26,6 +27,9 @@ namespace multicore { using util::make_range; using util::value_by_key; +constexpr unsigned simd_width = S::simd_abi::native_width<fvm_value_type>::value; + + // Copy elements from source sequence into destination sequence, // and fill the remaining elements of the destination sequence // with the given fill value. @@ -127,6 +131,7 @@ void mechanism::instantiate(fvm_size_type id, backend::shared_state& shared, con node_index_ = iarray(width_padded_, pad); copy_extend(pos_data.cv, node_index_, pos_data.cv.back()); copy_extend(pos_data.weight, make_range(data_.data(), data_.data()+width_padded_), 0); + index_constraints_ = make_constraint_partition(node_index_, width_, simd_width); for (auto i: ion_index_table()) { util::optional<ion_state&> oion = value_by_key(shared.ion_data, i.first); @@ -140,6 +145,8 @@ void mechanism::instantiate(fvm_size_type id, backend::shared_state& shared, con auto& ion_index = *i.second; ion_index = iarray(width_padded_, pad); copy_extend(indices, ion_index, util::back(indices)); + + EXPECTS(compatible_index_constraints(node_index_, ion_index, simd_width)); } } diff --git a/src/backends/multicore/mechanism.hpp b/src/backends/multicore/mechanism.hpp index 066e287f..c03bbb75 100644 --- a/src/backends/multicore/mechanism.hpp +++ b/src/backends/multicore/mechanism.hpp @@ -12,6 +12,7 @@ #include <mechanism.hpp> #include <backends/multicore/multicore_common.hpp> +#include <backends/multicore/partition_by_constraint.hpp> #include <backends/multicore/fvm.hpp> namespace arb { @@ -81,6 +82,7 @@ protected: // Per-mechanism index and weight data, excepting ion indices. iarray node_index_; + constraint_partition index_constraints_; const value_type* weight_; // Points within data_ after instantiation. // Bulk storage for state and parameter variables. diff --git a/src/backends/multicore/partition_by_constraint.hpp b/src/backends/multicore/partition_by_constraint.hpp new file mode 100644 index 00000000..b4cf0c06 --- /dev/null +++ b/src/backends/multicore/partition_by_constraint.hpp @@ -0,0 +1,118 @@ +#pragma once + +#include<vector> +#include<simd/simd.hpp> + +namespace arb { +namespace multicore { + +namespace S = ::arb::simd; +using S::index_constraint; + +struct constraint_partition { + using iarray = arb::multicore::iarray; + + iarray contiguous; + iarray constant; + iarray independent; + iarray none; +}; + +template <typename It> +bool is_contiguous_n(It first, unsigned width) { + while (--width) { + It next = first; + if ((*first) +1 != *++next) { + return false; + } + first = next; + } + return true; +} + +template <typename It> +bool is_constant_n(It first, unsigned width) { + while (--width) { + It next = first; + if (*first != *++next) { + return false; + } + first = next; + } + return true; +} + +template <typename It> +bool is_independent_n(It first, unsigned width) { + while (--width) { + It next = first; + if (*first == *++next) { + return false; + } + first = next; + } + return true; +} + +template <typename It> +index_constraint idx_constraint(It it, unsigned simd_width) { + if (is_contiguous_n(it, simd_width)) { + return index_constraint::contiguous; + } + else if (is_constant_n(it, simd_width)) { + return index_constraint::constant; + } + else if (is_independent_n(it, simd_width)) { + return index_constraint::independent; + } + else { + return index_constraint::none; + } +} + +template <typename T> +constraint_partition make_constraint_partition(const T& node_index, unsigned width, unsigned simd_width) { + + constraint_partition part; + for (unsigned i = 0; i < width; i+= simd_width) { + auto ptr = &node_index[i]; + if (is_contiguous_n(ptr, simd_width)) { + part.contiguous.push_back(i); + } + else if (is_constant_n(ptr, simd_width)) { + part.constant.push_back(i); + } + else if (is_independent_n(ptr, simd_width)) { + part.independent.push_back(i); + } + else { + part.none.push_back(i); + } + } + return part; +} + +bool constexpr is_constraint_stronger(index_constraint a, index_constraint b) { + return a==b || + a==index_constraint::none || + (a==index_constraint::independent && b==index_constraint::contiguous); +} + +template <typename T> +bool compatible_index_constraints(T& node_index, T& ion_index, unsigned simd_width){ + for (unsigned i = 0; i < node_index.size(); i+= simd_width) { + auto nc = idx_constraint(&node_index[i], simd_width); + auto ic = idx_constraint(&ion_index[i], simd_width); + + if (!is_constraint_stronger(nc, ic)) { + return false; + } + } + return true; +} + +} // namespace util +} // namespace arb + + + diff --git a/src/fvm_lowered_cell_impl.hpp b/src/fvm_lowered_cell_impl.hpp index 22b8550c..765dfb2a 100644 --- a/src/fvm_lowered_cell_impl.hpp +++ b/src/fvm_lowered_cell_impl.hpp @@ -56,6 +56,11 @@ public: value_type time() const override { return tmin_; } + //Exposed for testing purposes + std::vector<mechanism_ptr>& mechanisms() { + return mechanisms_; + } + private: // Host or GPU-side back-end dependent storage. using array = typename backend::array; diff --git a/src/simd/implbase.hpp b/src/simd/implbase.hpp index 9253add1..3722ae58 100644 --- a/src/simd/implbase.hpp +++ b/src/simd/implbase.hpp @@ -58,7 +58,6 @@ namespace simd { enum class index_constraint { none = 0, // For indices k[0], k[1],...: - independent, // k[i]==k[j] => i=j. contiguous, // k[i]==k[0]+i constant // k[i]==k[j] ∀ i, j @@ -435,44 +434,6 @@ struct implbase { } } - template <typename ImplIndex> - static void compound_indexed_add(tag<ImplIndex> tag, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, index_constraint constraint) { - switch (constraint) { - case index_constraint::none: - { - typename ImplIndex::scalar_type o[width]; - ImplIndex::copy_to(index, o); - - store a; - I::copy_to(s, a); - - for (unsigned i = 0; i<width; ++i) { - p[o[i]] += a[i]; - } - } - break; - case index_constraint::independent: - { - vector_type v = I::add(I::gather(tag, p, index), s); - I::scatter(tag, v, p, index); - } - break; - case index_constraint::contiguous: - { - p += ImplIndex::element0(index); - vector_type v = I::add(I::copy_from(p), s); - I::copy_to(v, p); - } - break; - case index_constraint::constant: - { - p += ImplIndex::element0(index); - *p += I::reduce_add(s); - } - break; - } - } - static scalar_type reduce_add(const vector_type& s) { store a; I::copy_to(s, a); diff --git a/src/simd/simd.hpp b/src/simd/simd.hpp index 17922040..02d1ad62 100644 --- a/src/simd/simd.hpp +++ b/src/simd/simd.hpp @@ -7,6 +7,7 @@ #include <simd/implbase.hpp> #include <simd/generic.hpp> #include <simd/native.hpp> +#include <common_types.hpp> namespace arb { namespace simd { @@ -43,13 +44,13 @@ namespace simd_detail { template <typename Other> indirect_expression& operator+=(const simd_impl<Other>& s) { - Other::compound_indexed_add(tag<Impl>{}, s.value_, p, index, constraint); + simd_impl<Other>::compound_indexed_add(tag<Impl>{}, s.value_, p, index, constraint); return *this; } template <typename Other> indirect_expression& operator-=(const simd_impl<Other>& s) { - Other::compound_indexed_add(tag<Impl>{}, (-s).value_, p, index, constraint); + simd_impl<Other>::compound_indexed_add(tag<Impl>{}, (-s).value_, p, index, constraint); return *this; } }; @@ -158,14 +159,92 @@ namespace simd_detail { template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> void copy_from(indirect_expression<IndexImpl, scalar_type> pi) { - value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + switch (pi.constraint) { + case index_constraint::none: + value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + break; + case index_constraint::independent: + value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + break; + case index_constraint::contiguous: + { + scalar_type* p = IndexImpl::element0(pi.index) + pi.p; + value_ = Impl::copy_from(p); + } + break; + case index_constraint::constant: + { + scalar_type* p = IndexImpl::element0(pi.index) + pi.p; + scalar_type l = (*p); + value_ = Impl::broadcast(l); + } + break; + } } template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type> void copy_from(indirect_expression<IndexImpl, const scalar_type> pi) { - value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + switch (pi.constraint) { + case index_constraint::none: + value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + break; + case index_constraint::independent: + value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index); + break; + case index_constraint::contiguous: + { + const scalar_type* p = IndexImpl::element0(pi.index) + pi.p; + value_ = Impl::copy_from(p); + } + break; + case index_constraint::constant: + { + const scalar_type *p = IndexImpl::element0(pi.index) + pi.p; + scalar_type l = (*p); + value_ = Impl::broadcast(l); + } + break; + } + } + template <typename ImplIndex> + static void compound_indexed_add(tag<ImplIndex> tag, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, index_constraint constraint) { + switch (constraint) { + case index_constraint::none: + { + typename ImplIndex::scalar_type o[width]; + ImplIndex::copy_to(index, o); + + scalar_type a[width]; + Impl::copy_to(s, a); + + for (unsigned i = 0; i<width; ++i) { + p[o[i]] += a[i]; + } + } + break; + case index_constraint::independent: + { + vector_type v = Impl::add(Impl::gather(tag, p, index), s); + Impl::scatter(tag, v, p, index); + } + break; + case index_constraint::contiguous: + { + p += ImplIndex::element0(index); + vector_type v = Impl::add(Impl::copy_from(p), s); + Impl::copy_to(v, p); + } + break; + case index_constraint::constant: + p += ImplIndex::element0(index); + *p += Impl::reduce_add(s); + break; + } + } + + // Arithmetic operations: +, -, *, /, fma. simd_impl operator-() const { diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt index 297fb491..c93d4132 100644 --- a/tests/ubench/CMakeLists.txt +++ b/tests/ubench/CMakeLists.txt @@ -7,6 +7,7 @@ set(bench_sources default_construct.cpp event_setup.cpp event_binning.cpp + mech_vec.cpp ) set(bench_sources_cuda @@ -43,6 +44,7 @@ foreach(bench_src ${bench_sources}) add_dependencies("${bench_exe}" gbench) target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include") target_link_libraries("${bench_exe}" LINK_PUBLIC "${gbench_install_dir}/lib/libbenchmark.a") + target_link_libraries("${bench_exe}" LINK_PUBLIC ${ARB_LIBRARIES}) list(APPEND bench_exe_list ${bench_exe}) endforeach() diff --git a/tests/ubench/mech_vec.cpp b/tests/ubench/mech_vec.cpp new file mode 100644 index 00000000..58c1643b --- /dev/null +++ b/tests/ubench/mech_vec.cpp @@ -0,0 +1,363 @@ +// Test performance of vectorization for mechanism implementations. +// +// Start with pas (passive dendrite) mechanism + +#include <backends/multicore/fvm.hpp> +#include <benchmark/benchmark.h> +#include <fvm_lowered_cell_impl.hpp> +#include <fstream> + +using namespace arb; + +using backend = arb::multicore::backend; +using fvm_cell = arb::fvm_lowered_cell_impl<backend>; + +mechanism_ptr& find_mechanism(const std::string& name, fvm_cell& cell) { + auto &mechs = cell.mechanisms(); + auto it = std::find_if(mechs.begin(), + mechs.end(), + [&](mechanism_ptr& m){return m->internal_name()==name;}); + if (it==mechs.end()) { + std::cerr << "couldn't find mechanism with name " << name << "\n"; + exit(1); + } + return *it; +} + +class recipe_expsyn_1_branch: public recipe { + unsigned num_comp_; + unsigned num_synapse_; +public: + recipe_expsyn_1_branch(unsigned num_comp, unsigned num_synapse): + num_comp_(num_comp), num_synapse_(num_synapse) {} + + cell_size_type num_cells() const override { + return 1; + } + + virtual util::unique_any get_cell_description(cell_gid_type gid) const override { + cell c; + + auto soma = c.add_soma(12.6157/2.0); + soma->add_mechanism("pas"); + + c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + + for (auto& seg: c.segments()) { + if (seg->is_dendrite()) { + seg->set_compartments(num_comp_-1); + } + } + + auto distribution = std::uniform_real_distribution<float>(0.f, 1.0f); + + for(unsigned i = 0; i < num_synapse_; i++) { + auto gen = std::mt19937(i); + c.add_synapse({1, distribution(gen)}, "expsyn"); + } + + return std::move(c); + } + + virtual cell_kind get_cell_kind(cell_gid_type) const override { + return cell_kind::cable1d_neuron; + } + +}; + +class recipe_pas_1_branch: public recipe { + unsigned num_comp_; +public: + recipe_pas_1_branch(unsigned num_comp): num_comp_(num_comp) {} + + cell_size_type num_cells() const override { + return 1; + } + + virtual util::unique_any get_cell_description(cell_gid_type gid) const override { + cell c; + + auto soma = c.add_soma(12.6157/2.0); + soma->add_mechanism("pas"); + + c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + + for (auto& seg: c.segments()) { + if (seg->is_dendrite()) { + seg->add_mechanism("pas"); + seg->set_compartments(num_comp_-1); + } + } + return std::move(c); + } + + virtual cell_kind get_cell_kind(cell_gid_type) const override { + return cell_kind::cable1d_neuron; + } + +}; + +class recipe_pas_3_branches: public recipe { + unsigned num_comp_; +public: + recipe_pas_3_branches(unsigned num_comp): num_comp_(num_comp) {} + + cell_size_type num_cells() const override { + return 1; + } + + virtual util::unique_any get_cell_description(cell_gid_type gid) const override { + cell c; + + auto soma = c.add_soma(12.6157/2.0); + soma->add_mechanism("pas"); + + c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + + for (auto& seg: c.segments()) { + if (seg->is_dendrite()) { + seg->add_mechanism("pas"); + seg->set_compartments(num_comp_-1); + } + } + return std::move(c); + } + + virtual cell_kind get_cell_kind(cell_gid_type) const override { + return cell_kind::cable1d_neuron; + } + +}; + +class recipe_hh_1_branch: public recipe { + unsigned num_comp_; +public: + recipe_hh_1_branch(unsigned num_comp): num_comp_(num_comp) {} + + cell_size_type num_cells() const override { + return 1; + } + + virtual util::unique_any get_cell_description(cell_gid_type gid) const override { + cell c; + + auto soma = c.add_soma(12.6157/2.0); + soma->add_mechanism("hh"); + + c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + + for (auto& seg: c.segments()) { + if (seg->is_dendrite()) { + seg->add_mechanism("hh"); + seg->set_compartments(num_comp_-1); + } + } + return std::move(c); + } + + virtual cell_kind get_cell_kind(cell_gid_type) const override { + return cell_kind::cable1d_neuron; + } + +}; + +class recipe_hh_3_branches: public recipe { + unsigned num_comp_; +public: + recipe_hh_3_branches(unsigned num_comp): num_comp_(num_comp) {} + + cell_size_type num_cells() const override { + return 1; + } + + virtual util::unique_any get_cell_description(cell_gid_type gid) const override { + cell c; + + auto soma = c.add_soma(12.6157/2.0); + soma->add_mechanism("pas"); + + c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0); + + for (auto& seg: c.segments()) { + if (seg->is_dendrite()) { + seg->add_mechanism("hh"); + seg->set_compartments(num_comp_-1); + } + } + return std::move(c); + } + + virtual cell_kind get_cell_kind(cell_gid_type) const override { + return cell_kind::cable1d_neuron; + } +}; + +void expsyn_1_branch_current(benchmark::State& state) { + const unsigned ncomp = state.range(0); + const unsigned nsynapse = state.range(1); + recipe_expsyn_1_branch rec_expsyn_1_branch(ncomp, nsynapse); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_expsyn_1_branch, target_handles, probe_handles); + + auto& m = find_mechanism("expsyn", cell); + + while (state.KeepRunning()) { + m->nrn_current(); + } +} + +void expsyn_1_branch_state(benchmark::State& state) { + const unsigned ncomp = state.range(0); + const unsigned nsynapse = state.range(1); + recipe_expsyn_1_branch rec_expsyn_1_branch(ncomp, nsynapse); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_expsyn_1_branch, target_handles, probe_handles); + + auto& m = find_mechanism("expsyn", cell); + + while (state.KeepRunning()) { + m->nrn_state(); + } +} + +void pas_1_branch_current(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_pas_1_branch rec_pas_1_branch(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_pas_1_branch, target_handles, probe_handles); + + auto& m = find_mechanism("pas", cell); + + while (state.KeepRunning()) { + m->nrn_current(); + } +} + +void pas_3_branches_current(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_pas_3_branches rec_pas_3_branches(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_pas_3_branches, target_handles, probe_handles); + + auto& m = find_mechanism("pas", cell); + + while (state.KeepRunning()) { + m->nrn_current(); + } +} + +void hh_1_branch_state(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_hh_1_branch rec_hh_1_branch(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_hh_1_branch, target_handles, probe_handles); + + auto& m = find_mechanism("hh", cell); + + while (state.KeepRunning()) { + m->nrn_state(); + } +} + +void hh_1_branch_current(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_hh_1_branch rec_hh_1_branch(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_hh_1_branch, target_handles, probe_handles); + + auto& m = find_mechanism("hh", cell); + + while (state.KeepRunning()) { + m->nrn_current(); + } +} + +void hh_3_branches_state(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_hh_3_branches rec_hh_3_branches(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_hh_3_branches, target_handles, probe_handles); + + auto& m = find_mechanism("hh", cell); + + while (state.KeepRunning()) { + m->nrn_state(); + } +} + +void hh_3_branches_current(benchmark::State& state) { + const unsigned ncomp = state.range(0); + recipe_hh_3_branches rec_hh_3_branches(ncomp); + + std::vector<cell_gid_type> gids = {0}; + std::vector<target_handle> target_handles; + probe_association_map<probe_handle> probe_handles; + + fvm_cell cell; + cell.initialize(gids, rec_hh_3_branches, target_handles, probe_handles); + + auto& m = find_mechanism("hh", cell); + + while (state.KeepRunning()) { + m->nrn_current(); + } +} + +void run_custom_arguments(benchmark::internal::Benchmark* b) { + for (auto ncomps: {10, 100, 1000, 10000, 100000, 1000000, 10000000}) { + b->Args({ncomps}); + } +} +void run_exp_custom_arguments(benchmark::internal::Benchmark* b) { + for (auto ncomps: {10, 100, 1000, 10000, 100000, 1000000}) { + b->Args({ncomps, ncomps*10}); + } +} +BENCHMARK(expsyn_1_branch_current)->Apply(run_exp_custom_arguments); +BENCHMARK(expsyn_1_branch_state)->Apply(run_exp_custom_arguments); +BENCHMARK(pas_1_branch_current)->Apply(run_custom_arguments); +BENCHMARK(hh_1_branch_current)->Apply(run_custom_arguments); +BENCHMARK(hh_1_branch_state)->Apply(run_custom_arguments); +BENCHMARK(pas_3_branches_current)->Apply(run_custom_arguments); +BENCHMARK(hh_3_branches_current)->Apply(run_custom_arguments); +BENCHMARK(hh_3_branches_state)->Apply(run_custom_arguments); +BENCHMARK_MAIN(); diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 942b2283..e00458e7 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -68,6 +68,7 @@ set(test_sources test_mechinfo.cpp test_padded.cpp test_partition.cpp + test_partition_by_constraint.cpp test_path.cpp test_point.cpp test_probe.cpp diff --git a/tests/unit/test_partition_by_constraint.cpp b/tests/unit/test_partition_by_constraint.cpp new file mode 100644 index 00000000..7b8400af --- /dev/null +++ b/tests/unit/test_partition_by_constraint.cpp @@ -0,0 +1,153 @@ +#include "../gtest.h" + +#include <array> +#include <forward_list> +#include <string> +#include <vector> + +#include <simd/simd.hpp> +#include <common_types.hpp> +#include <backends/multicore/multicore_common.hpp> +#include <backends/multicore/partition_by_constraint.hpp> + +using namespace arb; +using iarray = multicore::iarray; +static constexpr unsigned simd_width_ = arb::simd::simd_abi::native_width<fvm_value_type>::value; + +const int input_size_ = 1024; + +TEST(partition_by_constraint, partition_contiguous) { + iarray input_index(input_size_); + iarray expected; + multicore::constraint_partition output; + + + for (unsigned i = 0; i < input_size_; i++) { + input_index[i] = i; + if(i % simd_width_ == 0) + expected.push_back(i); + } + + output = multicore::make_constraint_partition(input_index, input_size_, simd_width_); + + EXPECT_EQ(0, output.independent.size()); + EXPECT_EQ(0, output.none.size()); + EXPECT_EQ(0, output.constant.size()); + EXPECT_EQ(expected, output.contiguous); +} + +TEST(partition_by_constraint, partition_constant) { + iarray input_index(input_size_); + iarray expected; + multicore::constraint_partition output; + + const int c = 5; + + for (unsigned i = 0; i < input_size_; i++) { + input_index[i] = c; + if(i % simd_width_ == 0) + expected.push_back(i); + } + + output = multicore::make_constraint_partition(input_index, input_size_, simd_width_); + + EXPECT_EQ(0, output.independent.size()); + EXPECT_EQ(0, output.none.size()); + if(simd_width_ != 1) { + EXPECT_EQ(0, output.contiguous.size()); + EXPECT_EQ(expected, output.constant); + } + else { + EXPECT_EQ(0, output.constant.size()); + EXPECT_EQ(expected, output.contiguous); + } +} + +TEST(partition_by_constraint, partition_independent) { + iarray input_index(input_size_); + iarray expected; + multicore::constraint_partition output; + + for (unsigned i = 0; i < input_size_; i++) { + input_index[i] = i * 2; + if(i % simd_width_ == 0) + expected.push_back(i); + } + + output = multicore::make_constraint_partition(input_index, input_size_, simd_width_); + + EXPECT_EQ(0, output.constant.size()); + EXPECT_EQ(0, output.none.size()); + if(simd_width_ != 1) { + EXPECT_EQ(0, output.contiguous.size()); + EXPECT_EQ(expected, output.independent); + } + else { + EXPECT_EQ(0, output.independent.size()); + EXPECT_EQ(expected, output.contiguous); + } +} + +TEST(partition_by_constraint, partition_none) { + iarray input_index(input_size_); + iarray expected; + multicore::constraint_partition output; + + for (unsigned i = 0; i < input_size_; i++) { + input_index[i] = i / ((simd_width_ + 1)/ 2); + if(i % simd_width_ == 0) + expected.push_back(i); + } + + output = multicore::make_constraint_partition(input_index, input_size_, simd_width_); + + EXPECT_EQ(0, output.independent.size()); + EXPECT_EQ(0, output.constant.size()); + if(simd_width_ != 1) { + EXPECT_EQ(0, output.contiguous.size()); + EXPECT_EQ(expected, output.none); + } + else { + EXPECT_EQ(0, output.none.size()); + EXPECT_EQ(expected, output.contiguous); + } +} + +TEST(partition_by_constraint, partition_random) { + iarray input_index(input_size_); + iarray expected_contiguous, expected_constant, + expected_independent, expected_none, expected_simd_1; + multicore::constraint_partition output; + + + const int c = 5; + for (unsigned i = 0; i < input_size_; i++) { + input_index[i] = i<input_size_/4 ? i/((simd_width_ + 1)/ 2): + i<input_size_/2 ? i*2: + i<input_size_*3/4 ? c: + i; + if (i < input_size_ / 4 && i % simd_width_ == 0) + expected_none.push_back(i); + else if (i < input_size_ / 2 && i % simd_width_ == 0) + expected_independent.push_back(i); + else if (i < input_size_* 3/ 4 && i % simd_width_ == 0) + expected_constant.push_back(i); + else if (i % simd_width_ == 0) + expected_contiguous.push_back(i); + expected_simd_1.push_back(i); + + } + + output = multicore::make_constraint_partition(input_index, input_size_, simd_width_); + + if (simd_width_ != 1) { + EXPECT_EQ(expected_contiguous, output.contiguous); + EXPECT_EQ(expected_constant, output.constant); + EXPECT_EQ(expected_independent, output.independent); + EXPECT_EQ(expected_none, output.none); + } + else { + EXPECT_EQ(expected_simd_1, output.contiguous); + } + +} diff --git a/tests/unit/test_simd.cpp b/tests/unit/test_simd.cpp index 1cec58f7..15589290 100644 --- a/tests/unit/test_simd.cpp +++ b/tests/unit/test_simd.cpp @@ -8,9 +8,11 @@ #include <simd/simd.hpp> #include <simd/avx.hpp> +#include <common_types.hpp> #include "common.hpp" using namespace arb::simd; +using index_constraint = arb::simd::index_constraint; namespace { // Use different distributions in `fill_random`, based on the value type in question: -- GitLab