From 64171e43573a608cbbdaf88e41ed9bcdc4a966e8 Mon Sep 17 00:00:00 2001
From: noraabiakar <nora.abiakar@gmail.com>
Date: Mon, 4 Jun 2018 16:44:57 +0200
Subject: [PATCH] Simd partition by constraint (#494)

Changes have been made to the simd implementation of mechansim functions:

- The node_index array (array of indices that specifies for each mechanism the CVs where it is present), is now partitioned into 4 arrays according to the constraint on each simd_vector in node_index:
    1. contiguous array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are contiguous
    2. constant array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are identical
    3. independent array: contains the indices of all simd_vectors in node_index where the elements in simd_vector are independent (no repetitions) but not contiguous
    4. none array: contains the indices of all simd_vectors in node_index where the none of the above constraints apply

    When mechanism functions are executed, they loop over each of the 4 arrays separately. This allows for optimizations in every category.

- The modcc compiler was modified to generate code for the previous changes, including the optimizations per constraint:
    1. contiguous array: we use vector load/store and vector arithmetic.
    2. constant array: we load only one element and broadcast it into a simd_vector; we use vector arithmetic; we reduce the result; we store one element.
    3. indepndent array: we use vector scatter/gather and vector arithmetic.
    4. none array: we cannot operate on the simd_vector in parallel, we loop over the elements to read, perform arithmetic and write back

- Added a mechanism benchmark for pas, hh and expsyn

- Moved/modified some functions in simd.hpp to ensure that the correct implementation of a function is being called.
---
 modcc/printer/cprinter.cpp                    | 197 ++++++++--
 modcc/printer/cprinter.hpp                    |  11 +
 src/backends/multicore/mechanism.cpp          |   7 +
 src/backends/multicore/mechanism.hpp          |   2 +
 .../multicore/partition_by_constraint.hpp     | 118 ++++++
 src/fvm_lowered_cell_impl.hpp                 |   5 +
 src/simd/implbase.hpp                         |  39 --
 src/simd/simd.hpp                             |  87 ++++-
 tests/ubench/CMakeLists.txt                   |   2 +
 tests/ubench/mech_vec.cpp                     | 363 ++++++++++++++++++
 tests/unit/CMakeLists.txt                     |   1 +
 tests/unit/test_partition_by_constraint.cpp   | 153 ++++++++
 tests/unit/test_simd.cpp                      |   2 +
 13 files changed, 903 insertions(+), 84 deletions(-)
 create mode 100644 src/backends/multicore/partition_by_constraint.hpp
 create mode 100644 tests/ubench/mech_vec.cpp
 create mode 100644 tests/unit/test_partition_by_constraint.cpp

diff --git a/modcc/printer/cprinter.cpp b/modcc/printer/cprinter.cpp
index a75b9a47..c856f4b6 100644
--- a/modcc/printer/cprinter.cpp
+++ b/modcc/printer/cprinter.cpp
@@ -21,6 +21,20 @@ void emit_simd_procedure_proto(std::ostream&, ProcedureExpression*, const std::s
 void emit_api_body(std::ostream&, APIMethod*);
 void emit_simd_api_body(std::ostream&, APIMethod*, moduleKind);
 
+void emit_index_initialize(std::ostream& out, const std::unordered_set<std::string>& indices,
+                           simd_expr_constraint constraint);
+
+void emit_body_for_loop(std::ostream& out, BlockExpression* body, const std::vector<LocalVariable*>& indexed_vars,
+                   const std::unordered_set<std::string>& indices, const simd_expr_constraint& read_constraint,
+                   const simd_expr_constraint& write_constraint);
+
+void emit_for_loop_per_constraint(std::ostream& out, BlockExpression* body,
+                                  const std::vector<LocalVariable*>& indexed_vars,
+                                  const std::unordered_set<std::string>& indices,
+                                  const simd_expr_constraint& read_constraint,
+                                  const simd_expr_constraint& write_constraint,
+                                  std::string underlying_constraint_name);
+
 struct cprint {
     Expression* expr_;
     explicit cprint(Expression* expr): expr_(expr) {}
@@ -33,10 +47,21 @@ struct cprint {
 
 struct simdprint {
     Expression* expr_;
-    explicit simdprint(Expression* expr): expr_(expr) {}
+    bool is_indirect_index_;
+    simd_expr_constraint constraint_;
+
+    explicit simdprint(Expression* expr): expr_(expr), is_indirect_index_(false),
+                                          constraint_(simd_expr_constraint::other) {}
+    explicit simdprint(Expression* expr, bool is_indexed, simd_expr_constraint constraint):
+            expr_(expr), is_indirect_index_(is_indexed), constraint_(constraint) {}
+
+    void set_indirect_index() {
+        is_indirect_index_ = true;
+    }
 
     friend std::ostream& operator<<(std::ostream& out, const simdprint& w) {
         SimdPrinter printer(out);
+        printer.set_var_indexed_to(w.is_indirect_index_);
         return w.expr_->accept(&printer), out;
     }
 };
@@ -107,6 +132,7 @@ std::string emit_cpp_source(const Module& module_, const std::string& ns, simd_s
     if (with_simd) {
         out <<
             "namespace S = ::arb::simd;\n"
+            "using S::index_constraint;\n"
             "static constexpr unsigned simd_width_ = ";
 
         if (!simd.width) {
@@ -416,11 +442,6 @@ static std::string index_i_name(const std::string& index_var) {
     return index_var+"i_";
 }
 
-static std::string index_constraint_name(const std::string& index_var) {
-    return index_var+"constraint_";
-}
-
-
 void SimdPrinter::visit(IdentifierExpression *e) {
     e->symbol()->accept(this);
 }
@@ -431,7 +452,10 @@ void SimdPrinter::visit(LocalVariable* sym) {
 
 void SimdPrinter::visit(VariableExpression *sym) {
     if (sym->is_range()) {
-        out_ << "simd_value(" << sym->name() << "+i_)";
+        if(is_indirect_index_)
+            out_ << "simd_value(" << sym->name() << "+index_)";
+        else
+            out_ << "simd_value(" << sym->name() << "+i_)";
     }
     else {
         out_ << sym->name();
@@ -448,7 +472,10 @@ void SimdPrinter::visit(AssignmentExpression* e) {
     if (lhs->is_variable() && lhs->is_variable()->is_range()) {
         out_ << "simd_value(";
         e->rhs()->accept(this);
-        out_ << ").copy_to(" << lhs->name() << "+i_)";
+        if(is_indirect_index_)
+            out_ << ").copy_to(" << lhs->name() << "+index_)";
+        else
+            out_ << ").copy_to(" << lhs->name() << "+i_)";
     }
     else {
         out_ << lhs->name() << " = ";
@@ -460,7 +487,7 @@ void SimdPrinter::visit(IndexedVariable *sym) {
     indexed_variable_info v = decode_indexed_variable(sym);
     out_ << "S::indirect(" << v.data_var
          << ", " << index_i_name(v.index_var)
-         << ", " << index_constraint_name(v.index_var) << ")";
+         << ", constraint_category_)";
 }
 
 void SimdPrinter::visit(CallExpression* e) {
@@ -505,24 +532,104 @@ void emit_simd_procedure_proto(std::ostream& out, ProcedureExpression* e, const
     out << ")";
 }
 
-void emit_simd_state_read(std::ostream& out, LocalVariable* local) {
+void emit_simd_state_read(std::ostream& out, LocalVariable* local, simd_expr_constraint constraint) {
     out << "simd_value " << local->name();
 
     if (local->is_read()) {
-        out << "(" << simdprint(local->external_variable()) << ");\n";
+        indexed_variable_info v = decode_indexed_variable(local->external_variable());
+        if(constraint == simd_expr_constraint::contiguous) {
+            out << "(" <<  v.data_var
+                 << " + " << v.index_var
+                 << "[index_]);\n";
+        }
+        else if(constraint == simd_expr_constraint::constant){
+            out << "(" << v.data_var
+                 << "[" << v.index_var
+                 << "element0]);\n";
+        }
+        else
+            out << "(" <<  simdprint(local->external_variable()) << ");\n";
     }
     else {
         out << " = 0;\n";
     }
 }
 
-void emit_simd_state_update(std::ostream& out, Symbol* from, IndexedVariable* external) {
+void emit_simd_state_update(std::ostream& out, Symbol* from, IndexedVariable* external, simd_expr_constraint constraint) {
     if (!external->is_write()) return;
 
     const char* op = external->op()==tok::plus? " += ": " -= ";
-    out << simdprint(external) << op << from->name() << ";\n";
+    indexed_variable_info v = decode_indexed_variable(external);
+
+    if(constraint == simd_expr_constraint::contiguous) {
+        out << "simd_value t_"<< external->name() <<"(" << v.data_var << " + " << v.index_var << "[index_]);\n";
+        out << "t_" << external->name() << op << from->name() << ";\n";
+        out << "t_" << external->name() << ".copy_to(" << v.data_var << " + " << v.index_var << "[index_]);\n";
+
+    }
+    else {
+        out << simdprint(external) << op << from->name() << ";\n";
+    }
+}
+
+void emit_index_initialize(std::ostream& out, const std::unordered_set<std::string>& indices,
+                           simd_expr_constraint constraint) {
+    switch(constraint) {
+        case simd_expr_constraint::contiguous :
+            break;
+        case simd_expr_constraint::constant : {
+            for (auto& index: indices) {
+                out << "simd_index::scalar_type " << index << "element0 = " << index << "[index_];\n";
+                out << index_i_name(index) << " = " << index << "element0;\n";
+            }
+        }
+            break;
+        case simd_expr_constraint::other : {
+            for (auto& index: indices) {
+                out << index_i_name(index) << ".copy_from(" << index << ".data() + index_);\n";
+            }
+        }
+            break;
+    }
+}
+
+void emit_body_for_loop(std::ostream& out, BlockExpression* body, const std::vector<LocalVariable*>& indexed_vars,
+                        const std::unordered_set<std::string>& indices, const simd_expr_constraint& read_constraint,
+                        const simd_expr_constraint& write_constraint) {
+    emit_index_initialize(out, indices, read_constraint);
+
+    for (auto& sym: indexed_vars) {
+        emit_simd_state_read(out, sym, read_constraint);
+    }
+
+    simdprint printer(body);
+    printer.set_indirect_index();
+
+    out << printer;
+
+    for (auto& sym: indexed_vars) {
+        emit_simd_state_update(out, sym, sym->external_variable(), write_constraint);
+    }
 }
 
+void emit_for_loop_per_constraint(std::ostream& out, BlockExpression* body,
+                                  const std::vector<LocalVariable*>& indexed_vars,
+                                  const std::unordered_set<std::string>& indices,
+                                  const simd_expr_constraint& read_constraint,
+                                  const simd_expr_constraint& write_constraint,
+                                  std::string underlying_constraint_name) {
+
+    out << "constraint_category_ = index_constraint::"<< underlying_constraint_name << ";\n";
+    out << "for (unsigned i_ = 0; i_ < index_constraints_." << underlying_constraint_name
+        << ".size(); i_++) {\n"
+        << indent;
+
+    out << "index_type index_ = index_constraints_." << underlying_constraint_name << "[i_];\n";
+
+    emit_body_for_loop(out, body, indexed_vars, indices, read_constraint, write_constraint);
+
+    out << popindent << "}\n";
+}
 
 void emit_simd_api_body(std::ostream& out, APIMethod* method, moduleKind module_kind) {
     auto body = method->body();
@@ -533,43 +640,51 @@ void emit_simd_api_body(std::ostream& out, APIMethod* method, moduleKind module_
         indices.insert(decode_indexed_variable(sym->external_variable()).index_var);
     }
 
-    // Note: expect to make index constraints non-constant for point mechanisms as
-    // an optimization in the near future.
+    if (!body->statements().empty()) {
+        if (!indices.empty()) {
+            for (auto& index: indices) {
+                out << "simd_index " << index_i_name(index) << ";\n";
+            }
 
-    // Another note: (TODO) can't actually use index_constraint::independent
-    // for density mechanisms because of collisions in the padded part of
-    // the indices. Work-arounds exist, but not yet implemented.
+            out << "index_constraint constraint_category_;\n\n";
 
+            //Generate for loop for all contiguous simd_vectors
+            simd_expr_constraint constraint = simd_expr_constraint::contiguous;
+            std::string underlying_constraint = "contiguous";
 
-    std::string common_constraint = module_kind==moduleKind::density?
-        //"S::index_constraint::independent":
-        "S::index_constraint::none":
-        "S::index_constraint::none";
+            emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint,
+                                         constraint, underlying_constraint);
 
-    for (auto& index: indices) {
-        out << "simd_index " << index_i_name(index) << ";\n";
-        out << "constexpr S::index_constraint " << index_constraint_name(index)
-            << " = " << common_constraint << ";\n";
-    }
+            //Generate for loop for all independent simd_vectors
+            constraint = simd_expr_constraint::other;
+            underlying_constraint = "independent";
 
-    if (!body->statements().empty()) {
-        out <<
-            "int n_ = width_;\n"
-            "for (int i_ = 0; i_ < n_; i_ += simd_width_) {\n" << indent;
+            emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint,
+                                         constraint, underlying_constraint);
 
-        for (auto& index: indices) {
-            out << index_i_name(index) << ".copy_from(" << index << ".data()+i_);\n";
-        }
+            //Generate for loop for all simd_vectors that have no optimizing constraints
+            constraint = simd_expr_constraint::other;
+            underlying_constraint = "none";
 
-        for (auto& sym: indexed_vars) {
-            emit_simd_state_read(out, sym);
-        }
+            emit_for_loop_per_constraint(out, body, indexed_vars, indices, constraint,
+                                         constraint, underlying_constraint);
 
-        out << simdprint(body);
+            //Generate for loop for all constant simd_vectors
+            simd_expr_constraint read_constraint = simd_expr_constraint::constant;
+            simd_expr_constraint write_constraint = simd_expr_constraint::other;
+            underlying_constraint = "constant";
+
+            emit_for_loop_per_constraint(out, body, indexed_vars, indices, read_constraint,
+                                         write_constraint, underlying_constraint);
 
-        for (auto& sym: indexed_vars) {
-            emit_simd_state_update(out, sym, sym->external_variable());
         }
-        out << popindent << "}\n";
+        else {
+            out << "unsigned n_ = width_;\n\n";
+            out <<
+                "for (unsigned i_ = 0; i_ < n_; i_ += simd_width_) {\n" << indent;
+            out << simdprint(body);
+            out << popindent << "}\n";
+        }
     }
 }
+
diff --git a/modcc/printer/cprinter.hpp b/modcc/printer/cprinter.hpp
index 77b2e37c..e7e96ab8 100644
--- a/modcc/printer/cprinter.hpp
+++ b/modcc/printer/cprinter.hpp
@@ -38,6 +38,13 @@ protected:
     std::ostream& out_;
 };
 
+
+enum class simd_expr_constraint{
+    constant,
+    contiguous,
+    other
+};
+
 class SimdPrinter: public Visitor {
 public:
     SimdPrinter(std::ostream& out): out_(out) {}
@@ -45,6 +52,9 @@ public:
     void visit(Expression* e) override {
         throw compiler_exception("SimdPrinter cannot translate expression "+e->to_string());
     }
+    void set_var_indexed_to(bool is_indirect_index) {
+        is_indirect_index_ = is_indirect_index;
+    }
 
     void visit(BlockExpression*) override;
     void visit(CallExpression*) override;
@@ -60,4 +70,5 @@ public:
 
 private:
     std::ostream& out_;
+    bool is_indirect_index_;
 };
diff --git a/src/backends/multicore/mechanism.cpp b/src/backends/multicore/mechanism.cpp
index 73f95529..b0fc076f 100644
--- a/src/backends/multicore/mechanism.cpp
+++ b/src/backends/multicore/mechanism.cpp
@@ -19,6 +19,7 @@
 #include <backends/multicore/mechanism.hpp>
 #include <backends/multicore/multicore_common.hpp>
 #include <backends/multicore/fvm.hpp>
+#include <backends/multicore/partition_by_constraint.hpp>
 
 namespace arb {
 namespace multicore {
@@ -26,6 +27,9 @@ namespace multicore {
 using util::make_range;
 using util::value_by_key;
 
+constexpr unsigned simd_width = S::simd_abi::native_width<fvm_value_type>::value;
+
+
 // Copy elements from source sequence into destination sequence,
 // and fill the remaining elements of the destination sequence
 // with the given fill value.
@@ -127,6 +131,7 @@ void mechanism::instantiate(fvm_size_type id, backend::shared_state& shared, con
     node_index_ = iarray(width_padded_, pad);
     copy_extend(pos_data.cv, node_index_, pos_data.cv.back());
     copy_extend(pos_data.weight, make_range(data_.data(), data_.data()+width_padded_), 0);
+    index_constraints_ = make_constraint_partition(node_index_, width_, simd_width);
 
     for (auto i: ion_index_table()) {
         util::optional<ion_state&> oion = value_by_key(shared.ion_data, i.first);
@@ -140,6 +145,8 @@ void mechanism::instantiate(fvm_size_type id, backend::shared_state& shared, con
         auto& ion_index = *i.second;
         ion_index = iarray(width_padded_, pad);
         copy_extend(indices, ion_index, util::back(indices));
+
+        EXPECTS(compatible_index_constraints(node_index_, ion_index, simd_width));
     }
 
 }
diff --git a/src/backends/multicore/mechanism.hpp b/src/backends/multicore/mechanism.hpp
index 066e287f..c03bbb75 100644
--- a/src/backends/multicore/mechanism.hpp
+++ b/src/backends/multicore/mechanism.hpp
@@ -12,6 +12,7 @@
 #include <mechanism.hpp>
 
 #include <backends/multicore/multicore_common.hpp>
+#include <backends/multicore/partition_by_constraint.hpp>
 #include <backends/multicore/fvm.hpp>
 
 namespace arb {
@@ -81,6 +82,7 @@ protected:
     // Per-mechanism index and weight data, excepting ion indices.
 
     iarray node_index_;
+    constraint_partition index_constraints_;
     const value_type* weight_;    // Points within data_ after instantiation.
 
     // Bulk storage for state and parameter variables.
diff --git a/src/backends/multicore/partition_by_constraint.hpp b/src/backends/multicore/partition_by_constraint.hpp
new file mode 100644
index 00000000..b4cf0c06
--- /dev/null
+++ b/src/backends/multicore/partition_by_constraint.hpp
@@ -0,0 +1,118 @@
+#pragma once
+
+#include<vector>
+#include<simd/simd.hpp>
+
+namespace arb {
+namespace multicore {
+
+namespace S = ::arb::simd;
+using S::index_constraint;
+
+struct constraint_partition {
+    using iarray = arb::multicore::iarray;
+
+    iarray contiguous;
+    iarray constant;
+    iarray independent;
+    iarray none;
+};
+
+template <typename It>
+bool is_contiguous_n(It first, unsigned width) {
+    while (--width) {
+        It next = first;
+        if ((*first) +1 != *++next) {
+            return false;
+        }
+        first = next;
+    }
+    return true;
+}
+
+template <typename It>
+bool is_constant_n(It first, unsigned width) {
+    while (--width) {
+        It next = first;
+        if (*first != *++next) {
+            return false;
+        }
+        first = next;
+    }
+    return true;
+}
+
+template <typename It>
+bool is_independent_n(It first, unsigned width) {
+    while (--width) {
+        It next = first;
+        if (*first == *++next) {
+            return false;
+        }
+        first = next;
+    }
+    return true;
+}
+
+template <typename It>
+index_constraint idx_constraint(It it, unsigned simd_width) {
+    if (is_contiguous_n(it, simd_width)) {
+        return index_constraint::contiguous;
+    }
+    else if (is_constant_n(it, simd_width)) {
+        return index_constraint::constant;
+    }
+    else if (is_independent_n(it, simd_width)) {
+        return index_constraint::independent;
+    }
+    else {
+        return index_constraint::none;
+    }
+}
+
+template <typename T>
+constraint_partition make_constraint_partition(const T& node_index, unsigned width, unsigned simd_width) {
+
+    constraint_partition part;
+    for (unsigned i = 0; i < width; i+= simd_width) {
+        auto ptr = &node_index[i];
+        if (is_contiguous_n(ptr, simd_width)) {
+            part.contiguous.push_back(i);
+        }
+        else if (is_constant_n(ptr, simd_width)) {
+            part.constant.push_back(i);
+        }
+        else if (is_independent_n(ptr, simd_width)) {
+            part.independent.push_back(i);
+        }
+        else {
+            part.none.push_back(i);
+        }
+    }
+    return part;
+}
+
+bool constexpr is_constraint_stronger(index_constraint a, index_constraint b) {
+    return a==b ||
+           a==index_constraint::none ||
+           (a==index_constraint::independent && b==index_constraint::contiguous);
+}
+
+template <typename T>
+bool compatible_index_constraints(T& node_index, T& ion_index, unsigned simd_width){
+    for (unsigned i = 0; i < node_index.size(); i+= simd_width) {
+        auto nc = idx_constraint(&node_index[i], simd_width);
+        auto ic = idx_constraint(&ion_index[i], simd_width);
+
+        if (!is_constraint_stronger(nc, ic)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+} // namespace util
+} // namespace arb
+
+
+
diff --git a/src/fvm_lowered_cell_impl.hpp b/src/fvm_lowered_cell_impl.hpp
index 22b8550c..765dfb2a 100644
--- a/src/fvm_lowered_cell_impl.hpp
+++ b/src/fvm_lowered_cell_impl.hpp
@@ -56,6 +56,11 @@ public:
 
     value_type time() const override { return tmin_; }
 
+    //Exposed for testing purposes
+    std::vector<mechanism_ptr>& mechanisms() {
+        return mechanisms_;
+    }
+
 private:
     // Host or GPU-side back-end dependent storage.
     using array = typename backend::array;
diff --git a/src/simd/implbase.hpp b/src/simd/implbase.hpp
index 9253add1..3722ae58 100644
--- a/src/simd/implbase.hpp
+++ b/src/simd/implbase.hpp
@@ -58,7 +58,6 @@ namespace simd {
 enum class index_constraint {
     none = 0,
     // For indices k[0], k[1],...:
-
     independent, // k[i]==k[j] => i=j.
     contiguous,  // k[i]==k[0]+i
     constant     // k[i]==k[j] ∀ i, j
@@ -435,44 +434,6 @@ struct implbase {
         }
     }
 
-    template <typename ImplIndex>
-    static void compound_indexed_add(tag<ImplIndex> tag, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, index_constraint constraint) {
-        switch (constraint) {
-        case index_constraint::none:
-            {
-                typename ImplIndex::scalar_type o[width];
-                ImplIndex::copy_to(index, o);
-
-                store a;
-                I::copy_to(s, a);
-
-                for (unsigned i = 0; i<width; ++i) {
-                    p[o[i]] += a[i];
-                }
-            }
-            break;
-        case index_constraint::independent:
-            {
-                vector_type v = I::add(I::gather(tag, p, index), s);
-                I::scatter(tag, v, p, index);
-            }
-            break;
-        case index_constraint::contiguous:
-            {
-                p += ImplIndex::element0(index);
-                vector_type v = I::add(I::copy_from(p), s);
-                I::copy_to(v, p);
-            }
-            break;
-        case index_constraint::constant:
-            {
-                p += ImplIndex::element0(index);
-                *p += I::reduce_add(s);
-            }
-            break;
-        }
-    }
-
     static scalar_type reduce_add(const vector_type& s) {
         store a;
         I::copy_to(s, a);
diff --git a/src/simd/simd.hpp b/src/simd/simd.hpp
index 17922040..02d1ad62 100644
--- a/src/simd/simd.hpp
+++ b/src/simd/simd.hpp
@@ -7,6 +7,7 @@
 #include <simd/implbase.hpp>
 #include <simd/generic.hpp>
 #include <simd/native.hpp>
+#include <common_types.hpp>
 
 namespace arb {
 namespace simd {
@@ -43,13 +44,13 @@ namespace simd_detail {
 
         template <typename Other>
         indirect_expression& operator+=(const simd_impl<Other>& s) {
-            Other::compound_indexed_add(tag<Impl>{}, s.value_, p, index, constraint);
+            simd_impl<Other>::compound_indexed_add(tag<Impl>{}, s.value_, p, index, constraint);
             return *this;
         }
 
         template <typename Other>
         indirect_expression& operator-=(const simd_impl<Other>& s) {
-            Other::compound_indexed_add(tag<Impl>{}, (-s).value_, p, index, constraint);
+            simd_impl<Other>::compound_indexed_add(tag<Impl>{}, (-s).value_, p, index, constraint);
             return *this;
         }
     };
@@ -158,14 +159,92 @@ namespace simd_detail {
 
         template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
         void copy_from(indirect_expression<IndexImpl, scalar_type> pi) {
-            value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+            switch (pi.constraint) {
+            case index_constraint::none:
+                value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+                break;
+            case index_constraint::independent:
+                value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+                break;
+            case index_constraint::contiguous:
+                {
+                    scalar_type* p = IndexImpl::element0(pi.index) + pi.p;
+                    value_ = Impl::copy_from(p);
+                }
+                break;
+            case index_constraint::constant:
+                {
+                    scalar_type* p = IndexImpl::element0(pi.index) + pi.p;
+                    scalar_type l = (*p);
+                    value_ = Impl::broadcast(l);
+                }
+                break;
+            }
         }
 
         template <typename IndexImpl, typename = typename std::enable_if<width==simd_traits<IndexImpl>::width>::type>
         void copy_from(indirect_expression<IndexImpl, const scalar_type> pi) {
-            value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+            switch (pi.constraint) {
+            case index_constraint::none:
+                value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+                break;
+            case index_constraint::independent:
+                value_ = Impl::gather(tag<IndexImpl>{}, pi.p, pi.index);
+                break;
+            case index_constraint::contiguous:
+                {
+                    const scalar_type* p = IndexImpl::element0(pi.index) + pi.p;
+                    value_ = Impl::copy_from(p);
+                }
+                break;
+            case index_constraint::constant:
+                {
+                    const scalar_type *p = IndexImpl::element0(pi.index) + pi.p;
+                    scalar_type l = (*p);
+                    value_ = Impl::broadcast(l);
+                }
+                break;
+            }
+
         }
 
+        template <typename ImplIndex>
+        static void compound_indexed_add(tag<ImplIndex> tag, const vector_type& s, scalar_type* p, const typename ImplIndex::vector_type& index, index_constraint constraint) {
+            switch (constraint) { 
+            case index_constraint::none:
+                {
+                    typename ImplIndex::scalar_type o[width];
+                    ImplIndex::copy_to(index, o);
+
+                    scalar_type a[width];
+                    Impl::copy_to(s, a);
+
+                    for (unsigned i = 0; i<width; ++i) {
+                            p[o[i]] += a[i];
+                    }
+                }
+                break;
+            case index_constraint::independent:
+                {
+                    vector_type v = Impl::add(Impl::gather(tag, p, index), s);
+                    Impl::scatter(tag, v, p, index);
+                }
+                break;
+            case index_constraint::contiguous:
+                {
+                    p += ImplIndex::element0(index);
+                    vector_type v = Impl::add(Impl::copy_from(p), s);
+                    Impl::copy_to(v, p);
+                }
+                break;
+            case index_constraint::constant:
+                p += ImplIndex::element0(index);
+                *p += Impl::reduce_add(s);
+                break;
+            }
+        }
+
+
         // Arithmetic operations: +, -, *, /, fma.
 
         simd_impl operator-() const {
diff --git a/tests/ubench/CMakeLists.txt b/tests/ubench/CMakeLists.txt
index 297fb491..c93d4132 100644
--- a/tests/ubench/CMakeLists.txt
+++ b/tests/ubench/CMakeLists.txt
@@ -7,6 +7,7 @@ set(bench_sources
     default_construct.cpp
     event_setup.cpp
     event_binning.cpp
+    mech_vec.cpp
 )
 
 set(bench_sources_cuda
@@ -43,6 +44,7 @@ foreach(bench_src ${bench_sources})
     add_dependencies("${bench_exe}" gbench)
     target_include_directories("${bench_exe}" PRIVATE "${gbench_install_dir}/include")
     target_link_libraries("${bench_exe}" LINK_PUBLIC "${gbench_install_dir}/lib/libbenchmark.a")
+    target_link_libraries("${bench_exe}" LINK_PUBLIC ${ARB_LIBRARIES})
 
     list(APPEND bench_exe_list ${bench_exe})
 endforeach()
diff --git a/tests/ubench/mech_vec.cpp b/tests/ubench/mech_vec.cpp
new file mode 100644
index 00000000..58c1643b
--- /dev/null
+++ b/tests/ubench/mech_vec.cpp
@@ -0,0 +1,363 @@
+// Test performance of vectorization for mechanism implementations.
+//
+// Start with pas (passive dendrite) mechanism
+
+#include <backends/multicore/fvm.hpp>
+#include <benchmark/benchmark.h>
+#include <fvm_lowered_cell_impl.hpp>
+#include <fstream>
+
+using namespace arb;
+
+using backend = arb::multicore::backend;
+using fvm_cell = arb::fvm_lowered_cell_impl<backend>;
+
+mechanism_ptr& find_mechanism(const std::string& name, fvm_cell& cell) {
+    auto &mechs = cell.mechanisms();
+    auto it = std::find_if(mechs.begin(),
+                           mechs.end(),
+                           [&](mechanism_ptr& m){return m->internal_name()==name;});
+    if (it==mechs.end()) {
+        std::cerr << "couldn't find mechanism with name " << name << "\n";
+        exit(1);
+    }
+    return *it;
+}
+
+class recipe_expsyn_1_branch: public recipe {
+    unsigned num_comp_;
+    unsigned num_synapse_;
+public:
+    recipe_expsyn_1_branch(unsigned num_comp, unsigned num_synapse):
+            num_comp_(num_comp), num_synapse_(num_synapse) {}
+
+    cell_size_type num_cells() const override {
+        return 1;
+    }
+
+    virtual util::unique_any get_cell_description(cell_gid_type gid) const override {
+        cell c;
+
+        auto soma = c.add_soma(12.6157/2.0);
+        soma->add_mechanism("pas");
+
+        c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+
+        for (auto& seg: c.segments()) {
+            if (seg->is_dendrite()) {
+                seg->set_compartments(num_comp_-1);
+            }
+        }
+
+        auto distribution = std::uniform_real_distribution<float>(0.f, 1.0f);
+
+        for(unsigned i = 0; i < num_synapse_; i++) {
+            auto gen = std::mt19937(i);
+            c.add_synapse({1, distribution(gen)}, "expsyn");
+        }
+
+        return std::move(c);
+    }
+
+    virtual cell_kind get_cell_kind(cell_gid_type) const override {
+        return cell_kind::cable1d_neuron;
+    }
+
+};
+
+class recipe_pas_1_branch: public recipe {
+    unsigned num_comp_;
+public:
+    recipe_pas_1_branch(unsigned num_comp): num_comp_(num_comp) {}
+
+    cell_size_type num_cells() const override {
+        return 1;
+    }
+
+    virtual util::unique_any get_cell_description(cell_gid_type gid) const override {
+        cell c;
+
+        auto soma = c.add_soma(12.6157/2.0);
+        soma->add_mechanism("pas");
+
+        c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+
+        for (auto& seg: c.segments()) {
+            if (seg->is_dendrite()) {
+                seg->add_mechanism("pas");
+                seg->set_compartments(num_comp_-1);
+            }
+        }
+        return std::move(c);
+    }
+
+    virtual cell_kind get_cell_kind(cell_gid_type) const override {
+        return cell_kind::cable1d_neuron;
+    }
+
+};
+
+class recipe_pas_3_branches: public recipe {
+    unsigned num_comp_;
+public:
+    recipe_pas_3_branches(unsigned num_comp): num_comp_(num_comp) {}
+
+    cell_size_type num_cells() const override {
+        return 1;
+    }
+
+    virtual util::unique_any get_cell_description(cell_gid_type gid) const override {
+        cell c;
+
+        auto soma = c.add_soma(12.6157/2.0);
+        soma->add_mechanism("pas");
+
+        c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+        c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+        c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+
+        for (auto& seg: c.segments()) {
+            if (seg->is_dendrite()) {
+                seg->add_mechanism("pas");
+                seg->set_compartments(num_comp_-1);
+            }
+        }
+        return std::move(c);
+    }
+
+    virtual cell_kind get_cell_kind(cell_gid_type) const override {
+        return cell_kind::cable1d_neuron;
+    }
+
+};
+
+class recipe_hh_1_branch: public recipe {
+    unsigned num_comp_;
+public:
+    recipe_hh_1_branch(unsigned num_comp): num_comp_(num_comp) {}
+
+    cell_size_type num_cells() const override {
+        return 1;
+    }
+
+    virtual util::unique_any get_cell_description(cell_gid_type gid) const override {
+        cell c;
+
+        auto soma = c.add_soma(12.6157/2.0);
+        soma->add_mechanism("hh");
+
+        c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+
+        for (auto& seg: c.segments()) {
+            if (seg->is_dendrite()) {
+                seg->add_mechanism("hh");
+                seg->set_compartments(num_comp_-1);
+            }
+        }
+        return std::move(c);
+    }
+
+    virtual cell_kind get_cell_kind(cell_gid_type) const override {
+        return cell_kind::cable1d_neuron;
+    }
+
+};
+
+class recipe_hh_3_branches: public recipe {
+    unsigned num_comp_;
+public:
+    recipe_hh_3_branches(unsigned num_comp): num_comp_(num_comp) {}
+
+    cell_size_type num_cells() const override {
+        return 1;
+    }
+
+    virtual util::unique_any get_cell_description(cell_gid_type gid) const override {
+        cell c;
+
+        auto soma = c.add_soma(12.6157/2.0);
+        soma->add_mechanism("pas");
+
+        c.add_cable(0, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+        c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+        c.add_cable(1, section_kind::dendrite, 1.0/2, 1.0/2, 200.0);
+
+        for (auto& seg: c.segments()) {
+            if (seg->is_dendrite()) {
+                seg->add_mechanism("hh");
+                seg->set_compartments(num_comp_-1);
+            }
+        }
+        return std::move(c);
+    }
+
+    virtual cell_kind get_cell_kind(cell_gid_type) const override {
+        return cell_kind::cable1d_neuron;
+    }
+};
+
+void expsyn_1_branch_current(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    const unsigned nsynapse = state.range(1);
+    recipe_expsyn_1_branch rec_expsyn_1_branch(ncomp, nsynapse);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_expsyn_1_branch, target_handles, probe_handles);
+
+    auto& m = find_mechanism("expsyn", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_current();
+    }
+}
+
+void expsyn_1_branch_state(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    const unsigned nsynapse = state.range(1);
+    recipe_expsyn_1_branch rec_expsyn_1_branch(ncomp, nsynapse);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_expsyn_1_branch, target_handles, probe_handles);
+
+    auto& m = find_mechanism("expsyn", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_state();
+    }
+}
+
+void pas_1_branch_current(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_pas_1_branch rec_pas_1_branch(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_pas_1_branch, target_handles, probe_handles);
+
+    auto& m = find_mechanism("pas", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_current();
+    }
+}
+
+void pas_3_branches_current(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_pas_3_branches rec_pas_3_branches(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_pas_3_branches, target_handles, probe_handles);
+
+    auto& m = find_mechanism("pas", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_current();
+    }
+}
+
+void hh_1_branch_state(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_hh_1_branch rec_hh_1_branch(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_hh_1_branch, target_handles, probe_handles);
+
+    auto& m = find_mechanism("hh", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_state();
+    }
+}
+
+void hh_1_branch_current(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_hh_1_branch rec_hh_1_branch(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_hh_1_branch, target_handles, probe_handles);
+
+    auto& m = find_mechanism("hh", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_current();
+    }
+}
+
+void hh_3_branches_state(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_hh_3_branches rec_hh_3_branches(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_hh_3_branches, target_handles, probe_handles);
+
+    auto& m = find_mechanism("hh", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_state();
+    }
+}
+
+void hh_3_branches_current(benchmark::State& state) {
+    const unsigned ncomp = state.range(0);
+    recipe_hh_3_branches rec_hh_3_branches(ncomp);
+
+    std::vector<cell_gid_type> gids = {0};
+    std::vector<target_handle> target_handles;
+    probe_association_map<probe_handle> probe_handles;
+
+    fvm_cell cell;
+    cell.initialize(gids, rec_hh_3_branches, target_handles, probe_handles);
+
+    auto& m = find_mechanism("hh", cell);
+
+    while (state.KeepRunning()) {
+        m->nrn_current();
+    }
+}
+
+void run_custom_arguments(benchmark::internal::Benchmark* b) {
+    for (auto ncomps: {10, 100, 1000, 10000, 100000, 1000000, 10000000}) {
+        b->Args({ncomps});
+    }
+}
+void run_exp_custom_arguments(benchmark::internal::Benchmark* b) {
+    for (auto ncomps: {10, 100, 1000, 10000, 100000, 1000000}) {
+        b->Args({ncomps, ncomps*10});
+    }
+}
+BENCHMARK(expsyn_1_branch_current)->Apply(run_exp_custom_arguments);
+BENCHMARK(expsyn_1_branch_state)->Apply(run_exp_custom_arguments);
+BENCHMARK(pas_1_branch_current)->Apply(run_custom_arguments);
+BENCHMARK(hh_1_branch_current)->Apply(run_custom_arguments);
+BENCHMARK(hh_1_branch_state)->Apply(run_custom_arguments);
+BENCHMARK(pas_3_branches_current)->Apply(run_custom_arguments);
+BENCHMARK(hh_3_branches_current)->Apply(run_custom_arguments);
+BENCHMARK(hh_3_branches_state)->Apply(run_custom_arguments);
+BENCHMARK_MAIN();
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 942b2283..e00458e7 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -68,6 +68,7 @@ set(test_sources
     test_mechinfo.cpp
     test_padded.cpp
     test_partition.cpp
+    test_partition_by_constraint.cpp
     test_path.cpp
     test_point.cpp
     test_probe.cpp
diff --git a/tests/unit/test_partition_by_constraint.cpp b/tests/unit/test_partition_by_constraint.cpp
new file mode 100644
index 00000000..7b8400af
--- /dev/null
+++ b/tests/unit/test_partition_by_constraint.cpp
@@ -0,0 +1,153 @@
+#include "../gtest.h"
+
+#include <array>
+#include <forward_list>
+#include <string>
+#include <vector>
+
+#include <simd/simd.hpp>
+#include <common_types.hpp>
+#include <backends/multicore/multicore_common.hpp>
+#include <backends/multicore/partition_by_constraint.hpp>
+
+using namespace arb;
+using iarray = multicore::iarray;
+static constexpr unsigned simd_width_ = arb::simd::simd_abi::native_width<fvm_value_type>::value;
+
+const int input_size_ = 1024;
+
+TEST(partition_by_constraint, partition_contiguous) {
+    iarray input_index(input_size_);
+    iarray expected;
+    multicore::constraint_partition output;
+
+
+    for (unsigned i = 0; i < input_size_; i++) {
+        input_index[i] = i;
+        if(i % simd_width_ == 0)
+            expected.push_back(i);
+    }
+
+    output = multicore::make_constraint_partition(input_index, input_size_, simd_width_);
+
+    EXPECT_EQ(0, output.independent.size());
+    EXPECT_EQ(0, output.none.size());
+    EXPECT_EQ(0, output.constant.size());
+    EXPECT_EQ(expected, output.contiguous);
+}
+
+TEST(partition_by_constraint, partition_constant) {
+    iarray input_index(input_size_);
+    iarray expected;
+    multicore::constraint_partition output;
+
+    const int c = 5;
+
+    for (unsigned i = 0; i < input_size_; i++) {
+        input_index[i] = c;
+        if(i % simd_width_ == 0)
+            expected.push_back(i);
+    }
+
+    output = multicore::make_constraint_partition(input_index, input_size_, simd_width_);
+
+    EXPECT_EQ(0, output.independent.size());
+    EXPECT_EQ(0, output.none.size());
+    if(simd_width_ != 1) {
+        EXPECT_EQ(0, output.contiguous.size());
+        EXPECT_EQ(expected, output.constant);
+    }
+    else {
+        EXPECT_EQ(0, output.constant.size());
+        EXPECT_EQ(expected, output.contiguous);
+    }
+}
+
+TEST(partition_by_constraint, partition_independent) {
+    iarray input_index(input_size_);
+    iarray expected;
+    multicore::constraint_partition output;
+
+    for (unsigned i = 0; i < input_size_; i++) {
+        input_index[i] = i * 2;
+        if(i % simd_width_ == 0)
+            expected.push_back(i);
+    }
+
+    output = multicore::make_constraint_partition(input_index, input_size_, simd_width_);
+
+    EXPECT_EQ(0, output.constant.size());
+    EXPECT_EQ(0, output.none.size());
+    if(simd_width_ != 1) {
+        EXPECT_EQ(0, output.contiguous.size());
+        EXPECT_EQ(expected, output.independent);
+    }
+    else {
+        EXPECT_EQ(0, output.independent.size());
+        EXPECT_EQ(expected, output.contiguous);
+    }
+}
+
+TEST(partition_by_constraint, partition_none) {
+    iarray input_index(input_size_);
+    iarray expected;
+    multicore::constraint_partition output;
+
+    for (unsigned i = 0; i < input_size_; i++) {
+        input_index[i] = i / ((simd_width_ + 1)/ 2);
+        if(i % simd_width_ == 0)
+            expected.push_back(i);
+    }
+
+    output = multicore::make_constraint_partition(input_index, input_size_, simd_width_);
+
+    EXPECT_EQ(0, output.independent.size());
+    EXPECT_EQ(0, output.constant.size());
+    if(simd_width_ != 1) {
+        EXPECT_EQ(0, output.contiguous.size());
+        EXPECT_EQ(expected, output.none);
+    }
+    else {
+        EXPECT_EQ(0, output.none.size());
+        EXPECT_EQ(expected, output.contiguous);
+    }
+}
+
+TEST(partition_by_constraint, partition_random) {
+    iarray input_index(input_size_);
+    iarray expected_contiguous, expected_constant,
+            expected_independent, expected_none, expected_simd_1;
+    multicore::constraint_partition output;
+
+
+    const int c = 5;
+    for (unsigned i = 0; i < input_size_; i++) {
+        input_index[i] = i<input_size_/4   ? i/((simd_width_ + 1)/ 2):
+                         i<input_size_/2   ? i*2:
+                         i<input_size_*3/4 ? c:
+                         i;
+        if (i < input_size_ / 4 && i % simd_width_ == 0)
+            expected_none.push_back(i);
+        else if (i < input_size_ / 2 && i % simd_width_ == 0)
+            expected_independent.push_back(i);
+        else if (i < input_size_* 3/ 4 && i % simd_width_ == 0)
+            expected_constant.push_back(i);
+        else if (i % simd_width_ == 0)
+            expected_contiguous.push_back(i);
+        expected_simd_1.push_back(i);
+
+    }
+
+    output = multicore::make_constraint_partition(input_index, input_size_, simd_width_);
+
+    if (simd_width_ != 1) {
+        EXPECT_EQ(expected_contiguous, output.contiguous);
+        EXPECT_EQ(expected_constant, output.constant);
+        EXPECT_EQ(expected_independent, output.independent);
+        EXPECT_EQ(expected_none, output.none);
+    }
+    else {
+        EXPECT_EQ(expected_simd_1, output.contiguous);
+    }
+
+}
diff --git a/tests/unit/test_simd.cpp b/tests/unit/test_simd.cpp
index 1cec58f7..15589290 100644
--- a/tests/unit/test_simd.cpp
+++ b/tests/unit/test_simd.cpp
@@ -8,9 +8,11 @@
 #include <simd/simd.hpp>
 #include <simd/avx.hpp>
 
+#include <common_types.hpp>
 #include "common.hpp"
 
 using namespace arb::simd;
+using index_constraint = arb::simd::index_constraint;
 
 namespace {
     // Use different distributions in `fill_random`, based on the value type in question:
-- 
GitLab