diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index 330f1a20b231ff22f1c2a07b8d6828184aeef41c..d3acd53097efaf5e16d8f7f60f400cc60aaaea48 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -158,6 +158,9 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         TCLAP::ValueArg<uint32_t> group_size_arg(
             "g", "group-size", "number of cells per cell group",
             false, defopts.compartments_per_segment, "integer", cmd);
+        TCLAP::ValueArg<double> sample_dt_arg(
+            "", "sample-dt", "set sampling interval to <time> ms",
+            false, defopts.bin_dt, "time", cmd);
         TCLAP::ValueArg<double> probe_ratio_arg(
             "p", "probe-ratio", "proportion between 0 and 1 of cells to probe",
             false, defopts.probe_ratio, "proportion", cmd);
@@ -165,10 +168,13 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
             "X", "probe-soma-only", "only probe cell somas, not dendrites", cmd, false);
         TCLAP::ValueArg<std::string> trace_prefix_arg(
             "P", "prefix", "write traces to files with prefix <prefix>",
-            false, defopts.trace_prefix, "stringr", cmd);
+            false, defopts.trace_prefix, "string", cmd);
         TCLAP::ValueArg<util::optional<unsigned>> trace_max_gid_arg(
             "T", "trace-max-gid", "only trace probes on cells up to and including <gid>",
             false, defopts.trace_max_gid, "gid", cmd);
+        TCLAP::ValueArg<std::string> trace_format_arg(
+            "F", "trace-format", "select trace data format: csv or json",
+            false, defopts.trace_prefix, "string", cmd);
         TCLAP::ValueArg<util::optional<std::string>> morphologies_arg(
             "M", "morphologies", "load morphologies from SWC files matching <glob>",
             false, defopts.morphologies, "glob", cmd);
@@ -214,10 +220,12 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                     update_option(options.all_to_all, fopts, "all_to_all");
                     update_option(options.ring, fopts, "ring");
                     update_option(options.group_size, fopts, "group_size");
+                    update_option(options.sample_dt, fopts, "sample_dt");
                     update_option(options.probe_ratio, fopts, "probe_ratio");
                     update_option(options.probe_soma_only, fopts, "probe_soma_only");
                     update_option(options.trace_prefix, fopts, "trace_prefix");
                     update_option(options.trace_max_gid, fopts, "trace_max_gid");
+                    update_option(options.trace_format, fopts, "trace_format");
                     update_option(options.morphologies, fopts, "morphologies");
                     update_option(options.morph_rr, fopts, "morph_rr");
                     update_option(options.report_compartments, fopts, "report_compartments");
@@ -258,10 +266,12 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         update_option(options.all_to_all, all_to_all_arg);
         update_option(options.ring, ring_arg);
         update_option(options.group_size, group_size_arg);
+        update_option(options.sample_dt, sample_dt_arg);
         update_option(options.probe_ratio, probe_ratio_arg);
         update_option(options.probe_soma_only, probe_soma_only_arg);
         update_option(options.trace_prefix, trace_prefix_arg);
         update_option(options.trace_max_gid, trace_max_gid_arg);
+        update_option(options.trace_format, trace_format_arg);
         update_option(options.morphologies, morphologies_arg);
         update_option(options.morph_rr, morph_rr_arg);
         update_option(options.report_compartments, report_compartments_arg);
@@ -269,6 +279,10 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         update_option(options.profile_only_zero, profile_only_zero_arg);
         update_option(options.dry_run_ranks, dry_run_ranks_arg);
 
+        if (options.trace_format!="csv" && options.trace_format!="json") {
+            throw usage_error("trace format must be one of: csv, json");
+        }
+
         if (options.all_to_all && options.ring) {
             throw usage_error("can specify at most one of --ring and --all-to-all");
         }
@@ -301,6 +315,7 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                 fopts["all_to_all"] = options.all_to_all;
                 fopts["ring"] = options.ring;
                 fopts["group_size"] = options.group_size;
+                fopts["sample_dt"] = options.sample_dt;
                 fopts["probe_ratio"] = options.probe_ratio;
                 fopts["probe_soma_only"] = options.probe_soma_only;
                 fopts["trace_prefix"] = options.trace_prefix;
@@ -310,6 +325,7 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                 else {
                     fopts["trace_max_gid"] = nullptr;
                 }
+                fopts["trace_format"] = options.trace_format;
                 if (options.morphologies) {
                     fopts["morphologies"] = options.morphologies.get();
                 }
@@ -352,6 +368,7 @@ std::ostream& operator<<(std::ostream& o, const cl_options& options) {
     o << "  all to all network   : " << (options.all_to_all ? "yes" : "no") << "\n";
     o << "  ring network         : " << (options.ring ? "yes" : "no") << "\n";
     o << "  group size           : " << options.group_size << "\n";
+    o << "  sample dt            : " << options.sample_dt << "\n";
     o << "  probe ratio          : " << options.probe_ratio << "\n";
     o << "  probe soma only      : " << (options.probe_soma_only ? "yes" : "no") << "\n";
     o << "  trace prefix         : " << options.trace_prefix << "\n";
@@ -360,6 +377,7 @@ std::ostream& operator<<(std::ostream& o, const cl_options& options) {
        o << *options.trace_max_gid;
     }
     o << "\n";
+    o << "  trace format         : " << options.trace_format << "\n";
     o << "  morphologies         : ";
     if (options.morphologies) {
        o << *options.morphologies;
diff --git a/miniapp/io.hpp b/miniapp/io.hpp
index 8e6b7b1f54715d78536820ec70f569614481c4c7..bc16a6b1099b7a911bf32bb456e9017bead41470 100644
--- a/miniapp/io.hpp
+++ b/miniapp/io.hpp
@@ -36,10 +36,12 @@ struct cl_options {
     double bin_dt = 0.0025;   // 0 => no binning.
 
     // Probe/sampling specification.
+    double sample_dt = 0.1;
     bool probe_soma_only = false;
     double probe_ratio = 0;  // Proportion of cells to probe.
     std::string trace_prefix = "trace_";
     util::optional<unsigned> trace_max_gid; // Only make traces up to this gid.
+    std::string trace_format = "json"; // Support only 'json' and 'csv'.
 
     // Parameters for spike output.
     bool spike_file_output = false;
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 9b6cbc0f4d297822fa3a22f93a5ae6df8388b304..c74db58a6d234553d0ef182d5c0bb6608f759cc9 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -37,6 +37,7 @@ std::unique_ptr<sample_trace_type> make_trace(probe_record probe);
 using communicator_type = communication::communicator<communication::global_policy>;
 
 void write_trace_json(const sample_trace_type& trace, const std::string& prefix = "trace_");
+void write_trace_csv(const sample_trace_type& trace, const std::string& prefix = "trace_");
 void report_compartment_stats(const recipe&);
 
 int main(int argc, char** argv) {
@@ -111,7 +112,7 @@ int main(int argc, char** argv) {
 
         // Attach samplers to all probes
         std::vector<std::unique_ptr<sample_trace_type>> traces;
-        const time_type sample_dt = 0.1;
+        const time_type sample_dt = options.sample_dt;
         for (auto probe: m.probes()) {
             if (options.trace_max_gid && probe.id.gid>*options.trace_max_gid) {
                 continue;
@@ -153,8 +154,9 @@ int main(int argc, char** argv) {
         std::cout << "there were " << m.num_spikes() << " spikes\n";
 
         // save traces
+        auto write_trace = options.trace_format=="json"? write_trace_json: write_trace_csv;
         for (const auto& trace: traces) {
-            write_trace_json(*trace.get(), options.trace_prefix);
+            write_trace(*trace.get(), options.trace_prefix);
         }
 
         util::save_to_file(meters, "meters.json");
@@ -227,6 +229,20 @@ std::unique_ptr<sample_trace_type> make_trace(probe_record probe) {
     return util::make_unique<sample_trace_type>(probe.id, name, units);
 }
 
+void write_trace_csv(const sample_trace_type& trace, const std::string& prefix) {
+    auto path = prefix + std::to_string(trace.probe_id.gid) +
+                "." + std::to_string(trace.probe_id.index) + "_" + trace.name + ".csv";
+
+    std::ofstream file(path);
+    file << "# cell: " << trace.probe_id.gid << "\n";
+    file << "# probe: " << trace.probe_id.index << "\n";
+    file << "time_ms, " << trace.name << "_" << trace.units << "\n";
+
+    for (const auto& sample: trace.samples) {
+        file << util::strprintf("% 20.15f, % 20.15f\n", sample.time, sample.value);
+    }
+}
+
 void write_trace_json(const sample_trace_type& trace, const std::string& prefix) {
     auto path = prefix + std::to_string(trace.probe_id.gid) +
                 "." + std::to_string(trace.probe_id.index) + "_" + trace.name + ".json";
diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp
index 7aceb44387bb1b80c8f29e9ad542f13b89456bad..1fbcf848082de751be0f023204ea9cb825366a0c 100644
--- a/modcc/backends/avx2.hpp
+++ b/modcc/backends/avx2.hpp
@@ -156,6 +156,16 @@ struct simd_intrinsics<targetKind::avx2> {
         tb << ")";
     }
 
+    // In avx2 require 4-wide gather of i32 indices.
+    template<typename A, typename I, typename S>
+    static void emit_gather_index(TextBuffer& tb, const A& addr,
+                                  const I& index, const S& scale) {
+        tb << "_mm_i32gather_epi32(";
+        emit_operands(tb, arg_emitter(addr), arg_emitter(index),
+                      arg_emitter(scale));
+        tb << ")";
+    }
+
     template<typename T>
     static void emit_set_value(TextBuffer& tb, const T& arg) {
         tb << "_mm256_set1_pd(";
diff --git a/modcc/backends/avx512.hpp b/modcc/backends/avx512.hpp
index 87b7ef5da6e47c837178c2921b8f626da486da13..f1c7fce179afd21b4284af6026c58326867b89bf 100644
--- a/modcc/backends/avx512.hpp
+++ b/modcc/backends/avx512.hpp
@@ -130,6 +130,16 @@ struct simd_intrinsics<targetKind::avx512> {
         tb << ")";
     }
 
+    // In avx512 require 8-wide gather of i32 indices.
+    template<typename A, typename I, typename S>
+    static void emit_gather_index(TextBuffer& tb, const A& addr,
+                                  const I& index, const S& scale) {
+        tb << "_mm256_i32gather_epi32(";
+        emit_operands(tb, arg_emitter(addr), arg_emitter(index),
+                      arg_emitter(scale));
+        tb << ")";
+    }
+
     template<typename T>
     static void emit_set_value(TextBuffer& tb, const T& arg) {
         tb << "_mm512_set1_pd(";
diff --git a/modcc/backends/base.hpp b/modcc/backends/base.hpp
index 8323cb484a41dc145f857e4a187fc3708fd7e443..c5d0fdf45eb826fdf13fc1869915d289dfa6af89 100644
--- a/modcc/backends/base.hpp
+++ b/modcc/backends/base.hpp
@@ -76,6 +76,11 @@ struct simd_intrinsics {
     static void emit_gather(TextBuffer& tb, const A& addr,
                             const I& index, const S& scale);
 
+    // int32 value version of `emit_gather` to look up cell indices
+    template<typename A, typename I, typename S>
+    static void emit_gather_index(TextBuffer& tb, const A& addr,
+                                  const I& index, const S& scale);
+
     template<typename T>
     static void emit_set_value(TextBuffer& tb, const T& arg);
 
diff --git a/modcc/cprinter.cpp b/modcc/cprinter.cpp
index b871d60cd6ffc19cb406be98dcf663e43068bbea..f553a7a2cc6394159e651a834446e860ae2af68d 100644
--- a/modcc/cprinter.cpp
+++ b/modcc/cprinter.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <string>
 
 #include "cprinter.hpp"
 #include "lexer.hpp"
@@ -56,8 +57,10 @@ std::string CPrinter::emit_source() {
     text_.add_line("using iarray  = typename base::iarray;");
     text_.add_line("using view   = typename base::view;");
     text_.add_line("using iview  = typename base::iview;");
+    text_.add_line("using const_view = typename base::const_view;");
     text_.add_line("using const_iview = typename base::const_iview;");
     text_.add_line("using ion_type = typename base::ion_type;");
+    text_.add_line("using multi_event_stream = typename base::multi_event_stream;");
     text_.add_line();
 
     //////////////////////////////////////////////
@@ -85,8 +88,8 @@ std::string CPrinter::emit_source() {
     //////////////////////////////////////////////
     int num_vars = array_variables.size();
     text_.add_line();
-    text_.add_line(class_name + "(view vec_v, view vec_i, array&& weights, iarray&& node_index)");
-    text_.add_line(":   base(vec_v, vec_i, std::move(node_index))");
+    text_.add_line(class_name + "(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_index)");
+    text_.add_line(":   base(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))");
     text_.add_line("{");
     text_.increase_indentation();
     text_.add_gutter() << "size_type num_fields = " << num_vars << ";";
@@ -176,11 +179,7 @@ std::string CPrinter::emit_source() {
     text_.add_line("}");
     text_.add_line();
 
-    text_.add_line("void set_params(value_type t_, value_type dt_) override {");
-    text_.increase_indentation();
-    text_.add_line("t = t_;");
-    text_.add_line("dt = dt_;");
-    text_.decrease_indentation();
+    text_.add_line("void set_params() override {");
     text_.add_line("}");
     text_.add_line();
 
@@ -319,17 +318,36 @@ std::string CPrinter::emit_source() {
     auto proctest = [] (procedureKind k) {
         return is_in(k, {procedureKind::normal, procedureKind::api, procedureKind::net_receive});
     };
+    bool override_deliver_events = false;
     for(auto const& var: module_->symbols()) {
         auto isproc = var.second->kind()==symbolKind::procedure;
-        if(isproc )
-        {
+        if(isproc) {
             auto proc = var.second->is_procedure();
             if(proctest(proc->kind())) {
                 proc->accept(this);
             }
+            override_deliver_events |= proc->kind()==procedureKind::net_receive;
         }
     }
 
+    if(override_deliver_events) {
+        text_.add_line("void deliver_events(multi_event_stream& events) override {");
+        text_.increase_indentation();
+        text_.add_line("auto ncell = events.n_streams();");
+        text_.add_line("for (size_type c = 0; c<ncell; ++c) {");
+        text_.increase_indentation();
+        text_.add_line("for (auto ev: events.marked_events(c)) {");
+        text_.increase_indentation();
+        text_.add_line("if (ev.handle.mech_id==mech_id_) net_receive(ev.handle.index, ev.weight);");
+        text_.decrease_indentation();
+        text_.add_line("}");
+        text_.decrease_indentation();
+        text_.add_line("}");
+        text_.decrease_indentation();
+        text_.add_line("}");
+        text_.add_line();
+    }
+
     //////////////////////////////////////////////
     //////////////////////////////////////////////
 
@@ -357,6 +375,11 @@ std::string CPrinter::emit_source() {
     }
 
     text_.add_line();
+    text_.add_line("using base::mech_id_;");
+    text_.add_line("using base::vec_ci_;");
+    text_.add_line("using base::vec_t_;");
+    text_.add_line("using base::vec_t_to_;");
+    text_.add_line("using base::vec_dt_;");
     text_.add_line("using base::vec_v_;");
     text_.add_line("using base::vec_i_;");
     text_.add_line("using base::node_index_;");
@@ -380,6 +403,7 @@ void CPrinter::emit_headers() {
     text_.add_line();
     text_.add_line("#include <mechanism.hpp>");
     text_.add_line("#include <algorithms.hpp>");
+    text_.add_line("#include <backends/multicore/multi_event_stream.hpp>");
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 }
@@ -429,6 +453,10 @@ void CPrinter::visit(IndexedVariable *e) {
     text_ << e->index_name() << "[i_]";
 }
 
+void CPrinter::visit(CellIndexedVariable *e) {
+    text_ << e->index_name() << "[i_]";
+}
+
 void CPrinter::visit(UnaryExpression *e) {
     auto b = (e->expression()->is_binary()!=nullptr);
     switch(e->op()) {
@@ -582,20 +610,25 @@ void CPrinter::visit(APIMethod *e) {
         // create local indexed views
         for(auto &symbol : e->scope()->locals()) {
             auto var = symbol.second->is_local_variable();
-            if(var->is_indexed()) {
-                auto const& name = var->name();
-                auto const& index_name = var->external_variable()->index_name();
-                text_.add_gutter();
-                text_ << "auto " + index_name + " = util::indirect_view";
-                auto channel = var->external_variable()->ion_channel();
-                if(channel==ionKind::none) {
-                    text_ << "(" + index_name + "_, node_index_);\n";
-                }
-                else {
-                    auto iname = ion_store(channel);
-                    text_ << "(" << iname << "." << name << ", "
-                          << ion_store(channel) << ".index);\n";
-                }
+            if (!var->is_indexed()) continue;
+
+            auto external = var->external_variable();
+            auto const& name = var->name();
+            auto const& index_name = external->index_name();
+
+            text_.add_gutter();
+            text_ << "auto " + index_name + " = ";
+
+            if(external->is_cell_indexed_variable()) {
+                text_ << "util::indirect_view(util::indirect_view(" + index_name + "_, vec_ci_), node_index_);\n";
+            }
+            else if(external->is_ion()) {
+                auto channel = external->ion_channel();
+                auto iname = ion_store(channel);
+                text_ << "util::indirect_view(" << iname << "." << name << ", " << ion_store(channel) << ".index);\n";
+            }
+            else {
+                text_ << "util::indirect_view(" + index_name + "_, node_index_);\n";
             }
         }
 
diff --git a/modcc/cprinter.hpp b/modcc/cprinter.hpp
index 2b58a9fc85d7b891eb2453e84b86520430f25fec..0a254bc569fbc7d7209c0767d45612b96cf6fea7 100644
--- a/modcc/cprinter.hpp
+++ b/modcc/cprinter.hpp
@@ -21,6 +21,7 @@ public:
     virtual void visit(Symbol *e)               override;
     virtual void visit(LocalVariable *e)        override;
     virtual void visit(IndexedVariable *e)      override;
+    virtual void visit(CellIndexedVariable *e)  override;
     virtual void visit(IdentifierExpression *e) override;
     virtual void visit(CallExpression *e)       override;
     virtual void visit(ProcedureExpression *e)  override;
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 5672dd48fb152b23b67bd2fdbe7c65756bec8d89..f4e3db811a20aa0a4e58c50148e4b3f8f147f565 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("#include <mechanism.hpp>");
     text_.add_line("#include <algorithms.hpp>");
     text_.add_line("#include <backends/gpu/intrinsics.hpp>");
+    text_.add_line("#include <backends/gpu/multi_event_stream.hpp>");
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 
@@ -82,6 +83,16 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         param_pack.push_back(tname + ".index.data()");
     }
 
+    text_.add_line("// cv index to cell mapping and cell time states");
+    text_.add_line("const I* ci;");
+    text_.add_line("const T* vec_t;");
+    text_.add_line("const T* vec_t_to;");
+    text_.add_line("const T* vec_dt;");
+    param_pack.push_back("vec_ci_.data()");
+    param_pack.push_back("vec_t_.data()");
+    param_pack.push_back("vec_t_to_.data()");
+    param_pack.push_back("vec_dt_.data()");
+
     text_.add_line("// voltage and current state within the cell");
     text_.add_line("T* vec_v;");
     text_.add_line("T* vec_i;");
@@ -103,6 +114,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("namespace kernels {");
     {
         increase_indentation();
+        text_.add_line("using nest::mc::gpu::multi_event_stream;");
 
         // forward declarations of procedures
         for(auto const &var : m.symbols()) {
@@ -149,6 +161,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("using typename base::const_iview;");
     text_.add_line("using typename base::const_view;");
     text_.add_line("using typename base::ion_type;");
+    text_.add_line("using multi_event_stream = typename base::multi_event_stream;");
     text_.add_line("using param_pack_type = " + module_name + "_ParamPack<value_type, size_type>;");
 
     //////////////////////////////////////////////
@@ -178,8 +191,8 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
 
     int num_vars = array_variables.size();
     text_.add_line();
-    text_.add_line(class_name + "(view vec_v, view vec_i, array&& weights, iarray&& node_index):");
-    text_.add_line("   base(vec_v, vec_i, std::move(node_index))");
+    text_.add_line(class_name + "(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_index):");
+    text_.add_line("   base(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))");
     text_.add_line("{");
     text_.increase_indentation();
     text_.add_gutter() << "size_type num_fields = " << num_vars << ";";
@@ -252,13 +265,8 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("}");
     text_.add_line();
 
-    // print the member funtion that
-    //   *  sets time step parameters
-    //   *  packs up the parameters for use on the GPU
-    text_.add_line("void set_params(value_type t_, value_type dt_) override {");
-    text_.increase_indentation();
-    text_.add_line("t = t_;");
-    text_.add_line("dt = dt_;");
+    // print the member funtion that packs up the parameters for use on the GPU
+    text_.add_line("void set_params() override {");
     text_.add_line("param_pack_ =");
     text_.increase_indentation();
     text_.add_line("param_pack_type {");
@@ -432,15 +440,23 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         else if( var.second->kind()==symbolKind::procedure &&
                  var.second->is_procedure()->kind()==procedureKind::net_receive)
         {
-            // Simplest net_receive implementation forwards a single update
-            // to a GPU kernel.
-            auto proc = var.second->is_procedure();
-            auto name = proc->name();
-            text_.add_line("void " + name + "(int i_, value_type weight) {");
+            // Override `deliver_events`.
+            text_.add_line("void deliver_events(multi_event_stream& events) override {");
             text_.increase_indentation();
-            text_.add_line(
-                "kernels::" + name + "<value_type, size_type>"
-                + "<<<1, 1>>>(param_pack_, i_, weight);");
+            text_.add_line("auto ncell = events.n_streams();");
+            text_.add_line("constexpr int blockwidth = 128;");
+            text_.add_line("int nblock = 1+(ncell-1)/blockwidth;");
+            text_.add_line("kernels::deliver_events<value_type, size_type>"
+                           "<<<nblock, blockwidth>>>(param_pack_, mech_id_, events.delivery_data());");
+            text_.decrease_indentation();
+            text_.add_line("}");
+            text_.add_line();
+
+            // Provide testing interface to `net_receive`.
+            text_.add_line("void net_receive(int i_, value_type weight) override {");
+            text_.increase_indentation();
+            text_.add_line("kernels::net_receive_global<value_type, size_type>"
+                           "<<<1, 1>>>(param_pack_, i_, weight);");
             text_.decrease_indentation();
             text_.add_line("}");
             text_.add_line();
@@ -468,6 +484,11 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         }
     }
 
+    text_.add_line("using base::mech_id_;");
+    text_.add_line("using base::vec_ci_;");
+    text_.add_line("using base::vec_t_;");
+    text_.add_line("using base::vec_t_to_;");
+    text_.add_line("using base::vec_dt_;");
     text_.add_line("using base::vec_v_;");
     text_.add_line("using base::vec_i_;");
     text_.add_line("using base::node_index_;");
@@ -531,6 +552,9 @@ std::string CUDAPrinter::index_string(Symbol *s) {
                     s->location());
         }
     }
+    else if(s->is_cell_indexed_variable()) {
+        return "cid_";
+    }
     return "";
 }
 
@@ -538,6 +562,11 @@ void CUDAPrinter::visit(IndexedVariable *e) {
     text_ << "params_." << e->index_name() << "[" << index_string(e) << "]";
 }
 
+void CUDAPrinter::visit(CellIndexedVariable *e) {
+    text_ << "params_." << e->index_name() << "[" << index_string(e) << "]";
+}
+
+
 void CUDAPrinter::visit(LocalVariable *e) {
     std::string const& name = e->name();
     text_ << name;
@@ -680,10 +709,11 @@ void CUDAPrinter::visit(ProcedureExpression *e) {
     }
     else {
         // net_receive() kernel is a special case, not covered by APIMethod visit.
+
+        // Core `net_receive` kernel is called device-side from `kernel::deliver_events`.
         text_.add_gutter() << "template <typename T, typename I>\n";
-        text_.add_line(       "__global__");
-        text_.add_gutter() << "void " << e->name()
-                           << "(" << module_->name() << "_ParamPack<T,I> params_, "
+        text_.add_line(       "__device__");
+        text_.add_gutter() << "void net_receive(const " << module_->name() << "_ParamPack<T,I>& params_, "
                            << "I i_, T weight) {";
         text_.add_line();
         increase_indentation();
@@ -692,15 +722,60 @@ void CUDAPrinter::visit(ProcedureExpression *e) {
         text_.add_line("using iarray = I;");
         text_.add_line();
 
-        text_.add_line("if (threadIdx.x || blockIdx.x) return;");
         text_.add_line("auto tid_ = i_;");
         text_.add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
+        text_.add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
 
         print_APIMethod_body(e);
 
         decrease_indentation();
         text_.add_line("}");
         text_.add_line();
+
+        // Global one-thread wrapper for `net_receive` kernel is used to implement the
+        // `mechanism::net_receive` method. This is not called in the normal course
+        // of event delivery.
+        text_.add_gutter() << "template <typename T, typename I>\n";
+        text_.add_line(       "__global__");
+        text_.add_gutter() << "void net_receive_global("
+                           << module_->name() << "_ParamPack<T,I> params_, "
+                           << "I i_, T weight) {";
+        text_.add_line();
+        increase_indentation();
+
+        text_.add_line("if (threadIdx.x || blockIdx.x) return;");
+        text_.add_line("net_receive<T, I>(params_, i_, weight);");
+
+        decrease_indentation();
+        text_.add_line("}");
+        text_.add_line();
+
+        text_.add_gutter() << "template <typename T, typename I>\n";
+        text_.add_line(       "__global__");
+        text_.add_gutter() << "void deliver_events("
+                           << module_->name() << "_ParamPack<T,I> params_, "
+                           << "I mech_id, multi_event_stream::span_state state) {";
+        text_.add_line();
+        increase_indentation();
+
+        text_.add_line("auto tid_ = threadIdx.x + blockDim.x*blockIdx.x;");
+        text_.add_line("auto const ncell_ = state.n;");
+        text_.add_line();
+        text_.add_line("if(tid_<ncell_) {");
+        increase_indentation();
+
+        text_.add_line("for (auto j = state.span_begin[tid_]; j<state.mark[tid_]; ++j) {");
+        increase_indentation();
+        text_.add_line("if (state.ev_mech_id[j]==mech_id) net_receive<T, I>(params_, state.ev_index[j], state.ev_weight[j]);");
+        decrease_indentation();
+        text_.add_line("}");
+
+        decrease_indentation();
+        text_.add_line("}");
+
+        decrease_indentation();
+        text_.add_line("}");
+        text_.add_line();
     }
 }
 
@@ -731,6 +806,7 @@ void CUDAPrinter::visit(APIMethod *e) {
     increase_indentation();
 
     text_.add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
+    text_.add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
 
     print_APIMethod_body(e);
 
diff --git a/modcc/cudaprinter.hpp b/modcc/cudaprinter.hpp
index 0f4acbbbe1c8b0ed53b978aeea946a6dd222a3c7..7c5165225790750323fa8f94ab1c1e088659a086 100644
--- a/modcc/cudaprinter.hpp
+++ b/modcc/cudaprinter.hpp
@@ -22,6 +22,7 @@ public:
     void visit(Symbol *e)               override;
     void visit(LocalVariable *e)        override;
     void visit(IndexedVariable *e)      override;
+    void visit(CellIndexedVariable *e)  override;
 
     void visit(IdentifierExpression *e) override;
     void visit(CallExpression *e)       override;
diff --git a/modcc/expression.cpp b/modcc/expression.cpp
index 7b05d65fe8c974a9bc54c5e01e81e9b6cc666a49..2fcf79e32d5337f1a6b6e913dbe90d4610934dc6 100644
--- a/modcc/expression.cpp
+++ b/modcc/expression.cpp
@@ -96,7 +96,7 @@ void IdentifierExpression::semantic(scope_ptr scp) {
     // indexed variable is used in this procedure. In which case, we create
     // a local variable which refers to the indexed variable, which will be
     // found for any subsequent variable lookup inside the procedure
-    if(auto sym = s->is_indexed_variable()) {
+    if(auto sym = s->is_abstract_indexed_variable()) {
         auto var = new LocalVariable(location_, spelling_);
         var->external_variable(sym);
         s = scope_->add_local_symbol(spelling_, scope_type::symbol_ptr{var});
@@ -262,6 +262,15 @@ std::string IndexedVariable::to_string() const {
         + ", ion" + (ion_channel()==ionKind::none ? red(ch) : green(ch)) + ") ";
 }
 
+/*******************************************************************************
+  CellIndexedVariable
+*******************************************************************************/
+
+std::string CellIndexedVariable::to_string() const {
+    auto ch = ::to_string(ion_channel());
+    return blue("cellindexed") + " " + yellow(name()) + "->" + yellow(index_name());
+}
+
 /*******************************************************************************
   ReactionExpression
 *******************************************************************************/
@@ -869,6 +878,9 @@ void VariableExpression::accept(Visitor *v) {
 void IndexedVariable::accept(Visitor *v) {
     v->visit(this);
 }
+void CellIndexedVariable::accept(Visitor *v) {
+    v->visit(this);
+}
 void NumberExpression::accept(Visitor *v) {
     v->visit(this);
 }
diff --git a/modcc/expression.hpp b/modcc/expression.hpp
index 0da1dac24f9926cfef31d6c78aa597b9a0ef7a74..84eb7d09f86e786bc2646f4c96993cb2a9a2564b 100644
--- a/modcc/expression.hpp
+++ b/modcc/expression.hpp
@@ -21,7 +21,9 @@ class BlockExpression;
 class InitialBlock;
 class IfExpression;
 class VariableExpression;
+class AbstractIndexedVariable;
 class IndexedVariable;
+class CellIndexedVariable;
 class NumberExpression;
 class IntegerExpression;
 class LocalDeclaration;
@@ -233,7 +235,9 @@ public :
     virtual ProcedureExpression*  is_procedure()         {return nullptr;}
     virtual NetReceiveExpression* is_net_receive()       {return nullptr;}
     virtual APIMethod*            is_api_method()        {return nullptr;}
+    virtual AbstractIndexedVariable* is_abstract_indexed_variable()  {return nullptr;}
     virtual IndexedVariable*      is_indexed_variable()  {return nullptr;}
+    virtual CellIndexedVariable*  is_cell_indexed_variable()  {return nullptr;}
     virtual LocalVariable*        is_local_variable()    {return nullptr;}
 
 private :
@@ -509,8 +513,27 @@ protected:
     double         value_       = std::numeric_limits<double>::quiet_NaN();
 };
 
+// abstract base class for the two sorts of indexed externals
+class AbstractIndexedVariable: public Symbol {
+public:
+    AbstractIndexedVariable(Location loc, std::string name, symbolKind kind)
+    :   Symbol(std::move(loc), std::move(name), std::move(kind))
+    {}
+
+    virtual accessKind access() const = 0;
+    virtual ionKind ion_channel() const = 0;
+    virtual std::string const& index_name() const = 0;
+    virtual tok op() const = 0;
+
+    virtual bool is_ion()   const = 0;
+    virtual bool is_read()  const = 0;
+    virtual bool is_write() const = 0;
+
+    AbstractIndexedVariable* is_abstract_indexed_variable() override {return this;}
+};
+
 // an indexed variable
-class IndexedVariable : public Symbol {
+class IndexedVariable : public AbstractIndexedVariable {
 public:
     IndexedVariable(Location loc,
                     std::string lookup_name,
@@ -518,7 +541,7 @@ public:
                     accessKind acc,
                     tok o=tok::eq,
                     ionKind channel=ionKind::none)
-    :   Symbol(loc, std::move(lookup_name), symbolKind::indexed_variable),
+    :   AbstractIndexedVariable(loc, std::move(lookup_name), symbolKind::indexed_variable),
         access_(acc),
         ion_channel_(channel),
         index_name_(index_name),
@@ -529,7 +552,7 @@ public:
         if(access()==accessKind::readwrite) {
             msg = pprintf("attempt to generate an index % with readwrite access",
                           yellow(lookup_name));
-            goto compiler_error;
+           goto compiler_error;
         }
         // read only variables must be assigned via equality
         if(is_read() && op()!=tok::eq) {
@@ -546,30 +569,20 @@ public:
 
         return;
 
-compiler_error:
+    compiler_error:
         throw(compiler_exception(msg, location_));
     }
 
     std::string to_string() const override;
 
-    accessKind access() const {
-        return access_;
-    }
-
-    ionKind ion_channel() const {
-        return ion_channel_;
-    }
-    tok op() const {
-        return op_;
-    }
-
-    std::string const& index_name() const {
-        return index_name_;
-    }
+    accessKind access() const override { return access_; }
+    ionKind ion_channel() const override { return ion_channel_; }
+    std::string const& index_name() const override { return index_name_; }
+    tok op() const override { return op_; }
 
-    bool is_ion()   const {return ion_channel_ != ionKind::none;}
-    bool is_read()  const {return access_ == accessKind::read;  }
-    bool is_write() const {return access_ == accessKind::write; }
+    bool is_ion()   const override { return ion_channel_ != ionKind::none; }
+    bool is_read()  const override { return access_ == accessKind::read;   }
+    bool is_write() const override { return access_ == accessKind::write;  }
 
     void accept(Visitor *v) override;
     IndexedVariable* is_indexed_variable() override {return this;}
@@ -582,6 +595,36 @@ protected:
     tok op_;
 };
 
+class CellIndexedVariable : public AbstractIndexedVariable {
+public:
+    CellIndexedVariable(Location loc,
+                    std::string lookup_name,
+                    std::string index_name)
+    :   AbstractIndexedVariable(loc, std::move(lookup_name), symbolKind::indexed_variable),
+        index_name_(std::move(index_name))
+    {}
+
+    std::string to_string() const override;
+
+    accessKind access() const override { return accessKind::read; }
+    ionKind ion_channel() const override { return ionKind::none; }
+    std::string const& index_name() const override { return index_name_; }
+    tok op() const override { return tok::plus; }
+
+    bool is_ion()   const override { return false; }
+    bool is_read()  const override { return true; }
+    bool is_write() const override { return false; }
+
+    void accept(Visitor *v) override;
+    CellIndexedVariable* is_cell_indexed_variable() override {return this;}
+
+    ~CellIndexedVariable() {}
+
+protected:
+    std::string index_name_;
+};
+
+
 class LocalVariable : public Symbol {
 public :
     LocalVariable(Location loc,
@@ -626,11 +669,11 @@ public :
         return kind_==localVariableKind::argument;
     }
 
-    IndexedVariable* external_variable() {
+    AbstractIndexedVariable* external_variable() {
         return external_;
     }
 
-    void external_variable(IndexedVariable *i) {
+    void external_variable(AbstractIndexedVariable *i) {
         external_ = i;
     }
 
@@ -638,7 +681,7 @@ public :
     void accept(Visitor *v) override;
 
 private :
-    IndexedVariable *external_=nullptr;
+    AbstractIndexedVariable *external_=nullptr;
     localVariableKind kind_;
 };
 
diff --git a/modcc/module.cpp b/modcc/module.cpp
index 31a7b1261c6bbba6c6b888e0bda3e5701aa64bd1..41358492789bbbdaab34a248468369745a851b77 100644
--- a/modcc/module.cpp
+++ b/modcc/module.cpp
@@ -295,17 +295,13 @@ bool Module::semantic() {
     // The second is nrn_current, which is handled after this block
     auto state_api  = make_empty_api_method("nrn_state", "breakpoint");
     auto api_state  = state_api.first;
-    auto breakpoint = state_api.second;
+    auto breakpoint = state_api.second; // implies we are building the `nrn_state()` method.
 
     api_state->semantic(symbols_);
     scope_ptr nrn_state_scope = api_state->scope();
 
     if(breakpoint) {
-        //..........................................................
-        // nrn_state : The temporal integration of state variables
-        //..........................................................
-
-        // grab SOLVE statements, put them in `nrn_state` after translation.
+        // Grab SOLVE statements, put them in `nrn_state` after translation.
         bool found_solve = false;
         bool found_non_solve = false;
         std::set<std::string> solved_ids;
@@ -360,6 +356,7 @@ bool Module::semantic() {
                 }
 
                 // May have now redundant local variables; remove these first.
+                solve_block->semantic(nrn_state_scope);
                 solve_block = remove_unused_locals(solve_block->is_block());
 
                 // Copy body into nrn_state.
@@ -381,10 +378,14 @@ bool Module::semantic() {
                     breakpoint->location());
         }
         else {
-            // redo semantic pass in order to elimate any removed local symbols.
+            // Redo semantic pass in order to elimate any removed local symbols.
             api_state->semantic(symbols_);
         }
 
+        // Run remove locals pass again on the whole body in case `dt` was never used.
+        api_state->body(remove_unused_locals(api_state->body()));
+        api_state->semantic(symbols_);
+
         //..........................................................
         // nrn_current : update contributions to currents
         //..........................................................
@@ -430,8 +431,6 @@ void Module::add_variables_to_symbols() {
         symbols_[name] = symbol_ptr{t};
     };
 
-    create_variable("t",  rangeKind::scalar, accessKind::read);
-    create_variable("dt", rangeKind::scalar, accessKind::read);
     // density mechanisms use a vector of weights from current densities to
     // units of nA
     if (kind()==moduleKind::density) {
@@ -456,6 +455,23 @@ void Module::add_variables_to_symbols() {
                             accessKind::write, ionKind::none, Location());
     create_indexed_variable("v", "vec_v", tok::eq,
                             accessKind::read,  ionKind::none, Location());
+    create_indexed_variable("dt", "vec_dt", tok::eq,
+                            accessKind::read,  ionKind::none, Location());
+
+    // add cell-indexed variables to the table
+    auto create_cell_indexed_variable = [this]
+        (std::string const& name, std::string const& indexed_name, Location loc = Location())
+    {
+        if(symbols_.count(name)) {
+            throw compiler_exception(
+                "trying to insert a symbol that already exists",
+                loc);
+        }
+        symbols_[name] = make_symbol<CellIndexedVariable>(loc, name, indexed_name);
+    };
+
+    create_cell_indexed_variable("t", "vec_t");
+    create_cell_indexed_variable("t_to", "vec_t_to");
 
     // add state variables
     for(auto const &var : state_block()) {
diff --git a/modcc/simd_printer.hpp b/modcc/simd_printer.hpp
index ed77cc6b6fba9f2c68e0ed7f20d489152b0368cc..849ac2da18a08320147877baecb0d8010f2c8725 100644
--- a/modcc/simd_printer.hpp
+++ b/modcc/simd_printer.hpp
@@ -41,6 +41,7 @@ public:
         text_ << name;
     }
 
+    void visit(CellIndexedVariable *e) override;
     void visit(IndexedVariable *e) override;
     void visit(APIMethod *e) override;
     void visit(BlockExpression *e) override;
@@ -109,12 +110,14 @@ void SimdPrinter<Arch>::visit(APIMethod *e) {
     if (e->is_api_method()->body()->statements().size()) {
         text_.increase_indentation();
 
-        // First emit the raw pointer of node_index_
+        // First emit the raw pointer of node_index_ and vec_ci_
         text_.add_line("constexpr size_t simd_width = " +
                        simd_backend::emit_simd_width() +
                        " / (CHAR_BIT*sizeof(value_type));");
         text_.add_line("const size_type* " + emit_rawptr_name("node_index_") +
                        " = node_index_.data();");
+        text_.add_line("const size_type* " + emit_rawptr_name("vec_ci_") +
+                       " = vec_ci_.data();");
         text_.add_line();
 
         // create local indexed views
@@ -152,7 +155,8 @@ template<targetKind Arch>
 void SimdPrinter<Arch>::emit_indexed_view(LocalVariable* var,
                                           std::set<std::string>& decls) {
     auto const& name = var->name();
-    auto const& index_name = var->external_variable()->index_name();
+    auto external = var->external_variable();
+    auto const& index_name = external->index_name();
     text_.add_gutter();
 
     if (decls.find(index_name) == decls.cend()) {
@@ -160,24 +164,29 @@ void SimdPrinter<Arch>::emit_indexed_view(LocalVariable* var,
         decls.insert(index_name);
     }
 
-    text_ << index_name;
-    text_ << " = util::indirect_view";
-    auto channel = var->external_variable()->ion_channel();
-    if (channel == ionKind::none) {
-        text_ << "(" + emit_member_name(index_name) + ", node_index_);\n";
+    text_ << index_name << " = ";
+
+    if (external->is_cell_indexed_variable()) {
+        text_ << "util::indirect_view(util::indirect_view("
+              << emit_member_name(index_name) << ", vec_ci_), node_index_);\n";
     }
-    else {
+    else if (external->is_ion()) {
+        auto channel = external->ion_channel();
         auto iname = ion_store(channel);
-        text_ << "(" << iname << "." << name << ", "
+        text_ << "util::indirect_view(" << iname << "." << name << ", "
               << ion_store(channel) << ".index);\n";
     }
+    else {
+        text_ << " util::indirect_view(" + emit_member_name(index_name) + ", node_index_);\n";
+    }
 }
 
 template<targetKind Arch>
 void SimdPrinter<Arch>::emit_indexed_view_simd(LocalVariable* var,
                                                std::set<std::string>& decls) {
     auto const& name = var->name();
-    auto const& index_name = var->external_variable()->index_name();
+    auto external = var->external_variable();
+    auto const& index_name = external->index_name();
 
     // We need to work with with raw pointers in the vectorized version
     auto channel = var->external_variable()->ion_channel();
@@ -236,11 +245,11 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
     for (auto& symbol : e->scope()->locals()) {
         auto var = symbol.second->is_local_variable();
         if (var->is_indexed()) {
-            auto channel = var->external_variable()->ion_channel();
+            auto external = var->external_variable();
+            auto channel = external->ion_channel();
             std::string cast_type =
                 "(const " + simd_backend::emit_index_type() + " *) ";
 
-
             std::string vindex_name, index_ptr_name;
             if (channel == ionKind::none) {
                 vindex_name = emit_vtmp_name("node_index_");
@@ -253,7 +262,6 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
 
             }
 
-
             if (declared_ion_vars.find(vindex_name) == declared_ion_vars.cend()) {
                 declared_ion_vars.insert(vindex_name);
                 text_.add_gutter();
@@ -264,6 +272,20 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
                     text_, cast_type + "&" + index_ptr_name + "[off_]");
                 text_.end_line(";");
             }
+
+            if (external->is_cell_indexed_variable()) {
+                std::string vci_name = emit_vtmp_name("vec_ci_");
+                std::string ci_ptr_name = emit_rawptr_name("vec_ci_");
+
+                if (declared_ion_vars.find(vci_name) == declared_ion_vars.cend()) {
+                    declared_ion_vars.insert(vci_name);
+                    text_.add_gutter();
+                    text_ << simd_backend::emit_index_type() << " "
+                          << vci_name << " = ";
+                    simd_backend::emit_gather_index(text_, "(int *)"+ci_ptr_name, vindex_name, "sizeof(size_type)");
+                    text_.end_line(";");
+                }
+            }
         }
     }
 
@@ -381,6 +403,16 @@ void SimdPrinter<Arch>::visit(IndexedVariable *e) {
     simd_backend::emit_gather(text_, value_name, vindex_name, "sizeof(value_type)");
 }
 
+template<targetKind Arch>
+void SimdPrinter<Arch>::visit(CellIndexedVariable *e) {
+    std::string vindex_name, value_name;
+
+    vindex_name = emit_vtmp_name("vec_ci_");
+    value_name = emit_rawptr_name(e->index_name());
+
+    simd_backend::emit_gather(text_, vindex_name, value_name, "sizeof(value_type)");
+}
+
 template<targetKind Arch>
 void SimdPrinter<Arch>::visit(BlockExpression *e) {
     if (!e->is_nested()) {
diff --git a/modcc/visitor.hpp b/modcc/visitor.hpp
index 921189bc70cda3674bf94fabe6441a3a43f174b4..3ccbb938ae08563ff93a1a63a87b7bd77d9bb401 100644
--- a/modcc/visitor.hpp
+++ b/modcc/visitor.hpp
@@ -30,6 +30,7 @@ public:
     virtual void visit(StoichExpression *e)     { visit((Expression*) e); }
     virtual void visit(VariableExpression *e)   { visit((Expression*) e); }
     virtual void visit(IndexedVariable *e)      { visit((Expression*) e); }
+    virtual void visit(CellIndexedVariable *e)  { visit((Expression*) e); }
     virtual void visit(FunctionExpression *e)   { visit((Expression*) e); }
     virtual void visit(IfExpression *e)         { visit((Expression*) e); }
     virtual void visit(SolveExpression *e)      { visit((Expression*) e); }
diff --git a/scripts/cc-filter b/scripts/cc-filter
index e406b1091fb30bbcc6dd8955ae6fc01da7df5fcc..fca5c49c7369b14aa7031edfaca86e93154e01b2 100755
--- a/scripts/cc-filter
+++ b/scripts/cc-filter
@@ -226,7 +226,7 @@ rule cxx:unsigned-int      s/\bunsigned\s+int\b/unsigned/g
 group cxx:tidy cxx:rm-template-space cxx:unsigned-int
 
 rule cxx:strip-qualified   s/%cxx:qualified%//g
-rule cxx:strip-args        s/(%cxx:identifier%%cxx:template-args%?)%cxx:paren-args%/$1(...)/g
+rule cxx:strip-args        s/(%cxx:identifier%%cxx:template-args%?)%cxx:paren-args%(?!:)/$1(...)/g
 group cxx:strip-all cxx:strip-qualified cxx:strip-args
 _end_
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index badd3733a2a0cf3d90b2b9bb6c19fb553fcae686..47b79f5fd7ac9e2a036c40eeb10b41bb3bc3be2c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -22,6 +22,7 @@ set(BASE_SOURCES
 )
 set(CUDA_SOURCES
     backends/gpu/fvm.cu
+    backends/gpu/multi_event_stream.cu
     memory/fill.cu
 )
 
diff --git a/src/backends/event.hpp b/src/backends/event.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0eae623fbd0b90abf493dfc44e6c53b3d2bc5047
--- /dev/null
+++ b/src/backends/event.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <common_types.hpp>
+
+// Structures for the representation of event delivery targets and
+// staged events.
+
+namespace nest {
+namespace mc {
+
+struct target_handle {
+    cell_local_size_type mech_id;    // mechanism type identifier (per cell group).
+    cell_local_size_type index;      // instance of the mechanism
+    cell_size_type cell_index;       // which cell (acts as index into e.g. vec_t)
+
+    target_handle() {}
+    target_handle(cell_local_size_type mech_id, cell_local_size_type index, cell_size_type cell_index):
+        mech_id(mech_id), index(index), cell_index(cell_index) {}
+};
+
+struct deliverable_event {
+    time_type time;
+    target_handle handle;
+    float weight;
+
+    deliverable_event() {}
+    deliverable_event(time_type time, target_handle handle, float weight):
+        time(time), handle(handle), weight(weight) {}
+};
+
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/event_delivery.md b/src/backends/event_delivery.md
new file mode 100644
index 0000000000000000000000000000000000000000..796ccbae65b187d1120b20ef4aa2e544f3cd7399
--- /dev/null
+++ b/src/backends/event_delivery.md
@@ -0,0 +1,123 @@
+# Back-end event delivery
+
+## Data structures
+
+The lowered cell implementation gathers postsynaptic spike events from its governing
+`cell_group` class, and then passes them on to concrete back-end device-specific
+classes for on-device delivery.
+
+`backends/event.hpp` contains the concrete classes used to represent event
+destinations and event information.
+
+The back-end event management structure is supplied by the corresponding `backend`
+class as `backend::multi_event_stream`. It presents a limited public interface to
+the lowered cell, and is passed by reference as a parameter to the mechanism
+`deliver_events` method.
+
+### Target handles
+
+Target handles are used by the lowered cell implementation to identify a particular mechanism
+instance that can receive events — ultimately via `net_receive` — and corresponding simulated
+cell. The cell information is given as an index into the cell group collection of cells,
+and is used to group events by integration domain (we have one domain per cell in each cell
+group).
+
+Target handles are represented by the `target_handle` struct, opaque to `mc_cell_group`,
+but created in the `fvm_multicell` for each point mechanism (synapse) in the cell group.
+
+### Deliverable events
+
+Events for delivery within the next integration period are staged in the lowered cell
+as a vector of `deliverable_event` objects. These comprise an event delivery time,
+a `target_handle` describing their destination, and a weight.
+
+### Back-end event streams
+
+`backend::multi_event_stream` represents a set (one per cell/integration domain)
+of event streams. There is one `multi_event_stream` per lowered cell.
+
+From the perspective of the lowered cell, it must support the methods below.
+In the following, `time` is a `view` or `const_view` of an array with one
+element per stream.
+
+*  `void init(const std::vector<deliverable_event>& staged_events)`
+
+   Take a copy of the staged events (which must be ordered by increasing event time)
+   and initialize the streams by gathering the events by cell.
+
+*  `bool empty() const`
+
+   Return true if and only if there are no un-retired events left in any stream.
+
+*  `void clear()`
+
+   Retire all events, leaving the `multi_event_stream` in an empty state.
+
+*  `void mark_until_after(const_view time)`
+
+   For all streams, mark events for delivery in the _i_ th stream with event time ≤ _time[i]_.
+
+*  `void event_times_if_before(view time) const`
+
+   For each stream, set _time[i]_ to the time of the next event time in the _i_ th stream
+   if such an event exists and has time less than _time[i]_.
+
+*  `void drop_marked_events()`
+
+   Retire all marked events.
+
+
+## Event delivery and integration timeline
+
+Event delivery is performed as part of the integration loop within the lowered
+cell implementation. The interface is provided by the `multi_event_stream`
+described above, together with the mechanism method that handles the delivery proper,
+`mechanism::deliver_events` and a `backend` static method that computes the
+integration step end time before considering any pending events.
+
+For `fvm_multicell` one integration step comprises:
+
+1.  Events for each cell that are due at that cell's corresponding time are
+    gathered with `events_.mark_events(time_)` where `time_` is a
+    `const_view` of the cell times and `events_` is a reference to the
+    `backend::multi_event_stream` object.
+
+2.  Each mechanism is requested to deliver to itself any marked events that
+    are associated with that mechanism, via the virtual
+    `mechanism::deliver_events(backend::multi_event_stream&)` method.
+
+    This action must precede the computation of mechanism current contributions
+    with `mechanism::nrn_current()`.
+
+3.  Marked events are discarded with `events_.drop_marked_events()`.
+
+4.  The upper bound on the integration step stop time `time_to_` is
+    computed via `backend::update_time_to(time_to_, time_, dt_max_, tfinal_)`,
+    as the minimum of the per-cell time `time_` plus `dt_max_` and
+    the final integration stop time `tfinal_`.
+
+5.  The integration step stop time `time_to_` is reduced to match any
+    pending events on each cell with `events_.event_times_if_before(time_to)`.
+
+6.  The solver matrix is assembled and solved to compute the voltages, using the
+    newly computed currents and integration step times.
+
+7.  The mechanism states are updated with `mechanism::nrn_state()`.
+
+8.  The cell times `time_` are set to the integration step stop times `time_to_`.
+
+9.  Spike detection for the last integration step is performed via the
+    `threshold_watcher_` object.
+
+## Consequences for the integrator
+
+Towards the end of the integration period, an integration step may have a zero _dt_
+for one or more cells within the group, and this needs to be handled correctly:
+
+*   Generated mechanism `nrn_state()` methods should be numerically correct with
+    zero _dt_; a possibility is to guard the integration step with a _dt_ check.
+
+*   Matrix assemble and solve must check for zero _dt_. In the FVM `multicore`
+    matrix implementation, zero _dt_ sets the diagonal element to zero and the
+    rhs to the voltage; the solve procedure then ignores cells with a zero
+    diagonal element.
diff --git a/src/backends/fvm.hpp b/src/backends/fvm.hpp
index adb9838b227a392ba5b1dc6556c937bea303104a..a12ce30ea3f83af44e98d13a1d3ac8e21eb38af6 100644
--- a/src/backends/fvm.hpp
+++ b/src/backends/fvm.hpp
@@ -13,7 +13,13 @@ struct null_backend: public multicore::backend {
     }
 
     static mechanism make_mechanism(
-        const std::string&, view, view, const std::vector<value_type>&, const std::vector<size_type>&)
+        const std::string&,
+        size_type,
+        const_iview,
+        const_view, const_view, const_view,
+        view, view,
+        const std::vector<value_type>&,
+        const std::vector<size_type>&)
     {
         throw std::runtime_error("attempt to use an unsupported back end");
     }
diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp
index 1d932a07b68b39e2ea8a07098358141b61a491d2..c7a4dbb6b782c2c1754537735b212ead9d6d296e 100644
--- a/src/backends/gpu/fvm.hpp
+++ b/src/backends/gpu/fvm.hpp
@@ -3,12 +3,16 @@
 #include <map>
 #include <string>
 
+#include <backends/event.hpp>
 #include <common_types.hpp>
 #include <mechanism.hpp>
 #include <memory/memory.hpp>
+#include <util/rangeutil.hpp>
 
+#include "kernels/time_ops.hpp"
 #include "matrix_state_interleaved.hpp"
 #include "matrix_state_flat.hpp"
+#include "multi_event_stream.hpp"
 #include "stimulus.hpp"
 #include "threshold_watcher.hpp"
 
@@ -47,6 +51,11 @@ struct backend {
 
     // matrix back end implementation
     using matrix_state = matrix_state_interleaved<value_type, size_type>;
+    using multi_event_stream = nest::mc::gpu::multi_event_stream;
+
+    // re-expose common backend event types
+    using deliverable_event = nest::mc::deliverable_event;
+    using target_handle = nest::mc::target_handle;
 
     // mechanism infrastructure
     using ion = mechanisms::ion<backend>;
@@ -57,6 +66,9 @@ struct backend {
 
     static mechanism make_mechanism(
         const std::string& name,
+        size_type mech_id,
+        const_iview vec_ci,
+        const_view vec_t, const_view vec_t_to, const_view vec_dt,
         view vec_v, view vec_i,
         const std::vector<value_type>& weights,
         const std::vector<size_type>& node_indices)
@@ -66,7 +78,7 @@ struct backend {
         }
 
         return mech_map_.find(name)->
-            second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices));
+            second(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices));
     }
 
     static bool has_mechanism(const std::string& name) {
@@ -76,15 +88,49 @@ struct backend {
     using threshold_watcher =
         nest::mc::gpu::threshold_watcher<value_type, size_type>;
 
-private:
+    // perform min/max reductions on 'array' type
+    template <typename V>
+    static std::pair<V, V> minmax_value(const memory::device_vector<V>& v) {
+        // TODO: consider/test replacement with CUDA kernel (or generic reduction kernel).
+        auto v_copy = memory::on_host(v);
+        return util::minmax_value(v_copy);
+    }
+
+    // perform element-wise comparison on 'array' type against `t_test`.
+    template <typename V>
+    static bool any_time_before(const memory::device_vector<V>& t, V t_test) {
+        // Note: benchmarking (on a P100) indicates that using the gpu::any_time_before
+        // function is slower than the copy, unless we're running over ten thousands of
+        // cells per cell group.
+        //
+        // Commenting out for now, but consider a size-dependent test or adaptive choice.
+
+        // return gpu::any_time_before(t.size(), t.data(), t_test);
+
+        auto v_copy = memory::on_host(t);
+        return util::minmax_value(v_copy).first<t_test;
+    }
 
-    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
+    static void update_time_to(array& time_to, const_view time, value_type dt, value_type tmax) {
+        nest::mc::gpu::update_time_to<value_type, size_type>(time_to.size(), time_to.data(), time.data(), dt, tmax);
+    }
+
+    // set the per-cell and per-compartment dt_ from time_to_ - time_.
+    static void set_dt(array& dt_cell, array& dt_comp, const_view time_to, const_view time, const_iview cv_to_cell) {
+        size_type ncell = util::size(dt_cell);
+        size_type ncomp = util::size(dt_comp);
+
+        nest::mc::gpu::set_dt<value_type, size_type>(ncell, ncomp, dt_cell.data(), dt_comp.data(), time_to.data(), time.data(), cv_to_cell.data());
+    }
+
+private:
+    using maker_type = mechanism (*)(size_type, const_iview, const_view, const_view, const_view, view, view, array&&, iarray&&);
     static std::map<std::string, maker_type> mech_map_;
 
     template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
+    static mechanism maker(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
         return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(weights), std::move(node_indices));
+            (mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
 
diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp
index 69d19733dcd9e36eaf5c5fbc24f806fd7fed1f50..1a4d6a4520fd4e926a27d29b8f09ea13f9552803 100644
--- a/src/backends/gpu/kernels/assemble_matrix.hpp
+++ b/src/backends/gpu/kernels/assemble_matrix.hpp
@@ -15,20 +15,40 @@ namespace gpu {
 template <typename T, typename I>
 __global__
 void assemble_matrix_flat(
-        T* d, T* rhs, const T* invariant_d,
-        const T* voltage, const T* current, const T* cv_capacitance,
-        T dt, unsigned n)
+        T* d,
+        T* rhs,
+        const T* invariant_d,
+        const T* voltage,
+        const T* current,
+        const T* cv_capacitance,
+        const I* cv_to_cell,
+        const T* dt_cell,
+        unsigned n)
 {
     const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
 
-    // The 1e-3 is a constant of proportionality required to ensure that the
-    // conductance (gi) values have units μS (micro-Siemens).
-    // See the model documentation in docs/model for more information.
-    T factor = 1e-3/dt;
     if (tid<n) {
-        auto gi = factor * cv_capacitance[tid];
-        d[tid] = gi + invariant_d[tid];
-        rhs[tid] = gi*voltage[tid] - current[tid];
+        auto cid = cv_to_cell[tid];
+        auto dt = dt_cell[cid];
+
+        // Note: dt==0 case is expected only at the end of a mindelay/2
+        // integration period, and consequently divergence is unlikely
+        // to be a peformance problem.
+
+        if (dt>0) {
+            // The 1e-3 is a constant of proportionality required to ensure that the
+            // conductance (gi) values have units μS (micro-Siemens).
+            // See the model documentation in docs/model for more information.
+            T factor = 1e-3/dt;
+
+            auto gi = factor * cv_capacitance[tid];
+            d[tid] = gi + invariant_d[tid];
+            rhs[tid] = gi*voltage[tid] - current[tid];
+        }
+        else {
+            d[tid] = 0;
+            rhs[tid] = voltage[tid];
+        }
     }
 }
 
@@ -49,7 +69,9 @@ void assemble_matrix_interleaved(
         const T* cv_capacitance,
         const I* sizes,
         const I* starts,
-        T dt, unsigned padded_size, unsigned num_mtx)
+        const I* matrix_to_cell,
+        const T* dt_cell,
+        unsigned padded_size, unsigned num_mtx)
 {
     static_assert(BlockWidth*LoadWidth==Threads,
         "number of threads must equal number of values to process per block");
@@ -76,10 +98,21 @@ void assemble_matrix_interleaved(
 
     const unsigned max_size = sizes[0];
 
-    // The 1e-3 is a constant of proportionality required to ensure that the
-    // conductance (gi) values have units μS (micro-Siemens).
-    // See the model documentation in docs/model for more information.
-    T factor = 1e-3/dt;
+    T factor = 0;
+    T dt = 0;
+    const unsigned permuted_cid = blk_id*BlockWidth + blk_lane;
+
+    if (permuted_cid<num_mtx) {
+        auto cid = matrix_to_cell[permuted_cid];
+        dt = dt_cell[cid];
+
+        // The 1e-3 is a constant of proportionality required to ensure that the
+        // conductance (gi) values have units μS (micro-Siemens).
+        // See the model documentation in docs/model for more information.
+
+        factor = dt>0? 1e-3/dt: 0;
+    }
+
     for (unsigned j=0u; j<max_size; j+=LoadWidth) {
         if (do_load && load_pos<end) {
             buffer_v[lid] = voltage[load_pos];
@@ -90,8 +123,15 @@ void assemble_matrix_interleaved(
 
         if (j+blk_row<padded_size) {
             const auto gi = factor * cv_capacitance[store_pos];
-            d[store_pos]   = gi + invariant_d[store_pos];
-            rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos];
+
+            if (dt>0) {
+                d[store_pos]   = (gi + invariant_d[store_pos]);
+                rhs[store_pos] = (gi*buffer_v[blk_pos] - buffer_i[blk_pos]);
+            }
+            else {
+                d[store_pos]   = 0;
+                rhs[store_pos] = buffer_v[blk_pos];
+            }
         }
 
         __syncthreads();
diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp
index 8f3993faee6d47af106d4f05a1e33783b1befc18..e33e933bc36d47b98cc3716eca3349b08f0f8b81 100644
--- a/src/backends/gpu/kernels/detail.hpp
+++ b/src/backends/gpu/kernels/detail.hpp
@@ -75,6 +75,13 @@ constexpr bool is_npos(T v) {
     return v == npos<T>();
 }
 
+/// Cuda lerp by u on [a,b]: (1-u)*a + u*b.
+template <typename T>
+__host__ __device__
+inline T lerp(T a, T b, T u) {
+    return std::fma(u, b, std::fma(-u, a, a));
+}
+
 } // namespace impl
 
 } // namespace gpu
diff --git a/src/backends/gpu/kernels/solve_matrix.hpp b/src/backends/gpu/kernels/solve_matrix.hpp
index 2cfa5f105f54f3a0e0e9c357f8dc3ad6ea7ee4b5..48e4a8b9addd31e9d9adefc87eded77c3a03031c 100644
--- a/src/backends/gpu/kernels/solve_matrix.hpp
+++ b/src/backends/gpu/kernels/solve_matrix.hpp
@@ -11,17 +11,21 @@ namespace gpu {
 template <typename T, typename I>
 __global__
 void solve_matrix_flat(
-    T* rhs, T* d, const T* u, const I* p, const I* cell_index, int num_mtx)
+    T* rhs, T* d, const T* u, const I* p, const I* cell_cv_divs, int num_mtx)
 {
     auto tid = threadIdx.x + blockDim.x*blockIdx.x;
 
     if (tid<num_mtx) {
         // get range of this thread's cell matrix
-        const auto first = cell_index[tid];
-        const auto last  = cell_index[tid+1];
+        const auto first = cell_cv_divs[tid];
+        const auto last  = cell_cv_divs[tid+1];
+
+        // Zero diagonal term implies dt==0; just leave rhs (for whole matrix)
+        // alone in that case.
+        if (d[last-1]==0) return;
 
         // backward sweep
-        for(auto i=last-1; i>first; --i) {
+        for (auto i=last-1; i>first; --i) {
             auto factor = u[i] / d[i];
             d[p[i]]   -= factor * u[i];
             rhs[p[i]] -= factor * rhs[i];
@@ -55,6 +59,10 @@ void solve_matrix_interleaved(
         const auto last     = first + BlockWidth*(sizes[tid]-1);
         const auto last_max = first + BlockWidth*(sizes[block_start]-1);
 
+        // Zero diagonal term implies dt==0; just leave rhs (for whole matrix)
+        // alone in that case.
+        if (d[last]==0) return;
+
         // backward sweep
         for(auto i=last_max; i>first; i-=BlockWidth) {
             if (i<=last) {
diff --git a/src/backends/gpu/kernels/test_thresholds.hpp b/src/backends/gpu/kernels/test_thresholds.hpp
index 946030704607b87bad5531c8530f671fe52a48d8..4c23d13d67f6a0df63bf326e75c645c763f0f807 100644
--- a/src/backends/gpu/kernels/test_thresholds.hpp
+++ b/src/backends/gpu/kernels/test_thresholds.hpp
@@ -17,10 +17,11 @@ namespace gpu {
 template <typename T, typename I, typename Stack>
 __global__
 void test_thresholds(
-    float t, float t_prev, int size,
+    const I* cv_to_cell, const T* t_after, const T* t_before,
+    int size,
     Stack& stack,
     I* is_crossed, T* prev_values,
-    const I* index, const T* values, const T* thresholds)
+    const I* cv_index, const T* values, const T* thresholds)
 {
     int i = threadIdx.x + blockIdx.x*blockDim.x;
 
@@ -29,8 +30,10 @@ void test_thresholds(
 
     if (i<size) {
         // Test for threshold crossing
+        const auto cv     = cv_index[i];
+        const auto cell   = cv_to_cell[cv];
         const auto v_prev = prev_values[i];
-        const auto v      = values[index[i]];
+        const auto v      = values[cv];
         const auto thresh = thresholds[i];
 
         if (!is_crossed[i]) {
@@ -38,7 +41,7 @@ void test_thresholds(
                 // The threshold has been passed, so estimate the time using
                 // linear interpolation
                 auto pos = (thresh - v_prev)/(v - v_prev);
-                crossing_time = t_prev + pos*(t - t_prev);
+                crossing_time = impl::lerp(t_before[cell], t_after[cell], pos);
 
                 is_crossed[i] = 1;
                 crossed = true;
diff --git a/src/backends/gpu/kernels/time_ops.hpp b/src/backends/gpu/kernels/time_ops.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..942710346d137c2611ed66d0efbd2c8840451367
--- /dev/null
+++ b/src/backends/gpu/kernels/time_ops.hpp
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <type_traits>
+
+#include <cuda.h>
+
+#include <memory/wrappers.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+namespace kernels {
+    template <typename T, typename I>
+    __global__ void update_time_to(I n, T* time_to, const T* time, T dt, T tmax) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            auto t = time[i]+dt;
+            time_to[i] = t<tmax? t: tmax;
+        }
+    }
+
+    // array-array comparison
+    template <typename T, typename I, typename Pred>
+    __global__ void array_reduce_any(I n, const T* x, const T* y, Pred p, int* rptr) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        int cmp = i<n? p(x[i], y[i]): 0;
+        if (__syncthreads_or(cmp) && !threadIdx.x) {
+            *rptr=1;
+        }
+    }
+
+    // array-scalar comparison
+    template <typename T, typename I, typename Pred>
+    __global__ void array_reduce_any(I n, const T* x, T y, Pred p, int* rptr) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        int cmp = i<n? p(x[i], y): 0;
+        if (__syncthreads_or(cmp) && !threadIdx.x) {
+            *rptr=1;
+        }
+    }
+
+    template <typename T>
+    struct less {
+        __device__ __host__
+        bool operator()(const T& a, const T& b) const { return a<b; }
+    };
+
+    // vector minus: x = y - z
+    template <typename T, typename I>
+    __global__ void vec_minus(I n, T* x, const T* y, const T* z) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            x[i] = y[i]-z[i];
+        }
+    }
+
+    // vector gather: x[i] = y[index[i]]
+    template <typename T, typename I>
+    __global__ void gather(I n, T* x, const T* y, const I* index) {
+        int i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            x[i] = y[index[i]];
+        }
+    }
+}
+
+template <typename T, typename I>
+void update_time_to(I n, T* time_to, const T* time, T dt, T tmax) {
+    if (!n) {
+        return;
+    }
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n-1)/blockwidth;
+    kernels::update_time_to<<<nblock, blockwidth>>>(n, time_to, time, dt, tmax);
+}
+
+template <typename T, typename U, typename I>
+bool any_time_before(I n, T* t0, U t1) {
+    static_assert(std::is_convertible<T*, U>::value || std::is_convertible<T, U>::value,
+        "third-argument must be a compatible scalar or pointer type");
+
+    static thread_local auto r = memory::device_vector<int>(1);
+    if (!n) {
+        return false;
+    }
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n-1)/blockwidth;
+
+    r[0] = 0;
+    kernels::array_reduce_any<<<nblock, blockwidth>>>(n, t0, t1, kernels::less<T>(), r.data());
+    return r[0];
+}
+
+template <typename T, typename I>
+void set_dt(I ncell, I ncomp, T* dt_cell, T* dt_comp, const T* time_to, const T* time, const I* cv_to_cell) {
+    if (!ncell || !ncomp) {
+        return;
+    }
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(ncell-1)/blockwidth;
+    kernels::vec_minus<<<nblock, blockwidth>>>(ncell, dt_cell, time_to, time);
+
+    nblock = 1+(ncomp-1)/blockwidth;
+    kernels::gather<<<nblock, blockwidth>>>(ncomp, dt_comp, dt_cell, cv_to_cell);
+}
+
+} // namespace gpu
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/gpu/matrix_state_flat.hpp b/src/backends/gpu/matrix_state_flat.hpp
index d214079f7f5841a43c6f6c52cf9992e3e0a635e3..dc7f9a40fa095399a2ff357dfc30660bb245389f 100644
--- a/src/backends/gpu/matrix_state_flat.hpp
+++ b/src/backends/gpu/matrix_state_flat.hpp
@@ -1,7 +1,9 @@
 #pragma once
 
 #include <memory/memory.hpp>
+#include <memory/wrappers.hpp>
 #include <util/span.hpp>
+#include <util/partition.hpp>
 #include <util/rangeutil.hpp>
 
 #include "kernels/solve_matrix.hpp"
@@ -24,7 +26,8 @@ struct matrix_state_flat {
     using const_view = typename array::const_view_type;
 
     iarray parent_index;
-    iarray cell_index;
+    iarray cell_cv_divs;
+    iarray cv_to_cell;
 
     array d;     // [μS]
     array u;     // [μS]
@@ -36,17 +39,15 @@ struct matrix_state_flat {
     // the invariant part of the matrix diagonal
     array invariant_d;         // [μS]
 
-    // interface for exposing the solution to the outside world
-    view solution;
-
     matrix_state_flat() = default;
 
     matrix_state_flat(const std::vector<size_type>& p,
-                 const std::vector<size_type>& cell_idx,
+                 const std::vector<size_type>& cell_cv_divs,
                  const std::vector<value_type>& cv_cap,
                  const std::vector<value_type>& face_cond):
         parent_index(memory::make_const_view(p)),
-        cell_index(memory::make_const_view(cell_idx)),
+        cell_cv_divs(memory::make_const_view(cell_cv_divs)),
+        cv_to_cell(p.size()),
         d(p.size()),
         u(p.size()),
         rhs(p.size()),
@@ -54,12 +55,13 @@ struct matrix_state_flat {
     {
         EXPECTS(cv_cap.size() == size());
         EXPECTS(face_cond.size() == size());
-        EXPECTS(cell_idx.back() == size());
-        EXPECTS(cell_idx.size() > 2u);
+        EXPECTS(cell_cv_divs.back() == size());
+        EXPECTS(cell_cv_divs.size() > 1u);
 
         using memory::make_const_view;
 
         auto n = d.size();
+        std::vector<size_type> cv_to_cell_tmp(n, 0);
         std::vector<value_type> invariant_d_tmp(n, 0);
         std::vector<value_type> u_tmp(n, 0);
 
@@ -70,18 +72,29 @@ struct matrix_state_flat {
             invariant_d_tmp[i] += gij;
             invariant_d_tmp[p[i]] += gij;
         }
+
+        size_type ci = 0;
+        for (auto cv_span: util::partition_view(cell_cv_divs)) {
+            util::fill(util::subrange_view(cv_to_cell_tmp, cv_span), ci);
+            ++ci;
+        }
+
+        cv_to_cell = make_const_view(cv_to_cell_tmp);
         invariant_d = make_const_view(invariant_d_tmp);
         u = make_const_view(u_tmp);
+    }
 
-        solution = rhs;
+    // interface for exposing the solution to the outside world
+    const_view solution() const {
+        return memory::make_view(rhs);
     }
 
     // Assemble the matrix
-    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
-    //   dt      [ms]
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
+    //   dt_cell [ms] (per cell)
     //   voltage [mV]
     //   current [nA]
-    void assemble(value_type dt, const_view voltage, const_view current) {
+    void assemble(const_view dt_cell, const_view voltage, const_view current) {
         // determine the grid dimensions for the kernel
         auto const n = voltage.size();
         auto const block_dim = 128;
@@ -89,7 +102,7 @@ struct matrix_state_flat {
 
         assemble_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
             d.data(), rhs.data(), invariant_d.data(), voltage.data(),
-            current.data(), cv_capacitance.data(), dt, size());
+            current.data(), cv_capacitance.data(), cv_to_cell.data(), dt_cell.data(), size());
     }
 
     void solve() {
@@ -100,7 +113,7 @@ struct matrix_state_flat {
         // perform solve on gpu
         solve_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> (
             rhs.data(), d.data(), u.data(), parent_index.data(),
-            cell_index.data(), num_matrices());
+            cell_cv_divs.data(), num_matrices());
     }
 
     std::size_t size() const {
@@ -109,7 +122,7 @@ struct matrix_state_flat {
 
 private:
     unsigned num_matrices() const {
-        return cell_index.size()-1;
+        return cell_cv_divs.size()-1;
     }
 };
 
diff --git a/src/backends/gpu/matrix_state_interleaved.hpp b/src/backends/gpu/matrix_state_interleaved.hpp
index 2f1888395dbaf7624edef3813ad1a9984170e697..a6a127e879a789e9506035180f1195be7555edfe 100644
--- a/src/backends/gpu/matrix_state_interleaved.hpp
+++ b/src/backends/gpu/matrix_state_interleaved.hpp
@@ -1,7 +1,9 @@
 #pragma once
 
 #include <memory/memory.hpp>
+#include <util/debug.hpp>
 #include <util/span.hpp>
+#include <util/partition.hpp>
 #include <util/rangeutil.hpp>
 
 #include "kernels/solve_matrix.hpp"
@@ -55,6 +57,8 @@ struct matrix_state_interleaved {
     iarray matrix_sizes;
     // start values corresponding to matrix i in the external storage
     iarray matrix_index;
+    // map from matrix index (after permutation) to original cell index
+    iarray matrix_to_cell_index;
 
     // Storage for the matrix and parent index in interleaved format.
     // Includes the cv_capacitance, which is required for matrix assembly.
@@ -77,7 +81,7 @@ struct matrix_state_interleaved {
     //  Storage for solution in uninterleaved format.
     //  Used to hold the storage for passing to caller, and must be updated
     //  after each call to the ::solve() method.
-    array solution;
+    array solution_;
 
     // default constructor
     matrix_state_interleaved() = default;
@@ -89,18 +93,19 @@ struct matrix_state_interleaved {
     //  cv_cap      // [pF]
     //  face_cond   // [μS]
     matrix_state_interleaved(const std::vector<size_type>& p,
-                 const std::vector<size_type>& cell_idx,
+                 const std::vector<size_type>& cell_cv_divs,
                  const std::vector<value_type>& cv_cap,
                  const std::vector<value_type>& face_cond)
     {
         EXPECTS(cv_cap.size()    == p.size());
         EXPECTS(face_cond.size() == p.size());
-        EXPECTS(cell_idx.back()  == p.size());
+        EXPECTS(cell_cv_divs.back()  == p.size());
 
         // Just because you never know.
-        EXPECTS(cell_idx.size() <= UINT_MAX);
+        EXPECTS(cell_cv_divs.size() <= UINT_MAX);
 
         using util::make_span;
+        using util::indirect_view;
 
         // Convenience for commonly used type in this routine.
         using svec = std::vector<size_type>;
@@ -110,11 +115,11 @@ struct matrix_state_interleaved {
         //
 
         // Find the size of each matrix.
-        const auto num_mtx = cell_idx.size()-1;
         svec sizes;
-        for (auto it=cell_idx.begin()+1; it!=cell_idx.end(); ++it) {
-            sizes.push_back(*it - *(it-1));
+        for (auto cv_span: util::partition_view(cell_cv_divs)) {
+            sizes.push_back(cv_span.second-cv_span.first);
         }
+        const auto num_mtx = sizes.size();
 
         // Find permutations and sort indexes/sizes.
         svec perm(num_mtx);
@@ -123,15 +128,10 @@ struct matrix_state_interleaved {
         util::stable_sort_by(perm, [&sizes](size_type i){ return sizes[i]; });
         std::reverse(perm.begin(), perm.end());
 
-        // TODO: refactor to be less verbose with permutation_view
-        svec sizes_p;
-        for (auto i: make_span(0, num_mtx)) {
-            sizes_p.push_back(sizes[perm[i]]);
-        }
-        svec cell_index_p;
-        for (auto i: make_span(0, num_mtx)) {
-            cell_index_p.push_back(cell_idx[perm[i]]);
-        }
+        svec sizes_p = util::assign_from(indirect_view(sizes, perm));
+
+        auto cell_to_cv = util::subrange_view(cell_cv_divs, 0, num_mtx);
+        svec cell_to_cv_p = util::assign_from(indirect_view(cell_to_cv, perm));
 
         //
         // Calculate dimensions required to store matrices.
@@ -154,7 +154,7 @@ struct matrix_state_interleaved {
             auto lane  = mtx%block_dim();
 
             auto len = sizes_p[mtx];
-            auto src = cell_index_p[mtx];
+            auto src = cell_to_cv_p[mtx];
             auto dst = block*(block_dim()*padded_size) + lane;
             for (auto i: make_span(0, len)) {
                 // the p indexes are always relative to the start of the p vector.
@@ -189,25 +189,30 @@ struct matrix_state_interleaved {
         // memory, for use as an rvalue in an assignemt to a device vector.
         auto interleave = [&] (std::vector<T>const& x) {
             return memory::on_gpu(
-                flat_to_interleaved(x, sizes_p, cell_index_p, block_dim(), num_mtx, padded_size));
+                flat_to_interleaved(x, sizes_p, cell_to_cv_p, block_dim(), num_mtx, padded_size));
         };
         u           = interleave(u_tmp);
         invariant_d = interleave(invariant_d_tmp);
         cv_capacitance = interleave(cv_cap);
 
         matrix_sizes = memory::make_const_view(sizes_p);
-        matrix_index = memory::make_const_view(cell_index_p);
+        matrix_index = memory::make_const_view(cell_to_cv_p);
+        matrix_to_cell_index = memory::make_const_view(perm);
 
         // Allocate space for storing the un-interleaved solution.
-        solution = array(p.size());
+        solution_ = array(p.size());
+    }
+
+    const_view solution() const {
+        return solution_;
     }
 
     // Assemble the matrix
-    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
-    //   dt      [ms]
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
+    //   dt_cell [ms] (per cell)
     //   voltage [mV]
     //   current [nA]
-    void assemble(value_type dt, const_view voltage, const_view current) {
+    void assemble(const_view dt_cell, const_view voltage, const_view current) {
         constexpr auto bd = impl::block_dim();
         constexpr auto lw = impl::load_width();
         constexpr auto block_dim = bd*lw;
@@ -220,7 +225,8 @@ struct matrix_state_interleaved {
             (d.data(), rhs.data(), invariant_d.data(),
              voltage.data(), current.data(), cv_capacitance.data(),
              matrix_sizes.data(), matrix_index.data(),
-             dt, padded_matrix_size(), num_matrices());
+             matrix_to_cell_index.data(),
+             dt_cell.data(), padded_matrix_size(), num_matrices());
 
     }
 
@@ -234,7 +240,7 @@ struct matrix_state_interleaved {
 
         // Copy the solution from interleaved to front end storage.
         interleaved_to_flat<value_type, size_type, impl::block_dim(), impl::load_width()>
-            (rhs.data(), solution.data(), matrix_sizes.data(), matrix_index.data(),
+            (rhs.data(), solution_.data(), matrix_sizes.data(), matrix_index.data(),
              padded_matrix_size(), num_matrices());
     }
 
diff --git a/src/backends/gpu/multi_event_stream.cu b/src/backends/gpu/multi_event_stream.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cc19e215c14d191a0e0ac04b579dcad86933025e
--- /dev/null
+++ b/src/backends/gpu/multi_event_stream.cu
@@ -0,0 +1,157 @@
+#include <common_types.hpp>
+#include <backends/event.hpp>
+#include <backends/gpu/multi_event_stream.hpp>
+#include <memory/array.hpp>
+#include <memory/copy.hpp>
+#include <util/rangeutil.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+namespace kernels {
+    template <typename T, typename I>
+    __global__ void mark_until_after(
+        I n,
+        I* mark,
+        const I* span_end,
+        const T* ev_time,
+        const T* t_until)
+    {
+        I i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            auto t = t_until[i];
+            auto end = span_end[i];
+            auto &m = mark[i];
+
+            while (m!=end && !(ev_time[m]>t)) {
+                ++m;
+            }
+        }
+    }
+
+    template <typename T, typename I>
+    __global__ void drop_marked_events(
+        I n,
+        I* n_nonempty,
+        I* span_begin,
+        const I* span_end,
+        const I* mark)
+    {
+        I i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            bool emptied = (span_begin[i]<span_end[i] && mark[i]==span_end[i]);
+            span_begin[i] = mark[i];
+            if (emptied) {
+                atomicAdd(n_nonempty, (I)-1);
+            }
+        }
+    }
+
+    template <typename T, typename I>
+    __global__ void event_time_if_before(
+        I n,
+        const I* span_begin,
+        const I* span_end,
+        const T* ev_time,
+        T* t_until)
+    {
+        I i = threadIdx.x+blockIdx.x*blockDim.x;
+        if (i<n) {
+            if (span_begin[i]<span_end[i]) {
+                auto ev_t = ev_time[span_begin[i]];
+                if (t_until[i]>ev_t) {
+                    t_until[i] = ev_t;
+                }
+            }
+        }
+    }
+} // namespace kernels
+
+void multi_event_stream::clear() {
+    memory::fill(span_begin_, 0u);
+    memory::fill(span_end_, 0u);
+    memory::fill(mark_, 0u);
+    n_nonempty_stream_[0] = 0;
+}
+
+void multi_event_stream::init(std::vector<deliverable_event> staged) {
+    if (staged.size()>std::numeric_limits<size_type>::max()) {
+        throw std::range_error("too many events");
+    }
+
+    // Build vectors in host memory here and transfer to the device at end.
+    std::size_t n_ev = staged.size();
+
+    std::vector<size_type> divisions(n_stream_+1, 0);
+    std::vector<size_type> tmp_ev_indices(n_ev);
+    std::vector<value_type> tmp_ev_values(n_ev);
+
+    ev_time_ = array(n_ev);
+    ev_weight_ = array(n_ev);
+    ev_mech_id_ = iarray(n_ev);
+    ev_index_ = iarray(n_ev);
+
+    util::stable_sort_by(staged, [](const deliverable_event& e) { return e.handle.cell_index; });
+
+    // Split out event fields and copy to device.
+    util::assign_by(tmp_ev_values, staged, [](const deliverable_event& e) { return e.weight; });
+    memory::copy(tmp_ev_values, ev_weight_);
+
+    util::assign_by(tmp_ev_values, staged, [](const deliverable_event& e) { return e.time; });
+    memory::copy(tmp_ev_values, ev_time_);
+
+    util::assign_by(tmp_ev_indices, staged, [](const deliverable_event& e) { return e.handle.mech_id; });
+    memory::copy(tmp_ev_indices, ev_mech_id_);
+
+    util::assign_by(tmp_ev_indices, staged, [](const deliverable_event& e) { return e.handle.index; });
+    memory::copy(tmp_ev_indices, ev_index_);
+
+    // Determine divisions by `cell_index` in ev list and copy to device.
+    size_type ev_i = 0;
+    size_type n_nonempty = 0;
+    for (size_type s = 1; s<=n_stream_; ++s) {
+        while (ev_i<n_ev && staged[ev_i].handle.cell_index<s) ++ev_i;
+        divisions[s] = ev_i;
+        n_nonempty += (divisions[s]!=divisions[s-1]);
+    }
+
+    memory::copy(memory::make_view(divisions)(0,n_stream_), span_begin_);
+    memory::copy(memory::make_view(divisions)(1,n_stream_+1), span_end_);
+    memory::copy(span_begin_, mark_);
+    n_nonempty_stream_[0] = n_nonempty;
+}
+
+
+// Designate for processing events `ev` at head of each event stream `i`
+// until `event_time(ev)` > `t_until[i]`.
+void multi_event_stream::mark_until_after(const_view t_until) {
+    EXPECTS(n_streams()==util::size(t_until));
+
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n_stream_-1)/blockwidth;
+    kernels::mark_until_after<value_type, size_type><<<nblock, blockwidth>>>(
+        n_stream_, mark_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+}
+
+// Remove marked events from front of each event stream.
+void multi_event_stream::drop_marked_events() {
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n_stream_-1)/blockwidth;
+    kernels::drop_marked_events<value_type, size_type><<<nblock, blockwidth>>>(
+        n_stream_, n_nonempty_stream_.data(), span_begin_.data(), span_end_.data(), mark_.data());
+}
+
+// If the head of `i`th event stream exists and has time less than `t_until[i]`, set
+// `t_until[i]` to the event time.
+void multi_event_stream::event_time_if_before(view t_until) {
+    constexpr int blockwidth = 128;
+    int nblock = 1+(n_stream_-1)/blockwidth;
+    kernels::event_time_if_before<value_type, size_type><<<nblock, blockwidth>>>(
+        n_stream_, span_begin_.data(), span_end_.data(), ev_time_.data(), t_until.data());
+}
+
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/backends/gpu/multi_event_stream.hpp b/src/backends/gpu/multi_event_stream.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9fcce25f17d408da3bbd7a5e1c4e1ee4ac94a52c
--- /dev/null
+++ b/src/backends/gpu/multi_event_stream.hpp
@@ -0,0 +1,83 @@
+#pragma once
+
+// Indexed collection of pop-only event queues --- multicore back-end implementation.
+
+#include <common_types.hpp>
+#include <backends/event.hpp>
+#include <memory/array.hpp>
+#include <memory/copy.hpp>
+
+namespace nest {
+namespace mc {
+namespace gpu {
+
+class multi_event_stream {
+public:
+    using size_type = cell_size_type;
+    using value_type = double;
+
+    using array = memory::device_vector<value_type>;
+    using iarray = memory::device_vector<size_type>;
+
+    using const_view = array::const_view_type;
+    using view = array::view_type;
+
+    multi_event_stream() {}
+
+    explicit multi_event_stream(size_type n_stream):
+        n_stream_(n_stream),
+        span_begin_(n_stream),
+        span_end_(n_stream),
+        mark_(n_stream),
+        n_nonempty_stream_(1)
+    {}
+
+    size_type n_streams() const { return n_stream_; }
+
+    bool empty() const { return n_nonempty_stream_[0]==0; }
+
+    void clear();
+
+    // Initialize event streams from a vector of `deliverable_event`.
+    void init(std::vector<deliverable_event> staged);
+
+    // Designate for processing events `ev` at head of each event stream `i`
+    // until `event_time(ev)` > `t_until[i]`.
+    void mark_until_after(const_view t_until);
+
+    // Remove marked events from front of each event stream.
+    void drop_marked_events();
+
+    // If the head of `i`th event stream exists and has time less than `t_until[i]`, set
+    // `t_until[i]` to the event time.
+    void event_time_if_before(view t_until);
+
+    // Interface for access by mechanism kernels:
+    struct span_state {
+        size_type n;
+        const size_type* ev_mech_id;
+        const size_type* ev_index;
+        const value_type* ev_weight;
+        const size_type* span_begin;
+        const size_type* mark;
+    };
+
+    span_state delivery_data() const {
+        return {n_stream_, ev_mech_id_.data(), ev_index_.data(), ev_weight_.data(), span_begin_.data(), mark_.data()};
+    }
+
+private:
+    size_type n_stream_;
+    array ev_time_;
+    array ev_weight_;
+    iarray ev_mech_id_;
+    iarray ev_index_;
+    iarray span_begin_;
+    iarray span_end_;
+    iarray mark_;
+    iarray n_nonempty_stream_;
+};
+
+} // namespace gpu
+} // namespace nest
+} // namespace mc
diff --git a/src/backends/gpu/stimulus.hpp b/src/backends/gpu/stimulus.hpp
index 6cc41703dba92be14fd0a87cad59ccf20da0691f..28d36f8565e4ccc1fa2d697e49a03661729ece8e 100644
--- a/src/backends/gpu/stimulus.hpp
+++ b/src/backends/gpu/stimulus.hpp
@@ -19,7 +19,7 @@ namespace kernels {
     __global__
     void stim_current(
         const T* delay, const T* duration, const T* amplitude,
-        const I* node_index, int n, T t, T* current)
+        const I* node_index, int n, const I* cell_index, const T* time, T* current)
     {
         using value_type = T;
         using iarray = I;
@@ -27,7 +27,8 @@ namespace kernels {
         auto i = threadIdx.x + blockDim.x*blockIdx.x;
 
         if (i<n) {
-            if (t>=delay[i] && t<(delay[i]+duration[i])) {
+            auto t = time[cell_index[i]];
+            if (t>=delay[i] && t<delay[i]+duration[i]) {
                 // use subtraction because the electrode currents are specified
                 // in terms of current into the compartment
                 cuda_atomic_add(current+node_index[i], -amplitude[i]);
@@ -47,11 +48,14 @@ public:
     using iarray  = typename base::iarray;
     using view   = typename base::view;
     using iview  = typename base::iview;
+    using const_view = typename base::const_view;
     using const_iview = typename base::const_iview;
     using ion_type = typename base::ion_type;
 
-    stimulus(view vec_v, view vec_i, iarray&& node_index):
-        base(vec_v, vec_i, std::move(node_index))
+    static constexpr size_type no_mech_id = (size_type)-1;
+
+    stimulus(const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, iarray&& node_index):
+        base(no_mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))
     {}
 
     using base::size;
@@ -60,10 +64,7 @@ public:
         return 0;
     }
 
-    void set_params(value_type t_, value_type dt_) override {
-        t = t_;
-        dt = dt_;
-    }
+    void set_params() override {}
 
     std::string name() const override {
         return "stimulus";
@@ -114,19 +115,18 @@ public:
 
         kernels::stim_current<value_type, size_type><<<dim_grid, dim_block>>>(
             delay.data(), duration.data(), amplitude.data(),
-            node_index_.data(), n, t,
+            node_index_.data(), n, vec_ci_.data(), vec_t_.data(),
             vec_i_.data()
         );
 
     }
 
-    value_type dt = 0;
-    value_type t = 0;
-
     array amplitude;
     array duration;
     array delay;
 
+    using base::vec_ci_;
+    using base::vec_t_;
     using base::vec_v_;
     using base::vec_i_;
     using base::node_index_;
diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp
index e4624c077239eff3a5a74cda42c60939b1ea8ae8..1d28bb329d01e587ee3bdaae5dc2f5efbffeedf7 100644
--- a/src/backends/gpu/threshold_watcher.hpp
+++ b/src/backends/gpu/threshold_watcher.hpp
@@ -23,6 +23,7 @@ public:
     using array = memory::device_vector<T>;
     using iarray = memory::device_vector<I>;
     using const_view = typename array::const_view_type;
+    using const_iview = typename iarray::const_view_type;
 
     /// stores a single crossing event
     struct threshold_crossing {
@@ -41,18 +42,24 @@ public:
     threshold_watcher() = default;
 
     threshold_watcher(
+            const_iview vec_ci,
+            const_view vec_t_before,
+            const_view vec_t_after,
             const_view values,
             const std::vector<size_type>& index,
             const std::vector<value_type>& thresh,
             value_type t=0):
+        cv_to_cell_(vec_ci),
+        t_before_(vec_t_before),
+        t_after_(vec_t_after),
         values_(values),
-        index_(memory::make_const_view(index)),
+        cv_index_(memory::make_const_view(index)),
         thresholds_(memory::make_const_view(thresh)),
         prev_values_(values),
         is_crossed_(size()),
         stack_(memory::make_managed_ptr<stack_type>(10*size()))
     {
-        reset(t);
+        reset();
     }
 
     /// Remove all stored crossings that were detected in previous calls
@@ -64,26 +71,23 @@ public:
     /// Reset state machine for each detector.
     /// Assume that the values in values_ have been set correctly before
     /// calling, because the values are used to determine the initial state
-    void reset(value_type t=0) {
+    void reset() {
         clear_crossings();
 
         // Make host-side copies of the information needed to calculate
         // the initial crossed state
         auto values = memory::on_host(values_);
         auto thresholds = memory::on_host(thresholds_);
-        auto index = memory::on_host(index_);
+        auto cv_index = memory::on_host(cv_index_);
 
         // calculate the initial crossed state in host memory
-        auto crossed = std::vector<size_type>(size());
+        std::vector<size_type> crossed(size());
         for (auto i: util::make_span(0u, size())) {
-            crossed[i] = values[index[i]] < thresholds[i] ? 0 : 1;
+            crossed[i] = values[cv_index[i]] < thresholds[i] ? 0 : 1;
         }
 
         // copy the initial crossed state to device memory
-        is_crossed_ = memory::on_gpu(crossed);
-
-        // reset time of last test
-        t_prev_ = t;
+        memory::copy(crossed, is_crossed_);
     }
 
     bool is_crossed(size_type i) const {
@@ -94,36 +98,28 @@ public:
         return std::vector<threshold_crossing>(stack_->begin(), stack_->end());
     }
 
-    /// The time at which the last test was performed
-    value_type last_test_time() const {
-        return t_prev_;
-    }
-
     /// Tests each target for changed threshold state.
     /// Crossing events are recorded for each threshold that has been
     /// crossed since current time t, and the last time the test was
     /// performed.
-    void test(value_type t) {
-        EXPECTS(t_prev_<t);
-
+    void test() {
         constexpr int block_dim = 128;
         const int grid_dim = (size()+block_dim-1)/block_dim;
         test_thresholds<<<grid_dim, block_dim>>>(
-            t, t_prev_, size(),
+            cv_to_cell_.data(), t_after_.data(), t_before_.data(),
+            size(),
             *stack_,
             is_crossed_.data(), prev_values_.data(),
-            index_.data(), values_.data(), thresholds_.data());
+            cv_index_.data(), values_.data(), thresholds_.data());
 
         // Check that the number of spikes has not exceeded
         // the capacity of the stack.
         EXPECTS(stack_->size() <= stack_->capacity());
-
-        t_prev_ = t;
     }
 
     /// the number of threashold values that are being monitored
     std::size_t size() const {
-        return index_.size();
+        return cv_index_.size();
     }
 
     /// Data type used to store the crossings.
@@ -131,13 +127,14 @@ public:
     using crossing_list =  std::vector<threshold_crossing>;
 
 private:
-
+    const_iview cv_to_cell_;    // index to cell mapping: on gpu
+    const_view t_before_;       // times per cell corresponding to prev_values_: on gpu
+    const_view t_after_;        // times per cell corresponding to values_: on gpu
     const_view values_;         // values to watch: on gpu
-    iarray index_;              // indexes of values to watch: on gpu
+    iarray cv_index_;           // compartment indexes of values to watch: on gpu
 
     array thresholds_;          // threshold for each watch: on gpu
-    value_type t_prev_;         // time of previous sample: on host
-    array prev_values_;         // values at previous sample time: on host
+    array prev_values_;         // values at previous sample time: on gpu
     iarray is_crossed_;         // bool flag for state of each watch: on gpu
 
     memory::managed_ptr<stack_type> stack_;
diff --git a/src/backends/multicore/fvm.hpp b/src/backends/multicore/fvm.hpp
index 08e919ded66de399494e149291edd2e4f3cfe28a..fa3b1b14e46b5492a7a3735b5a97b6fee119d462 100644
--- a/src/backends/multicore/fvm.hpp
+++ b/src/backends/multicore/fvm.hpp
@@ -3,13 +3,18 @@
 #include <map>
 #include <string>
 
+#include <backends/event.hpp>
 #include <common_types.hpp>
+#include <event_queue.hpp>
 #include <mechanism.hpp>
 #include <memory/memory.hpp>
 #include <memory/wrappers.hpp>
+#include <util/meta.hpp>
+#include <util/rangeutil.hpp>
 #include <util/span.hpp>
 
 #include "matrix_state.hpp"
+#include "multi_event_stream.hpp"
 #include "stimulus.hpp"
 #include "threshold_watcher.hpp"
 
@@ -42,9 +47,12 @@ struct backend {
     using host_view   = view;
     using host_iview  = iview;
 
-    /// matrix state
-    using matrix_state =
-        nest::mc::multicore::matrix_state<value_type, size_type>;
+    using matrix_state = nest::mc::multicore::matrix_state<value_type, size_type>;
+    using multi_event_stream = nest::mc::multicore::multi_event_stream;
+
+    // re-expose common backend event types
+    using deliverable_event = nest::mc::deliverable_event;
+    using target_handle = nest::mc::target_handle;
 
     //
     // mechanism infrastructure
@@ -57,6 +65,9 @@ struct backend {
 
     static mechanism make_mechanism(
         const std::string& name,
+        size_type mech_id,
+        const_iview vec_ci,
+        const_view vec_t, const_view vec_t_to, const_view vec_dt,
         view vec_v, view vec_i,
         const std::vector<value_type>& weights,
         const std::vector<size_type>& node_indices)
@@ -65,7 +76,7 @@ struct backend {
             throw std::out_of_range("no mechanism in database : " + name);
         }
 
-        return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices));
+        return mech_map_.find(name)->second(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, array(weights), iarray(node_indices));
     }
 
     static bool has_mechanism(const std::string& name) {
@@ -82,15 +93,47 @@ struct backend {
         nest::mc::multicore::threshold_watcher<value_type, size_type>;
 
 
-private:
+    // perform min/max reductions on 'array' type
+    template <typename V>
+    static std::pair<V, V> minmax_value(const memory::host_vector<V>& v) {
+        return util::minmax_value(v);
+    }
+
+    // perform element-wise comparison on 'array' type against `t_test`.
+    template <typename V>
+    static bool any_time_before(const memory::host_vector<V>& t, V t_test) {
+        return minmax_value(t).first<t_test;
+    }
+
+    static void update_time_to(array& time_to, const_view time, value_type dt, value_type tmax) {
+        size_type ncell = util::size(time);
+        for (size_type i = 0; i<ncell; ++i) {
+            time_to[i] = std::min(time[i]+dt, tmax);
+        }
+    }
 
-    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
+    // set the per-cell and per-compartment dt_ from time_to_ - time_.
+    static void set_dt(array& dt_cell, array& dt_comp, const_view time_to, const_view time, const_iview cv_to_cell) {
+        size_type ncell = util::size(dt_cell);
+        size_type ncomp = util::size(dt_comp);
+
+        for (size_type j = 0; j<ncell; ++j) {
+            dt_cell[j] = time_to[j]-time[j];
+        }
+
+        for (size_type i = 0; i<ncomp; ++i) {
+            dt_comp[i] = dt_cell[cv_to_cell[i]];
+        }
+    }
+
+private:
+    using maker_type = mechanism (*)(value_type, const_iview, const_view, const_view, const_view, view, view, array&&, iarray&&);
     static std::map<std::string, maker_type> mech_map_;
 
     template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
+    static mechanism maker(value_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
         return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(weights), std::move(node_indices));
+            (mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
 
diff --git a/src/backends/multicore/matrix_state.hpp b/src/backends/multicore/matrix_state.hpp
index 0414a9d02ce7f6d30d9535b3c5ca7ab0553c9c12..15507ca0bc0a8c7f8abf6b16922677daba4b165c 100644
--- a/src/backends/multicore/matrix_state.hpp
+++ b/src/backends/multicore/matrix_state.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory/memory.hpp>
+#include <util/partition.hpp>
 #include <util/span.hpp>
 
 namespace nest {
@@ -17,7 +18,7 @@ public:
     using const_view = typename array::const_view_type;
     using iarray = memory::host_vector<size_type>;
     iarray parent_index;
-    iarray cell_index;
+    iarray cell_cv_divs;
 
     array d;     // [μS]
     array u;     // [μS]
@@ -29,23 +30,21 @@ public:
     // the invariant part of the matrix diagonal
     array invariant_d;         // [μS]
 
-    const_view solution;
-
     matrix_state() = default;
 
     matrix_state(const std::vector<size_type>& p,
-                 const std::vector<size_type>& cell_idx,
+                 const std::vector<size_type>& cell_cv_divs,
                  const std::vector<value_type>& cap,
                  const std::vector<value_type>& cond):
         parent_index(memory::make_const_view(p)),
-        cell_index(memory::make_const_view(cell_idx)),
+        cell_cv_divs(memory::make_const_view(cell_cv_divs)),
         d(size(), 0), u(size(), 0), rhs(size()),
         cv_capacitance(memory::make_const_view(cap)),
         face_conductance(memory::make_const_view(cond))
     {
         EXPECTS(cap.size() == size());
         EXPECTS(cond.size() == size());
-        EXPECTS(cell_idx.back() == size());
+        EXPECTS(cell_cv_divs.back() == size());
 
         auto n = size();
         invariant_d = array(n, 0);
@@ -56,49 +55,66 @@ public:
             invariant_d[i] += gij;
             invariant_d[p[i]] += gij;
         }
+    }
 
+    const_view solution() const {
         // In this back end the solution is a simple view of the rhs, which
         // contains the solution after the matrix_solve is performed.
-        solution = rhs;
+        return const_view(rhs);
     }
 
+
     // Assemble the matrix
-    // Afterwards the diagonal and RHS will have been set given dt, voltage and current
-    //   dt      [ms]
+    // Afterwards the diagonal and RHS will have been set given dt, voltage and current.
+    //   dt_cell [ms] (per cell)
     //   voltage [mV]
     //   current [nA]
-    void assemble(value_type dt, const_view voltage, const_view current) {
-        auto n = size();
-        value_type factor = 1e-3/dt;
-        for (auto i: util::make_span(0u, n)) {
-            auto gi = factor*cv_capacitance[i];
+    void assemble(const_view dt_cell, const_view voltage, const_view current) {
+        auto cell_cv_part = util::partition_view(cell_cv_divs);
+        const size_type ncells = cell_cv_part.size();
+
+        // loop over submatrices
+        for (auto m: util::make_span(0, ncells)) {
+            auto dt = dt_cell[m];
 
-            d[i] = gi + invariant_d[i];
+            if (dt>0) {
+                value_type factor = 1e-3/dt;
+                for (auto i: util::make_span(cell_cv_part[m])) {
+                    auto gi = factor*cv_capacitance[i];
 
-            rhs[i] = gi*voltage[i] - current[i];
+                    d[i] = gi + invariant_d[i];
+                    rhs[i] = gi*voltage[i] - current[i];
+                }
+            }
+            else {
+                for (auto i: util::make_span(cell_cv_part[m])) {
+                    d[i] = 0;
+                    rhs[i] = voltage[i];
+                }
+            }
         }
     }
 
     void solve() {
-        const size_type ncells = cell_index.size()-1;
-
         // loop over submatrices
-        for (auto m: util::make_span(0, ncells)) {
-            auto first = cell_index[m];
-            auto last = cell_index[m+1];
-
-            // backward sweep
-            for(auto i=last-1; i>first; --i) {
-                auto factor = u[i] / d[i];
-                d[parent_index[i]]   -= factor * u[i];
-                rhs[parent_index[i]] -= factor * rhs[i];
-            }
-            rhs[first] /= d[first];
-
-            // forward sweep
-            for(auto i=first+1; i<last; ++i) {
-                rhs[i] -= u[i] * rhs[parent_index[i]];
-                rhs[i] /= d[i];
+        for (auto cv_span: util::partition_view(cell_cv_divs)) {
+            auto first = cv_span.first;
+            auto last = cv_span.second; // one past the end
+
+            if (d[first]!=0) {
+                // backward sweep
+                for(auto i=last-1; i>first; --i) {
+                    auto factor = u[i] / d[i];
+                    d[parent_index[i]]   -= factor * u[i];
+                    rhs[parent_index[i]] -= factor * rhs[i];
+                }
+                rhs[first] /= d[first];
+
+                // forward sweep
+                for(auto i=first+1; i<last; ++i) {
+                    rhs[i] -= u[i] * rhs[parent_index[i]];
+                    rhs[i] /= d[i];
+                }
             }
         }
     }
diff --git a/src/backends/multicore/multi_event_stream.hpp b/src/backends/multicore/multi_event_stream.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b3ce70511af37be5cd39ab0dcc92d52a1af1faf
--- /dev/null
+++ b/src/backends/multicore/multi_event_stream.hpp
@@ -0,0 +1,143 @@
+#pragma once
+
+// Indexed collection of pop-only event queues --- multicore back-end implementation.
+
+#include <limits>
+#include <ostream>
+#include <utility>
+
+#include <common_types.hpp>
+#include <backends/event.hpp>
+#include <util/debug.hpp>
+#include <util/range.hpp>
+#include <util/rangeutil.hpp>
+#include <util/strprintf.hpp>
+
+namespace nest {
+namespace mc {
+namespace multicore {
+
+class multi_event_stream {
+public:
+    using size_type = cell_size_type;
+    using value_type = double;
+
+    multi_event_stream() {}
+
+    explicit multi_event_stream(size_type n_stream):
+       span_(n_stream), mark_(n_stream) {}
+
+    size_type n_streams() const { return span_.size(); }
+
+    bool empty() const { return remaining_==0; }
+
+    void clear() {
+        ev_.clear();
+        remaining_ = 0;
+
+        util::fill(span_, span_type(0u, 0u));
+        util::fill(mark_, 0u);
+    }
+
+    // Initialize event streams from a vector of `deliverable_event`.
+    void init(std::vector<deliverable_event> staged) {
+        if (staged.size()>std::numeric_limits<size_type>::max()) {
+            throw std::range_error("too many events");
+        }
+
+        ev_ = std::move(staged);
+        util::stable_sort_by(ev_, [](const deliverable_event& e) { return e.handle.cell_index; });
+
+        util::fill(span_, span_type(0u, 0u));
+        util::fill(mark_, 0u);
+
+        size_type si = 0;
+        for (size_type ev_i = 0; ev_i<ev_.size(); ++ev_i) {
+            size_type i = ev_[ev_i].handle.cell_index;
+            EXPECTS(i<n_streams());
+
+            if (si<i) {
+                // Moved on to a new cell: make empty spans for any skipped cells.
+                for (size_type j = si+1; j<i; ++j) {
+                    span_[j].second = span_[si].second;
+                }
+                si = i;
+            }
+            // Include event in this cell's span.
+            span_[si].second = ev_i+1;
+        }
+
+        while (++si<n_streams()) {
+            span_[si].second = span_[si-1].second;
+        }
+
+        for (size_type i = 1u; i<n_streams(); ++i) {
+            mark_[i] = span_[i].first = span_[i-1].second;
+        }
+
+        remaining_ = ev_.size();
+    }
+
+    // Designate for processing events `ev` at head of each event stream `i`
+    // until `event_time(ev)` > `t_until[i]`.
+    template <typename TimeSeq>
+    void mark_until_after(const TimeSeq& t_until) {
+        EXPECTS(n_streams()==util::size(t_until));
+
+        // note: operation on each `i` is independent.
+        for (size_type i = 0; i<n_streams(); ++i) {
+            auto& span = span_[i];
+            auto& mark = mark_[i];
+            auto t = t_until[i]; 
+
+            mark = span.first;
+            while (mark!=span.second && !(ev_[mark].time>t)) {
+                ++mark;
+            }
+        }
+    }
+
+    // Remove marked events from front of each event stream.
+    void drop_marked_events() {
+        // note: operation on each `i` is independent.
+        for (size_type i = 0; i<n_streams(); ++i) {
+            remaining_ -= (mark_[i]-span_[i].first);
+            span_[i].first = mark_[i];
+        }
+    }
+
+    // Return range of marked events on stream `i`.
+    util::range<deliverable_event*> marked_events(size_type i) {
+        return {&ev_[span_[i].first], &ev_[mark_[i]]};
+    }
+
+    // If the head of `i`th event stream exists and has time less than `t_until[i]`, set
+    // `t_until[i]` to the event time.
+    template <typename TimeSeq>
+    void event_time_if_before(TimeSeq& t_until) {
+        // note: operation on each `i` is independent.
+        for (size_type i = 0; i<n_streams(); ++i) {
+            const auto& span = span_[i];
+            if (span.second==span.first) {
+               continue;
+            }
+
+            auto ev_t = ev_[span.first].time;
+            if (t_until[i]>ev_t) {
+                t_until[i] = ev_t;
+            }
+        }
+    }
+
+private:
+    using span_type = std::pair<size_type, size_type>;
+
+    std::vector<deliverable_event> ev_;
+    std::vector<span_type> span_;
+    std::vector<size_type> mark_;
+    size_type remaining_ = 0;
+};
+
+} // namespace multicore
+} // namespace nest
+} // namespace mc
diff --git a/src/backends/multicore/stimulus.hpp b/src/backends/multicore/stimulus.hpp
index 4ca4f0eff0dd603878458e19704e71f2cc7ce5a0..659cd7aec3fb9bb55a8c34d6ad08e6b177646b27 100644
--- a/src/backends/multicore/stimulus.hpp
+++ b/src/backends/multicore/stimulus.hpp
@@ -24,11 +24,14 @@ public:
     using iarray  = typename base::iarray;
     using view   = typename base::view;
     using iview  = typename base::iview;
+    using const_view = typename base::const_view;
     using const_iview = typename base::const_iview;
     using ion_type = typename base::ion_type;
 
-    stimulus(view vec_v, view vec_i, iarray&& node_index):
-        base(vec_v, vec_i, std::move(node_index))
+    static constexpr size_type no_mech_id = (size_type)-1;
+
+    stimulus(const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, iarray&& node_index):
+        base(no_mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))
     {}
 
     using base::size;
@@ -37,9 +40,7 @@ public:
         return 0;
     }
 
-    void set_params(value_type t_, value_type dt_) override {
-        t = t_;
-        dt = dt_;
+    void set_params() override {
     }
 
     std::string name() const override {
@@ -80,10 +81,12 @@ public:
         if (amplitude.size() != size()) {
             throw std::domain_error("stimulus called with mismatched parameter size\n");
         }
+        auto vec_t = util::indirect_view(util::indirect_view(vec_t_, vec_ci_), node_index_);
         auto vec_i = util::indirect_view(vec_i_, node_index_);
-        int n = size();
-        for(int i=0; i<n; ++i) {
-            if (t>=delay[i] && t<(delay[i]+duration[i])) {
+        size_type n = size();
+        for (size_type i=0; i<n; ++i) {
+            auto t = vec_t[i];
+            if (t>=delay[i] && t<delay[i]+duration[i]) {
                 // use subtraction because the electrod currents are specified
                 // in terms of current into the compartment
                 vec_i[i] -= amplitude[i];
@@ -91,13 +94,12 @@ public:
         }
     }
 
-    value_type dt = 0;
-    value_type t = 0;
-
     std::vector<value_type> amplitude;
     std::vector<value_type> duration;
     std::vector<value_type> delay;
 
+    using base::vec_ci_;
+    using base::vec_t_;
     using base::vec_v_;
     using base::vec_i_;
     using base::node_index_;
diff --git a/src/backends/multicore/threshold_watcher.hpp b/src/backends/multicore/threshold_watcher.hpp
index c7cfb3ab5db0abca8662e6685edbaabb25183088..8a09cdca4f72f59ea534261c207ddcbed161f02c 100644
--- a/src/backends/multicore/threshold_watcher.hpp
+++ b/src/backends/multicore/threshold_watcher.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <math.hpp>
 #include <memory/memory.hpp>
 
 namespace nest {
@@ -15,6 +16,7 @@ public:
     using array = memory::host_vector<value_type>;
     using const_view = typename array::const_view_type;
     using iarray = memory::host_vector<size_type>;
+    using const_iview = typename iarray::const_view_type;
 
     /// stores a single crossing event
     struct threshold_crossing {
@@ -30,17 +32,22 @@ public:
     threshold_watcher() = default;
 
     threshold_watcher(
+            const_iview vec_ci,
+            const_view vec_t_before,
+            const_view vec_t_after,
             const_view vals,
             const std::vector<size_type>& indxs,
-            const std::vector<value_type>& thresh,
-            value_type t=0):
+            const std::vector<value_type>& thresh):
+        cv_to_cell_(vec_ci),
+        t_before_(vec_t_before),
+        t_after_(vec_t_after),
         values_(vals),
-        index_(memory::make_const_view(indxs)),
+        cv_index_(memory::make_const_view(indxs)),
         thresholds_(memory::make_const_view(thresh)),
         v_prev_(vals)
     {
         is_crossed_ = iarray(size());
-        reset(t);
+        reset();
     }
 
     /// Remove all stored crossings that were detected in previous calls
@@ -52,37 +59,34 @@ public:
     /// Reset state machine for each detector.
     /// Assume that the values in values_ have been set correctly before
     /// calling, because the values are used to determine the initial state
-    void reset(value_type t=0) {
+    void reset() {
         clear_crossings();
         for (auto i=0u; i<size(); ++i) {
-            is_crossed_[i] = values_[index_[i]]>=thresholds_[i];
+            is_crossed_[i] = values_[cv_index_[i]]>=thresholds_[i];
         }
-        t_prev_ = t;
     }
 
     const std::vector<threshold_crossing>& crossings() const {
         return crossings_;
     }
 
-    /// The time at which the last test was performed
-    value_type last_test_time() const {
-        return t_prev_;
-    }
-
     /// Tests each target for changed threshold state
     /// Crossing events are recorded for each threshold that
     /// is crossed since the last call to test
-    void test(value_type t) {
+    void test() {
         for (auto i=0u; i<size(); ++i) {
+            auto cv     = cv_index_[i];
+            auto cell   = cv_to_cell_[cv];
             auto v_prev = v_prev_[i];
-            auto v      = values_[index_[i]];
+            auto v      = values_[cv];
             auto thresh = thresholds_[i];
+
             if (!is_crossed_[i]) {
                 if (v>=thresh) {
-                    // the threshold has been passed, so estimate the time using
-                    // linear interpolation
+                    // The threshold has been passed, so estimate the time using
+                    // linear interpolation.
                     auto pos = (thresh - v_prev)/(v - v_prev);
-                    auto crossing_time = t_prev_ + pos*(t - t_prev_);
+                    auto crossing_time = math::lerp(t_before_[cell], t_after_[cell], pos);
                     crossings_.push_back({i, crossing_time});
 
                     is_crossed_[i] = true;
@@ -96,7 +100,6 @@ public:
 
             v_prev_[i] = v;
         }
-        t_prev_ = t;
     }
 
     bool is_crossed(size_type i) const {
@@ -105,7 +108,7 @@ public:
 
     /// the number of threashold values that are being monitored
     std::size_t size() const {
-        return index_.size();
+        return cv_index_.size();
     }
 
     /// Data type used to store the crossings.
@@ -113,11 +116,13 @@ public:
     using crossing_list =  std::vector<threshold_crossing>;
 
 private:
+    const_iview cv_to_cell_;
+    const_view t_before_;
+    const_view t_after_;
     const_view values_;
-    iarray index_;
+    iarray cv_index_;
 
     array thresholds_;
-    value_type t_prev_;
     array v_prev_;
     crossing_list crossings_;
     iarray is_crossed_;
diff --git a/src/event_queue.hpp b/src/event_queue.hpp
index c796da31d7f17b9c3144bd8024dadacbc86d81a8..ea97e48fa0537b44580b547d513fcdc3c54d2d48 100644
--- a/src/event_queue.hpp
+++ b/src/event_queue.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <ostream>
 #include <queue>
 #include <type_traits>
@@ -9,6 +10,8 @@
 #include "common_types.hpp"
 #include "util/meta.hpp"
 #include "util/optional.hpp"
+#include "util/range.hpp"
+#include "util/strprintf.hpp"
 
 namespace nest {
 namespace mc {
diff --git a/src/fvm_multicell.hpp b/src/fvm_multicell.hpp
index b952d80765dc8f4c6dc6474c77a2557444f1509b..2a7bee9efef4b1cfc89c87e35073881c6d321ef0 100644
--- a/src/fvm_multicell.hpp
+++ b/src/fvm_multicell.hpp
@@ -58,9 +58,11 @@ public:
     /// the container used for indexes
     using iarray = typename backend::iarray;
 
-    using target_handle = std::pair<size_type, size_type>;
     using probe_handle = std::pair<const array fvm_multicell::*, size_type>;
 
+    using target_handle = typename backend::target_handle;
+    using deliverable_event = typename backend::deliverable_event;
+
     fvm_multicell() = default;
 
     void resting_potential(value_type potential_mV) {
@@ -75,63 +77,74 @@ public:
 
     void reset();
 
+    // fvm_multicell::deliver_event is used only for testing
     void deliver_event(target_handle h, value_type weight) {
-        mechanisms_[h.first]->net_receive(h.second, weight);
+        mechanisms_[h.mech_id]->net_receive(h.index, weight);
     }
 
     value_type probe(probe_handle h) const {
         return (this->*h.first)[h.second];
     }
 
-    /// Initialize state prior to a sequence of integration steps.
+    // Initialize state prior to a sequence of integration steps.
     void setup_integration(value_type tfinal, value_type dt_max) {
-        EXPECTS(tfinal>t_);
         EXPECTS(dt_max>0);
 
         tfinal_ = tfinal;
         dt_max_ = dt_max;
-        integration_running_ = true;
-
-        // TODO: Placeholder; construct backend event delivery
-        // data structure from events added.
-        EXPECTS(events_.empty());
-        for (auto& ev: staged_events_) {
-            EXPECTS(ev.time<tfinal);
-            events_.push(std::move(ev));
-        }
 
+        compute_min_remaining();
+
+        EXPECTS(!has_pending_events());
+
+        events_->init(std::move(staged_events_));
         staged_events_.clear();
     }
 
-    /// Advance one integration step, up to `dt_max_` in each cell.
+    // Advance one integration step.
     void step_integration();
 
-    /// Query integration completion state.
+    // Query integration completion state.
     bool integration_complete() const {
         return !integration_running_;
     }
 
-    /// Query per-cell time state. (Placeholders)
-    value_type time(size_type cell_index) const {
-        return t_;
+    // Query per-cell time state.
+    // Placeholder: external time queries will no longer be required when
+    // complete integration loop is in lowered cell.
+    value_type time(size_type cell_idx) const {
+        refresh_time_cache();
+        return cached_time_[cell_idx];
+    }
+
+    value_type min_time() const {
+        return backend::minmax_value(time_).first;
     }
 
     value_type max_time() const {
-        return t_;
+        return backend::minmax_value(time_).second;
     }
 
     bool state_synchronized() const {
-        return true;
+        auto mm = backend::minmax_value(time_);
+        return mm.first==mm.second;
+    }
+
+    /// Set times for all cells (public for testing purposes only).
+    void set_time_global(value_type t) {
+        memory::fill(time_, t);
+        invalidate_time_cache();
+    }
+
+    void set_time_to_global(value_type t) {
+        memory::fill(time_to_, t);
+        invalidate_time_cache();
     }
 
     /// Add an event for processing in next integration stage.
     void add_event(value_type ev_time, target_handle h, value_type weight) {
         EXPECTS(!integration_running_);
-
-        // TODO: Placeholder; either add to backend event list structure
-        // incrementally here, or store by cell gid offset for construction
-        // all at once in `start_integration()`.
-        staged_events_.push_back({ev_time, h, weight});
+        staged_events_.push_back(deliverable_event(ev_time, h, weight));
     }
 
     /// Following types and methods are public only for testing:
@@ -238,10 +251,10 @@ public:
     }
 
 private:
-    threshold_watcher threshold_watcher_;
+    /// number of distinct cells (integration domains)
+    size_type ncell_;
 
-    /// current time [ms]
-    value_type t_ = 0;
+    threshold_watcher threshold_watcher_;
 
     /// resting potential (initial voltage condition)
     value_type resting_potential_ = -65;
@@ -256,19 +269,30 @@ private:
     /// once integration to `tfinal_` is complete.
     bool integration_running_ = false;
 
-    struct deliverable_event {
-        double time;
-        target_handle handle;
-        double weight;
-    };
+    /// minimum number of integration steps left in integration period.
+    unsigned min_remaining_steps_ = 0;
+
+    void compute_min_remaining() {
+        auto tmin = min_time();
+        min_remaining_steps_ = tmin>=tfinal_? 0: 1 + (unsigned)((tfinal_-tmin)/dt_max_);
+        integration_running_ = min_remaining_steps_>0;
+    }
+
+    void decrement_min_remaining() {
+        if (min_remaining_steps_>0) --min_remaining_steps_;
+        integration_running_ = min_remaining_steps_>0;
+    }
 
     /// events staged for upcoming integration stage
     std::vector<deliverable_event> staged_events_;
 
     /// event queue for integration period
-    /// TODO: Placeholder; this will change when event lists are moved to
-    /// a backend structure.
-    event_queue<deliverable_event> events_;
+    using multi_event_stream = typename backend::multi_event_stream;
+    std::unique_ptr<multi_event_stream> events_;
+
+    bool has_pending_events() const {
+        return events_ && !events_->empty();
+    }
 
     /// the linear system for implicit time stepping of cell state
     matrix_type matrix_;
@@ -276,6 +300,37 @@ private:
     /// cv_areas_[i] is the surface area of CV i [µm^2]
     array cv_areas_;
 
+    /// the map from compartment index to cell index
+    iarray cv_to_cell_;
+
+    /// the per-cell simulation time
+    array time_;
+
+    /// the per-cell integration period end point
+    array time_to_;
+
+    // the per-compartment dt
+    // (set to dt_cell_[j] for each compartment in cell j).
+    array dt_comp_;
+
+    // the per-cell dt
+    // (set to time_to_[j]-time_[j] for each cell j).
+    array dt_cell_;
+
+    // Maintain cached copy of time vector for querying by
+    // cell_group. This will no longer be necessary when full
+    // integration loop is in lowered cell.
+    mutable std::vector<value_type> cached_time_;
+    mutable bool cached_time_valid_ = false;
+
+    void invalidate_time_cache() { cached_time_valid_ = false; }
+    void refresh_time_cache() const {
+        if (!cached_time_valid_) {
+            memory::copy(time_, memory::make_view(cached_time_));
+        }
+        cached_time_valid_ = true;
+    }
+
     /// the transmembrane current over the surface of each CV [nA]
     ///     I = area*i_m - I_e
     array current_;
@@ -488,7 +543,7 @@ void fvm_multicell<Backend>::initialize(
     auto targets_size = size(target_handles);
     auto probes_size = size(probe_handles);
 
-    auto ncell = size(cells);
+    ncell_ = size(cells);
     auto cell_num_compartments =
         transform_view(cells, [](const cell& c) { return c.num_compartments(); });
 
@@ -499,9 +554,23 @@ void fvm_multicell<Backend>::initialize(
     // initialize storage from total compartment count
     current_ = array(ncomp, 0);
     voltage_ = array(ncomp, resting_potential_);
+    cv_to_cell_ = iarray(ncomp, 0);
+    time_ = array(ncell_, 0);
+    time_to_ = array(ncell_, 0);
+    cached_time_.resize(ncell_);
+    cached_time_valid_ = false;
+    dt_cell_ = array(ncell_, 0);
+    dt_comp_ = array(ncomp, 0);
+
+    // initialize cv_to_cell_ values from compartment partition
+    std::vector<size_type> cv_to_cell_tmp(ncomp);
+    for (size_type i = 0; i<ncell_; ++i) {
+        util::fill(util::subrange_view(cv_to_cell_tmp, cell_comp_part[i]), i);
+    }
+    memory::copy(cv_to_cell_tmp, cv_to_cell_);
 
-    // look up table: (density) mechanism name -> list of CV range objects
-    std::map<std::string, std::vector<segment_cv_range>> mech_map;
+    // look up table: mechanism name -> list of cv_range objects
+    std::map<std::string, std::vector<segment_cv_range>> mech_to_cv_range;
 
     // look up table: point mechanism (synapse) name -> CV indices and target numbers.
     struct syn_cv_and_target {
@@ -513,6 +582,9 @@ void fvm_multicell<Backend>::initialize(
     // initialize vector used for matrix creation.
     std::vector<size_type> group_parent_index(ncomp);
 
+    // setup per-cell event stores.
+    events_ = util::make_unique<multi_event_stream>(ncell_);
+
     // create each cell:
     auto probe_hi = probe_handles.begin();
 
@@ -524,7 +596,7 @@ void fvm_multicell<Backend>::initialize(
     /// cv_capacitance_[i] is the capacitance of CV membrane
     std::vector<value_type> cv_capacitance(ncomp);   // [µm^2*F*m^-2 = pF]
     /// membrane area of each cv
-    std::vector<value_type> tmp_cv_areas(ncomp);         // [µm^2]
+    std::vector<value_type> tmp_cv_areas(ncomp);     // [µm^2]
 
     // used to build the information required to construct spike detectors
     std::vector<size_type> spike_detector_index;
@@ -533,13 +605,13 @@ void fvm_multicell<Backend>::initialize(
     // Iterate over the input cells and build the indexes etc that descrbe the
     // fused cell group. On completion:
     //  - group_paranet_index contains the full parent index for the fused cells.
-    //  - mech_map and syn_mech_map provide a map from mechanism names to an
+    //  - mech_to_cv_range and syn_mech_map provide a map from mechanism names to an
     //    iterable container of compartment ranges, which are used later to
     //    generate the node index for each mechanism kind.
     //  - the tmp_* vectors contain compartment-specific information for each
     //    compartment in the fused cell group (areas, capacitance, etc).
     //  - each probe, stimulus and detector is attached to its compartment.
-    for (auto i: make_span(0, ncell)) {
+    for (auto i: make_span(0, ncell_)) {
         const auto& c = cells[i];
         auto comp_ival = cell_comp_part[i];
 
@@ -567,7 +639,7 @@ void fvm_multicell<Backend>::initialize(
 
             for (const auto& mech: seg->mechanisms()) {
                 if (mech.name()!="membrane") {
-                    mech_map[mech.name()].push_back(cv_range);
+                    mech_to_cv_range[mech.name()].push_back(cv_range);
                 }
             }
         }
@@ -610,6 +682,7 @@ void fvm_multicell<Backend>::initialize(
         //       optimizations that rely on this assumption.
         if (stim_index.size()) {
             auto stim = new stimulus(
+                cv_to_cell_, time_, time_to_, dt_comp_,
                 voltage_, current_, memory::make_const_view(stim_index));
             stim->set_parameters(stim_amplitudes, stim_durations, stim_delays);
             mechanisms_.push_back(mechanism(stim));
@@ -643,7 +716,7 @@ void fvm_multicell<Backend>::initialize(
 
     // set a back-end supplied watcher on the voltage vector
     threshold_watcher_ =
-        threshold_watcher(voltage_, spike_detector_index, thresholds, 0);
+        threshold_watcher(cv_to_cell_, time_, time_to_, voltage_, spike_detector_index, thresholds);
 
     // confirm user-supplied container probes were appropriately sized.
     EXPECTS(probes_size==probes_count);
@@ -659,15 +732,17 @@ void fvm_multicell<Backend>::initialize(
     // compartments with that mechanism, then build the mechanism instance.
     std::vector<size_type> mech_cv_index(ncomp);
     std::vector<value_type> mech_cv_weight(ncomp);
-    std::map<std::string, std::vector<size_type>> mech_index_map;
-    for (auto const& mech: mech_map) {
+    std::map<std::string, std::vector<size_type>> mech_to_cv_index;
+    for (auto const& entry: mech_to_cv_range) {
+        const auto& mech_name = entry.first;
+        const auto& seg_cv_ranges = entry.second;
+
         // Clear the pre-allocated storage for mechanism indexes and weights.
         // Reuse the same vectors each time to have only one malloc and free
         // outside of the loop for each
         mech_cv_index.clear();
         mech_cv_weight.clear();
 
-        const auto& seg_cv_ranges = mech.second;
         for (auto& rng: seg_cv_ranges) {
             if (rng.has_parent()) {
                 // locate the parent cv in the partially constructed list of cv indexes
@@ -697,39 +772,39 @@ void fvm_multicell<Backend>::initialize(
             w *= 1e-2;
         }
 
+        size_type mech_id = mechanisms_.size();
         mechanisms_.push_back(
-            backend::make_mechanism(mech.first, voltage_, current_, mech_cv_weight, mech_cv_index));
+            backend::make_mechanism(mech_name, mech_id, cv_to_cell_, time_, time_to_, dt_comp_, voltage_, current_, mech_cv_weight, mech_cv_index));
 
         // Save the indices for ion set up below.
-        mech_index_map[mech.first] = mech_cv_index;
+        mech_to_cv_index[mech_name] = mech_cv_index;
     }
 
     // Create point (synapse) mechanisms.
     for (auto& syni: syn_mech_map) {
-        size_type mech_index = mechanisms_.size();
+        size_type mech_id = mechanisms_.size();
 
         const auto& mech_name = syni.first;
         auto& cv_assoc = syni.second;
 
-
         // Sort CV indices but keep track of their corresponding targets.
         auto cv_index = [](syn_cv_and_target x) { return x.cv; };
-        util::sort_by(cv_assoc, cv_index);
+        util::stable_sort_by(cv_assoc, cv_index);
         std::vector<cell_lid_type> cv_indices = assign_from(transform_view(cv_assoc, cv_index));
 
         // Create the mechanism.
         // An empty weight vector is supplied, because there are no weights applied to point
         // processes, because their currents are calculated with the target units of [nA]
         mechanisms_.push_back(
-            backend::make_mechanism(mech_name, voltage_, current_, {}, cv_indices));
+            backend::make_mechanism(mech_name, mech_id, cv_to_cell_, time_, time_to_, dt_comp_, voltage_, current_, {}, cv_indices));
 
         // Save the indices for ion set up below.
-        mech_index_map[mech_name] = cv_indices;
+        mech_to_cv_index[mech_name] = cv_indices;
 
         // Make the target handles.
         cell_lid_type instance = 0;
         for (auto entry: cv_assoc) {
-            target_handles[entry.target] = {mech_index, instance++};
+            target_handles[entry.target] = target_handle(mech_id, instance++, cv_to_cell_tmp[entry.cv]);
         }
     }
 
@@ -743,7 +818,7 @@ void fvm_multicell<Backend>::initialize(
         std::set<size_type> index_set;
         for (auto const& mech : mechanisms_) {
             if(mech->uses_ion(ion)) {
-                auto const& ni = mech_index_map[mech->name()];
+                auto const& ni = mech_to_cv_index[mech->name()];
                 index_set.insert(ni.begin(), ni.end());
             }
         }
@@ -757,7 +832,7 @@ void fvm_multicell<Backend>::initialize(
         // join the ion reference in each mechanism into the cell-wide ion state
         for (auto& mech : mechanisms_) {
             if (mech->uses_ion(ion)) {
-                auto const& ni = mech_index_map[mech->name()];
+                auto const& ni = mech_to_cv_index[mech->name()];
                 mech->set_ion(ion, ions_[ion],
                     util::make_copy<std::vector<size_type>> (algorithms::index_into(ni, indexes)));
             }
@@ -783,69 +858,70 @@ void fvm_multicell<Backend>::initialize(
     memory::fill(ion_ca().internal_concentration(), 5e-5);          // mM
     memory::fill(ion_ca().external_concentration(), 2.0);           // mM
 
-    // initialise mechanism and voltage state
+    // initialize mechanism and voltage state
     reset();
 }
 
 template <typename Backend>
 void fvm_multicell<Backend>::reset() {
     memory::fill(voltage_, resting_potential_);
-    t_ = 0.;
+
+    set_time_global(0);
+    set_time_to_global(0);
+
     for (auto& m : mechanisms_) {
         // TODO : the parameters have to be set before the nrn_init
         // for now use a dummy value of dt.
-        m->set_params(t_, 0.025);
+        m->set_params();
         m->nrn_init();
     }
 
     // Reset state of the threshold watcher.
     // NOTE: this has to come after the voltage_ values have been reinitialized,
     // because these values are used by the watchers to set their initial state.
-    threshold_watcher_.reset(t_);
+    threshold_watcher_.reset();
 
     // Reset integration state.
     tfinal_ = 0;
     dt_max_ = 0;
     integration_running_ = false;
     staged_events_.clear();
-    events_.clear();
+    events_->clear();
 }
 
 
 template <typename Backend>
 void fvm_multicell<Backend>::step_integration() {
-    // Integrate cell states from `t_` by `dt_max_` if permissible,
-    // or otherwise until the next event time or `t_final`.
     EXPECTS(integration_running_);
 
-    while (auto ev = events_.pop_if_not_after(t_)) {
-        deliver_event(ev->handle, ev->weight);
-    }
-
-    value_type t_to = std::min(t_+dt_max_, tfinal_);
-
-    if (auto t = events_.time_if_before(t_to)) {
-        t_to = *t;
-    }
-
-    value_type dt = t_to-t_;
-    EXPECTS(dt>0);
-
     PE("current");
     memory::fill(current_, 0.);
 
-    // update currents from ion channels
-    for(auto& m : mechanisms_) {
+    // mark pending events for delivery
+    events_->mark_until_after(time_);
+
+    // deliver pending events and update current contributions from mechanisms
+    for(auto& m: mechanisms_) {
         PE(m->name().c_str());
-        m->set_params(t_, dt);
+        m->deliver_events(*events_);
         m->nrn_current();
         PL();
     }
+
+    // remove delivered events from queue and set time_to_
+    events_->drop_marked_events();
+
+    backend::update_time_to(time_to_, time_, dt_max_, tfinal_);
+    invalidate_time_cache();
+    events_->event_time_if_before(time_to_);
     PL();
 
+    // set per-cell and per-compartment dt (constant within a cell)
+    backend::set_dt(dt_cell_, dt_comp_, time_to_, time_, cv_to_cell_);
+
     // solve the linear system
     PE("matrix", "setup");
-    matrix_.assemble(dt, voltage_, current_);
+    matrix_.assemble(dt_cell_, voltage_, current_);
 
     PL(); PE("solve");
     matrix_.solve();
@@ -855,22 +931,28 @@ void fvm_multicell<Backend>::step_integration() {
 
     // integrate state of gating variables etc.
     PE("state");
-    for(auto& m : mechanisms_) {
+    for(auto& m: mechanisms_) {
         PE(m->name().c_str());
         m->nrn_state();
         PL();
     }
     PL();
 
-    t_ = t_to;
+    memory::copy(time_to_, time_);
+    invalidate_time_cache();
 
     // update spike detector thresholds
-    threshold_watcher_.test(t_);
+    threshold_watcher_.test();
 
     // are we there yet?
-    integration_running_ = t_<tfinal_;
+    if (!min_remaining_steps_) {
+        compute_min_remaining();
+    }
+    else {
+        decrement_min_remaining();
+    }
 
-    EXPECTS(integration_running_ || events_.empty());
+    EXPECTS(integration_running_ || !has_pending_events());
 }
 
 } // namespace fvm
diff --git a/src/matrix.hpp b/src/matrix.hpp
index 8f010763cce427011f191b36c821fe7c0167e9df..0afadd79f897d1ce2bf9a0a8b9ed5377e225d629 100644
--- a/src/matrix.hpp
+++ b/src/matrix.hpp
@@ -36,10 +36,10 @@ public:
 
     matrix() = default;
 
-    matrix( const std::vector<size_type>& pi,
-            const std::vector<size_type>& ci,
-            const std::vector<value_type>& cv_capacitance,
-            const std::vector<value_type>& face_conductance):
+    matrix(const std::vector<size_type>& pi,
+           const std::vector<size_type>& ci,
+           const std::vector<value_type>& cv_capacitance,
+           const std::vector<value_type>& face_conductance):
         parent_index_(memory::make_const_view(pi)),
         cell_index_(memory::make_const_view(ci)),
         state_(pi, ci, cv_capacitance, face_conductance)
@@ -69,17 +69,16 @@ public:
     }
 
     /// Assemble the matrix for given dt
-    void assemble(double dt, const_view voltage, const_view current) {
-        state_.assemble(dt, voltage, current);
+    void assemble(const_view dt_cell, const_view voltage, const_view current) {
+        state_.assemble(dt_cell, voltage, current);
     }
 
     /// Get a view of the solution
     const_view solution() const {
-        return state_.solution;
+        return state_.solution();
     }
 
-    private:
-
+private:
     /// the parent indice that describe matrix structure
     iarray parent_index_;
 
diff --git a/src/mc_cell_group.hpp b/src/mc_cell_group.hpp
index 2ed838e8f04e9921449183b554f57e150c8d5a4f..d1e2dae8a8e9f160d901daa780267bbb96306747 100644
--- a/src/mc_cell_group.hpp
+++ b/src/mc_cell_group.hpp
@@ -117,36 +117,40 @@ public:
 
         lowered_.setup_integration(tfinal, dt);
 
+        util::optional<time_type> first_sample_time = sample_events_.time_if_before(tfinal);
         std::vector<sample_event> requeue_sample_events;
         while (!lowered_.integration_complete()) {
             // Take any pending samples.
             // TODO: Placeholder: this will be replaced by a backend polling implementation.
 
-            PE("sampling");
-            time_type cell_max_time = lowered_.max_time();
+            if (first_sample_time) {
+                PE("sampling");
+                time_type cell_max_time = lowered_.max_time();
 
-            requeue_sample_events.clear();
-            while (auto m = sample_events_.pop_if_before(cell_max_time)) {
-                auto& s = samplers_[m->sampler_index];
-                EXPECTS((bool)s.sampler);
+                requeue_sample_events.clear();
+                while (auto m = sample_events_.pop_if_before(cell_max_time)) {
+                    auto& s = samplers_[m->sampler_index];
+                    EXPECTS((bool)s.sampler);
 
-                time_type cell_time = lowered_.time(s.cell_gid-gid_base_);
-                if (cell_time<m->time) {
-                    // This cell hasn't reached this sample time yet.
-                    requeue_sample_events.push_back(*m);
-                }
-                else {
-                    auto next = s.sampler(cell_time, lowered_.probe(s.handle));
-                    if (next) {
-                        m->time = std::max(*next, cell_time);
+                    time_type cell_time = lowered_.time(s.cell_gid-gid_base_);
+                    if (cell_time<m->time) {
+                        // This cell hasn't reached this sample time yet.
                         requeue_sample_events.push_back(*m);
                     }
+                    else {
+                        auto next = s.sampler(cell_time, lowered_.probe(s.handle));
+                        if (next) {
+                            m->time = std::max(*next, cell_time);
+                            requeue_sample_events.push_back(*m);
+                        }
+                    }
                 }
+                for (auto& ev: requeue_sample_events) {
+                    sample_events_.push(std::move(ev));
+                }
+                first_sample_time = sample_events_.time_if_before(tfinal);
+                PL();
             }
-            for (auto& ev: requeue_sample_events) {
-                sample_events_.push(std::move(ev));
-            }
-            PL();
 
             // Ask lowered_ cell to integrate 'one step', delivering any
             // events accordingly.
diff --git a/src/mechanism.hpp b/src/mechanism.hpp
index 8cde22d2f1103006de589904c23ff1bc9f2c0b65..b0bc86e6b7f81d973f452dffa5f33a6781466746 100644
--- a/src/mechanism.hpp
+++ b/src/mechanism.hpp
@@ -3,11 +3,11 @@
 #include <algorithm>
 #include <memory>
 #include <string>
-#include <util/meta.hpp>
 
 #include <ion.hpp>
 #include <parameter_list.hpp>
 #include <util/indirect.hpp>
+#include <util/meta.hpp>
 #include <util/make_unique.hpp>
 
 namespace nest {
@@ -39,7 +39,14 @@ public:
 
     using ion_type = ion<backend>;
 
-    mechanism(view vec_v, view vec_i, iarray&& node_index):
+    using multi_event_stream = typename backend::multi_event_stream;
+
+    mechanism(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, iarray&& node_index):
+        mech_id_(mech_id),
+        vec_ci_(vec_ci),
+        vec_t_(vec_t),
+        vec_t_to_(vec_t_to),
+        vec_dt_(vec_dt),
         vec_v_(vec_v),
         vec_i_(vec_i),
         node_index_(std::move(node_index))
@@ -53,22 +60,39 @@ public:
         return node_index_;
     }
 
-    virtual void set_params(value_type t_, value_type dt_) = 0;
+    virtual void set_params() = 0;
     virtual std::string name() const = 0;
     virtual std::size_t memory() const = 0;
     virtual void nrn_init()     = 0;
     virtual void nrn_state()    = 0;
     virtual void nrn_current()  = 0;
-    virtual void net_receive(int, value_type) {};
+    virtual void deliver_events(multi_event_stream& events) {};
     virtual bool uses_ion(ionKind) const = 0;
     virtual void set_ion(ionKind k, ion_type& i, const std::vector<size_type>& index) = 0;
-
     virtual mechanismKind kind() const = 0;
 
+    // net_receive() is used internally by deliver_events(), but
+    // is exposed primarily for unit testing.
+    virtual void net_receive(int, value_type) {};
+
     virtual ~mechanism() = default;
 
+    // Mechanism identifier: index into list of mechanisms on cell group.
+    size_type mech_id_;
+
+    // Maps compartment index to cell index.
+    const_iview vec_ci_;
+    // Maps cell index to integration start time.
+    const_view vec_t_;
+    // Maps cell index to integration stop time.
+    const_view vec_t_to_;
+    // Maps compartment index to (stop time) - (start time).
+    const_view vec_dt_;
+    // Maps compartment index to voltage.
     view vec_v_;
+    // Maps compartment index to current.
     view vec_i_;
+    // Maps mechanism instance index to compartment index.
     iarray node_index_;
 };
 
@@ -77,14 +101,17 @@ using mechanism_ptr = std::unique_ptr<mechanism<Backend>>;
 
 template <typename M>
 auto make_mechanism(
-    typename M::view  vec_v,
-    typename M::view  vec_i,
-    typename M::array&&  weights,
-    typename M::iarray&& node_indices)
--> decltype(util::make_unique<M>(vec_v, vec_i, std::move(weights), std::move(node_indices)))
-{
-    return util::make_unique<M>(vec_v, vec_i, std::move(weights), std::move(node_indices));
-}
+    typename M::size_type mech_id,
+    typename M::const_iview vec_ci,
+    typename M::const_view vec_t,
+    typename M::const_view vec_t_to,
+    typename M::const_view vec_dt,
+    typename M::view vec_v,
+    typename M::view vec_i,
+    typename M::array&& weights,
+    typename M::iarray&& node_indices
+)
+DEDUCED_RETURN_TYPE(util::make_unique<M>(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices)))
 
 } // namespace mechanisms
 } // namespace mc
diff --git a/src/util/compat.hpp b/src/util/compat.hpp
index 6c12be4e3e06e2ed92042bb410652f40fb4d644d..4f1c2d34aa24a3b3d277f5a6115123fb7ca9ada7 100644
--- a/src/util/compat.hpp
+++ b/src/util/compat.hpp
@@ -44,5 +44,4 @@ inline void compiler_barrier_if_icc_leq(unsigned ver) {
 template <typename X>
 inline constexpr bool isinf(X x) { return std::isinf(x); }
 
-
 } // namespace compat
diff --git a/src/util/debug.hpp b/src/util/debug.hpp
index 3c3eaa853d70f4f93e0ab71eb3e81efd8424d9c9..08c6c0dc9e4ad366af0034316772cb3de2787cc2 100644
--- a/src/util/debug.hpp
+++ b/src/util/debug.hpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <sstream>
 #include <mutex>
+#include <utility>
 
 #include <threading/threading.hpp>
 #include "unwind.hpp"
@@ -50,6 +51,7 @@ template <typename... Args>
 void debug_emit_trace(const char* file, int line, const char* varlist, const Args&... args) {
     if (nest::mc::threading::multithreaded()) {
         std::stringstream buffer;
+        buffer.precision(17);
 
         debug_emit_trace_leader(buffer, file, line, varlist);
         debug_emit(buffer, args...);
@@ -65,6 +67,34 @@ void debug_emit_trace(const char* file, int line, const char* varlist, const Arg
     }
 }
 
+namespace impl {
+    template <typename Seq, typename Separator>
+    struct sepval {
+        const Seq& seq;
+        Separator sep;
+
+        sepval(const Seq& seq, Separator sep): seq(seq), sep(std::move(sep)) {}
+
+        friend std::ostream& operator<<(std::ostream& out, const sepval& sv) {
+            bool emitsep = false;
+            for (const auto& v: sv.seq) {
+                if (emitsep) out << sv.sep;
+                emitsep = true;
+                out << v;
+            }
+            return out;
+        }
+    };
+}
+
+// Wrap a sequence or container of values so that they can be printed
+// to an `std::ostream` with the elements separated by the supplied 
+// separator.
+template <typename Seq, typename Separator>
+impl::sepval<Seq, Separator> sepval(const Seq& seq, Separator sep) {
+    return impl::sepval<Seq, Separator>(seq, std::move(sep));
+}
+
 } // namespace util
 } // namespace mc
 } // namespace nest
diff --git a/src/util/meta.hpp b/src/util/meta.hpp
index b1ce5f7540407844e60832b03714f93fd86b0b1b..35b1b3e3363c14b6fcb9f0162c0845b748667ea3 100644
--- a/src/util/meta.hpp
+++ b/src/util/meta.hpp
@@ -43,9 +43,38 @@ constexpr auto cend(const T& c) -> decltype(compat::end(c)) {
     return compat::end(c);
 }
 
-template <typename T>
-constexpr bool empty(const T& c) {
-    return c.empty();
+// Use sequence `empty() const` method if exists, otherwise
+// compare begin and end.
+
+namespace impl {
+    template <typename C>
+    struct has_const_empty_method {
+        template <typename T>
+        static decltype(std::declval<const T>().empty(), std::true_type{}) test(int);
+        template <typename T>
+        static std::false_type test(...);
+
+        using type = decltype(test<C>(0));
+    };
+
+    // For correct ADL on begin and end:
+    using std::begin;
+    using std::end;
+
+    template <typename Seq>
+    constexpr bool empty(const Seq& seq, std::false_type) {
+        return begin(seq)==end(seq);
+    }
+
+    template <typename Seq>
+    constexpr bool empty(const Seq& seq, std::true_type) {
+        return seq.empty();
+    }
+}
+
+template <typename Seq>
+constexpr bool empty(const Seq& seq) {
+    return impl::empty(seq, typename impl::has_const_empty_method<Seq>::type{});
 }
 
 template <typename T, std::size_t N>
diff --git a/src/util/path.hpp b/src/util/path.hpp
index e3114518cf597611f101597b7f4146b5b12c98ed..fa206a27c3b84ca48d67355ee8355f418ed7323c 100644
--- a/src/util/path.hpp
+++ b/src/util/path.hpp
@@ -26,6 +26,10 @@
 #include <util/meta.hpp>
 #include <util/rangeutil.hpp>
 
+#ifndef _POSIX_C_SOURCE
+#error "no non-POSIX implementations for filesystem classes or methods"
+#endif
+
 namespace nest {
 namespace mc {
 namespace util {
diff --git a/src/util/rangeutil.hpp b/src/util/rangeutil.hpp
index 7dee14b911f9e5bb81d8778d12ec141a0e258ffe..8d03be492d52ade3b90c5007e1bd83919cc2636c 100644
--- a/src/util/rangeutil.hpp
+++ b/src/util/rangeutil.hpp
@@ -41,26 +41,43 @@ range_view(Seq& seq) {
 
 template <
     typename Seq,
-    typename Iter = typename sequence_traits<Seq>::iterator,
-    typename Size = typename sequence_traits<Seq>::size_type
+    typename Offset1,
+    typename Offset2,
+    typename Iter = typename sequence_traits<Seq>::iterator
 >
 enable_if_t<is_forward_iterator<Iter>::value, range<Iter>>
-subrange_view(Seq& seq, Size bi, Size ei) {
-    Iter b = std::next(std::begin(seq), bi);
-    Iter e = std::next(b, ei-bi);
+subrange_view(Seq& seq, Offset1 bi, Offset2 ei) {
+    Iter b = std::begin(seq);
+    std::advance(b, bi);
+
+    Iter e = b;
+    std::advance(e, ei-bi);
     return make_range(b, e);
 }
 
 template <
     typename Seq,
-    typename Iter = typename sequence_traits<Seq>::iterator,
-    typename Size = typename sequence_traits<Seq>::size_type
+    typename Offset1,
+    typename Offset2,
+    typename Iter = typename sequence_traits<Seq>::iterator
 >
 enable_if_t<is_forward_iterator<Iter>::value, range<Iter>>
-subrange_view(Seq& seq, std::pair<Size, Size> index) {
-    Iter b = std::next(std::begin(seq), index.first);
-    Iter e = std::next(b, index.second-index.first);
-    return make_range(b, e);
+subrange_view(Seq& seq, std::pair<Offset1, Offset2> index) {
+    return subrange_view(seq, index.first, index.second);
+}
+
+// Fill container or range.
+
+template <typename Seq, typename V>
+void fill(Seq& seq, const V& value) {
+    auto canon = canonical_view(seq);
+    std::fill(canon.begin(), canon.end(), value);
+}
+
+template <typename Range, typename V>
+void fill(const Range& seq, const V& value) {
+    auto canon = canonical_view(seq);
+    std::fill(canon.begin(), canon.end(), value);
 }
 
 // Append sequence to a container
@@ -238,7 +255,7 @@ max_element_by(const Seq& seq, const Proj& proj) {
         });
 }
 
-// Maximum value
+// Maximum value.
 //
 // Value semantics instead of iterator semantics means it will operate
 // with input iterators.  Will return default-constructed value if sequence
@@ -259,7 +276,7 @@ Value max_value(const Seq& seq, Compare cmp = Compare{}) {
 
     auto i = std::begin(seq);
     auto e = std::end(seq);
-    auto m = *i;
+    Value m = *i;
     while (++i!=e) {
         Value x = *i;
         if (cmp(m, x)) {
@@ -269,6 +286,34 @@ Value max_value(const Seq& seq, Compare cmp = Compare{}) {
     return m;
 }
 
+// Minimum and maximum value.
+
+template <
+    typename Seq,
+    typename Value = typename sequence_traits<Seq>::value_type,
+    typename Compare = std::less<Value>
+>
+std::pair<Value, Value> minmax_value(const Seq& seq, Compare cmp = Compare{}) {
+    if (util::empty(seq)) {
+        return {Value{}, Value{}};
+    }
+
+    auto i = std::begin(seq);
+    auto e = std::end(seq);
+    Value lower = *i;
+    Value upper = *i;
+    while (++i!=e) {
+        Value x = *i;
+        if (cmp(upper, x)) {
+            upper = std::move(x);
+        }
+        else if (cmp(x, lower)) {
+            lower = std::move(x);
+        }
+    }
+    return {lower, upper};
+}
+
 template <typename C, typename Seq>
 C make_copy(Seq const& seq) {
     return C{std::begin(seq), std::end(seq)};
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 06899d2d9498959dbf44116ec2d75700c86d39bb..789969bd47d4dbb968d5c0a771fcfd7587f23b6a 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -21,6 +21,7 @@ set(TEST_CUDA_SOURCES
     test_mc_cell_group.cu
     test_gpu_stack.cu
     test_matrix.cu
+    test_multi_event_stream.cu
     test_spikes.cu
     test_vector.cu
 
@@ -50,6 +51,7 @@ set(TEST_SOURCES
     test_math.cpp
     test_matrix.cpp
     test_mechanisms.cpp
+    test_multi_event_stream.cpp
     test_nop.cpp
     test_optional.cpp
     test_parameters.cpp
@@ -89,6 +91,8 @@ endif()
 target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..")
 
 if(NMC_WITH_CUDA)
+    # Omit -DDATADIR for cuda target because of CMake 3.7/3.8 FindCUDA quoting bug.
+
     set(TARGETS ${TARGETS} test_cuda.exe)
     cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS})
 endif()
@@ -108,3 +112,9 @@ foreach(target ${TARGETS})
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 endforeach()
+
+# Temporary hack (pending PR #266)
+if(NMC_WITH_CUDA)
+    target_link_libraries(test_cuda.exe LINK_PUBLIC nestmc)
+endif()
+
diff --git a/tests/unit/test_backend.cpp b/tests/unit/test_backend.cpp
index 08679d0d0b6e750e1611cbf0a9cbfc87ca68975f..109d75962f6772a90503ea8589ff617d926c11e0 100644
--- a/tests/unit/test_backend.cpp
+++ b/tests/unit/test_backend.cpp
@@ -17,7 +17,7 @@ TEST(backends, gpu_is_null) {
 
         EXPECT_FALSE(backend::has_mechanism("hh"));
         EXPECT_THROW(
-            backend::make_mechanism("hh", backend::view(), backend::view(), {}, {}),
+            backend::make_mechanism("hh", 0, backend::const_iview(), backend::const_view(), backend::const_view(), backend::const_view(), backend::view(), backend::view(), {}, {}),
             std::runtime_error);
     }
 }
diff --git a/tests/unit/test_fvm_multi.cpp b/tests/unit/test_fvm_multi.cpp
index 7f491f9fb7d7b748e3743b081a5351bf4dbe364b..6a9863191e2f71d4bc1733caabb15fc4311cda55 100644
--- a/tests/unit/test_fvm_multi.cpp
+++ b/tests/unit/test_fvm_multi.cpp
@@ -209,14 +209,17 @@ TEST(fvm_multi, stimulus)
 
     // test 1: Test that no current is injected at t=0
     memory::fill(I, 0.);
-    stims->set_params(0, 0.1);
+    fvcell.set_time_global(0.);
+    fvcell.set_time_to_global(0.1);
+    stims->set_params();
     stims->nrn_current();
     for (auto i: I) {
         EXPECT_EQ(i, 0.);
     }
 
     // test 2: Test that current is injected at soma at t=1
-    stims->set_params(1, 0.1);
+    fvcell.set_time_global(1.);
+    fvcell.set_time_to_global(1.1);
     stims->nrn_current();
     EXPECT_EQ(I[soma_idx], -0.1);
 
@@ -224,13 +227,16 @@ TEST(fvm_multi, stimulus)
     //         Note that we test for injection of -0.2, because the
     //         current contributions are accumulative, and the current
     //         values have not been cleared since the last update.
-    stims->set_params(1.5, 0.1);
+    fvcell.set_time_global(1.5);
+    fvcell.set_time_to_global(1.6);
+    stims->set_params();
     stims->nrn_current();
     EXPECT_EQ(I[soma_idx], -0.2);
 
     // test 4: test at t=10ms, when the the soma stim is not active, and
     //         dendrite stimulus is injecting a current of 0.3 nA
-    stims->set_params(10, 0.1);
+    fvcell.set_time_global(10.);
+    fvcell.set_time_to_global(10.1);
     stims->nrn_current();
     EXPECT_EQ(I[soma_idx], -0.2);
     EXPECT_EQ(I[dend_idx], -0.3);
@@ -385,10 +391,11 @@ void run_target_handle_test(std::vector<handle_info> all_handles) {
     for (unsigned ci = 0; ci<=1; ++ci) {
         for (auto h: handles[ci]) {
             // targets are represented by a pair of mechanism index and instance index
-            const auto& mech = fvcell.mechanisms()[targets[i].first];
+            const auto& mech = fvcell.mechanisms()[targets[i].mech_id];
             const auto& cvidx = mech->node_index();
             EXPECT_EQ(h.mech, mech->name());
-            EXPECT_EQ(h.cv, cvidx[targets[i].second]);
+            EXPECT_EQ(h.cv, cvidx[targets[i].index]);
+            EXPECT_EQ(h.cell, targets[i].cell_index);
             ++i;
         }
     }
diff --git a/tests/unit/test_matrix.cpp b/tests/unit/test_matrix.cpp
index f74a6760047119678ef8192d1bca26c1ab15350c..7cc1e959acbc17dba8620d467122958219c4d457 100644
--- a/tests/unit/test_matrix.cpp
+++ b/tests/unit/test_matrix.cpp
@@ -8,6 +8,8 @@
 #include <backends/multicore/fvm.hpp>
 #include <util/span.hpp>
 
+#include "common.hpp"
+
 using namespace nest::mc;
 
 using matrix_type = matrix<nest::mc::multicore::backend>;
@@ -77,3 +79,96 @@ TEST(matrix, solve_host)
         }
     }
 }
+
+TEST(matrix, zero_diagonal)
+{
+    // Combined matrix may have zero-blocks, corresponding to a zero dt.
+    // Zero-blocks are indicated by zero value in the diagonal (the off-diagonal
+    // elements should be ignored).
+    // These submatrices should leave the rhs as-is when solved.
+
+    // Three matrices, sizes 3, 3 and 2, with no branching.
+    std::vector<size_type> p = {0, 0, 1, 3, 3, 5, 5};
+    std::vector<size_type> c = {0, 3, 5, 7};
+    matrix_type m(p, c, vvec(7), vvec(7));
+
+    EXPECT_EQ(7u, m.size());
+    EXPECT_EQ(3u, m.num_cells());
+
+    auto& A = m.state_;
+    A.d =   vvec({2,  3,  2, 0,  0,  4,  5});
+    A.u =   vvec({0, -1, -1, 0, -1,  0, -2});
+    A.rhs = vvec({3,  5,  7, 7,  8, 16, 32});
+
+    // Expected solution:
+    std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
+
+    m.solve();
+    auto x = m.solution();
+
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+}
+
+TEST(matrix, zero_diagonal_assembled)
+{
+    // Use assemble method to construct same zero-diagonal
+    // test case from CV data.
+
+    using util::assign;
+    using memory::make_view;
+
+    // Combined matrix may have zero-blocks, corresponding to a zero dt.
+    // Zero-blocks are indicated by zero value in the diagonal (the off-diagonal
+    // elements should be ignored).
+    // These submatrices should leave the rhs as-is when solved.
+
+    // Three matrices, sizes 3, 3 and 2, with no branching.
+    std::vector<size_type> p = {0, 0, 1, 3, 3, 5, 5};
+    std::vector<size_type> c = {0, 3, 5, 7};
+
+    // Face conductances.
+    vvec g = {0, 1, 1, 0, 1, 0, 2};
+
+    // dt of 1e-3.
+    vvec dt(3, 1.0e-3);
+
+    // Capacitances.
+    vvec Cm = {1, 1, 1, 1, 1, 2, 3};
+
+    // Intial voltage of zero; currents alone determine rhs.
+    vvec v(7, 0.0);
+    vvec i = {-3, -5, -7, -6, -9, -16, -32};
+
+    // Expected matrix and rhs:
+    // u = [ 0 -1 -1  0 -1  0 -2]
+    // d = [ 2  3  2  2  2  4  5]
+    // b = [ 3  5  7  2  4 16 32]
+    //
+    // Expected solution:
+    // x = [ 4  5  6  7  8  9 10]
+
+    matrix_type m(p, c, Cm, g);
+    m.assemble(make_view(dt), make_view(v), make_view(i));
+    m.solve();
+
+    vvec x;
+    assign(x, on_host(m.solution()));
+    std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
+
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+
+    // Set dt of 2nd (middle) submatrix to zero. Solution
+    // should then return voltage values for that submatrix.
+
+    dt[1] = 0;
+    v[3] = 20;
+    v[4] = 30;
+    m.assemble(make_view(dt), make_view(v), make_view(i));
+    m.solve();
+
+    assign(x, m.solution());
+    expected = {4, 5, 6, 20, 30, 9, 10};
+
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+}
+
diff --git a/tests/unit/test_matrix.cu b/tests/unit/test_matrix.cu
index dac64cdf78e45b1bf0f53aaaf435e596f776c788..3e7a730d82f1cc1e8683c2a42f860d482cbc9f27 100644
--- a/tests/unit/test_matrix.cu
+++ b/tests/unit/test_matrix.cu
@@ -12,6 +12,8 @@
 #include <memory/memory.hpp>
 #include <util/span.hpp>
 
+#include <cuda.h>
+
 using namespace nest::mc;
 
 using gpu::impl::npos;
@@ -265,7 +267,6 @@ TEST(matrix, assemble)
 
     // Build the capacitance and conductance vectors and
     // populate with nonzero random values.
-
     auto gen  = std::mt19937();
     auto dist = std::uniform_real_distribution<T>(1, 2);
 
@@ -275,20 +276,26 @@ TEST(matrix, assemble)
     std::vector<T> g(group_size);
     std::generate(g.begin(), g.end(), [&](){return dist(gen);});
 
-    // Make the referenace matrix and the gpu matrix
+    // Make the reference matrix and the gpu matrix
     auto m_mc  = mc_state( p, cell_index, Cm, g); // on host
     auto m_gpu = gpu_state(p, cell_index, Cm, g); // on gpu
 
+    // Set the integration times for the cells to be between 0.1 and 0.2 ms.
+    std::vector<T> dt(num_mtx);
+
+    auto dt_dist = std::uniform_real_distribution<T>(0.1, 0.2);
+    std::generate(dt.begin(), dt.end(), [&](){return dt_dist(gen);});
+
     // Voltage and current values
-    m_mc.assemble( 0.2, host_array(group_size, -64), host_array(group_size, 10));
+    m_mc.assemble(host_array(dt), host_array(group_size, -64), host_array(group_size, 10));
     m_mc.solve();
-    m_gpu.assemble(0.2, gpu_array(group_size, -64),  gpu_array(group_size, 10));
+    m_gpu.assemble(on_gpu(dt), gpu_array(group_size, -64), gpu_array(group_size, 10));
     m_gpu.solve();
 
     // Compare the GPU and CPU results.
     // Cast result to float, because we are happy to ignore small differencs
-    std::vector<float> result_h = util::assign_from(m_mc.solution);
-    std::vector<float> result_g = util::assign_from(on_host(m_gpu.solution));
+    std::vector<float> result_h = util::assign_from(m_mc.solution());
+    std::vector<float> result_g = util::assign_from(on_host(m_gpu.solution()));
     EXPECT_TRUE(seq_almost_eq<float>(result_h, result_g));
 }
 
@@ -339,22 +346,21 @@ TEST(matrix, backends)
     const int num_mtx = 200;
 
     std::vector<I> p;
-    std::vector<I> cell_index;
+    std::vector<I> cell_cv_divs;
     for (auto m=0; m<num_mtx; ++m) {
         auto &p_ref = p_base[m%2];
         auto first = p.size();
         for (auto i: p_ref) {
             p.push_back(i + first);
         }
-        cell_index.push_back(first);
+        cell_cv_divs.push_back(first);
     }
-    cell_index.push_back(p.size());
+    cell_cv_divs.push_back(p.size());
 
-    auto group_size = cell_index.back();
+    auto group_size = cell_cv_divs.back();
 
     // Build the capacitance and conductance vectors and
     // populate with nonzero random values
-
     auto gen  = std::mt19937();
     gen.seed(100);
     auto dist = std::uniform_real_distribution<T>(1, 200);
@@ -369,13 +375,23 @@ TEST(matrix, backends)
     std::generate(v.begin(), v.end(), [&](){return dist(gen);});
     std::generate(i.begin(), i.end(), [&](){return dist(gen);});
 
-    // Make the referenace matrix and the gpu matrix
-    auto flat = state_flat(p, cell_index, Cm, g); // flat
-    auto intl = state_intl(p, cell_index, Cm, g); // interleaved
+    // Make the reference matrix and the gpu matrix
+    auto flat = state_flat(p, cell_cv_divs, Cm, g); // flat
+    auto intl = state_intl(p, cell_cv_divs, Cm, g); // interleaved
 
-    // voltage and current values
-    flat.assemble(0.02, on_gpu(v), on_gpu(i));
-    intl.assemble(0.02, on_gpu(v), on_gpu(i));
+    // Set the integration times for the cells to be between 0.01 and 0.02 ms.
+    std::vector<T> dt(num_mtx, 0);
+
+    auto dt_dist = std::uniform_real_distribution<T>(0.01, 0.02);
+    std::generate(dt.begin(), dt.end(), [&](){return dt_dist(gen);});
+
+    // Voltage and current values.
+    auto gpu_dt = on_gpu(dt);
+    auto gpu_v = on_gpu(v);
+    auto gpu_i = on_gpu(i);
+
+    flat.assemble(gpu_dt, gpu_v, gpu_i);
+    intl.assemble(gpu_dt, gpu_v, gpu_i);
 
     flat.solve();
     intl.solve();
@@ -383,7 +399,80 @@ TEST(matrix, backends)
     // Compare the results.
     // We expect exact equality for the two gpu matrix implementations because both
     // perform the same operations in the same order on the same inputs.
-    std::vector<double> x_flat = assign_from(on_host(flat.solution));
-    std::vector<double> x_intl = assign_from(on_host(intl.solution));
+    std::vector<double> x_flat = assign_from(on_host(flat.solution()));
+    std::vector<double> x_intl = assign_from(on_host(intl.solution()));
     EXPECT_EQ(x_flat, x_intl);
 }
+
+// Test for special zero diagonal behaviour. (see `test_matrix.cpp`.)
+TEST(matrix, zero_diagonal)
+{
+    using util::assign;
+
+    using value_type = gpu::backend::value_type;
+    using size_type = gpu::backend::size_type;
+    using matrix_type = gpu::backend::matrix_state;
+    using vvec = std::vector<value_type>;
+
+    // Combined matrix may have zero-blocks, corresponding to a zero dt.
+    // Zero-blocks are indicated by zero value in the diagonal (the off-diagonal
+    // elements should be ignored).
+    // These submatrices should leave the rhs as-is when solved.
+
+    // Three matrices, sizes 3, 3 and 2, with no branching.
+    std::vector<size_type> p = {0, 0, 1, 3, 3, 5, 5};
+    std::vector<size_type> c = {0, 3, 5, 7};
+
+    // Face conductances.
+    std::vector<value_type> g = {0, 1, 1, 0, 1, 0, 2};
+
+    // dt of 1e-3.
+    std::vector<value_type> dt(3, 1.0e-3);
+
+    // Capacitances.
+    std::vector<value_type> Cm = {1, 1, 1, 1, 1, 2, 3};
+
+    // Intial voltage of zero; currents alone determine rhs.
+    std::vector<value_type> v(7, 0.0);
+    std::vector<value_type> i = {-3, -5, -7, -6, -9, -16, -32};
+
+    // Expected matrix and rhs:
+    // u = [ 0 -1 -1  0 -1  0 -2]
+    // d = [ 2  3  2  2  2  4  5]
+    // b = [ 3  5  7  2  4 16 32]
+    //
+    // Expected solution:
+    // x = [ 4  5  6  7  8  9 10]
+
+    matrix_type m(p, c, Cm, g);
+    auto gpu_dt = on_gpu(dt);
+    auto gpu_v  = on_gpu(v);
+    auto gpu_i  = on_gpu(i);
+    m.assemble(gpu_dt, gpu_v, gpu_i);
+    m.solve();
+
+    vvec x;
+    assign(x, on_host(m.solution()));
+    std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10};
+
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+
+    // Set dt of 2nd (middle) submatrix to zero. Solution
+    // should then return voltage values for that submatrix.
+
+    dt[1] = 0;
+    gpu_dt = on_gpu(dt);
+
+    v[3] = 20;
+    v[4] = 30;
+    gpu_v  = on_gpu(v);
+
+    m.assemble(gpu_dt, gpu_v, gpu_i);
+    m.solve();
+
+    assign(x, on_host(m.solution()));
+    expected = {4, 5, 6, 20, 30, 9, 10};
+
+    EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x));
+}
+
diff --git a/tests/unit/test_mechanisms.cpp b/tests/unit/test_mechanisms.cpp
index f91d3bf592a518d8c64c5dfbdb6d893ae0386e0e..0f53af29f86132c75a3b5706a2582eb8fd3a3c99 100644
--- a/tests/unit/test_mechanisms.cpp
+++ b/tests/unit/test_mechanisms.cpp
@@ -34,22 +34,32 @@ TEST(mechanisms, helpers) {
     EXPECT_TRUE(multicore::backend::has_mechanism("pas"));
 
     std::vector<size_type> parent_index = {0,0,1,2,3,4,0,6,7,8};
-    auto node_indices = std::vector<size_type>{0,6,7,8,9};
-    auto weights = std::vector<value_type>(node_indices.size(), 1.0);
-    auto n = node_indices.size();
+    auto node_index = std::vector<size_type>{0,6,7,8,9};
+    auto weights = std::vector<value_type>(node_index.size(), 1.0);
+    auto n = node_index.size();
+
+    // one cell
+    size_type ncell = 1;
+    std::vector<size_type> cell_index(n, 0u);
 
     multicore::backend::array vec_i(n, 0.);
     multicore::backend::array vec_v(n, 0.);
+    multicore::backend::array vec_t(ncell, 0.);
+    multicore::backend::array vec_t_to(ncell, 0.);
+    multicore::backend::array vec_dt(n, 0.);
 
-    auto mech = multicore::backend::make_mechanism(
-            "hh", memory::make_view(vec_v), memory::make_view(vec_i), weights, node_indices);
+    auto mech = multicore::backend::make_mechanism("hh", 0,
+            memory::make_view(cell_index), vec_t, vec_t_to, vec_dt,
+            vec_v, vec_i, weights, node_index);
 
     EXPECT_EQ(mech->name(), "hh");
     EXPECT_EQ(mech->size(), 5u);
 
     // check that an out_of_range exception is thrown if an invalid mechanism is requested
     ASSERT_THROW(
-        multicore::backend::make_mechanism("dachshund", vec_v, vec_i, weights, node_indices),
+        multicore::backend::make_mechanism("dachshund", 0,
+            memory::make_view(cell_index), vec_t, vec_t_to, vec_dt,
+            vec_v, vec_i, weights, node_index),
         std::out_of_range
     );
 }
@@ -61,7 +71,7 @@ void mech_update(T* mech, unsigned num_iters) {
     using namespace nest::mc;
     std::map<mechanisms::ionKind, mechanisms::ion<typename T::backend>> ions;
 
-    mech->set_params(2., 0.1);
+    mech->set_params();
     mech->nrn_init();
     for (auto ion_kind : mechanisms::ion_kinds()) {
         auto ion_indexes = util::make_copy<std::vector<typename T::size_type>>(
@@ -124,13 +134,21 @@ TYPED_TEST_P(mechanisms, update) {
     EXPECT_TRUE((std::is_same<typename proto_mechanism_type::array,
                               typename mechanism_type::array>::value));
 
-    auto num_syn = 32;
+    int num_cell = 1;
+    int num_syn = 32;
+    int num_comp = num_syn;
+
+    typename mechanism_type::iarray node_index(num_syn);
+    typename mechanism_type::array  voltage(num_comp, -65.0);
+    typename mechanism_type::array  current(num_comp,   1.0);
 
-    typename mechanism_type::iarray indexes(num_syn);
-    typename mechanism_type::array  voltage(num_syn, -65.0);
-    typename mechanism_type::array  current(num_syn,   1.0);
     typename mechanism_type::array  weights(num_syn,   1.0);
 
+    typename mechanism_type::iarray cell_index(num_comp, 0);
+    typename mechanism_type::array  time(num_cell, 2.);
+    typename mechanism_type::array  time_to(num_cell, 2.1);
+    typename mechanism_type::array  dt(num_comp, 2.1-2.);
+
     array_init(voltage, nest::mc::util::cyclic_view({ -65.0, -61.0, -63.0 }));
     array_init(current, nest::mc::util::cyclic_view({   1.0,   0.9,   1.1 }));
     array_init(weights, nest::mc::util::cyclic_view({ 1.0 }));
@@ -146,29 +164,30 @@ TYPED_TEST_P(mechanisms, update) {
 
     auto freq_begin = nest::mc::util::cyclic_view(index_freq).cbegin();
     auto freq = freq_begin;
-    auto index = indexes.begin();
-    while (index != indexes.end()) {
-        for (auto i = 0; i < *freq && index != indexes.end(); ++i) {
+    auto index = node_index.begin();
+    while (index != node_index.end()) {
+        for (auto i = 0; i < *freq && index != node_index.end(); ++i) {
             *index++ = freq - freq_begin;
         }
         ++freq;
     }
 
-
     // Copy indexes, voltage and current to use for the prototype mechanism
-    typename mechanism_type::iarray indexes_copy(indexes);
+    typename mechanism_type::iarray node_index_copy(node_index);
     typename mechanism_type::array  voltage_copy(voltage);
     typename mechanism_type::array  current_copy(current);
     typename mechanism_type::array  weights_copy(weights);
 
     // Create mechanisms
     auto mech = nest::mc::mechanisms::make_mechanism<mechanism_type>(
-        voltage, current, std::move(weights), std::move(indexes)
+        0, cell_index, time, time_to, dt,
+        voltage, current, std::move(weights), std::move(node_index)
     );
 
     auto mech_proto = nest::mc::mechanisms::make_mechanism<proto_mechanism_type>(
+        0, cell_index, time, time_to, dt,
         voltage_copy, current_copy,
-        std::move(weights_copy), std::move(indexes_copy)
+        std::move(weights_copy), std::move(node_index_copy)
     );
 
     mech_update(dynamic_cast<mechanism_type*>(mech.get()), 10);
diff --git a/tests/unit/test_multi_event_stream.cpp b/tests/unit/test_multi_event_stream.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..754b9f7071fb9d7a6a28ecce9e101627689626ed
--- /dev/null
+++ b/tests/unit/test_multi_event_stream.cpp
@@ -0,0 +1,168 @@
+#include <vector>
+#include "../gtest.h"
+
+#include <backends/event.hpp>
+#include <backends/multicore/multi_event_stream.hpp>
+
+using namespace nest::mc;
+
+namespace common_events {
+    // set up four targets across three streams and two mech ids.
+
+    constexpr cell_local_size_type mech_1 = 10u;
+    constexpr cell_local_size_type mech_2 = 13u;
+    constexpr cell_size_type cell_1 = 20u;
+    constexpr cell_size_type cell_2 = 77u;
+    constexpr cell_size_type cell_3 = 33u;
+    constexpr cell_size_type n_cell = 100u;
+
+    target_handle handle[4] = {
+        target_handle(mech_1, 0u, cell_1),
+        target_handle(mech_2, 1u, cell_2),
+        target_handle(mech_1, 4u, cell_2),
+        target_handle(mech_2, 2u, cell_3)
+    };
+
+    // cell_1 (handle 0) has one event at t=3
+    // cell_2 (handle 1 and 2) has two events at t=2 and t=5
+    // cell_3 (handle 3) has one event at t=3
+
+    std::vector<deliverable_event> events = {
+        deliverable_event(2.f, handle[1], 2.f),
+        deliverable_event(3.f, handle[0], 1.f),
+        deliverable_event(3.f, handle[3], 4.f),
+        deliverable_event(5.f, handle[2], 3.f)
+    };
+}
+
+TEST(multi_event_stream, init) {
+    using multi_event_stream = multicore::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    EXPECT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+    EXPECT_FALSE(m.empty());
+
+    m.clear();
+    EXPECT_TRUE(m.empty());
+}
+
+TEST(multi_event_stream, mark) {
+    using multi_event_stream = multicore::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    ASSERT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        EXPECT_TRUE(m.marked_events(i).empty());
+    }
+
+    std::vector<time_type> t_until(n_cell);
+    t_until[cell_1] = 2.f;
+    t_until[cell_2] = 2.5f;
+    t_until[cell_3] = 4.f;
+    m.mark_until_after(t_until);
+
+    // Only two events should be marked: 
+    //     events[0] (with handle 1) at t=2.f on cell_2
+    //     events[2] (with handle 3) at t=3.f on cell_3
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        auto evs = m.marked_events(i);
+        auto n_marked = evs.size();
+        switch (i) {
+        case cell_2:
+            EXPECT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[1].mech_id, evs.front().handle.mech_id);
+            EXPECT_EQ(handle[1].index, evs.front().handle.index);
+            break;
+        case cell_3:
+            EXPECT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[3].mech_id, evs.front().handle.mech_id);
+            EXPECT_EQ(handle[3].index, evs.front().handle.index);
+            break;
+        default:
+            EXPECT_EQ(0u, n_marked);
+            break;
+        }
+    }
+
+    // Drop these events and mark all events up to t=5.f.
+    //     cell_1 should have one marked event (events[1], handle 0)
+    //     cell_2 should have one marked event (events[3], handle 2)
+
+    m.drop_marked_events();
+    t_until.assign(n_cell, 5.f);
+    m.mark_until_after(t_until);
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        auto evs = m.marked_events(i);
+        auto n_marked = evs.size();
+        switch (i) {
+        case cell_1:
+            EXPECT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[0].mech_id, evs.front().handle.mech_id);
+            EXPECT_EQ(handle[0].index, evs.front().handle.index);
+            break;
+        case cell_2:
+            EXPECT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[2].mech_id, evs.front().handle.mech_id);
+            EXPECT_EQ(handle[2].index, evs.front().handle.index);
+            break;
+        default:
+            EXPECT_EQ(0u, n_marked);
+            break;
+        }
+    }
+
+    // No more events after these.
+    EXPECT_FALSE(m.empty());
+    m.drop_marked_events();
+    EXPECT_TRUE(m.empty());
+}
+
+TEST(multi_event_stream, time_if_before) {
+    using multi_event_stream = multicore::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    ASSERT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+
+    // Test times less than all event times (first event at t=2).
+    std::vector<double> before(n_cell);
+    std::vector<double> after;
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	before[i] = 0.1+i/(double)n_cell;
+    }
+
+    std::vector<double> t(before);
+    m.event_time_if_before(t);
+
+    EXPECT_EQ(before, t);
+
+    // With times between 2 and 3, expect the event at time t=2
+    // on cell_2 to restrict corresponding element of t.
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	before[i] = 2.1+0.5*i/(double)n_cell;
+    }
+    t = before;
+    m.event_time_if_before(t);
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	if (i==cell_2) {
+	    EXPECT_EQ(2., t[i]);
+	}
+	else {
+	    EXPECT_EQ(before[i], t[i]);
+	}
+    }
+}
diff --git a/tests/unit/test_multi_event_stream.cu b/tests/unit/test_multi_event_stream.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6d5d3a5a5ec440b9f16a6fcd5efeff6f939eaa7
--- /dev/null
+++ b/tests/unit/test_multi_event_stream.cu
@@ -0,0 +1,242 @@
+#include <cstdio>
+#include <random>
+#include <vector>
+
+#include <cuda.h>
+#include "../gtest.h"
+
+#include <backends/event.hpp>
+#include <backends/gpu/multi_event_stream.hpp>
+#include <backends/gpu/kernels/time_ops.hpp>
+#include <memory/wrappers.hpp>
+#include <util/rangeutil.hpp>
+
+using namespace nest::mc;
+
+namespace common_events {
+    // set up four targets across three streams and two mech ids.
+
+    constexpr cell_local_size_type mech_1 = 10u;
+    constexpr cell_local_size_type mech_2 = 13u;
+    constexpr cell_size_type cell_1 = 20u;
+    constexpr cell_size_type cell_2 = 77u;
+    constexpr cell_size_type cell_3 = 33u;
+    constexpr cell_size_type n_cell = 100u;
+
+    target_handle handle[4] = {
+        target_handle(mech_1, 0u, cell_1),
+        target_handle(mech_2, 1u, cell_2),
+        target_handle(mech_1, 4u, cell_2),
+        target_handle(mech_2, 2u, cell_3)
+    };
+
+    // cell_1 (handle 0) has one event at t=3
+    // cell_2 (handle 1 and 2) has two events at t=2 and t=5
+    // cell_3 (handle 3) has one event at t=3
+
+    std::vector<deliverable_event> events = {
+        deliverable_event(2.f, handle[1], 2.f),
+        deliverable_event(3.f, handle[0], 1.f),
+        deliverable_event(3.f, handle[3], 4.f),
+        deliverable_event(5.f, handle[2], 3.f)
+    };
+}
+
+TEST(multi_event_stream, init) {
+    using multi_event_stream = gpu::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    EXPECT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+    EXPECT_FALSE(m.empty());
+
+    m.clear();
+    EXPECT_TRUE(m.empty());
+}
+
+struct ev_info {
+    unsigned mech_id;
+    unsigned index;
+    double weight;
+};
+
+__global__
+void copy_marked_events_kernel(
+    unsigned ci,
+    gpu::multi_event_stream::span_state state,
+    ev_info* store,
+    unsigned& count,
+    unsigned max_ev)
+{
+    // use only one thread here
+    if (threadIdx.x || blockIdx.x) return;
+
+    unsigned k = 0;
+    for (auto j = state.span_begin[ci]; j<state.mark[ci]; ++j) {
+        if (k>=max_ev) break;
+        store[k++] = {state.ev_mech_id[j], state.ev_index[j], state.ev_weight[j]};
+    }
+    count = k;
+}
+
+std::vector<ev_info> copy_marked_events(int ci, gpu::multi_event_stream& m) {
+    unsigned max_ev = 1000;
+    memory::device_vector<ev_info> store(max_ev);
+    memory::device_vector<unsigned> counter(1);
+
+    copy_marked_events_kernel<<<1,1>>>(ci, m.delivery_data(), store.data(), *counter.data(), max_ev);
+    unsigned n_ev = counter[0];
+    std::vector<ev_info> ev(n_ev);
+    memory::copy(store(0, n_ev), ev);
+    return ev;
+}
+
+TEST(multi_event_stream, mark) {
+    using multi_event_stream = gpu::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    ASSERT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        EXPECT_TRUE(copy_marked_events(i, m).empty());
+    }
+
+    memory::device_vector<double> t_until(n_cell);
+    t_until[cell_1] = 2.;
+    t_until[cell_2] = 2.5;
+    t_until[cell_3] = 4.;
+
+    m.mark_until_after(t_until);
+
+    // Only two events should be marked: 
+    //     events[0] (with handle 1) at t=2 on cell_2
+    //     events[2] (with handle 3) at t=3 on cell_3
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        auto evs = copy_marked_events(i, m);
+        auto n_marked = evs.size();
+        switch (i) {
+        case cell_2:
+            ASSERT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[1].mech_id, evs.front().mech_id);
+            EXPECT_EQ(handle[1].index, evs.front().index);
+            break;
+        case cell_3:
+            ASSERT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[3].mech_id, evs.front().mech_id);
+            EXPECT_EQ(handle[3].index, evs.front().index);
+            break;
+        default:
+            EXPECT_EQ(0u, n_marked);
+            break;
+        }
+    }
+
+    // Drop these events and mark all events up to t=5.
+    //     cell_1 should have one marked event (events[1], handle 0)
+    //     cell_2 should have one marked event (events[3], handle 2)
+
+    m.drop_marked_events();
+    memory::fill(t_until, 5.);
+    m.mark_until_after(memory::make_view(t_until));
+
+    for (cell_size_type i = 0; i<n_cell; ++i) {
+        auto evs = copy_marked_events(i, m);
+        auto n_marked = evs.size();
+        switch (i) {
+        case cell_1:
+            ASSERT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[0].mech_id, evs.front().mech_id);
+            EXPECT_EQ(handle[0].index, evs.front().index);
+            break;
+        case cell_2:
+            ASSERT_EQ(1u, n_marked);
+            EXPECT_EQ(handle[2].mech_id, evs.front().mech_id);
+            EXPECT_EQ(handle[2].index, evs.front().index);
+            break;
+        default:
+            EXPECT_EQ(0u, n_marked);
+            break;
+        }
+    }
+
+    // No more events after these.
+    EXPECT_FALSE(m.empty());
+    m.drop_marked_events();
+    EXPECT_TRUE(m.empty());
+}
+
+TEST(multi_event_stream, time_if_before) {
+    using multi_event_stream = gpu::multi_event_stream;
+    using namespace common_events;
+
+    multi_event_stream m(n_cell);
+    ASSERT_EQ(n_cell, m.n_streams());
+
+    m.init(events);
+
+    // Test times less than all event times (first event at t=2).
+    std::vector<double> before(n_cell);
+    std::vector<double> after;
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	before[i] = 0.1+i/(double)n_cell;
+    }
+
+    memory::device_vector<double> t = memory::on_gpu(before);
+    m.event_time_if_before(t);
+    util::assign(after, memory::on_host(t));
+
+    EXPECT_EQ(before, after);
+
+    // With times between 2 and 3, expect the event at time t=2
+    // on cell_2 to restrict corresponding element of t.
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	before[i] = 2.1+0.5*i/(double)n_cell;
+    }
+    t = memory::make_view(before);
+    m.event_time_if_before(t);
+    util::assign(after, memory::on_host(t));
+
+    for (unsigned i = 0; i<n_cell; ++i) {
+	if (i==cell_2) {
+	    EXPECT_EQ(2., after[i]);
+	}
+	else {
+	    EXPECT_EQ(before[i], after[i]);
+	}
+    }
+}
+
+TEST(multi_event_stream, any_time_before) {
+    constexpr std::size_t n = 10000;
+    std::minstd_rand R;
+    std::uniform_real_distribution<float> g(0, 10);
+
+    std::vector<double> t(n);
+    std::generate(t.begin(), t.end(), [&]{ return g(R); });
+
+    memory::device_vector<double> t0 = memory::on_gpu(t);
+
+    double tmin = *std::min_element(t.begin(), t.end());
+    EXPECT_TRUE(gpu::any_time_before(n, t0.data(), tmin+0.01));
+    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), tmin));
+
+    memory::device_vector<double> t1 = memory::on_gpu(t);
+    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), t1.data()));
+
+    t[2*n/3] += 20;
+    t1 = memory::on_gpu(t);
+    EXPECT_TRUE(gpu::any_time_before(n, t0.data(), t1.data()));
+
+    t[2*n/3] -= 30;
+    t1 = memory::on_gpu(t);
+    EXPECT_FALSE(gpu::any_time_before(n, t0.data(), t1.data()));
+}
+
diff --git a/tests/unit/test_range.cpp b/tests/unit/test_range.cpp
index fa2458faab208aa9677f295b267fd720be5c8331..e058604f7b31d1341328f42ee06efba97eaf550e 100644
--- a/tests/unit/test_range.cpp
+++ b/tests/unit/test_range.cpp
@@ -190,6 +190,31 @@ TEST(range, strictify) {
     EXPECT_EQ(cstr+11, ptr_range.right);
 }
 
+TEST(range, subrange) {
+    int values[] = {10, 11, 12, 13, 14, 15, 16};
+
+    // `subrange_view` should handle offsets of different integral types sanely.
+    auto sub1 = util::subrange_view(values, 1, 6u);
+    EXPECT_EQ(11, sub1[0]);
+    EXPECT_EQ(15, sub1.back());
+
+    // Should be able to take subranges of subranges, and modify underlying
+    // sequence.
+    auto sub2 = util::subrange_view(sub1, 3ull, (short)4);
+    EXPECT_EQ(1u, sub2.size());
+
+    sub2[0] = 23;
+    EXPECT_EQ(23, values[4]);
+
+    // Taking a subrange view of a const range over non-const iterators
+    // should still allow modification of underlying sequence.
+    const util::range<int*> const_view(values, values+4);
+    auto sub3 = util::subrange_view(const_view, std::make_pair(1, 3u));
+    sub3[1] = 42;
+    EXPECT_EQ(42, values[2]);
+}
+
+
 TEST(range, max_element_by) {
     const char *cstr = "helloworld";
     auto cstr_range = util::make_range(cstr, null_terminated);
@@ -223,6 +248,25 @@ TEST(range, max_value) {
     EXPECT_EQ('x', i);
 }
 
+TEST(range, minmax_value) {
+    auto cstr_empty_range = util::make_range((const char*)"", null_terminated);
+    auto p1 = util::minmax_value(cstr_empty_range);
+    EXPECT_EQ('\0', p1.first);
+    EXPECT_EQ('\0', p1.second);
+
+    const char *cstr = "hello world";
+    auto cstr_range = util::make_range(cstr, null_terminated);
+    auto p2 = util::minmax_value(cstr_range);
+    EXPECT_EQ(' ', p2.first);
+    EXPECT_EQ('w', p2.second);
+
+    auto p3 = util::minmax_value(
+        util::transform_view(cstr_range, [](char c) { return -(int)c; }));
+
+    EXPECT_EQ('w', -p3.first);
+    EXPECT_EQ(' ', -p3.second);
+}
+
 
 template <typename V>
 class counter_range: public ::testing::Test {};
@@ -325,6 +369,16 @@ TEST(range, assign) {
     EXPECT_EQ("00110", text);
 }
 
+TEST(range, fill) {
+    std::vector<char> aaaa(4);
+    util::fill(aaaa, 'a');
+    EXPECT_EQ("aaaa", std::string(aaaa.begin(), aaaa.end()));
+
+    char cstr[] = "howdy";
+    util::fill(util::make_range((char *)cstr, null_terminated), 'q');
+    EXPECT_EQ("qqqqq", std::string(cstr));
+}
+
 TEST(range, assign_from) {
     int in[] = {0,1,2};
 
diff --git a/tests/unit/test_spikes.cpp b/tests/unit/test_spikes.cpp
index 35ef18e7d7b2574b79097f06d2a87aff7c429451..5e759025138a493ebdc449b3ce7ac9c69c87e248 100644
--- a/tests/unit/test_spikes.cpp
+++ b/tests/unit/test_spikes.cpp
@@ -2,14 +2,27 @@
 
 #include <spike.hpp>
 #include <backends/multicore/fvm.hpp>
+#include <memory/memory.hpp>
+#include <util/rangeutil.hpp>
 
 using namespace nest::mc;
 
+// This source is included in `test_spikes.cu`, which defines
+// USE_BACKEND to override the default `multicore::backend`
+// used for CPU tests.
+
+#ifndef USE_BACKEND
+using backend = multicore::backend;
+#else
+using backend = USE_BACKEND;
+#endif
+
 TEST(spikes, threshold_watcher) {
     using backend = multicore::backend;
     using size_type = backend::size_type;
     using value_type = backend::value_type;
     using array = backend::array;
+    using iarray = backend::iarray;
     using list = backend::threshold_watcher::crossing_list;
 
     // the test creates a watch on 3 values in the array values (which has 10
@@ -24,11 +37,21 @@ TEST(spikes, threshold_watcher) {
     array values(n, 0);
     values[5] = 3.;
 
+    // the values are tied to two 'cells' with independent times:
+    // compartments [0, 5] -> cell 0
+    // compartments [6, 9] -> cell 1
+    iarray cell_index(n, 0);
+    for (unsigned i = 6; i<n; ++i) {
+        cell_index[i] = 1;
+    }
+    array time_before(2, 0.);
+    array time_after(2, 0.);
+
     // list for storing expected crossings for validation at the end
     list expected;
 
     // create the watch
-    backend::threshold_watcher watch(values, index, thresh, 0.f);
+    backend::threshold_watcher watch(cell_index, time_before, time_after, values, index, thresh);
 
     // initially the first and third watch should not be spiking
     //           the second is spiking
@@ -38,7 +61,8 @@ TEST(spikes, threshold_watcher) {
 
     // test again at t=1, with unchanged values
     //  - nothing should change
-    watch.test(1.);
+    util::fill(time_after, 1.);
+    watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_TRUE(watch.is_crossed(1));
     EXPECT_FALSE(watch.is_crossed(2));
@@ -47,30 +71,37 @@ TEST(spikes, threshold_watcher) {
     // test at t=2, with all values set to zero
     //  - 2nd watch should now stop spiking
     memory::fill(values, 0.);
-    watch.test(2.);
+    memory::copy(time_after, time_before);
+    util::fill(time_after, 2.);
+    watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
     EXPECT_FALSE(watch.is_crossed(2));
     EXPECT_EQ(watch.crossings().size(), 0u);
 
-    // test at t=3, with all values set to 4.
+    // test at t=(2.5, 3), with all values set to 4.
     //  - all watches should now be spiking
     memory::fill(values, 4.);
-    watch.test(3.);
+    memory::copy(time_after, time_before);
+    time_after[0] = 2.5;
+    time_after[1] = 3.0;
+    watch.test();
     EXPECT_TRUE(watch.is_crossed(0));
     EXPECT_TRUE(watch.is_crossed(1));
     EXPECT_TRUE(watch.is_crossed(2));
     EXPECT_EQ(watch.crossings().size(), 3u);
 
     // record the expected spikes
-    expected.push_back({0u, 2.25f});
-    expected.push_back({1u, 2.50f});
-    expected.push_back({2u, 2.75f});
+    expected.push_back({0u, 2.125f}); // 2. + (2.5-2)*(1./4.)
+    expected.push_back({1u, 2.250f}); // 2. + (2.5-2)*(2./4.)
+    expected.push_back({2u, 2.750f}); // 2. + (3.0-2)*(3./4.)
 
     // test at t=4, with all values set to 0.
     //  - all watches should stop spiking
     memory::fill(values, 0.);
-    watch.test(4.);
+    memory::copy(time_after, time_before);
+    util::fill(time_after, 4.);
+    watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
     EXPECT_FALSE(watch.is_crossed(2));
@@ -79,7 +110,9 @@ TEST(spikes, threshold_watcher) {
     // test at t=5, with value on 3rd watch set to 6
     //  - watch 3 should be spiking
     values[index[2]] = 6.;
-    watch.test(5.);
+    memory::copy(time_after, time_before);
+    util::fill(time_after, 5.);
+    watch.test();
     EXPECT_FALSE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
     EXPECT_TRUE(watch.is_crossed(2));
@@ -109,11 +142,10 @@ TEST(spikes, threshold_watcher) {
     //
     // test that resetting works
     //
-    EXPECT_EQ(watch.last_test_time(), 5);
     memory::fill(values, 0);
     values[index[0]] = 10.; // first watch should be intialized to spiking state
-    watch.reset(0);
-    EXPECT_EQ(watch.last_test_time(), 0);
+    util::fill(time_before, 0.);
+    watch.reset();
     EXPECT_EQ(watch.crossings().size(), 0u);
     EXPECT_TRUE(watch.is_crossed(0));
     EXPECT_FALSE(watch.is_crossed(1));
diff --git a/tests/unit/test_spikes.cu b/tests/unit/test_spikes.cu
index 12bb89ee975b90020337b91b3acf4a4e5d482780..52d245a20b27f6e5d0e08d9998b0d6a24d4c4aec 100644
--- a/tests/unit/test_spikes.cu
+++ b/tests/unit/test_spikes.cu
@@ -1,122 +1,6 @@
-#include "../gtest.h"
+// Override backend class in `test_spikes.cpp`
 
-#include <spike.hpp>
 #include <backends/gpu/fvm.hpp>
+#define USE_BACKEND gpu::backend
 
-using namespace nest::mc;
-
-TEST(spikes, threshold_watcher) {
-    using backend = gpu::backend;
-    using size_type = backend::size_type;
-    using value_type = backend::value_type;
-    using array = backend::array;
-    using list = backend::threshold_watcher::crossing_list;
-
-    // the test creates a watch on 3 values in the array values (which has 10
-    // elements in total).
-    const auto n = 10;
-
-    const std::vector<size_type> index{0, 5, 7};
-    const std::vector<value_type> thresh{1., 2., 3.};
-
-    // all values are initially 0, except for values[5] which we set
-    // to exceed the threshold of 2. for the second watch
-    array values(n, 0);
-    values[5] = 3.;
-
-    // list for storing expected crossings for validation at the end
-    list expected;
-
-    // create the watch
-    backend::threshold_watcher watch(values, index, thresh, 0.f);
-
-    // initially the first and third watch should not be spiking
-    //           the second is spiking
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_TRUE(watch.is_crossed(1));
-    EXPECT_FALSE(watch.is_crossed(2));
-
-    // test again at t=1, with unchanged values
-    //  - nothing should change
-    watch.test(1.);
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_TRUE(watch.is_crossed(1));
-    EXPECT_FALSE(watch.is_crossed(2));
-    EXPECT_EQ(watch.crossings().size(), 0u);
-
-    // test at t=2, with all values set to zero
-    //  - 2nd watch should now stop spiking
-    memory::fill(values, 0.);
-    watch.test(2.);
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_FALSE(watch.is_crossed(1));
-    EXPECT_FALSE(watch.is_crossed(2));
-    EXPECT_EQ(watch.crossings().size(), 0u);
-
-    // test at t=3, with all values set to 4.
-    //  - all watches should now be spiking
-    memory::fill(values, 4.);
-    watch.test(3.);
-    EXPECT_TRUE(watch.is_crossed(0));
-    EXPECT_TRUE(watch.is_crossed(1));
-    EXPECT_TRUE(watch.is_crossed(2));
-    EXPECT_EQ(watch.crossings().size(), 3u);
-
-    // record the expected spikes
-    expected.push_back({0u, 2.25f});
-    expected.push_back({1u, 2.50f});
-    expected.push_back({2u, 2.75f});
-
-    // test at t=4, with all values set to 0.
-    //  - all watches should stop spiking
-    memory::fill(values, 0.);
-    watch.test(4.);
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_FALSE(watch.is_crossed(1));
-    EXPECT_FALSE(watch.is_crossed(2));
-    EXPECT_EQ(watch.crossings().size(), 3u);
-
-    // test at t=5, with value on 3rd watch set to 6
-    //  - watch 3 should be spiking
-    values[index[2]] = 6.;
-    watch.test(5.);
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_FALSE(watch.is_crossed(1));
-    EXPECT_TRUE(watch.is_crossed(2));
-    EXPECT_EQ(watch.crossings().size(), 4u);
-    expected.push_back({2u, 4.5f});
-
-    //
-    // test that all generated spikes matched the expected values
-    //
-    if (expected.size()!=watch.crossings().size()) {
-        FAIL() << "count of recorded crosssings did not match expected count";
-    }
-    auto const& spikes = watch.crossings();
-    for (auto i=0u; i<expected.size(); ++i) {
-        EXPECT_EQ(expected[i], spikes[i]);
-    }
-
-    //
-    // test that clearing works
-    //
-    watch.clear_crossings();
-    EXPECT_EQ(watch.crossings().size(), 0u);
-    EXPECT_FALSE(watch.is_crossed(0));
-    EXPECT_FALSE(watch.is_crossed(1));
-    EXPECT_TRUE(watch.is_crossed(2));
-
-    //
-    // test that resetting works
-    //
-    EXPECT_EQ(watch.last_test_time(), 5);
-    memory::fill(values, 0);
-    values[index[0]] = 10.; // first watch should be intialized to spiking state
-    watch.reset(0);
-    EXPECT_EQ(watch.last_test_time(), 0);
-    EXPECT_EQ(watch.crossings().size(), 0u);
-    EXPECT_TRUE(watch.is_crossed(0));
-    EXPECT_FALSE(watch.is_crossed(1));
-    EXPECT_FALSE(watch.is_crossed(2));
-}
-
+#include "./test_spikes.cpp"
diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp
index 3e9b8ef4aa5b903fbf494ab5e79d4a12581d5927..6a45685d501ba1e8405c75f88f1146f335b1f060 100644
--- a/tests/unit/test_synapses.cpp
+++ b/tests/unit/test_synapses.cpp
@@ -41,7 +41,6 @@ TEST(synapses, add_to_cell)
     EXPECT_EQ(syns[2].mechanism.name(), "expsyn");
 }
 
-// compares results with those generated by nrn/ball_and_stick.py
 TEST(synapses, expsyn_basic_state)
 {
     using namespace nest::mc;
@@ -49,14 +48,21 @@ TEST(synapses, expsyn_basic_state)
     using value_type = multicore::backend::value_type;
 
     using synapse_type = mechanisms::expsyn::mechanism_expsyn<multicore::backend>;
-    auto num_syn = 4;
+    int num_syn = 4;
+    int num_comp = 4;
+    int num_cell = 1;
+
+    synapse_type::iarray cell_index(num_comp, 0);
+    synapse_type::array time(num_cell, 0);
+    synapse_type::array time_to(num_cell, 0.1);
+    synapse_type::array dt(num_comp, 0.1);
 
-    std::vector<size_type> indexes(num_syn);
-    std::vector<value_type> weights(indexes.size(), 1.0);
-    synapse_type::array voltage(num_syn, -65.0);
-    synapse_type::array current(num_syn,   1.0);
-    auto mech = mechanisms::make_mechanism<synapse_type>(voltage, current, weights, indexes);
+    std::vector<size_type> node_index(num_syn, 0);
+    std::vector<value_type> weights(num_syn, 1.0);
+    synapse_type::array voltage(num_comp, -65.0);
+    synapse_type::array current(num_comp,   1.0);
 
+    auto mech = mechanisms::make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
     auto n = ptr->size();
@@ -99,19 +105,25 @@ TEST(synapses, expsyn_basic_state)
 TEST(synapses, exp2syn_basic_state)
 {
     using namespace nest::mc;
+    using size_type = multicore::backend::size_type;
+    using value_type = multicore::backend::value_type;
 
     using synapse_type = mechanisms::exp2syn::mechanism_exp2syn<multicore::backend>;
-    auto num_syn = 4;
+    int num_syn = 4;
+    int num_comp = 4;
+    int num_cell = 1;
 
-    using size_type = multicore::backend::size_type;
-    using value_type = multicore::backend::value_type;
+    synapse_type::iarray cell_index(num_comp, 0);
+    synapse_type::array time(num_cell, 0);
+    synapse_type::array time_to(num_cell, 0.1);
+    synapse_type::array dt(num_comp, 0.1);
 
-    std::vector<size_type> indexes(num_syn);
-    std::vector<value_type> weights(indexes.size(), 1.0);
-    synapse_type::array voltage(num_syn, -65.0);
-    synapse_type::array current(num_syn,   1.0);
-    auto mech = mechanisms::make_mechanism<synapse_type>(voltage, current, weights, indexes);
+    std::vector<size_type> node_index(num_syn, 0);
+    std::vector<value_type> weights(num_syn, 1.0);
+    synapse_type::array voltage(num_comp, -65.0);
+    synapse_type::array current(num_comp,   1.0);
 
+    auto mech = mechanisms::make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
     auto n = ptr->size();