diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index 97962f2931d19d673be5c0a35aa35c370bfd165f..887848b549b7bad46574b92ac9fcde74faae5d5c 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -8,7 +8,7 @@ namespace io {
 // for now this is just a placeholder
 options read_options(std::string fname) {
     // 10 cells, 1 synapses per cell, 10 compartments per segment
-    return {10, 1, 100};
+    return {200, 1, 100};
 }
 
 std::ostream& operator<<(std::ostream& o, const options& opt) {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 15aae8e349227d90f6cabcf0ae1aedbf070d9ccf..18326d8db4df56bece5f422efe05c9590e82a684 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -11,7 +11,6 @@
 #include <profiling/profiler.hpp>
 #include <communication/communicator.hpp>
 #include <communication/serial_global_policy.hpp>
-#include <communication/mpi_global_policy.hpp>
 
 using namespace nest;
 
@@ -21,6 +20,7 @@ using id_type = uint32_t;
 using numeric_cell = mc::fvm::fvm_cell<real_type, index_type>;
 using cell_group   = mc::cell_group<numeric_cell>;
 #ifdef WITH_MPI
+#include <communication/mpi_global_policy.hpp>
 using communicator_type =
     mc::communication::communicator<mc::communication::mpi_global_policy>;
 #else
@@ -32,17 +32,6 @@ struct model {
     communicator_type communicator;
     std::vector<cell_group> cell_groups;
 
-    double time_init;
-    double time_network;
-    double time_solve;
-    double time_comms;
-    void print_times() const {
-        std::cout << "initialization took " << time_init << " s\n";
-        std::cout << "network        took " << time_network << " s\n";
-        std::cout << "solve          took " << time_solve << " s\n";
-        std::cout << "comms          took " << time_comms << " s\n";
-    }
-
     int num_groups() const {
         return cell_groups.size();
     }
@@ -50,30 +39,32 @@ struct model {
     void run(double tfinal, double dt) {
         auto t = 0.;
         auto delta = communicator.min_delay();
-        time_solve = 0.;
-        time_comms = 0.;
         while(t<tfinal) {
-            auto start_solve = mc::util::timer_type::tic();
             mc::threading::parallel_for::apply(
                 0, num_groups(),
                 [&](int i) {
+                        mc::util::profiler_enter("stepping","events");
                     cell_groups[i].enqueue_events(communicator.queue(i));
+                        mc::util::profiler_leave();
                     cell_groups[i].advance(t+delta, dt);
+                        mc::util::profiler_enter("events");
                     communicator.add_spikes(cell_groups[i].spikes());
                     cell_groups[i].clear_spikes();
+                        mc::util::profiler_leave(2);
                 }
             );
-            time_solve += mc::util::timer_type::toc(start_solve);
 
-            auto start_comms = mc::util::timer_type::tic();
+                mc::util::profiler_enter("stepping", "exchange");
             communicator.exchange();
-            time_comms += mc::util::timer_type::toc(start_comms);
+                mc::util::profiler_leave(2);
 
             t += delta;
         }
     }
 
     void init_communicator() {
+            mc::util::profiler_enter("setup", "communicator");
+
         // calculate the source and synapse distribution serially
         std::vector<id_type> target_counts(num_groups());
         std::vector<id_type> source_counts(num_groups());
@@ -87,9 +78,12 @@ struct model {
 
         //  create connections
         communicator = communicator_type(num_groups(), target_counts);
+
+            mc::util::profiler_leave(2);
     }
 
     void update_gids() {
+            mc::util::profiler_enter("setup", "globalize");
         auto com_policy = communicator.communication_policy();
         auto global_source_map = com_policy.make_map(source_map.back());
         auto domain_idx = communicator.domain_id();
@@ -97,6 +91,7 @@ struct model {
             cell_groups[i].set_source_gids(source_map[i]+global_source_map[domain_idx]);
             cell_groups[i].set_target_gids(target_map[i]+communicator.target_gid_from_group_lid(0));
         }
+            mc::util::profiler_leave(2);
     }
 
     // TODO : only stored here because init_communicator() and update_gids() are split
@@ -162,7 +157,7 @@ int main(int argc, char** argv) {
     //
     //  time stepping
     //
-    auto tfinal = 20.;
+    auto tfinal = 10.;
     auto dt = 0.01;
 
     auto id = m.communicator.domain_id();
@@ -174,7 +169,7 @@ int main(int argc, char** argv) {
     m.run(tfinal, dt);
 
     if (!id) {
-        //mc::util::data::profilers_.local().performance_tree().print(std::cout, 0.001);
+        mc::util::profiler_output(0.00001);
         std::cout << "there were " << m.communicator.num_spikes() << " spikes\n";
     }
 
@@ -206,7 +201,6 @@ void ring_model(nest::mc::io::options& opt, model& m) {
     auto basic_cell = make_cell(opt.compartments_per_segment, 1);
 
     // make a vector for storing all of the cells
-    auto start_init = mc::util::timer_type::tic();
     m.cell_groups = std::vector<cell_group>(opt.cells);
 
     // initialize the cells in parallel
@@ -214,15 +208,17 @@ void ring_model(nest::mc::io::options& opt, model& m) {
         0, opt.cells,
         [&](int i) {
             // initialize cell
+                mc::util::profiler_enter("setup");
+                    mc::util::profiler_enter("make cell");
             m.cell_groups[i] = make_lowered_cell(i, basic_cell);
+                    mc::util::profiler_leave();
+                mc::util::profiler_leave();
         }
     );
-    m.time_init = mc::util::timer_type::toc(start_init);
 
     //
     //  network creation
     //
-    auto start_network = mc::util::timer_type::tic();
     m.init_communicator();
 
     for (auto i=0u; i<(id_type)opt.cells; ++i) {
@@ -232,18 +228,13 @@ void ring_model(nest::mc::io::options& opt, model& m) {
         });
     }
 
-    m.communicator.construct();
-
     m.update_gids();
-
-    m.time_network = mc::util::timer_type::toc(start_network);
 }
 
 void all_to_all_model(nest::mc::io::options& opt, model& m) {
     //
     //  make cells
     //
-    auto timer = mc::util::timer_type();
 
     // make a basic cell
     auto basic_cell = make_cell(opt.compartments_per_segment, opt.cells-1);
@@ -262,16 +253,18 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
     mc::threading::parallel_for::apply(
         0, ncell_local,
         [&](int i) {
+                mc::util::profiler_enter("setup", "cells");
             m.cell_groups[i] = make_lowered_cell(i, basic_cell);
+                mc::util::profiler_leave(2);
         }
     );
 
     //
     //  network creation
     //
-    auto start_network = timer.tic();
     m.init_communicator();
 
+        mc::util::profiler_enter("setup", "connections");
     // lid is local cell/group id
     for (auto lid=0u; lid<ncell_local; ++lid) {
         auto target = m.communicator.target_gid_from_group_lid(lid);
@@ -288,10 +281,9 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
     }
 
     m.communicator.construct();
+        mc::util::profiler_leave(2);
 
     m.update_gids();
-
-    m.time_network = timer.toc(start_network);
 }
 
 ///////////////////////////////////////
@@ -333,8 +325,8 @@ mc::cell make_cell(int compartments_per_segment, int num_synapses) {
     // add dendrite of length 200 um and diameter 1 um with passive channel
     std::vector<mc::cable_segment*> dendrites;
     dendrites.push_back(cell.add_cable(0, mc::segmentKind::dendrite, 0.5, 0.5, 200));
-    dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
-    dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
+    //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
+    //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
 
     for (auto d : dendrites) {
         d->add_mechanism(mc::pas_parameters());
diff --git a/src/cell_group.hpp b/src/cell_group.hpp
index 85a0832bf24a5720951b35db392133e2d877a312..4964ad10a2328c96593f9e6b07b136baa7ca7fcd 100644
--- a/src/cell_group.hpp
+++ b/src/cell_group.hpp
@@ -7,6 +7,8 @@
 #include <communication/spike.hpp>
 #include <communication/spike_source.hpp>
 
+#include <profiling/profiler.hpp>
+
 namespace nest {
 namespace mc {
 
@@ -76,6 +78,7 @@ class cell_group {
             // integrate cell state
             cell_.advance(tnext - cell_.time());
 
+                nest::mc::util::profiler_enter("events");
             // check for new spikes
             for (auto& s : spike_sources_) {
                 if (auto spike = s.source.test(cell_, cell_.time())) {
@@ -93,6 +96,7 @@ class cell_group {
                     cell_.apply_event(e.get());
                 }
             }
+                nest::mc::util::profiler_leave();
         }
 
     }
diff --git a/src/fvm_cell.hpp b/src/fvm_cell.hpp
index 4bd215716b2b77471e8c488925f3c54f6105e165..c89dcdf0c3e920b4aa370f71a48767f4e20d1040 100644
--- a/src/fvm_cell.hpp
+++ b/src/fvm_cell.hpp
@@ -17,10 +17,12 @@
 #include <segment.hpp>
 #include <stimulus.hpp>
 #include <util.hpp>
+#include <profiling/profiler.hpp>
 
 #include <vector/include/Vector.hpp>
 #include <mechanisms/expsyn.hpp>
 
+
 namespace nest {
 namespace mc {
 namespace fvm {
@@ -113,9 +115,6 @@ class fvm_cell {
     /// make a time step
     void advance(value_type dt);
 
-    /// advance solution to target time tfinal with maximum step size dt
-    void advance_to(value_type tfinal, value_type dt);
-
     /// pass an event to the appropriate synapse and call net_receive
     void apply_event(local_event e) {
         mechanisms_[synapse_index_]->net_receive(e.target, e.weight);
@@ -479,12 +478,15 @@ void fvm_cell<T, I>::advance(T dt)
 {
     using memory::all;
 
+        mc::util::profiler_enter("current");
     current_(all) = 0.;
 
     // update currents from ion channels
     for(auto& m : mechanisms_) {
+            mc::util::profiler_enter(m->name().c_str());
         m->set_params(t_, dt);
         m->nrn_current();
+            mc::util::profiler_leave();
     }
 
     // add current contributions from stimulii
@@ -495,42 +497,29 @@ void fvm_cell<T, I>::advance(T dt)
         // the factor of 100 scales the injected current to 10^2.nA
         current_[loc] -= 100.*ie/cv_areas_[loc];
     }
+        mc::util::profiler_leave();
 
+        mc::util::profiler_enter("matrix", "setup");
     // solve the linear system
     setup_matrix(dt);
+        mc::util::profiler_leave(); mc::util::profiler_enter("solve");
     matrix_.solve();
+        mc::util::profiler_leave();
     voltage_(all) = matrix_.rhs();
+        mc::util::profiler_leave();
 
+        mc::util::profiler_enter("state");
     // integrate state of gating variables etc.
     for(auto& m : mechanisms_) {
+            mc::util::profiler_enter(m->name().c_str());
         m->nrn_state();
+            mc::util::profiler_leave();
     }
+        mc::util::profiler_leave();
 
     t_ += dt;
 }
 
-/*
-template <typename T, typename I>
-void fvm_cell<T, I>::advance_to(T tfinal, T dt)
-{
-    if(t_>=tfinal) {
-        return;
-    }
-
-    do {
-        auto tstep = std::min(tfinal, t_+dt);
-        auto next = events_.pop_if_before(tstep);
-        auto tnext = next? next->time: tstep;
-
-        advance(tnext-t_);
-        t_ = tnext;
-        if (next) { // handle event
-            mechanisms_[synapse_index_]->net_receive(next->target, next->weight);
-        }
-    } while(t_<tfinal);
-}
-*/
-
 } // namespace fvm
 } // namespace mc
 } // namespace nest
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 7b82f00323baf7a3797d06ce76e54fb357738c6b..8af4e334535f50e9b985a2ef2dd37da33c67106c 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -18,11 +18,11 @@ namespace nest {
 namespace mc {
 namespace util {
 
-static inline std::string green(std::string s)  { return s; }
-static inline std::string yellow(std::string s) { return s; }
-static inline std::string white(std::string s)  { return s; }
-static inline std::string red(std::string s)    { return s; }
-static inline std::string cyan(std::string s)   { return s; }
+inline std::string green(std::string s)  { return s; }
+inline std::string yellow(std::string s) { return s; }
+inline std::string white(std::string s)  { return s; }
+inline std::string red(std::string s)    { return s; }
+inline std::string cyan(std::string s)   { return s; }
 
 namespace impl {
 
@@ -30,7 +30,7 @@ namespace impl {
     ///     - for easy comparison of strings over MPI
     ///     - for fast searching of regions named with strings
     static inline
-    size_t hash(char* s) {
+    size_t hash(const char* s) {
         size_t h = 5381;
 
         while (*s) {
@@ -143,7 +143,7 @@ namespace impl {
                         n.print_sub(stream, indent + 2, threshold, total);
                     }
                 }
-                if (other >= threshold && children.size()) {
+                if (other>=std::max(threshold, 0.01) && children.size()) {
                     label = indent_str + "  other";
                     percentage = 100.*other/total;
                     snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
@@ -188,14 +188,10 @@ public:
     using profiler_node = impl::profiler_node;
 
     explicit region_type(std::string n) :
-        //name_(std::move(n))
-        name_(n)
+        name_(std::move(n))
     {
-        std::cout << "creating region " << name_ << "\n";
         start_time_ = timer_type::tic();
-        std::cout << "  started timer " << name_ << "\n";
         hash_ = impl::hash(n);
-        std::cout << "  hashed " << name_ << " -> " << hash_ << "\n";
     }
 
     explicit region_type(const char* n) :
@@ -304,21 +300,26 @@ public:
 
     void enter(const char* name) {
         if (!is_activated()) return;
-        auto start = timer_type::tic();
         current_region_ = current_region_->subregion(name);
         current_region_->start_time();
-        self_time_ += timer_type::toc(start);
     }
 
     void leave() {
         if (!is_activated()) return;
-        auto start = timer_type::tic();
         if (current_region_->parent()==nullptr) {
             throw std::out_of_range("attempt to leave root memory tracing region");
         }
         current_region_->end_time();
         current_region_ = current_region_->parent();
-        self_time_ += timer_type::toc(start);
+    }
+
+    // step up multiple n levels in one call
+    void leave(int n) {
+        EXPECTS(n>=1);
+
+        while(n--) {
+            leave();
+        }
     }
 
     region_type& regions() {
@@ -329,10 +330,6 @@ public:
         return current_region_;
     }
 
-    double self_time() const {
-        return self_time_;
-    }
-
     bool is_in_root() const {
         return &root_region_ == current_region_;
     }
@@ -348,19 +345,28 @@ public:
                   );
         }
         activate();
+        start_time_ = timer_type::tic();
         root_region_.start_time();
     }
 
     void stop() {
         if (!is_in_root()) {
             throw std::out_of_range(
-                    "attempt to profiler that is not in the root region"
+                    "profiler must be in root region when stopped"
                   );
         }
         root_region_.end_time();
+        stop_time_ = timer_type::tic();
+
         deactivate();
     }
 
+    timer_type::time_point start_time() const { return start_time_; }
+    timer_type::time_point stop_time()  const { return stop_time_; }
+    double wall_time() const {
+        return timer_type::difference(start_time_, stop_time_);
+    }
+
     region_type::profiler_node performance_tree() {
         if (is_activated()) {
             stop();
@@ -372,18 +378,13 @@ private:
     void activate()   { activated_ = true;  }
     void deactivate() { activated_ = false; }
 
+    timer_type::time_point start_time_;
+    timer_type::time_point stop_time_;
     bool activated_ = false;
     region_type root_region_;
     region_type* current_region_ = &root_region_;
-    double self_time_ = 0.;
 };
 
-namespace data {
-    using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
-    profiler_wrapper profilers_(profiler("root"));
-}
-
-/*
 #ifdef WITH_PROFILING
 namespace data {
     using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
@@ -408,9 +409,19 @@ inline void profiler_stop() {
 inline void profiler_enter(const char* n) {
     get_profiler().enter(n);
 }
+
+template <class...Args>
+void profiler_enter(const char* n, Args... args) {
+    get_profiler().enter(n);
+    profiler_enter(args...);
+}
+
 inline void profiler_leave() {
     get_profiler().leave();
 }
+inline void profiler_leave(int nlevels) {
+    get_profiler().leave(nlevels);
+}
 
 // iterate over all profilers and ensure that they have the same start stop times
 inline void stop_profilers() {
@@ -420,14 +431,28 @@ inline void stop_profilers() {
     }
 }
 
+inline void profiler_output(double threshold) {
+    stop_profilers();
+    auto p = impl::profiler_node(0, "results");
+    for(auto& thread_profiler : data::profilers_) {
+        std::cout << "fusing profiler : " << thread_profiler.wall_time() << " s\n";
+        p.fuse(thread_profiler.performance_tree());
+    }
+    p.print(std::cout, threshold);
+}
+
 #else
-*/
 inline void profiler_start() {}
 inline void profiler_stop() {}
-inline void profiler_enter(const char* n) {}
+inline void profiler_enter(const char*) {}
+template <class...Args>
+void profiler_enter(const char*, Args... args) {}
+inline void profiler_enter(const char*, const char*, const char*) {}
 inline void profiler_leave() {}
+inline void profiler_leave(int) {}
 inline void stop_profilers() {}
-//#endif
+inline void profiler_output(double threshold) {}
+#endif
 
 } // namespace util
 } // namespace mc
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index 0af901fbc16be94dabbbc3c159a5aae01159556f..3a2bc0ad13aa922dd28829f7c6114d1a14ccdefa 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -31,29 +31,16 @@ class enumerable_thread_specific {
         data{std::move(init)}
     {}
 
-    T& local() {
-        return data[0];
-    }
-    const T& local() const {
-        return data[0];
-    }
+    T& local() { return data[0]; }
+    const T& local() const { return data[0]; }
 
-    auto begin() -> decltype(data.begin())
-    {
-        return data.begin();
-    }
-    auto end() -> decltype(data.end())
-    {
-        return data.end();
-    }
-    auto cbegin() -> decltype(data.cbegin())
-    {
-        return data.cbegin();
-    }
-    auto cend() -> decltype(data.cend())
-    {
-        return data.cend();
-    }
+    auto size() -> decltype(data.size()) const { return data.size(); }
+
+    auto begin() -> decltype(data.begin()) { return data.begin(); }
+    auto end()   -> decltype(data.end())   { return data.end(); }
+
+    auto cbegin() -> decltype(data.cbegin()) const { return data.cbegin(); }
+    auto cend()   -> decltype(data.cend())   const { return data.cend(); }
 };
 
 
@@ -77,17 +64,17 @@ std::string description() {
 struct timer {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
 
-    static
-    inline time_point tic()
-    {
+    static inline time_point tic() {
         return std::chrono::system_clock::now();
     }
 
-    static
-    inline double toc(time_point t)
-    {
+    static inline double toc(time_point t) {
         return std::chrono::duration<double>(tic() - t).count();
     }
+
+    static inline double difference(time_point b, time_point e) {
+        return std::chrono::duration<double>(e-b).count();
+    }
 };
 
 
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index 32097586169f794c90baafdbd948f8b60df025d6..a36f2b876b788a3d7c2e52477e652fc4d66c49c9 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -32,17 +32,17 @@ std::string description() {
 struct timer {
     using time_point = tbb::tick_count;
 
-    static
-    inline time_point tic()
-    {
+    static inline time_point tic() {
         return tbb::tick_count::now();
     }
 
-    static
-    inline double toc(time_point t)
-    {
+    static inline double toc(time_point t) {
         return (tic() - t).seconds();
     }
+
+    static inline double difference(time_point b, time_point e) {
+        return (e-b).seconds();
+    }
 };
 
 } // threading