diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index 887848b549b7bad46574b92ac9fcde74faae5d5c..e6fba80211261f081969d8d7abd0e8fd33452e44 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -8,7 +8,7 @@ namespace io {
 // for now this is just a placeholder
 options read_options(std::string fname) {
     // 10 cells, 1 synapses per cell, 10 compartments per segment
-    return {200, 1, 100};
+    return {1000, 1, 100};
 }
 
 std::ostream& operator<<(std::ostream& o, const options& opt) {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 18326d8db4df56bece5f422efe05c9590e82a684..98c0e21c25579a2048fb8c599935012622eb77ef 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -157,7 +157,7 @@ int main(int argc, char** argv) {
     //
     //  time stepping
     //
-    auto tfinal = 10.;
+    auto tfinal = 200.;
     auto dt = 0.01;
 
     auto id = m.communicator.domain_id();
@@ -168,8 +168,9 @@ int main(int argc, char** argv) {
 
     m.run(tfinal, dt);
 
+    mc::util::profiler_output(0.001);
+
     if (!id) {
-        mc::util::profiler_output(0.00001);
         std::cout << "there were " << m.communicator.num_spikes() << " spikes\n";
     }
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c3ae6c78627d93c78202a6cab72bab1c5d4c621..e7bced8530908d637ecec5112fdb5e581e652e9a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,8 +5,10 @@ set(BASE_SOURCES
     cell.cpp
     mechanism_interface.cpp
     parameter_list.cpp
+    profiling/profiler.cpp
     swcio.cpp
 )
+
 if(${WITH_MPI})
     set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp)
 endif()
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9765da76c0e732aa1a27a9ceb532b9fa7544d0c
--- /dev/null
+++ b/src/profiling/profiler.cpp
@@ -0,0 +1,388 @@
+#include "profiler.hpp"
+
+#ifdef WITH_MPI
+#include <communication/mpi.hpp>
+#endif
+
+namespace nest {
+namespace mc {
+namespace util {
+
+/////////////////////////////////////////////////////////
+// profiler_node
+/////////////////////////////////////////////////////////
+void profiler_node::print(int indent) {
+    std::string s = std::string(indent, ' ') + name;
+    std::cout << s
+              << std::string(60-s.size(), '.')
+              << value
+              << "\n";
+    for (auto& n : children) {
+        n.print(indent+2);
+    }
+}
+
+void profiler_node::print(std::ostream& stream, double threshold) {
+    // convert threshold from proportion to time
+    threshold *= value;
+    print_sub(stream, 0, threshold, value);
+}
+
+void profiler_node::print_sub(
+    std::ostream& stream,
+    int indent,
+    double threshold,
+    double total)
+{
+    char buffer[512];
+
+    if (value < threshold) {
+        std::cout << green("not printing ") << name << std::endl;
+        return;
+    }
+
+    auto max_contribution =
+        std::accumulate(
+                children.begin(), children.end(), -1.,
+                [] (double lhs, const profiler_node& rhs) {
+                    return lhs > rhs.value ? lhs : rhs.value;
+                }
+        );
+
+    // print the table row
+    auto const indent_str = std::string(indent, ' ');
+    auto label = indent_str + name;
+    float percentage = 100.*value/total;
+    snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
+                    label.c_str(),
+                    float(value),
+                    float(percentage));
+    bool print_children =
+        threshold==0. ? children.size()>0
+                      : max_contribution >= threshold;
+
+    stream << (print_children ? white(buffer) : buffer) << "\n";
+
+    if (print_children) {
+        auto other = 0.;
+        for (auto &n : children) {
+            if (n.value<threshold || n.name=="other") {
+                other += n.value;
+            }
+            else {
+                n.print_sub(stream, indent + 2, threshold, total);
+            }
+        }
+        if (other>=std::max(threshold, 0.001) && children.size()) {
+            label = indent_str + "  other";
+            percentage = 100.*other/total;
+            snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
+                            label.c_str(), float(other), percentage);
+            stream << buffer << std::endl;
+        }
+    }
+}
+
+void profiler_node::fuse(const profiler_node& other) {
+    for (auto& n : other.children) {
+        auto it = std::find(children.begin(), children.end(), n);
+        if (it!=children.end()) {
+            (*it).fuse(n);
+        }
+        else {
+            children.push_back(n);
+        }
+    }
+
+    value += other.value;
+}
+
+double profiler_node::time_in_other() const {
+    auto o = std::find_if(
+        children.begin(), children.end(),
+        [](const profiler_node& n) {
+            return n.name == std::string("other");
+        }
+    );
+    return o==children.end() ? 0. : o->value;
+}
+
+void profiler_node::scale(double factor) {
+    value *= factor;
+    for (auto& n : children) {
+        n.scale(factor);
+    }
+}
+
+profiler_node::json profiler_node::as_json() const {
+    json node;
+    node["name"] = name;
+    node["time"] = value;
+    for (const auto& n : children) {
+        node["regions"].push_back(n.as_json());
+    }
+    return node;
+}
+
+profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs) {
+    assert(lhs.name == rhs.name);
+    auto node = lhs;
+    node.fuse(rhs);
+    return node;
+}
+
+bool operator== (const profiler_node& lhs, const profiler_node& rhs) {
+    return lhs.name == rhs.name;
+}
+
+/////////////////////////////////////////////////////////
+// region_type
+/////////////////////////////////////////////////////////
+region_type* region_type::subregion(const char* n) {
+    size_t hsh = impl::hash(n);
+    auto s = subregions_.find(hsh);
+    if (s == subregions_.end()) {
+        subregions_[hsh] = util::make_unique<region_type>(n, this);
+        return subregions_[hsh].get();
+    }
+    return s->second.get();
+}
+
+double region_type::subregion_contributions() const {
+    return
+        std::accumulate(
+            subregions_.begin(), subregions_.end(), 0.,
+            [](double l, decltype(*(subregions_.begin())) r) {
+                return l+r.second->total();
+            }
+        );
+}
+
+profiler_node region_type::populate_performance_tree() const {
+    profiler_node tree(total(), name());
+
+    for (auto &it : subregions_) {
+        tree.children.push_back(it.second->populate_performance_tree());
+    }
+
+    // sort the contributions in descending order
+    std::stable_sort(
+        tree.children.begin(), tree.children.end(),
+        [](const profiler_node& lhs, const profiler_node& rhs) {
+            return lhs.value>rhs.value;
+        }
+    );
+
+    if (tree.children.size()) {
+        // find the contribution of parts of the code that were not explicitly profiled
+        auto contributions =
+            std::accumulate(
+                tree.children.begin(), tree.children.end(), 0.,
+                [](double v, profiler_node& n) {
+                    return v+n.value;
+                }
+            );
+        auto other = total() - contributions;
+
+        // add the "other" category
+        tree.children.emplace_back(other, std::string("other"));
+    }
+
+    return tree;
+}
+
+/////////////////////////////////////////////////////////
+// region_type
+/////////////////////////////////////////////////////////
+void profiler::enter(const char* name) {
+    if (!is_activated()) return;
+    current_region_ = current_region_->subregion(name);
+    current_region_->start_time();
+}
+
+void profiler::leave() {
+    if (!is_activated()) return;
+    if (current_region_->parent()==nullptr) {
+        throw std::out_of_range("attempt to leave root memory tracing region");
+    }
+    current_region_->end_time();
+    current_region_ = current_region_->parent();
+}
+
+void profiler::leave(int n) {
+    EXPECTS(n>=1);
+
+    while(n--) {
+        leave();
+    }
+}
+
+void profiler::start() {
+    if (is_activated()) {
+        throw std::out_of_range(
+                "attempt to start an already running profiler"
+              );
+    }
+    activate();
+    start_time_ = timer_type::tic();
+    root_region_.start_time();
+}
+
+void profiler::stop() {
+    if (!is_in_root()) {
+        throw std::out_of_range(
+                "profiler must be in root region when stopped"
+              );
+    }
+    root_region_.end_time();
+    stop_time_ = timer_type::tic();
+
+    deactivate();
+}
+
+profiler_node profiler::performance_tree() {
+    if (is_activated()) {
+        stop();
+    }
+    return root_region_.populate_performance_tree();
+}
+
+
+#ifdef WITH_PROFILING
+namespace data {
+    profiler_wrapper profilers_(profiler("root"));
+}
+
+profiler& get_profiler() {
+    auto& p = data::profilers_.local();
+    if (!p.is_activated()) {
+        p.start();
+    }
+    return p;
+}
+
+// this will throw an exception if the profler has already been started
+void profiler_start() {
+    data::profilers_.local().start();
+}
+void profiler_stop() {
+    get_profiler().stop();
+}
+void profiler_enter(const char* n) {
+    get_profiler().enter(n);
+}
+
+void profiler_leave() {
+    get_profiler().leave();
+}
+void profiler_leave(int nlevels) {
+    get_profiler().leave(nlevels);
+}
+
+// iterate over all profilers and ensure that they have the same start stop times
+void stop_profilers() {
+    for (auto& p : data::profilers_) {
+        p.stop();
+    }
+}
+
+void profiler_output(double threshold) {
+    stop_profilers();
+
+    // Find the earliest start time and latest stop time over all profilers
+    // This can be used to calculate the wall time for this communicator.
+    // The min-max values are used because, for example, the individual
+    // profilers might start at different times. In this case, the time stamp
+    // when the first profiler started is taken as the start time of the whole
+    // measurement period. Likewise for the last profiler to stop.
+    auto start_time = data::profilers_.begin()->start_time();
+    auto stop_time = data::profilers_.begin()->stop_time();
+    for(auto& p : data::profilers_) {
+        start_time = std::min(start_time, p.start_time());
+        stop_time  = std::max(stop_time,  p.stop_time());
+    }
+    // calculate the wall time
+    auto wall_time = timer_type::difference(start_time, stop_time);
+    // calculate the accumulated wall time over all threads
+    auto nthreads = data::profilers_.size();
+    auto thread_wall = wall_time * nthreads;
+
+    // gather the profilers into one accumulated profile over all threads
+    auto thread_measured = 0.; // accumulator for the time measured in each thread
+    auto p = profiler_node(0, "total");
+    for(auto& thread_profiler : data::profilers_) {
+        auto tree = thread_profiler.performance_tree();
+        thread_measured += tree.value - tree.time_in_other();
+        p.fuse(thread_profiler.performance_tree());
+    }
+    auto efficiency = 100. * thread_measured / thread_wall;
+
+    p.scale(1./nthreads);
+
+#ifdef WITH_MPI
+    bool print = nest::mc::mpi::rank()==0 ? true : false;
+#else
+    bool print = true;
+#endif
+    if(print) {
+        std::cout << " ---------------------------------------------------- \n";
+        std::cout << "|                      profiler                      |\n";
+        std::cout << " ---------------------------------------------------- \n";
+        char line[128];
+        std::snprintf(
+            line, sizeof(line), "%-18s%10.3f s\n",
+            "wall time", float(wall_time));
+        std::cout << line;
+        #ifdef WITH_MPI
+        std::snprintf(
+            line, sizeof(line), "%-18s%10d\n",
+            "MPI ranks", int(nest::mc::mpi::size()));
+        std::cout << line;
+        #endif
+        std::snprintf(
+            line, sizeof(line), "%-18s%10d\n",
+            "threads", int(nthreads));
+        std::cout << line;
+        std::snprintf(
+            line, sizeof(line), "%-18s%10.2f %%\n",
+            "thread efficiency", float(efficiency));
+        std::cout << line << "\n";
+        p.print(std::cout, threshold);
+
+        std::cout << "\n\n";
+    }
+
+    nlohmann::json as_json = p.as_json();
+    as_json["wall time"] = wall_time;
+    as_json["threads"] = nthreads;
+    as_json["efficiency"] = efficiency;
+#ifdef WITH_MPI
+    as_json["communicators"] = nest::mc::mpi::size();
+    as_json["rank"] = nest::mc::mpi::rank();
+#else
+    as_json["communicators"] = 1;
+    as_json["rank"] = 0;
+#endif
+
+#ifdef WITH_MPI
+    std::ofstream fid("profile_" + std::to_string(mpi::rank()));
+#else
+    std::ofstream fid("profile");
+#endif
+    fid << std::setw(1) << as_json;
+}
+
+#else
+void profiler_start() {}
+void profiler_stop() {}
+void profiler_enter(const char*) {}
+void profiler_leave() {}
+void profiler_leave(int) {}
+void stop_profilers() {}
+void profiler_output(double threshold) {}
+#endif
+
+} // namespace util
+} // namespace mc
+} // namespace nest
+
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 8af4e334535f50e9b985a2ef2dd37da33c67106c..c6f5244e763b66406d71cabeffc3f1c925433601 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -12,7 +12,10 @@
 #include <cassert>
 #include <cstdlib>
 
+#include <json/src/json.hpp>
+
 #include <threading/threading.hpp>
+#include <util.hpp>
 
 namespace nest {
 namespace mc {
@@ -24,15 +27,13 @@ inline std::string white(std::string s)  { return s; }
 inline std::string red(std::string s)    { return s; }
 inline std::string cyan(std::string s)   { return s; }
 
-namespace impl {
+using timer_type = nest::mc::threading::timer;
 
+namespace impl {
     /// simple hashing function for strings
-    ///     - for easy comparison of strings over MPI
-    ///     - for fast searching of regions named with strings
     static inline
     size_t hash(const char* s) {
         size_t h = 5381;
-
         while (*s) {
             h = ((h << 5) + h) + int(*s);
             ++s;
@@ -45,131 +46,39 @@ namespace impl {
     size_t hash(const std::string& s) {
         return hash(s.c_str());
     }
+} // namespace impl
 
-    struct profiler_node {
-        double value;
-        std::string name;
-        std::vector<profiler_node> children;
-
-        profiler_node() :
-            value(0.), name("")
-        {}
-
-        profiler_node(double v, const std::string& n) :
-            value(v), name(n)
-        {}
-
-        void print(int indent=0) {
-            std::string s = std::string(indent, ' ') + name;
-            std::cout << s
-                      << std::string(60-s.size(), '.')
-                      << value
-                      << "\n";
-            for (auto& n : children) {
-                n.print(indent+2);
-            }
-        }
-
-        friend profiler_node operator+ (
-            const profiler_node& lhs,
-            const profiler_node& rhs)
-        {
-            assert(lhs.name == rhs.name);
-            auto node = lhs;
-            node.fuse(rhs);
-            return node;
-        }
-
-        friend bool operator== (
-            const profiler_node& lhs,
-            const profiler_node& rhs)
-        {
-            return lhs.name == rhs.name;
-        }
+/// The tree data structure that is generated by post-processing of
+/// a profiler.
+struct profiler_node {
+    double value;
+    std::string name;
+    std::vector<profiler_node> children;
+    using json = nlohmann::json;
 
-        void print(std::ostream& stream, double threshold) {
-            // convert threshold from proportion to time
-            threshold *= value;
-            print_sub(stream, 0, threshold, value);
-        }
+    profiler_node() :
+        value(0.), name("")
+    {}
 
-        void print_sub(
-            std::ostream& stream,
-            int indent,
-            double threshold,
-            double total)
-        {
-            char buffer[512];
-
-            if (value < threshold) {
-                std::cout << green("not printing ") << name << std::endl;
-                return;
-            }
-
-            auto max_contribution =
-                std::accumulate(
-                        children.begin(), children.end(), -1.,
-                        [] (double lhs, const profiler_node& rhs) {
-                            return lhs > rhs.value ? lhs : rhs.value;
-                        }
-                );
-
-            // print the table row
-            auto const indent_str = std::string(indent, ' ');
-            auto label = indent_str + name;
-            float percentage = 100.*value/total;
-            snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                            label.c_str(),
-                            float(value),
-                            float(percentage));
-            bool print_children =
-                threshold==0. ? children.size()>0
-                              : max_contribution >= threshold;
-
-            if (print_children) {
-                stream << white(buffer) << std::endl;
-            }
-            else {
-                stream << buffer << std::endl;
-            }
-
-            if (print_children) {
-                auto other = 0.;
-                for (auto &n : children) {
-                    if (n.value<threshold || n.name=="other") {
-                        other += n.value;
-                    }
-                    else {
-                        n.print_sub(stream, indent + 2, threshold, total);
-                    }
-                }
-                if (other>=std::max(threshold, 0.01) && children.size()) {
-                    label = indent_str + "  other";
-                    percentage = 100.*other/total;
-                    snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                                    label.c_str(), float(other), percentage);
-                    stream << buffer << std::endl;
-                }
-            }
-        }
+    profiler_node(double v, const std::string& n) :
+        value(v), name(n)
+    {}
 
-        void fuse(const profiler_node& other) {
-            for (auto& n : other.children) {
-                auto it = std::find(children.begin(), children.end(), n);
-                if (it!=children.end()) {
-                    (*it).fuse(n);
-                }
-                else {
-                    children.push_back(n);
-                }
-            }
-
-            value += other.value;
-        }
-    };
-} // namespace impl
+    void print(int indent=0);
+    void print(std::ostream& stream, double threshold);
+    void print_sub(std::ostream& stream, int indent, double threshold, double total);
+    void fuse(const profiler_node& other);
+    /// return wall time spend in "other" region
+    double time_in_other() const;
+    /// scale the value in each node by factor
+    /// performed to all children recursively
+    void scale(double factor);
+
+    json as_json() const;
+};
 
-using timer_type = nest::mc::threading::timer;
+profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs);
+bool operator== (const profiler_node& lhs, const profiler_node& rhs);
 
 // a region in the profiler, has
 // - name
@@ -185,14 +94,11 @@ class region_type {
 
 public:
 
-    using profiler_node = impl::profiler_node;
-
     explicit region_type(std::string n) :
-        name_(std::move(n))
-    {
-        start_time_ = timer_type::tic();
-        hash_ = impl::hash(n);
-    }
+        name_(std::move(n)),
+        hash_(impl::hash(n)),
+        start_time_(timer_type::tic())
+    {}
 
     explicit region_type(const char* n) :
         region_type(std::string(n))
@@ -204,85 +110,24 @@ public:
         parent_ = p;
     }
 
-    const std::string& name() const {
-        return name_;
-    }
-
-    void name(std::string n) {
-        name_ = std::move(n);
-    }
+    const std::string& name() const { return name_; }
+    void name(std::string n) { name_ = std::move(n); }
 
-    region_type* parent() {
-        return parent_;
-    }
+    region_type* parent() { return parent_; }
 
     void start_time() { start_time_ = timer_type::tic(); }
     void end_time  () { total_time_ += timer_type::toc(start_time_); }
+    double total() const { return total_time_; }
 
-    bool has_subregions() const {
-        return subregions_.size() > 0;
-    }
+    bool has_subregions() const { return subregions_.size() > 0; }
 
-    size_t hash() const {
-        return hash_;
-    }
+    size_t hash() const { return hash_; }
 
-    region_type* subregion(const char* n) {
-        size_t hsh = impl::hash(n);
-        auto s = subregions_.find(hsh);
-        if (s == subregions_.end()) {
-            subregions_[hsh] = util::make_unique<region_type>(n, this);
-            return subregions_[hsh].get();
-        }
-        return s->second.get();
-    }
-
-    double subregion_contributions() const {
-        return
-            std::accumulate(
-                subregions_.begin(), subregions_.end(), 0.,
-                [](double l, decltype(*(subregions_.begin())) r) {
-                    return l+r.second->total();
-                }
-            );
-    }
-
-    double total() const {
-        return total_time_;
-    }
+    region_type* subregion(const char* n);
 
-    profiler_node populate_performance_tree() const {
-        profiler_node tree(total(), name());
-
-        for (auto &it : subregions_) {
-            tree.children.push_back(it.second->populate_performance_tree());
-        }
-
-        // sort the contributions in descending order
-        std::stable_sort(
-            tree.children.begin(), tree.children.end(),
-            [](const profiler_node& lhs, const profiler_node& rhs) {
-                return lhs.value>rhs.value;
-            }
-        );
-
-        if (tree.children.size()) {
-            // find the contribution of parts of the code that were not explicitly profiled
-            auto contributions =
-                std::accumulate(
-                    tree.children.begin(), tree.children.end(), 0.,
-                    [](double v, profiler_node& n) {
-                        return v+n.value;
-                    }
-                );
-            auto other = total() - contributions;
-
-            // add the "other" category
-            tree.children.emplace_back(other, std::string("other"));
-        }
+    double subregion_contributions() const;
 
-        return tree;
-    }
+    profiler_node populate_performance_tree() const;
 };
 
 class profiler {
@@ -298,81 +143,46 @@ public:
         profiler(other.root_region_.name())
     {}
 
-    void enter(const char* name) {
-        if (!is_activated()) return;
-        current_region_ = current_region_->subregion(name);
-        current_region_->start_time();
-    }
+    /// step down into level with name
+    void enter(const char* name);
 
-    void leave() {
-        if (!is_activated()) return;
-        if (current_region_->parent()==nullptr) {
-            throw std::out_of_range("attempt to leave root memory tracing region");
-        }
-        current_region_->end_time();
-        current_region_ = current_region_->parent();
-    }
+    /// step up one level
+    void leave();
 
-    // step up multiple n levels in one call
-    void leave(int n) {
-        EXPECTS(n>=1);
+    /// step up multiple n levels in one call
+    void leave(int n);
 
-        while(n--) {
-            leave();
-        }
-    }
+    /// return a reference to the root region
+    region_type& regions() { return root_region_; }
 
-    region_type& regions() {
-        return root_region_;
-    }
+    /// return a pointer to the current region
+    region_type* current_region() { return current_region_; }
 
-    region_type* current_region() {
-        return current_region_;
-    }
+    /// return if in the root region (i.e. the highest level)
+    bool is_in_root() const { return &root_region_ == current_region_; }
 
-    bool is_in_root() const {
-        return &root_region_ == current_region_;
-    }
+    /// return if the profiler has been activated
+    bool is_activated() const { return activated_; }
 
-    bool is_activated() const {
-        return activated_;
-    }
+    /// start (activate) the profiler
+    void start();
 
-    void start() {
-        if (is_activated()) {
-            throw std::out_of_range(
-                    "attempt to start an already running profiler"
-                  );
-        }
-        activate();
-        start_time_ = timer_type::tic();
-        root_region_.start_time();
-    }
-
-    void stop() {
-        if (!is_in_root()) {
-            throw std::out_of_range(
-                    "profiler must be in root region when stopped"
-                  );
-        }
-        root_region_.end_time();
-        stop_time_ = timer_type::tic();
-
-        deactivate();
-    }
+    /// stop (deactivate) the profiler
+    void stop();
 
+    /// the time stamp at which the profiler was started (avtivated)
     timer_type::time_point start_time() const { return start_time_; }
+
+    /// the time stamp at which the profiler was stopped (deavtivated)
     timer_type::time_point stop_time()  const { return stop_time_; }
+
+    /// the time in seconds between activation and deactivation of the profiler
     double wall_time() const {
         return timer_type::difference(start_time_, stop_time_);
     }
 
-    region_type::profiler_node performance_tree() {
-        if (is_activated()) {
-            stop();
-        }
-        return root_region_.populate_performance_tree();
-    }
+    /// stop the profiler then generate the performance tree ready for output
+    profiler_node performance_tree();
 
 private:
     void activate()   { activated_ = true;  }
@@ -388,71 +198,40 @@ private:
 #ifdef WITH_PROFILING
 namespace data {
     using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
-    profiler_wrapper profilers_(profiler("root"));
+    extern profiler_wrapper profilers_;
 }
+#endif
 
-inline profiler& get_profiler() {
-    auto& p = data::profilers_.local();
-    if (!p.is_activated()) {
-        p.start();
-    }
-    return p;
-}
+/// get a reference to the thread private profiler
+/// will lazily create and start the profiler it it has not already been done so
+profiler& get_profiler();
 
-// this will throw an exception if the profler has already been started
-inline void profiler_start() {
-    data::profilers_.local().start();
-}
-inline void profiler_stop() {
-    get_profiler().stop();
-}
-inline void profiler_enter(const char* n) {
-    get_profiler().enter(n);
-}
+/// start thread private profiler
+void profiler_start();
+
+/// stop thread private profiler
+void profiler_stop();
+
+/// enter a profiling region with name n
+void profiler_enter(const char* n);
 
+/// enter nested profiler regions in a single call
 template <class...Args>
 void profiler_enter(const char* n, Args... args) {
     get_profiler().enter(n);
     profiler_enter(args...);
 }
 
-inline void profiler_leave() {
-    get_profiler().leave();
-}
-inline void profiler_leave(int nlevels) {
-    get_profiler().leave(nlevels);
-}
+/// move up one level in the profiler
+void profiler_leave();
+/// move up multiple profiler levels in one call
+void profiler_leave(int nlevels);
 
-// iterate over all profilers and ensure that they have the same start stop times
-inline void stop_profilers() {
-    std::cout << "::profiler : stopping " << data::profilers_.size() << " profilers\n";
-    for (auto& p : data::profilers_) {
-        p.stop();
-    }
-}
+/// iterate and stop them
+void stop_profilers();
 
-inline void profiler_output(double threshold) {
-    stop_profilers();
-    auto p = impl::profiler_node(0, "results");
-    for(auto& thread_profiler : data::profilers_) {
-        std::cout << "fusing profiler : " << thread_profiler.wall_time() << " s\n";
-        p.fuse(thread_profiler.performance_tree());
-    }
-    p.print(std::cout, threshold);
-}
-
-#else
-inline void profiler_start() {}
-inline void profiler_stop() {}
-inline void profiler_enter(const char*) {}
-template <class...Args>
-void profiler_enter(const char*, Args... args) {}
-inline void profiler_enter(const char*, const char*, const char*) {}
-inline void profiler_leave() {}
-inline void profiler_leave(int) {}
-inline void stop_profilers() {}
-inline void profiler_output(double threshold) {}
-#endif
+/// print the collated profiler to std::cout
+void profiler_output(double threshold);
 
 } // namespace util
 } // namespace mc
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index 3a2bc0ad13aa922dd28829f7c6114d1a14ccdefa..eef783949049c77f3c06bfa059b02c06d96b3670 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -56,8 +56,7 @@ struct parallel_for {
     }
 };
 
-static inline
-std::string description() {
+inline std::string description() {
     return "serial";
 }
 
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index a36f2b876b788a3d7c2e52477e652fc4d66c49c9..ed82e26ec87617f7453f4548efe3eec3f858507f 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -24,8 +24,7 @@ struct parallel_for {
     }
 };
 
-static
-std::string description() {
+inline std::string description() {
     return "TBB";
 }
 
@@ -49,3 +48,11 @@ struct timer {
 } // mc
 } // nest
 
+namespace tbb {
+    /// comparison operator for tbb::tick_count type
+    /// returns true iff time stamp l occurred before timestamp r
+    inline bool operator< (tbb::tick_count l, tbb::tick_count r) {
+        return (l-r).seconds() < 0.;
+    }
+}
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b859950e4d44d1a1026cca8d4e422b9be5450a83..230d385f27b065d21bb0e297d799ba2299d395c4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -53,6 +53,11 @@ add_executable(validate.exe ${VALIDATION_SOURCES} ${HEADERS})
 target_link_libraries(test.exe LINK_PUBLIC cellalgo gtest)
 target_link_libraries(validate.exe LINK_PUBLIC cellalgo gtest)
 
+if(WITH_TBB)
+    target_link_libraries(test.exe LINK_PUBLIC ${TBB_LIBRARIES})
+    target_link_libraries(validate.exe LINK_PUBLIC ${TBB_LIBRARIES})
+endif()
+
 set_target_properties(test.exe
    PROPERTIES
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
@@ -62,3 +67,4 @@ set_target_properties(validate.exe
    PROPERTIES
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
 )
+