diff --git a/CMakeLists.txt b/CMakeLists.txt
index d4cd87e44c0b4306f4bed59f10685b0bf723a0a6..f51b8f5324c6b1fef8caac5c83e3212d85801540 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,20 +127,23 @@ endif()
 #----------------------------------------------------------
 # MPI support
 #----------------------------------------------------------
-option(NMC_WITH_MPI "use MPI for distrubuted parallelism" OFF)
-
+option(NMC_WITH_MPI "use MPI for distributed parallelism" OFF)
 if(NMC_WITH_MPI)
-    find_package(MPI REQUIRED)
+   # BGQ specific flags
+   if(${NMC_SYSTEM_TYPE} MATCHES "BGQ" )
+      # On BGQ, set CXX to the mpi wrapper, and pass it a static
+      add_definitions(-DMPICH2_CONST=const)
+      set(MPI_FOUND TRUE)
+    endif()
+    
+    if (NOT MPI_FOUND)
+      find_package(MPI REQUIRED)
+    endif()
     include_directories(SYSTEM ${MPI_C_INCLUDE_PATH})
     add_definitions(-DNMC_HAVE_MPI)
     # unfortunate workaround for C++ detection in system mpi.h
     add_definitions(-DMPICH_SKIP_MPICXX=1 -DOMPI_SKIP_MPICXX=1)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
-
-    # BGQ specific flags
-   if(${NMC_SYSTEM_TYPE} MATCHES "BGQ" )
-     add_definitions(-DMPICH2_CONST=const)
-   endif()
 endif()
 
 #----------------------------------------------------------
diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index d2aaef48f36afeb943ef2fe6bb91b8e4ecb97242..ad07161c7da0e32a58c1cf7fc6a1c9195b707ef1 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -134,7 +134,10 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         true,       // Overwrite outputfile if exists
         "./",       // output path
         "spikes",   // file name
-        "gdf"       // file extension
+        "gdf",      // file extension
+        
+        // Turn on/off profiling output for all ranks
+        false
     };
 
     cl_options options;
@@ -191,6 +194,9 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         TCLAP::SwitchArg spike_output_arg(
             "f","spike_file_output","save spikes to file", cmd, false);
 
+        TCLAP::SwitchArg profile_only_zero_arg(
+             "z", "profile-only-zero", "Only output profile information for rank 0", cmd, false);
+
         cmd.reorder_arguments();
         cmd.parse(argc, argv);
 
@@ -230,6 +236,8 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                         update_option(options.file_extension, fopts, "file_extension");
                     }
 
+                    update_option(options.profile_only_zero, fopts, "profile_only_zero");
+
                 }
                 catch (std::exception& e) {
                     throw model_description_error(
@@ -255,6 +263,7 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         update_option(options.trace_prefix, trace_prefix_arg);
         update_option(options.trace_max_gid, trace_max_gid_arg);
         update_option(options.spike_file_output, spike_output_arg);
+        update_option(options.profile_only_zero, profile_only_zero_arg);
 
         if (options.all_to_all && options.ring) {
             throw usage_error("can specify at most one of --ring and --all-to-all");
diff --git a/miniapp/io.hpp b/miniapp/io.hpp
index ac769d436b6a36550afe2e64b33e0b35a69f3f80..3100de17441d1fcc01dc5eb87e42918892d8a9ff 100644
--- a/miniapp/io.hpp
+++ b/miniapp/io.hpp
@@ -35,6 +35,9 @@ struct cl_options {
     std::string output_path;
     std::string file_name;
     std::string file_extension;
+
+    // Turn on/off profiling output for all ranks
+    bool profile_only_zero;
 };
 
 class usage_error: public std::runtime_error {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 6094631b26ec449de143d8070c63171977c107bc..c52543004afc20f2e0143ad053623f4352d1e616 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -141,7 +141,7 @@ int main(int argc, char** argv) {
 
         // output profile and diagnostic feedback
         auto const num_steps = options.tfinal / options.dt;
-        util::profiler_output(0.001, m.num_cells()*num_steps);
+        util::profiler_output(0.001, m.num_cells()*num_steps, options.profile_only_zero);
         std::cout << "there were " << m.num_spikes() << " spikes\n";
 
         // save traces
diff --git a/scripts/profstats b/scripts/profstats
deleted file mode 100755
index 88f68c72e6253fa4c0d240d051d7ce1be1960604..0000000000000000000000000000000000000000
--- a/scripts/profstats
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python2
-#coding: utf-8
-
-import json
-import argparse
-import re
-import numpy as np
-from itertools import chain
-
-def parse_clargs():
-    P = argparse.ArgumentParser(description='Aggregate and analyse MPI profile output.')
-    P.add_argument('inputs', metavar='FILE', nargs='+',
-                   help='MPI profile output in JSON format')
-    P.add_argument('-r', '--raw', action='store_true',
-                   help='emit raw times in csv table')
-
-    return P.parse_args()
-
-def parse_profile_json(source):
-    j = json.load(source)
-    rank = j['rank']
-    if rank is None:
-        raise ValueError('missing rank information in profile')
-
-    tx = dict()
-
-    def collect_times(j, prefix):
-        t = j['time']
-        n = j['name']
-
-        if t is None or n is None:
-            return
-
-        prefix = prefix + n
-        tx[prefix] = t
-
-        try:
-            children = j['regions']
-            # special case for top level
-            if prefix == 'total':
-                prefix = ''
-            else:
-                prefix = prefix + '/'
-
-            for j in children:
-                collect_times(j, prefix)
-        except KeyError:
-            pass
-
-    collect_times(j['regions'], '')
-    return rank, tx
-
-def csv_escape(x):
-    s = re.sub('"','""',str(x))
-    if re.search('["\t\n,]',s):
-        s = '"'+s+'"'
-    return s
-
-def emit_csv(cols, rows):
-    print(",".join([csv_escape(c) for c in cols]))
-    for r in rows:
-        print(",".join([csv_escape(r[c]) if c in r else '' for c in cols]))
-
-args = parse_clargs()
-
-rank_times = dict()
-for filename in args.inputs:
-    with open(filename) as f:
-        rank, times = parse_profile_json(f)
-        rank_times[rank] = times
-
-if args.raw:
-    rows = [rank_times[rank] for rank in sorted(rank_times.keys())]
-    cols = sorted({col for tbl in rows for col in tbl.keys()})
-    emit_csv(cols, rows)
-else:
-    rank_entry = [rank_times[rank] for rank in sorted(rank_times.keys())]
-    bins = sorted({col for tbl in rank_entry for col in tbl.keys()})
-
-    rows = []
-    for b in bins:
-        qs = np.percentile([entry[b] for entry in rank_times.values() if b in entry],
-            [0., 0.25, 0.5, 0.75, 1.])
-        rows.append({
-            'region': b,
-            'min': qs[0],
-            'q25': qs[1],
-            'median': qs[2],
-            'q75': qs[3],
-            'max': qs[4]
-        })
-
-    emit_csv(['region','min','q25','median','q75','max'], rows)
diff --git a/scripts/profstats b/scripts/profstats
new file mode 120000
index 0000000000000000000000000000000000000000..8170d8312648ad82df61e58c3d3de18f02e0f3fb
--- /dev/null
+++ b/scripts/profstats
@@ -0,0 +1 @@
+profstats.py
\ No newline at end of file
diff --git a/scripts/profstats.py b/scripts/profstats.py
new file mode 100755
index 0000000000000000000000000000000000000000..86611e33a3b00e873b2698946e98ce4e19461789
--- /dev/null
+++ b/scripts/profstats.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+#coding: utf-8
+
+import json
+import argparse
+import re
+import numpy as np
+from itertools import chain
+
+def parse_clargs():
+    P = argparse.ArgumentParser(description='Aggregate and analyse MPI profile output.')
+    P.add_argument('inputs', metavar='FILE', nargs='+',
+                   help='MPI profile output in JSON format')
+    P.add_argument('-r', '--raw', action='store_true',
+                   help='emit raw times in csv table')
+
+    return P.parse_args()
+
+def parse_profile_json(source):
+    j = json.load(source)
+    rank = j['rank']
+    if rank is None:
+        raise ValueError('missing rank information in profile')
+
+    tx = dict()
+
+    def collect_times(j, prefix):
+        t = j['time']
+        n = j['name']
+
+        if t is None or n is None:
+            return
+
+        prefix = prefix + n
+        tx[prefix] = t
+
+        try:
+            children = j['regions']
+            # special case for top level
+            if prefix == 'total':
+                prefix = ''
+            else:
+                prefix = prefix + '/'
+
+            for j in children:
+                collect_times(j, prefix)
+        except KeyError:
+            pass
+
+    collect_times(j['regions'], '')
+    return rank, tx
+
+def csv_escape(x):
+    s = re.sub('"','""',str(x))
+    if re.search('["\t\n,]',s):
+        s = '"'+s+'"'
+    return s
+
+def emit_csv(cols, rows, stdout):
+    stdout.write(",".join([csv_escape(c) for c in cols]))
+    stdout.write("\n")
+    for r in rows:
+        stdout.write(",".join([csv_escape(r[c]) if c in r else '' for c in cols]))
+        stdout.write("\n")
+
+def main(raw, inputs, stdout):
+    rank_times = dict()
+    for filename in inputs:
+        with open(filename) as f:
+            rank, times = parse_profile_json(f)
+            rank_times[rank] = times
+
+    if raw:
+        rows = [rank_times[rank] for rank in sorted(rank_times.keys())]
+        cols = sorted({col for tbl in rows for col in tbl.keys()})
+        emit_csv(cols, rows, stdout)
+    else:
+        rank_entry = [rank_times[rank] for rank in sorted(rank_times.keys())]
+        bins = sorted({col for tbl in rank_entry for col in tbl.keys()})
+
+        rows = []
+        for b in bins:
+            qs = np.percentile([entry[b] for entry in rank_times.values() if b in entry],
+                [0., 0.25, 0.5, 0.75, 1.])
+            rows.append({
+                'region': b,
+                'min': qs[0],
+                'q25': qs[1],
+                'median': qs[2],
+                'q75': qs[3],
+                'max': qs[4]
+            })
+
+        emit_csv(['region','min','q25','median','q75','max'], rows, stdout)
+
+if __name__ == "__main__":
+    import sys
+    args = parse_clargs()
+    main(args.raw, args.inputs, sys.stdout)
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
index 896e7bcf6c11383bf21e2c2416149f1f1970527f..a6f08e96f579c142f579d550b450fe56e55d02b7 100644
--- a/src/profiling/profiler.cpp
+++ b/src/profiling/profiler.cpp
@@ -349,7 +349,7 @@ void profilers_restart() {
     }
 }
 
-void profiler_output(double threshold, std::size_t num_local_work_items) {
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero) {
     profilers_stop();
 
     // Find the earliest start time and latest stop time over all profilers
@@ -385,6 +385,7 @@ void profiler_output(double threshold, std::size_t num_local_work_items) {
     auto ncomms = communication::global_policy::size();
     auto comm_rank = communication::global_policy::id();
     bool print = comm_rank==0 ? true : false;
+    bool output_this_rank = (comm_rank == 0) || ! profile_only_zero;
 
     // calculate the throughput in terms of work items per second
     auto local_throughput = num_local_work_items / wall_time;
@@ -433,9 +434,11 @@ void profiler_output(double threshold, std::size_t num_local_work_items) {
     as_json["rank"] = comm_rank;
     as_json["regions"] = p.as_json();
 
-    auto fname = std::string("profile_" + std::to_string(comm_rank));
-    std::ofstream fid(fname);
-    fid << std::setw(1) << as_json;
+    if (output_this_rank) {
+        auto fname = std::string("profile_" + std::to_string(comm_rank));
+        std::ofstream fid(fname);
+        fid << std::setw(1) << as_json;
+    }
 }
 
 #else
@@ -445,7 +448,7 @@ void profiler_enter(const char*) {}
 void profiler_leave() {}
 void profiler_leave(int) {}
 void profilers_stop() {}
-void profiler_output(double threshold, std::size_t num_local_work_items) {}
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero) {}
 void profilers_restart() {};
 #endif
 
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index b000de671c96b80d5268d797af090b61a55b0f9d..0747fbdcf556b77503628c2d6caa3d238c040c17 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -245,7 +245,7 @@ void profilers_stop();
 void profilers_restart();
 
 /// print the collated profiler to std::cout
-void profiler_output(double threshold, std::size_t num_local_work_items);
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero);
 
 } // namespace util
 } // namespace mc