diff --git a/CMakeLists.txt b/CMakeLists.txt index f91e332eecdd77b327846951ed9271d740e926df..bc45593eb2451be9818e2b75671d91341647e4e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,8 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" ) if(WITH_TBB) find_package(TBB REQUIRED) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_TBB ${TBB_DEFINITIONS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}") + add_definitions(-DWITH_TBB) endif() # MPI support @@ -37,6 +38,12 @@ if(WITH_MPI) set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}") endif() +# Profiler support +set(WITH_PROFILING OFF CACHE BOOL "use built in profiling of miniapp" ) +if(WITH_PROFILING) + add_definitions(-DWITH_PROFILING) +endif() + # Cray systems set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems") if(SYSTEM_CRAY) diff --git a/miniapp/io.cpp b/miniapp/io.cpp index 5b7a1cb338999c67a6885287fdccb02195322bf6..887848b549b7bad46574b92ac9fcde74faae5d5c 100644 --- a/miniapp/io.cpp +++ b/miniapp/io.cpp @@ -8,7 +8,7 @@ namespace io { // for now this is just a placeholder options read_options(std::string fname) { // 10 cells, 1 synapses per cell, 10 compartments per segment - return {20, 1, 100}; + return {200, 1, 100}; } std::ostream& operator<<(std::ostream& o, const options& opt) { diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp index 549386abd307db798127666913558ecf905a212d..4a619cf9a087dc7fa151ee9613adc6d5ff6357c4 100644 --- a/miniapp/miniapp.cpp +++ b/miniapp/miniapp.cpp @@ -11,8 +11,7 @@ #include "threading/threading.hpp" #include "profiling/profiler.hpp" #include "communication/communicator.hpp" -#include "communication/serial_global_policy.hpp" -#include "communication/mpi_global_policy.hpp" +#include "communication/global_policy.hpp" #include "util/optional.hpp" using namespace nest; @@ -22,13 +21,10 @@ using index_type = int; using id_type = uint32_t; using numeric_cell = mc::fvm::fvm_cell<real_type, index_type>; using cell_group = mc::cell_group<numeric_cell>; -#ifdef WITH_MPI -using communicator_type = - mc::communication::communicator<mc::communication::mpi_global_policy>; -#else + +using global_policy = nest::mc::communication::global_policy; using communicator_type = - mc::communication::communicator<mc::communication::serial_global_policy>; -#endif + mc::communication::communicator<global_policy>; using nest::mc::util::optional; @@ -36,17 +32,6 @@ struct model { communicator_type communicator; std::vector<cell_group> cell_groups; - double time_init; - double time_network; - double time_solve; - double time_comms; - void print_times() const { - std::cout << "initialization took " << time_init << " s\n"; - std::cout << "network took " << time_network << " s\n"; - std::cout << "solve took " << time_solve << " s\n"; - std::cout << "comms took " << time_comms << " s\n"; - } - int num_groups() const { return cell_groups.size(); } @@ -54,30 +39,32 @@ struct model { void run(double tfinal, double dt) { auto t = 0.; auto delta = communicator.min_delay(); - time_solve = 0.; - time_comms = 0.; while(t<tfinal) { - auto start_solve = mc::util::timer_type::tic(); mc::threading::parallel_for::apply( 0, num_groups(), [&](int i) { + mc::util::profiler_enter("stepping","events"); cell_groups[i].enqueue_events(communicator.queue(i)); + mc::util::profiler_leave(); cell_groups[i].advance(t+delta, dt); + mc::util::profiler_enter("events"); communicator.add_spikes(cell_groups[i].spikes()); cell_groups[i].clear_spikes(); + mc::util::profiler_leave(2); } ); - time_solve += mc::util::timer_type::toc(start_solve); - auto start_comms = mc::util::timer_type::tic(); + mc::util::profiler_enter("stepping", "exchange"); communicator.exchange(); - time_comms += mc::util::timer_type::toc(start_comms); + mc::util::profiler_leave(2); t += delta; } } void init_communicator() { + mc::util::profiler_enter("setup", "communicator"); + // calculate the source and synapse distribution serially std::vector<id_type> target_counts(num_groups()); std::vector<id_type> source_counts(num_groups()); @@ -91,9 +78,12 @@ struct model { // create connections communicator = communicator_type(num_groups(), target_counts); + + mc::util::profiler_leave(2); } void update_gids() { + mc::util::profiler_enter("setup", "globalize"); auto com_policy = communicator.communication_policy(); auto global_source_map = com_policy.make_map(source_map.back()); auto domain_idx = communicator.domain_id(); @@ -101,6 +91,7 @@ struct model { cell_groups[i].set_source_gids(source_map[i]+global_source_map[domain_idx]); cell_groups[i].set_target_gids(target_map[i]+communicator.target_gid_from_group_lid(0)); } + mc::util::profiler_leave(2); } // TODO : only stored here because init_communicator() and update_gids() are split @@ -184,7 +175,7 @@ namespace synapses { mc::cell make_cell(int compartments_per_segment, int num_synapses); /// do basic setup (initialize global state, print banner, etc) -void setup(int argc, char** argv); +void setup(); /// helper function for initializing cells cell_group make_lowered_cell(int cell_index, const mc::cell& c); @@ -193,34 +184,35 @@ cell_group make_lowered_cell(int cell_index, const mc::cell& c); void ring_model(nest::mc::io::options& opt, model& m); void all_to_all_model(nest::mc::io::options& opt, model& m); + /////////////////////////////////////// // main /////////////////////////////////////// int main(int argc, char** argv) { + nest::mc::communication::global_policy_guard global_guard(argc, argv); - setup(argc, argv); + setup(); // read parameters mc::io::options opt; try { opt = mc::io::read_options(""); - if (mc::mpi::rank()==0) { + if (!global_policy::id()) { std::cout << opt << "\n"; } } - catch (std::exception e) { + catch (std::exception& e) { std::cerr << e.what() << std::endl; exit(1); } model m; - //ring_model(opt, m); all_to_all_model(opt, m); - ///////////////////////////////////////////////////// + // // time stepping - ///////////////////////////////////////////////////// - auto tfinal = 50.; + // + auto tfinal = 20.; auto dt = 0.01; auto id = m.communicator.domain_id(); @@ -230,14 +222,16 @@ int main(int argc, char** argv) { } m.run(tfinal, dt); + + mc::util::profiler_output(0.001); + if (!id) { - m.print_times(); std::cout << "there were " << m.communicator.num_spikes() << " spikes\n"; } m.dump_traces(); #ifdef SPLAT - if (!mc::mpi::rank()) { + if (!global_policy::id()) { //for (auto i=0u; i<m.cell_groups.size(); ++i) { m.cell_groups[0].splat("cell0.txt"); m.cell_groups[1].splat("cell1.txt"); @@ -245,10 +239,6 @@ int main(int argc, char** argv) { //} } #endif - -#ifdef WITH_MPI - mc::mpi::finalize(); -#endif } /////////////////////////////////////// @@ -264,7 +254,6 @@ void ring_model(nest::mc::io::options& opt, model& m) { auto basic_cell = make_cell(opt.compartments_per_segment, 1); // make a vector for storing all of the cells - auto start_init = mc::util::timer_type::tic(); m.cell_groups = std::vector<cell_group>(opt.cells); // initialize the cells in parallel @@ -272,15 +261,17 @@ void ring_model(nest::mc::io::options& opt, model& m) { 0, opt.cells, [&](int i) { // initialize cell + mc::util::profiler_enter("setup"); + mc::util::profiler_enter("make cell"); m.cell_groups[i] = make_lowered_cell(i, basic_cell); + mc::util::profiler_leave(); + mc::util::profiler_leave(); } ); - m.time_init = mc::util::timer_type::toc(start_init); // // network creation // - auto start_network = mc::util::timer_type::tic(); m.init_communicator(); for (auto i=0u; i<(id_type)opt.cells; ++i) { @@ -290,24 +281,18 @@ void ring_model(nest::mc::io::options& opt, model& m) { }); } - m.communicator.construct(); - m.update_gids(); - - m.time_network = mc::util::timer_type::toc(start_network); } void all_to_all_model(nest::mc::io::options& opt, model& m) { // // make cells // - auto timer = mc::util::timer_type(); // make a basic cell auto basic_cell = make_cell(opt.compartments_per_segment, opt.cells-1); // make a vector for storing all of the cells - auto start_init = timer.tic(); id_type ncell_global = opt.cells; id_type ncell_local = ncell_global / m.communicator.num_domains(); int remainder = ncell_global - (ncell_local*m.communicator.num_domains()); @@ -321,15 +306,15 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) { mc::threading::parallel_for::apply( 0, ncell_local, [&](int i) { + mc::util::profiler_enter("setup", "cells"); m.cell_groups[i] = make_lowered_cell(i, basic_cell); + mc::util::profiler_leave(2); } ); - m.time_init = timer.toc(start_init); // // network creation // - auto start_network = timer.tic(); m.init_communicator(); // monitor soma and dendrite on a few cells @@ -348,6 +333,7 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) { m.cell_groups[lid].add_sampler(m.make_simple_sampler(probe_dend, "vdend", gid, sample_dt)); } + mc::util::profiler_enter("setup", "connections"); // lid is local cell/group id for (auto lid=0u; lid<ncell_local; ++lid) { auto target = m.communicator.target_gid_from_group_lid(lid); @@ -364,35 +350,24 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) { } m.communicator.construct(); + mc::util::profiler_leave(2); m.update_gids(); - - m.time_network = timer.toc(start_network); } /////////////////////////////////////// // function definitions /////////////////////////////////////// -void setup(int argc, char** argv) { -#ifdef WITH_MPI - mc::mpi::init(&argc, &argv); - +void setup() { // print banner - if (mc::mpi::rank()==0) { + if (!global_policy::id()) { std::cout << "====================\n"; std::cout << " starting miniapp\n"; std::cout << " - " << mc::threading::description() << " threading support\n"; - std::cout << " - MPI support\n"; + std::cout << " - communication policy: " << global_policy::name() << "\n"; std::cout << "====================\n"; } -#else - // print banner - std::cout << "====================\n"; - std::cout << " starting miniapp\n"; - std::cout << " - " << mc::threading::description() << " threading support\n"; - std::cout << "====================\n"; -#endif // setup global state for the mechanisms mc::mechanisms::setup_mechanism_helpers(); @@ -409,8 +384,8 @@ mc::cell make_cell(int compartments_per_segment, int num_synapses) { // add dendrite of length 200 um and diameter 1 um with passive channel std::vector<mc::cable_segment*> dendrites; dendrites.push_back(cell.add_cable(0, mc::segmentKind::dendrite, 0.5, 0.5, 200)); - dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100)); - dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100)); + //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100)); + //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100)); for (auto d : dendrites) { d->add_mechanism(mc::pas_parameters()); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5c3ae6c78627d93c78202a6cab72bab1c5d4c621..e7bced8530908d637ecec5112fdb5e581e652e9a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,8 +5,10 @@ set(BASE_SOURCES cell.cpp mechanism_interface.cpp parameter_list.cpp + profiling/profiler.cpp swcio.cpp ) + if(${WITH_MPI}) set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp) endif() diff --git a/src/cell_group.hpp b/src/cell_group.hpp index 7b6f5bedd1b80705631a5a82cb965dc227e9768b..922b7b197091af6cf4ba7ecc345030bcff3e380e 100644 --- a/src/cell_group.hpp +++ b/src/cell_group.hpp @@ -8,6 +8,8 @@ #include <communication/spike.hpp> #include <communication/spike_source.hpp> +#include <profiling/profiler.hpp> + namespace nest { namespace mc { @@ -114,6 +116,7 @@ public: // integrate cell state cell_.advance(tnext - cell_.time()); + nest::mc::util::profiler_enter("events"); // check for new spikes for (auto& s : spike_sources_) { if (auto spike = s.source.test(cell_, cell_.time())) { @@ -131,6 +134,7 @@ public: cell_.apply_event(e.get()); } } + nest::mc::util::profiler_leave(); } } diff --git a/src/communication/global_policy.hpp b/src/communication/global_policy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b771e8ed24d5d022b6a23cc7d77171ca477a7dbf --- /dev/null +++ b/src/communication/global_policy.hpp @@ -0,0 +1,42 @@ +#pragma once + +#ifdef WITH_MPI + #include "communication/mpi_global_policy.hpp" +#else + #include "communication/serial_global_policy.hpp" +#endif + +namespace nest { +namespace mc { +namespace communication { + +#ifdef WITH_MPI +using global_policy = nest::mc::communication::mpi_global_policy; +#else +using global_policy = nest::mc::communication::serial_global_policy; +#endif + +template <typename Policy> +struct policy_guard { + using policy_type = Policy; + + policy_guard(int argc, char**& argv) { + policy_type::setup(argc, argv); + } + + policy_guard() = delete; + policy_guard(policy_guard&&) = delete; + policy_guard(const policy_guard&) = delete; + policy_guard& operator=(policy_guard&&) = delete; + policy_guard& operator=(const policy_guard&) = delete; + + ~policy_guard() { + Policy::teardown(); + } +}; + +using global_policy_guard = policy_guard<global_policy>; + +} // namespace communication +} // namespace mc +} // namespace nest diff --git a/src/communication/mpi_global_policy.hpp b/src/communication/mpi_global_policy.hpp index 79135437e73d21f0a49ac91c6408a33e4b6d61ef..7409585447ae888eec2aa47ddc1fa0f137df7d6e 100644 --- a/src/communication/mpi_global_policy.hpp +++ b/src/communication/mpi_global_policy.hpp @@ -1,5 +1,9 @@ #pragma once +#ifndef WITH_MPI +#error "mpi_global_policy.hpp should only be compiled in a WITH_MPI build" +#endif + #include <type_traits> #include <vector> @@ -9,7 +13,6 @@ #include <communication/mpi.hpp> #include <algorithms.hpp> - namespace nest { namespace mc { namespace communication { @@ -17,22 +20,22 @@ namespace communication { struct mpi_global_policy { using id_type = uint32_t; - std::vector<spike<id_type>> const - gather_spikes(const std::vector<spike<id_type>>& local_spikes) { + std::vector<spike<id_type>> + static gather_spikes(const std::vector<spike<id_type>>& local_spikes) { return mpi::gather_all(local_spikes); } - int id() const { return mpi::rank(); } + static int id() { return mpi::rank(); } - int size() const { return mpi::size(); } + static int size() { return mpi::size(); } template <typename T> - T min(T value) const { + static T min(T value) { return nest::mc::mpi::reduce(value, MPI_MIN); } template <typename T> - T max(T value) const { + static T max(T value) { return nest::mc::mpi::reduce(value, MPI_MAX); } @@ -40,11 +43,24 @@ struct mpi_global_policy { typename T, typename = typename std::enable_if<std::is_integral<T>::value> > - std::vector<T> make_map(T local) { + static std::vector<T> make_map(T local) { return algorithms::make_index(mpi::gather_all(local)); } + + static void setup(int& argc, char**& argv) { + nest::mc::mpi::init(&argc, &argv); + } + + static void teardown() { + nest::mc::mpi::finalize(); + } + + static const char* name() { return "MPI"; } + +private: }; } // namespace communication } // namespace mc } // namespace nest + diff --git a/src/communication/serial_global_policy.hpp b/src/communication/serial_global_policy.hpp index eaa6c4b11168e4e11f0c56df79631548ccb02444..a1c7fee9671cb79768e255a710e0082386302e8a 100644 --- a/src/communication/serial_global_policy.hpp +++ b/src/communication/serial_global_policy.hpp @@ -15,7 +15,7 @@ struct serial_global_policy { using id_type = uint32_t; std::vector<spike<id_type>> const - gather_spikes(const std::vector<spike<id_type>>& local_spikes) { + static gather_spikes(const std::vector<spike<id_type>>& local_spikes) { return local_spikes; } @@ -28,12 +28,12 @@ struct serial_global_policy { } template <typename T> - T min(T value) const { + static T min(T value) { return value; } template <typename T> - T max(T value) const { + static T max(T value) { return value; } @@ -41,9 +41,13 @@ struct serial_global_policy { typename T, typename = typename std::enable_if<std::is_integral<T>::value> > - std::vector<T> make_map(T local) { + static std::vector<T> make_map(T local) { return {T(0), local}; } + + static void setup(int& argc, char**& argv) {} + static void teardown() {} + static const char* name() { return "serial"; } }; } // namespace communication diff --git a/src/fvm_cell.hpp b/src/fvm_cell.hpp index 678ee5e1fec5ad020e9a1d10fd5af0edb00ec887..947b7a7ba233340274f5113c12d075d7851a1ea1 100644 --- a/src/fvm_cell.hpp +++ b/src/fvm_cell.hpp @@ -17,10 +17,12 @@ #include <segment.hpp> #include <stimulus.hpp> #include <util.hpp> +#include <profiling/profiler.hpp> #include <vector/include/Vector.hpp> #include <mechanisms/expsyn.hpp> + namespace nest { namespace mc { namespace fvm { @@ -113,9 +115,6 @@ public: /// make a time step void advance(value_type dt); - /// advance solution to target time tfinal with maximum step size dt - void advance_to(value_type tfinal, value_type dt); - /// pass an event to the appropriate synapse and call net_receive void apply_event(postsynaptic_spike_event e) { mechanisms_[synapse_index_]->net_receive(e.target, e.weight); @@ -188,11 +187,6 @@ private: std::vector<std::pair<uint32_t, i_clamp>> stimulii_; std::vector<std::pair<const vector_type fvm_cell::*, uint32_t>> probes_; - - /* - /// spike event queue - event_queue<postsynaptic_spike_event> events_; - */ }; //////////////////////////////////////////////////////////////////////////////// @@ -505,12 +499,15 @@ void fvm_cell<T, I>::advance(T dt) { using memory::all; + mc::util::profiler_enter("current"); current_(all) = 0.; // update currents from ion channels for(auto& m : mechanisms_) { + mc::util::profiler_enter(m->name().c_str()); m->set_params(t_, dt); m->nrn_current(); + mc::util::profiler_leave(); } // add current contributions from stimulii @@ -521,42 +518,29 @@ void fvm_cell<T, I>::advance(T dt) // the factor of 100 scales the injected current to 10^2.nA current_[loc] -= 100.*ie/cv_areas_[loc]; } + mc::util::profiler_leave(); + mc::util::profiler_enter("matrix", "setup"); // solve the linear system setup_matrix(dt); + mc::util::profiler_leave(); mc::util::profiler_enter("solve"); matrix_.solve(); + mc::util::profiler_leave(); voltage_(all) = matrix_.rhs(); + mc::util::profiler_leave(); + mc::util::profiler_enter("state"); // integrate state of gating variables etc. for(auto& m : mechanisms_) { + mc::util::profiler_enter(m->name().c_str()); m->nrn_state(); + mc::util::profiler_leave(); } + mc::util::profiler_leave(); t_ += dt; } -/* -template <typename T, typename I> -void fvm_cell<T, I>::advance_to(T tfinal, T dt) -{ - if(t_>=tfinal) { - return; - } - - do { - auto tstep = std::min(tfinal, t_+dt); - auto next = events_.pop_if_before(tstep); - auto tnext = next? next->time: tstep; - - advance(tnext-t_); - t_ = tnext; - if (next) { // handle event - mechanisms_[synapse_index_]->net_receive(next->target, next->weight); - } - } while(t_<tfinal); -} -*/ - } // namespace fvm } // namespace mc } // namespace nest diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c07351e296ba8873ed7064ca4b605d2e9bb0cb8d --- /dev/null +++ b/src/profiling/profiler.cpp @@ -0,0 +1,375 @@ +#include "profiler.hpp" + +#include <communication/global_policy.hpp> + +namespace nest { +namespace mc { +namespace util { + +///////////////////////////////////////////////////////// +// profiler_node +///////////////////////////////////////////////////////// +void profiler_node::print(int indent) { + std::string s = std::string(indent, ' ') + name; + std::cout << s + << std::string(60-s.size(), '.') + << value + << "\n"; + for (auto& n : children) { + n.print(indent+2); + } +} + +void profiler_node::print(std::ostream& stream, double threshold) { + // convert threshold from proportion to time + threshold *= value; + print_sub(stream, 0, threshold, value); +} + +void profiler_node::print_sub( + std::ostream& stream, + int indent, + double threshold, + double total) +{ + char buffer[512]; + + if (value < threshold) { + std::cout << green("not printing ") << name << std::endl; + return; + } + + auto max_contribution = + std::accumulate( + children.begin(), children.end(), -1., + [] (double lhs, const profiler_node& rhs) { + return lhs > rhs.value ? lhs : rhs.value; + } + ); + + // print the table row + auto const indent_str = std::string(indent, ' '); + auto label = indent_str + name; + float percentage = 100.*value/total; + snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f", + label.c_str(), + float(value), + float(percentage)); + bool print_children = + threshold==0. ? children.size()>0 + : max_contribution >= threshold; + + stream << (print_children ? white(buffer) : buffer) << "\n"; + + if (print_children) { + auto other = 0.; + for (auto &n : children) { + if (n.value<threshold || n.name=="other") { + other += n.value; + } + else { + n.print_sub(stream, indent + 2, threshold, total); + } + } + if (other>=std::max(threshold, 0.001) && children.size()) { + label = indent_str + " other"; + percentage = 100.*other/total; + snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f", + label.c_str(), float(other), percentage); + stream << buffer << std::endl; + } + } +} + +void profiler_node::fuse(const profiler_node& other) { + for (auto& n : other.children) { + auto it = std::find(children.begin(), children.end(), n); + if (it!=children.end()) { + (*it).fuse(n); + } + else { + children.push_back(n); + } + } + + value += other.value; +} + +double profiler_node::time_in_other() const { + auto o = std::find_if( + children.begin(), children.end(), + [](const profiler_node& n) { + return n.name == std::string("other"); + } + ); + return o==children.end() ? 0. : o->value; +} + +void profiler_node::scale(double factor) { + value *= factor; + for (auto& n : children) { + n.scale(factor); + } +} + +profiler_node::json profiler_node::as_json() const { + json node; + node["name"] = name; + node["time"] = value; + for (const auto& n : children) { + node["regions"].push_back(n.as_json()); + } + return node; +} + +profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs) { + assert(lhs.name == rhs.name); + auto node = lhs; + node.fuse(rhs); + return node; +} + +bool operator== (const profiler_node& lhs, const profiler_node& rhs) { + return lhs.name == rhs.name; +} + +///////////////////////////////////////////////////////// +// region_type +///////////////////////////////////////////////////////// +region_type* region_type::subregion(const char* n) { + size_t hsh = impl::hash(n); + auto s = subregions_.find(hsh); + if (s == subregions_.end()) { + subregions_[hsh] = util::make_unique<region_type>(n, this); + return subregions_[hsh].get(); + } + return s->second.get(); +} + +double region_type::subregion_contributions() const { + return + std::accumulate( + subregions_.begin(), subregions_.end(), 0., + [](double l, decltype(*(subregions_.begin())) r) { + return l+r.second->total(); + } + ); +} + +profiler_node region_type::populate_performance_tree() const { + profiler_node tree(total(), name()); + + for (auto &it : subregions_) { + tree.children.push_back(it.second->populate_performance_tree()); + } + + // sort the contributions in descending order + std::stable_sort( + tree.children.begin(), tree.children.end(), + [](const profiler_node& lhs, const profiler_node& rhs) { + return lhs.value>rhs.value; + } + ); + + if (tree.children.size()) { + // find the contribution of parts of the code that were not explicitly profiled + auto contributions = + std::accumulate( + tree.children.begin(), tree.children.end(), 0., + [](double v, profiler_node& n) { + return v+n.value; + } + ); + auto other = total() - contributions; + + // add the "other" category + tree.children.emplace_back(other, std::string("other")); + } + + return tree; +} + +///////////////////////////////////////////////////////// +// region_type +///////////////////////////////////////////////////////// +void profiler::enter(const char* name) { + if (!is_activated()) return; + current_region_ = current_region_->subregion(name); + current_region_->start_time(); +} + +void profiler::leave() { + if (!is_activated()) return; + if (current_region_->parent()==nullptr) { + throw std::out_of_range("attempt to leave root memory tracing region"); + } + current_region_->end_time(); + current_region_ = current_region_->parent(); +} + +void profiler::leave(int n) { + EXPECTS(n>=1); + + while(n--) { + leave(); + } +} + +void profiler::start() { + if (is_activated()) { + throw std::out_of_range( + "attempt to start an already running profiler" + ); + } + activate(); + start_time_ = timer_type::tic(); + root_region_.start_time(); +} + +void profiler::stop() { + if (!is_in_root()) { + throw std::out_of_range( + "profiler must be in root region when stopped" + ); + } + root_region_.end_time(); + stop_time_ = timer_type::tic(); + + deactivate(); +} + +profiler_node profiler::performance_tree() { + if (is_activated()) { + stop(); + } + return root_region_.populate_performance_tree(); +} + + +#ifdef WITH_PROFILING +namespace data { + profiler_wrapper profilers_(profiler("root")); +} + +profiler& get_profiler() { + auto& p = data::profilers_.local(); + if (!p.is_activated()) { + p.start(); + } + return p; +} + +// this will throw an exception if the profler has already been started +void profiler_start() { + data::profilers_.local().start(); +} +void profiler_stop() { + get_profiler().stop(); +} +void profiler_enter(const char* n) { + get_profiler().enter(n); +} + +void profiler_leave() { + get_profiler().leave(); +} +void profiler_leave(int nlevels) { + get_profiler().leave(nlevels); +} + +// iterate over all profilers and ensure that they have the same start stop times +void stop_profilers() { + for (auto& p : data::profilers_) { + p.stop(); + } +} + +void profiler_output(double threshold) { + stop_profilers(); + + // Find the earliest start time and latest stop time over all profilers + // This can be used to calculate the wall time for this communicator. + // The min-max values are used because, for example, the individual + // profilers might start at different times. In this case, the time stamp + // when the first profiler started is taken as the start time of the whole + // measurement period. Likewise for the last profiler to stop. + auto start_time = data::profilers_.begin()->start_time(); + auto stop_time = data::profilers_.begin()->stop_time(); + for(auto& p : data::profilers_) { + start_time = std::min(start_time, p.start_time()); + stop_time = std::max(stop_time, p.stop_time()); + } + // calculate the wall time + auto wall_time = timer_type::difference(start_time, stop_time); + // calculate the accumulated wall time over all threads + auto nthreads = data::profilers_.size(); + auto thread_wall = wall_time * nthreads; + + // gather the profilers into one accumulated profile over all threads + auto thread_measured = 0.; // accumulator for the time measured in each thread + auto p = profiler_node(0, "total"); + for(auto& thread_profiler : data::profilers_) { + auto tree = thread_profiler.performance_tree(); + thread_measured += tree.value - tree.time_in_other(); + p.fuse(thread_profiler.performance_tree()); + } + auto efficiency = 100. * thread_measured / thread_wall; + + p.scale(1./nthreads); + + auto ncomms = communication::global_policy::size(); + auto comm_rank = communication::global_policy::id(); + bool print = comm_rank==0 ? true : false; + if(print) { + std::cout << " ---------------------------------------------------- \n"; + std::cout << "| profiler |\n"; + std::cout << " ---------------------------------------------------- \n"; + char line[128]; + std::snprintf( + line, sizeof(line), "%-18s%10.3f s\n", + "wall time", float(wall_time)); + std::cout << line; + std::snprintf( + line, sizeof(line), "%-18s%10d\n", + "communicators", int(ncomms)); + std::cout << line; + std::snprintf( + line, sizeof(line), "%-18s%10d\n", + "threads", int(nthreads)); + std::cout << line; + std::snprintf( + line, sizeof(line), "%-18s%10.2f %%\n", + "thread efficiency", float(efficiency)); + std::cout << line << "\n"; + p.print(std::cout, threshold); + + std::cout << "\n\n"; + } + + nlohmann::json as_json; + as_json["wall time"] = wall_time; + as_json["threads"] = nthreads; + as_json["efficiency"] = efficiency; + as_json["communicators"] = ncomms; + as_json["rank"] = comm_rank; + as_json["regions"] = p.as_json(); + + auto fname = std::string("profile_" + std::to_string(comm_rank)); + std::ofstream fid(fname); + fid << std::setw(1) << as_json; +} + +#else +void profiler_start() {} +void profiler_stop() {} +void profiler_enter(const char*) {} +void profiler_leave() {} +void profiler_leave(int) {} +void stop_profilers() {} +void profiler_output(double threshold) {} +#endif + +} // namespace util +} // namespace mc +} // namespace nest + diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp index 11fbfb4fe34c9894f87c5c6fb6ec4103f4d31c60..7230697baab21f914fa3b92f4ffbed4382b062b6 100644 --- a/src/profiling/profiler.hpp +++ b/src/profiling/profiler.hpp @@ -12,168 +12,73 @@ #include <cassert> #include <cstdlib> +#include <json/src/json.hpp> + #include <threading/threading.hpp> +#include <util.hpp> namespace nest { namespace mc { namespace util { -static inline std::string green(std::string s) { return s; } -static inline std::string yellow(std::string s) { return s; } -static inline std::string white(std::string s) { return s; } -static inline std::string red(std::string s) { return s; } -static inline std::string cyan(std::string s) { return s; } - -namespace impl { +inline std::string green(std::string s) { return s; } +inline std::string yellow(std::string s) { return s; } +inline std::string white(std::string s) { return s; } +inline std::string red(std::string s) { return s; } +inline std::string cyan(std::string s) { return s; } - static inline - size_t hash(std::string const& s) - { - size_t h = 5381; - for(auto c: s) { - h = ((h << 5) + h) + int(c); - } - return h; - } +using timer_type = nest::mc::threading::timer; +namespace impl { + /// simple hashing function for strings static inline - size_t hash(char* s) - { + size_t hash(const char* s) { size_t h = 5381; - - while(*s) { + while (*s) { h = ((h << 5) + h) + int(*s); ++s; } return h; } - struct profiler_node { - double value; - std::string name; - std::vector<profiler_node> children; - - profiler_node() - : value(0.), name("") - {} - - profiler_node(double v, std::string const& n) - : value(v), name(n) - {} - - void print(int indent=0) - { - std::string s = std::string(indent, ' ') + name; - std::cout << s - << std::string(60-s.size(), '.') - << value - << "\n"; - for(auto &n: children) { - n.print(indent+2); - } - } - - friend profiler_node operator +(profiler_node const& lhs, profiler_node const& rhs) - { - assert(lhs.name == rhs.name); - auto node = lhs; - node.fuse(rhs); - return node; - } - - friend bool operator ==(profiler_node const& lhs, profiler_node const& rhs) - { - return lhs.name == rhs.name; - } - - void print(std::ostream& stream, double threshold) - { - // convert threshold from proportion to time - threshold *= value; - print_sub(stream, 0, threshold, value); - } - - void print_sub(std::ostream& stream, - int indent, - double threshold, - double total) - { - char buffer[512]; - - if(value < threshold) { - std::cout << green("not printing ") << name << std::endl; - return; - } - - auto max_contribution = - std::accumulate( - children.begin(), children.end(), -1., - [] (double lhs, profiler_node const& rhs) { - return lhs > rhs.value ? lhs : rhs.value; - } - ); - - // print the table row - auto const indent_str = std::string(indent, ' '); - auto label = indent_str + name; - float percentage = 100.*value/total; - snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f", - label.c_str(), - float(value), - float(percentage)); - bool print_children = - threshold==0. ? children.size()>0 - : max_contribution >= threshold; - - if(print_children) { - stream << white(buffer) << std::endl; - } - else { - stream << buffer << std::endl; - } - - if(print_children) { - auto other = 0.; - for(auto &n : children) { - if(n.value<threshold || n.name=="other") { - other += n.value; - } - else { - n.print_sub(stream, indent + 2, threshold, total); - } - } - if(other >= threshold && children.size()) { - label = indent_str + " other"; - percentage = 100.*other/total; - snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f", - label.c_str(), float(other), percentage); - stream << buffer << std::endl; - } - } - } + /// std::string overload for hash + static inline + size_t hash(const std::string& s) { + return hash(s.c_str()); + } +} // namespace impl - void fuse(profiler_node const& other) - { - for(auto const& n : other.children) { - // linear search isn't ideal... - auto const it = std::find(children.begin(), children.end(), n); - if(it!=children.end()) { - (*it).fuse(n); - } - else { - children.push_back(n); - } - } - - value += other.value; - } +/// The tree data structure that is generated by post-processing of +/// a profiler. +struct profiler_node { + double value; + std::string name; + std::vector<profiler_node> children; + using json = nlohmann::json; - }; + profiler_node() : + value(0.), name("") + {} + profiler_node(double v, const std::string& n) : + value(v), name(n) + {} -} // namespace impl + void print(int indent=0); + void print(std::ostream& stream, double threshold); + void print_sub(std::ostream& stream, int indent, double threshold, double total); + void fuse(const profiler_node& other); + /// return wall time spend in "other" region + double time_in_other() const; + /// scale the value in each node by factor + /// performed to all children recursively + void scale(double factor); + + json as_json() const; +}; -using timer_type = nest::mc::threading::timer; +profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs); +bool operator== (const profiler_node& lhs, const profiler_node& rhs); // a region in the profiler, has // - name @@ -183,215 +88,153 @@ class region_type { region_type *parent_ = nullptr; std::string name_; size_t hash_; - std::unordered_map< - size_t, - std::unique_ptr<region_type> - > subregions_; + std::unordered_map<size_t, std::unique_ptr<region_type>> subregions_; timer_type::time_point start_time_; double total_time_ = 0; public: - using profiler_node = impl::profiler_node; - - explicit region_type(std::string const& n) - : name_(n) - { - start_time_ = timer_type::tic(); - hash_ = impl::hash(n); - } - - - explicit region_type(const char* n) - : region_type(std::string(n)) + explicit region_type(std::string n) : + name_(std::move(n)), + hash_(impl::hash(n)), + start_time_(timer_type::tic()) {} - std::string const& name() const { - return name_; - } - - void name(std::string const& n) { - name_ = n; - } - - region_type* parent() { - return parent_; - } - - void start_time() { start_time_ = timer_type::tic(); } - void end_time () { total_time_ += timer_type::toc(start_time_); } + explicit region_type(const char* n) : + region_type(std::string(n)) + {} - region_type(std::string const& n, region_type* p) - : region_type(n) + region_type(std::string n, region_type* p) : + region_type(std::move(n)) { parent_ = p; } - bool has_subregions() const { - return subregions_.size() > 0; - } + const std::string& name() const { return name_; } + void name(std::string n) { name_ = std::move(n); } - size_t hash () const { - return hash_; - } + region_type* parent() { return parent_; } - region_type* subregion(const char* n) - { - size_t hsh = impl::hash(n); - auto s = subregions_.find(hsh); - if(s == subregions_.end()) { - subregions_[hsh] = util::make_unique<region_type>(n, this); - return subregions_[hsh].get(); - } - return s->second.get(); - } - - double subregion_contributions() const - { - return - std::accumulate( - subregions_.begin(), subregions_.end(), 0., - [](double l, decltype(*(subregions_.begin())) r) { - return l+r.second->total(); - } - ); - } + void start_time() { start_time_ = timer_type::tic(); } + void end_time () { total_time_ += timer_type::toc(start_time_); } + double total() const { return total_time_; } - double total() const - { - return total_time_; - } + bool has_subregions() const { return subregions_.size() > 0; } - profiler_node populate_performance_tree() const { - profiler_node tree(total(), name()); + size_t hash() const { return hash_; } - for(auto &it : subregions_) { - tree.children.push_back(it.second->populate_performance_tree()); - } + region_type* subregion(const char* n); - // sort the contributions in descending order - std::stable_sort( - tree.children.begin(), tree.children.end(), - [](profiler_node const& lhs, profiler_node const& rhs) { - return lhs.value>rhs.value; - } - ); - - if(tree.children.size()) { - // find the contribution of parts of the code that were not explicitly profiled - auto contributions = - std::accumulate( - tree.children.begin(), tree.children.end(), 0., - [](double v, profiler_node& n) { - return v+n.value; - } - ); - auto other = total() - contributions; - - // add the "other" category - tree.children.emplace_back(other, std::string("other")); - } + double subregion_contributions() const; - return tree; - } + profiler_node populate_performance_tree() const; }; -class Profiler { +class profiler { public: - Profiler(std::string const& name) - : root_region_(name) - { } + profiler(std::string name) : + root_region_(std::move(name)) + {} // the copy constructor doesn't do a "deep copy" - // it simply creates a new Profiler with the same name + // it simply creates a new profiler with the same name // This is needed for tbb to create a list of thread local profilers - Profiler(Profiler const& other) - : Profiler(other.root_region_.name()) + profiler(const profiler& other) : + profiler(other.root_region_.name()) {} - void enter(const char* name) - { - if(!is_activated()) return; - auto start = timer_type::tic(); - current_region_ = current_region_->subregion(name); - current_region_->start_time(); - self_time_ += timer_type::toc(start); - } + /// step down into level with name + void enter(const char* name); - void leave() - { - if(!is_activated()) return; - auto start = timer_type::tic(); - if(current_region_->parent()==nullptr) { - std::cout << "error" << std::endl; - throw std::out_of_range("attempt to leave root memory tracing region"); - } - current_region_->end_time(); - current_region_ = current_region_->parent(); - self_time_ += timer_type::toc(start); - } + /// step up one level + void leave(); - region_type& regions() - { - return root_region_; - } + /// step up multiple n levels in one call + void leave(int n); - region_type* current_region() - { - return current_region_; - } + /// return a reference to the root region + region_type& regions() { return root_region_; } - double self_time() const - { - return self_time_; - } + /// return a pointer to the current region + region_type* current_region() { return current_region_; } - bool is_in_root() const - { - return &root_region_ == current_region_; - } + /// return if in the root region (i.e. the highest level) + bool is_in_root() const { return &root_region_ == current_region_; } - bool is_activated() const { - return activated_; - } + /// return if the profiler has been activated + bool is_activated() const { return activated_; } - void start() { - if(is_activated()) { - throw std::out_of_range( - "attempt to start an already running profiler" - ); - } - activate(); - root_region_.start_time(); - } + /// start (activate) the profiler + void start(); - void stop() { - if(!is_in_root()) { - throw std::out_of_range( - "attempt to profiler that is not in the root region" - ); - } - root_region_.end_time(); - disactivate(); - } + /// stop (deactivate) the profiler + void stop(); - region_type::profiler_node performance_tree() { - if(is_activated()) { - stop(); - } - return root_region_.populate_performance_tree(); + /// the time stamp at which the profiler was started (avtivated) + timer_type::time_point start_time() const { return start_time_; } + + /// the time stamp at which the profiler was stopped (deavtivated) + timer_type::time_point stop_time() const { return stop_time_; } + + /// the time in seconds between activation and deactivation of the profiler + double wall_time() const { + return timer_type::difference(start_time_, stop_time_); } + /// stop the profiler then generate the performance tree ready for output + profiler_node performance_tree(); + private: - void activate() { activated_ = true; } - void disactivate() { activated_ = false; } + void activate() { activated_ = true; } + void deactivate() { activated_ = false; } + timer_type::time_point start_time_; + timer_type::time_point stop_time_; bool activated_ = false; region_type root_region_; region_type* current_region_ = &root_region_; - double self_time_ = 0.; }; +#ifdef WITH_PROFILING +namespace data { + using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>; + extern profiler_wrapper profilers_; +} +#endif + +/// get a reference to the thread private profiler +/// will lazily create and start the profiler it it has not already been done so +profiler& get_profiler(); + +/// start thread private profiler +void profiler_start(); + +/// stop thread private profiler +void profiler_stop(); + +/// enter a profiling region with name n +void profiler_enter(const char* n); + +/// enter nested profiler regions in a single call +template <class...Args> +void profiler_enter(const char* n, Args... args) { +#ifdef WITH_PROFILING + get_profiler().enter(n); + profiler_enter(args...); +#endif +} + +/// move up one level in the profiler +void profiler_leave(); +/// move up multiple profiler levels in one call +void profiler_leave(int nlevels); + +/// iterate and stop them +void stop_profilers(); + +/// print the collated profiler to std::cout +void profiler_output(double threshold); + } // namespace util } // namespace mc } // namespace nest diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp index ebdede8426d2fa8a0ba0b7b327595b83af0970ee..eef783949049c77f3c06bfa059b02c06d96b3670 100644 --- a/src/threading/serial.hpp +++ b/src/threading/serial.hpp @@ -21,29 +21,26 @@ class enumerable_thread_specific { public : - T& local() { - return data[0]; - } - const T& local() const { - return data[0]; - } + enumerable_thread_specific() = default; - auto begin() -> decltype(data.begin()) - { - return data.begin(); - } - auto end() -> decltype(data.end()) - { - return data.end(); - } - auto cbegin() -> decltype(data.cbegin()) - { - return data.cbegin(); - } - auto cend() -> decltype(data.cend()) - { - return data.cend(); - } + enumerable_thread_specific(const T& init) : + data{init} + {} + + enumerable_thread_specific(T&& init) : + data{std::move(init)} + {} + + T& local() { return data[0]; } + const T& local() const { return data[0]; } + + auto size() -> decltype(data.size()) const { return data.size(); } + + auto begin() -> decltype(data.begin()) { return data.begin(); } + auto end() -> decltype(data.end()) { return data.end(); } + + auto cbegin() -> decltype(data.cbegin()) const { return data.cbegin(); } + auto cend() -> decltype(data.cend()) const { return data.cend(); } }; @@ -59,25 +56,24 @@ struct parallel_for { } }; -static inline -std::string description() { +inline std::string description() { return "serial"; } struct timer { using time_point = std::chrono::time_point<std::chrono::system_clock>; - static - inline time_point tic() - { + static inline time_point tic() { return std::chrono::system_clock::now(); } - static - inline double toc(time_point t) - { + static inline double toc(time_point t) { return std::chrono::duration<double>(tic() - t).count(); } + + static inline double difference(time_point b, time_point e) { + return std::chrono::duration<double>(e-b).count(); + } }; diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp index 32097586169f794c90baafdbd948f8b60df025d6..ed82e26ec87617f7453f4548efe3eec3f858507f 100644 --- a/src/threading/tbb.hpp +++ b/src/threading/tbb.hpp @@ -24,28 +24,35 @@ struct parallel_for { } }; -static -std::string description() { +inline std::string description() { return "TBB"; } struct timer { using time_point = tbb::tick_count; - static - inline time_point tic() - { + static inline time_point tic() { return tbb::tick_count::now(); } - static - inline double toc(time_point t) - { + static inline double toc(time_point t) { return (tic() - t).seconds(); } + + static inline double difference(time_point b, time_point e) { + return (e-b).seconds(); + } }; } // threading } // mc } // nest +namespace tbb { + /// comparison operator for tbb::tick_count type + /// returns true iff time stamp l occurred before timestamp r + inline bool operator< (tbb::tick_count l, tbb::tick_count r) { + return (l-r).seconds() < 0.; + } +} + diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b859950e4d44d1a1026cca8d4e422b9be5450a83..230d385f27b065d21bb0e297d799ba2299d395c4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,6 +53,11 @@ add_executable(validate.exe ${VALIDATION_SOURCES} ${HEADERS}) target_link_libraries(test.exe LINK_PUBLIC cellalgo gtest) target_link_libraries(validate.exe LINK_PUBLIC cellalgo gtest) +if(WITH_TBB) + target_link_libraries(test.exe LINK_PUBLIC ${TBB_LIBRARIES}) + target_link_libraries(validate.exe LINK_PUBLIC ${TBB_LIBRARIES}) +endif() + set_target_properties(test.exe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests" @@ -62,3 +67,4 @@ set_target_properties(validate.exe PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests" ) +