Skip to content
Snippets Groups Projects
Commit 54f47392 authored by Ben Cumming's avatar Ben Cumming Committed by w-klijn
Browse files

Metering support with time meter (#217)

  * An abstract `meter` class that defines interface for taking a reading, and returning the meter results as a json object.
  * A `time_meter` implementation of the `meter` that measures wall time.
  * To generate metering reports with global information, the global communication policy interfaces were extended to support `gather` and `barrier` operations. These are trivial for serial and dry run policies, and wrap the appropriate MPI calls for the MPI policy.
  * a `meter_manager` type that stores a list of meters was created
    * will also have memory and power meters soon.
  * a meter manager was added to the miniapp and now records startup, model initialization, time stepping and final file io times.
parent 5f9d4020
No related branches found
No related tags found
No related merge requests found
......@@ -16,6 +16,7 @@
#include <io/exporter_spike_file.hpp>
#include <model.hpp>
#include <profiling/profiler.hpp>
#include <profiling/meter_manager.hpp>
#include <threading/threading.hpp>
#include <util/debug.hpp>
#include <util/ioutil.hpp>
......@@ -50,6 +51,9 @@ int main(int argc, char** argv) {
nest::mc::communication::global_policy_guard global_guard(argc, argv);
try {
nest::mc::util::meter_manager meters;
meters.checkpoint("start");
std::cout << util::mask_stream(global_policy::id()==0);
// read parameters
io::cl_options options = io::read_options(argc, argv, global_policy::id()==0);
......@@ -72,6 +76,8 @@ int main(int argc, char** argv) {
banner();
meters.checkpoint("global setup");
// determine what to attach probes to
probe_distribution pdist;
pdist.proportion = options.probe_ratio;
......@@ -110,10 +116,8 @@ int main(int argc, char** argv) {
m.set_binning_policy(binning_policy, options.bin_dt);
// Inject some artificial spikes, 1 per 20 neurons.
std::vector<cell_gid_type> local_sources;
cell_gid_type first_spike_cell = 20*((cell_range.first+19)/20);
for (auto c=first_spike_cell; c<cell_range.second; c+=20) {
local_sources.push_back(c);
m.add_artificial_spike({c, 0});
}
......@@ -129,19 +133,7 @@ int main(int argc, char** argv) {
m.attach_sampler(probe.id, make_trace_sampler(traces.back().get(), sample_dt));
}
#ifdef WITH_PROFILING
// dummy run of the model for one step to ensure that profiling is consistent
m.run(options.dt, options.dt);
// reset and add the source spikes once again
m.reset();
for (auto source : local_sources) {
m.add_artificial_spike({source, 0});
}
#endif
// Initialize the spike exporting interface after the profiler dummy
// steps, to avoid having the initial seed spikes that are artificially
// injected at t=0 from being recorded and output twice.
// Initialize the spike exporting interface
std::unique_ptr<file_export_type> file_exporter;
if (options.spike_file_output) {
if (options.single_file_per_rank) {
......@@ -160,9 +152,13 @@ int main(int argc, char** argv) {
}
}
meters.checkpoint("model initialization");
// run model
m.run(options.tfinal, options.dt);
meters.checkpoint("time stepping");
// output profile and diagnostic feedback
auto const num_steps = options.tfinal / options.dt;
util::profiler_output(0.001, m.num_cells()*num_steps, options.profile_only_zero);
......@@ -172,6 +168,10 @@ int main(int argc, char** argv) {
for (const auto& trace: traces) {
write_trace_json(*trace.get(), options.trace_prefix);
}
meters.checkpoint("output");
util::save_to_file(meters, "meters.json");
}
catch (io::usage_error& e) {
// only print usage/startup errors on master
......
......@@ -3,7 +3,10 @@ set(BASE_SOURCES
cell.cpp
morphology.cpp
parameter_list.cpp
profiling/meter.cpp
profiling/meter_manager.cpp
profiling/profiler.cpp
profiling/time_meter.cpp
swcio.cpp
threading/affinity.cpp
util/debug.cpp
......
......@@ -74,6 +74,13 @@ struct dryrun_global_policy {
return size()*value;
}
template <typename T>
static std::vector<T> gather(T value, int) {
return std::vector<T>(size(), value);
}
static void barrier() {}
static void setup(int& argc, char**& argv) {}
static void teardown() {}
......
......@@ -51,6 +51,15 @@ struct mpi_global_policy {
return nest::mc::mpi::reduce(value, MPI_SUM);
}
template <typename T>
static std::vector<T> gather(T value, int root) {
return mpi::gather(value, root);
}
static void barrier() {
mpi::barrier();
}
static void setup(int& argc, char**& argv) {
nest::mc::mpi::init(&argc, &argv);
}
......
......@@ -52,6 +52,13 @@ struct serial_global_policy {
return value;
}
template <typename T>
static std::vector<T> gather(T value, int) {
return {value};
}
static void barrier() {}
static void setup(int& argc, char**& argv) {}
static void teardown() {}
......
#include "meter.hpp"
namespace nest {
namespace mc {
namespace util {
nlohmann::json to_json(const measurement& mnt) {
nlohmann::json measurements;
for (const auto& m: mnt.measurements) {
measurements.push_back(m);
}
return {
{"name", mnt.name},
{"units", mnt.units},
{"measurements", measurements}
};
}
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
#include <string>
#include <json/json.hpp>
namespace nest {
namespace mc {
namespace util {
// A measurement from a meter has the following:
// * name
// * e.g. walltime or allocated-memory
// * units
// * use SI units
// * e.g. s or MiB
// * measurements
// * a vector with one entry for each checkpoint
// * each entry is a std::vector<double> of measurements gathered across
// domains at one checkpoint.
//
struct measurement {
std::string name;
std::string units;
std::vector<std::vector<double>> measurements;
};
// Converts a measurement to a json type for serialization to file.
// See src/profiling/meters.md for more information about the json formating.
nlohmann::json to_json(const measurement& m);
// A meter can be used to take a measurement of resource consumption, for
// example wall time, memory or energy consumption.
// Each specialization must:
// 1) Record the resource consumption on calling meter::take_reading.
// * How and which information is recorded is implementation dependent.
// 2) Return a std::vector containing the measurements that are derived
// from the information recorded on calls to meter::take_reading.
// * The return value is a vector of measurements, because a meter
// may derive multiple measurements from the recorded checkpoint
// information.
class meter {
public:
meter() = default;
// Provide a human readable name for the meter
virtual std::string name() = 0;
// Take a reading/measurement of the resource
virtual void take_reading() = 0;
// Return a summary of the recordings.
// May perform expensive operations to process and analyse the readings.
// Full output is expected only on the root domain, i.e. when
// global_policy::id()==0
virtual std::vector<measurement> measurements() = 0;
virtual ~meter() = default;
};
} // namespace util
} // namespace mc
} // namespace nest
#include "meter_manager.hpp"
namespace nest {
namespace mc {
namespace util {
meter_manager::meter_manager() {
// add time-measurement meter
meters.emplace_back(new time_meter());
// add memory consumption meter
// TODO
// add energy consumption meter
// TODO
};
void meter_manager::checkpoint(std::string name) {
checkpoint_names.push_back(std::move(name));
// Enforce a global synchronization point the first time that the meters
// are used, to ensure that times measured across all domains are
// synchronised.
if (meters.size()==0) {
communication::global_policy::barrier();
}
for (auto& m: meters) {
m->take_reading();
}
}
nlohmann::json to_json(const meter_manager& manager) {
using gcom = communication::global_policy;
nlohmann::json meter_out;
for (const auto& m: manager.meters) {
for (const auto& measure: m->measurements()) {
meter_out.push_back(to_json(measure));
}
}
// Only the "root" process returns meter information
if (gcom::id()==0) {
return {
{"checkpoints", manager.checkpoint_names},
{"num_domains", gcom::size()},
{"global_model", std::to_string(gcom::kind())},
{"meters", meter_out},
// TODO mapping of domains to nodes, which will be required to
// calculate the total memory and energy consumption of a
// distributed simulation.
};
}
return {};
}
void save_to_file(const meter_manager& manager, const std::string& name) {
auto measurements = to_json(manager);
if (!communication::global_policy::id()) {
std::ofstream fid;
fid.exceptions(std::ios_base::badbit | std::ios_base::failbit);
fid.open(name);
fid << std::setw(1) << measurements << "\n";
}
}
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
#include <memory>
#include <vector>
#include <util/make_unique.hpp>
#include <communication/global_policy.hpp>
#include <json/json.hpp>
#include "meter.hpp"
#include "time_meter.hpp"
namespace nest {
namespace mc {
namespace util {
struct meter_manager {
std::vector<std::unique_ptr<meter>> meters;
std::vector<std::string> checkpoint_names;
meter_manager();
void checkpoint(std::string name);
};
nlohmann::json to_json(const meter_manager&);
void save_to_file(const meter_manager& manager, const std::string& name);
} // namespace util
} // namespace mc
} // namespace nest
A json record for a meter measurement is a json object.
Each Object corresponds to a derived measurement:
* `name`: a string describing the measurement
* `units`: a string with SI units for measurements
* `measurements`: a json Array of measurements, with one
entry per checkpoint (corresponding to a call to
meter::take_reading)
* each measurement is itself a numeric array, with one
recording for each domain in the global communicator
For example, the output of a meter for measuring wall time where 5 readings
were taken on 4 MPI ranks could be represented as follows:
```json
{
"name": "walltime",
"units": "s",
"measurements": [
[ 0, 0, 0, 0, ],
[ 0.001265837, 0.001344004, 0.001299362, 0.001195762, ],
[ 0.014114013, 0.015045662, 0.015071675, 0.014209514, ],
[ 1.491986631, 1.491121134, 1.490957219, 1.492064233, ],
[ 0.00565307, 0.004375347, 0.002228206, 0.002483978, ]
]
}
```
......@@ -15,14 +15,10 @@ namespace mc {
namespace util {
// Here we provide functionality that the profiler can use to control the CUDA
// profiler nvprof. The cudaStartProfiler and cudaStopProfiler API calls are
// provided to let a program control which parts of the program are to be
// profiled.
// Here are some wrappers that the NestMC profiler restrict nvprof to recording
// only the time intervals that the user requests when they start and stop the
// profiler.
// It is a simple wrapper around the API calls with a mutex to ensure correct
// behaviour when multiple threads attempt to start or stop the profiler.
// profiler nvprof. The start_nvprof and stop_nvprof calls are provided to let
// a program control which parts of the program are to be profiled. It is a
// simple wrapper around the API calls with a mutex to ensure correct behaviour
// when multiple threads attempt to start or stop the profiler.
#ifdef NMC_HAVE_GPU
namespace gpu {
bool is_running_nvprof = false;
......@@ -51,9 +47,9 @@ namespace gpu {
}
#endif
/////////////////////////////////////////////////////////
// profiler_node
/////////////////////////////////////////////////////////
//
// profiler_node implementation
//
void profiler_node::print(int indent) {
std::string s = std::string(indent, ' ') + name;
std::cout << s
......@@ -178,9 +174,9 @@ bool operator== (const profiler_node& lhs, const profiler_node& rhs) {
return lhs.name == rhs.name;
}
/////////////////////////////////////////////////////////
// region_type
/////////////////////////////////////////////////////////
//
// region_type implementation
//
region_type* region_type::subregion(const char* n) {
size_t hsh = impl::hash(n);
auto s = subregions_.find(hsh);
......@@ -234,9 +230,9 @@ profiler_node region_type::populate_performance_tree() const {
return tree;
}
/////////////////////////////////////////////////////////
// region_type
/////////////////////////////////////////////////////////
//
// profiler implementation
//
void profiler::enter(const char* name) {
if (!is_activated()) return;
current_region_ = current_region_->subregion(name);
......@@ -435,7 +431,7 @@ void profiler_output(double threshold, std::size_t num_local_work_items, bool pr
as_json["regions"] = p.as_json();
if (output_this_rank) {
auto fname = std::string("profile_" + std::to_string(comm_rank));
auto fname = std::string("profile_" + std::to_string(comm_rank) + ".json");
std::ofstream fid(fname);
fid << std::setw(1) << as_json;
}
......
#include <string>
#include <vector>
#ifdef NMC_HAVE_GPU
#include <cuda_runtime.h>
#endif
#include "time_meter.hpp"
#include <communication/global_policy.hpp>
namespace nest {
namespace mc {
namespace util {
std::string time_meter::name() {
return "time";
}
void time_meter::take_reading() {
// Wait for execution on this global domain to finish before recording the
// time stamp. For now this means waiting for all work to finish executing
// on the GPU (if GPU support is enabled)
#ifdef NMC_HAVE_GPU
cudaDeviceSynchronize();
#endif
// Record the time stamp
readings_.push_back(timer_type::tic());
// Enforce a global barrier after taking the time stamp
communication::global_policy::barrier();
}
std::vector<measurement> time_meter::measurements() {
using gcom = communication::global_policy;
// Calculate the elapsed time on the local domain for each interval,
// and store them in the times vector.
std::vector<double> times;
times.push_back(0);
for (auto i=1u; i<readings_.size(); ++i) {
double t = timer_type::difference(readings_[i-1], readings_[i]);
times.push_back(t);
}
// Assert that the same number of readings were taken on every domain.
const auto num_readings = times.size();
if (gcom::min(num_readings)!=gcom::max(num_readings)) {
throw std::out_of_range(
"the number of checkpoints in the \"time\" meter do not match across domains");
}
// Gather the timers from accross all of the domains onto the root domain.
// Note: results are only valid on the root domain on completion.
measurement results;
results.name = "walltime";
results.units = "s";
for (auto t: times) {
results.measurements.push_back(gcom::gather(t, 0));
}
return {results};
}
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
#include <string>
#include <vector>
#include <json/json.hpp>
#include "meter.hpp"
#include "profiler.hpp"
namespace nest {
namespace mc {
namespace util {
class time_meter : public meter {
std::vector<timer_type::time_point> readings_;
public:
std::string name() override;
void take_reading() override;
virtual std::vector<measurement> measurements() override;
};
} // namespace util
} // namespace mc
} // namespace nest
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment