Skip to content
Snippets Groups Projects
Commit 99a0b1c8 authored by Ben Cumming's avatar Ben Cumming Committed by Sam Yates
Browse files

Add power meter and refactor meter interfaces.

Fixes #190.

The final piece in the metering features.

* Add a `power_meter` which currently records energy used on each node of a Cray XC{30,40,50} systems, which all have built in `pm_counters` interface to power measurement.
* Add information about which node each MPI rank runs on to the metering output in `meters.json`, which is needed to analyse energy recordings, which are per node, not per MPI rank.
* Refactor collation of measurements: now the responsibility of the meter manager.
* Add support for `gather` with `std::string` to the global communication policy, which required a back end MPI implementation and corresponding unit test.
* Add `src/util/config.hpp` that populate the `nest::mc::config` namespace with `constexpr bool` flags describing system or environment capabilities.
parent a0640a11
No related branches found
No related tags found
No related merge requests found
Showing
with 415 additions and 224 deletions
......@@ -122,6 +122,7 @@ set_property(CACHE NMC_SYSTEM_TYPE PROPERTY STRINGS Generic Cray BGQ )
# Cray specific flags
if(${NMC_SYSTEM_TYPE} MATCHES "Cray")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -dynamic")
add_definitions(-DNMC_HAVE_CRAY)
endif()
#----------------------------------------------------------
......
......@@ -51,7 +51,7 @@ int main(int argc, char** argv) {
try {
nest::mc::util::meter_manager meters;
meters.checkpoint("start");
meters.start();
std::cout << util::mask_stream(global_policy::id()==0);
// read parameters
......@@ -168,8 +168,6 @@ int main(int argc, char** argv) {
write_trace_json(*trace.get(), options.trace_prefix);
}
meters.checkpoint("output");
util::save_to_file(meters, "meters.json");
}
catch (io::usage_error& e) {
......
......@@ -5,15 +5,16 @@ set(BASE_SOURCES
morphology.cpp
parameter_list.cpp
profiling/memory_meter.cpp
profiling/meter.cpp
profiling/meter_manager.cpp
profiling/power_meter.cpp
profiling/profiler.cpp
profiling/time_meter.cpp
swcio.cpp
threading/affinity.cpp
util/debug.cpp
util/hostname.cpp
util/memory.cpp
util/path.cpp
util/power.cpp
util/unwind.cpp
backends/multicore/fvm.cpp
)
......
......@@ -65,10 +65,6 @@ namespace mpi {
// T must be trivially copyable
template<typename T>
std::vector<T> gather(T value, int root) {
static_assert(
true,//std::is_trivially_copyable<T>::value,
"gather can only be performed on trivally copyable types");
using traits = mpi_traits<T>;
auto buffer_size = (rank()==root) ? size() : 0;
std::vector<T> buffer(buffer_size);
......@@ -87,9 +83,6 @@ namespace mpi {
// T must be trivially copyable
template <typename T>
std::vector<T> gather_all(T value) {
static_assert(
true,//std::is_trivially_copyable<T>::value,
"gather_all can only be performed on trivally copyable types");
using traits = mpi_traits<T>;
std::vector<T> buffer(size());
......@@ -103,11 +96,33 @@ namespace mpi {
return buffer;
}
// Specialize gather for std::string.
inline std::vector<std::string> gather(std::string str, int root) {
using traits = mpi_traits<char>;
auto counts = gather_all(int(str.size()));
auto displs = algorithms::make_index(counts);
std::vector<char> buffer(displs.back());
PE("MPI", "Gather");
MPI_Gatherv(str.data(), counts[rank()], traits::mpi_type(), // send
buffer.data(), counts.data(), displs.data(), traits::mpi_type(), // receive
root, MPI_COMM_WORLD);
PL(2);
// Unpack the raw string data into a vector of strings.
std::vector<std::string> result;
result.reserve(size());
for (auto i=0; i<size(); ++i) {
result.push_back(std::string(buffer.data()+displs[i], counts[i]));
}
return result;
}
template <typename T>
std::vector<T> gather_all(const std::vector<T>& values) {
static_assert(
true,//std::is_trivially_copyable<T>::value,
"gather_all can only be performed on trivally copyable types");
using traits = mpi_traits<T>;
auto counts = gather_all(int(values.size()));
......
......@@ -54,7 +54,7 @@ struct serial_global_policy {
template <typename T>
static std::vector<T> gather(T value, int) {
return {value};
return {std::move(value)};
}
static void barrier() {}
......
#include <string>
#include <vector>
#include <util/config.hpp>
#include "memory_meter.hpp"
#include <communication/global_policy.hpp>
namespace nest {
namespace mc {
namespace util {
namespace {
measurement collate(const std::vector<memory_size_type>& readings, std::string name) {
using gcom = communication::global_policy;
//
// memory_meter
//
// Calculate the local change in allocated memory for each interval.
std::vector<memory_size_type> allocated;
allocated.push_back(0);
for (auto i=1u; i<readings.size(); ++i) {
allocated.push_back(readings[i] - readings[i-1]);
}
class memory_meter: public meter {
protected:
std::vector<memory_size_type> readings_;
// Assert that the same number of readings were taken on every domain.
const auto num_readings = allocated.size();
if (gcom::min(num_readings)!=gcom::max(num_readings)) {
throw std::out_of_range(
"the number of checkpoints in the \"memory\" meter do not match across domains");
}
public:
std::string name() override {
return "memory-allocated";
}
std::string units() override {
return "B";
}
void take_reading() override {
readings_.push_back(allocated_memory());
}
// Gather allocations from across all of the domains onto the root domain.
// Note: results are only valid on the root domain on completion.
measurement results;
results.name = std::move(name);
results.units = "kB";
for (auto m: allocated) {
results.measurements.push_back(gcom::gather(std::round(m/1e3), 0));
std::vector<double> measurements() override {
std::vector<double> diffs;
for (auto i=1ul; i<readings_.size(); ++i) {
diffs.push_back(readings_[i]-readings_[i-1]);
}
return results;
return diffs;
}
} // anonymous namespace
};
std::string memory_meter::name() {
return "memory";
meter_ptr make_memory_meter() {
if (not config::has_memory_measurement) {
return nullptr;
}
return meter_ptr(new memory_meter());
}
void memory_meter::take_reading() {
readings_.push_back(allocated_memory());
#ifdef NMC_HAVE_GPU
readings_gpu_.push_back(gpu_allocated_memory());
#endif
}
//
// gpu_memory_meter
//
// The gpu memory meter specializes the reading and name methods of the basic
// memory_meter.
class gpu_memory_meter: public memory_meter {
public:
std::string name() override {
return "gpu-memory-allocated";
}
void take_reading() override {
readings_.push_back(gpu_allocated_memory());
}
};
std::vector<measurement> memory_meter::measurements() {
std::vector<measurement> results;
results.push_back(collate(readings_, "memory-allocated"));
if (readings_gpu_.size()) {
results.push_back(collate(readings_gpu_, "memory-allocated-gpu"));
meter_ptr make_gpu_memory_meter() {
if (not config::has_cuda) {
return nullptr;
}
return results;
return meter_ptr(new gpu_memory_meter());
}
} // namespace util
......
......@@ -11,17 +11,8 @@ namespace nest {
namespace mc {
namespace util {
class memory_meter : public meter {
std::vector<memory_size_type> readings_;
// only used if running on the GPU
std::vector<memory_size_type> readings_gpu_;
public:
std::string name() override;
void take_reading() override;
virtual std::vector<measurement> measurements() override;
};
meter_ptr make_memory_meter();
meter_ptr make_gpu_memory_meter();
} // namespace util
} // namespace mc
......
#include "meter.hpp"
namespace nest {
namespace mc {
namespace util {
nlohmann::json to_json(const measurement& mnt) {
nlohmann::json measurements;
for (const auto& m: mnt.measurements) {
measurements.push_back(m);
}
return {
{"name", mnt.name},
{"units", mnt.units},
{"measurements", measurements}
};
}
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
#include <memory>
#include <string>
#include <json/json.hpp>
#include <vector>
namespace nest {
namespace mc {
namespace util {
// A measurement from a meter has the following:
// * name
// * e.g. walltime or allocated-memory
// * units
// * use SI units
// * e.g. s or MiB
// * measurements
// * a vector with one entry for each checkpoint
// * each entry is a std::vector<double> of measurements gathered across
// domains at one checkpoint.
//
struct measurement {
std::string name;
std::string units;
std::vector<std::vector<double>> measurements;
};
// Converts a measurement to a json type for serialization to file.
// See src/profiling/meters.md for more information about the json formating.
nlohmann::json to_json(const measurement& m);
// A meter can be used to take a measurement of resource consumption, for
// example wall time, memory or energy consumption.
// Each specialization must:
// 1) Record the resource consumption on calling meter::take_reading.
// * How and which information is recorded is implementation dependent.
// 2) Return a std::vector containing the measurements that are derived
// from the information recorded on calls to meter::take_reading.
// * The return value is a vector of measurements, because a meter
// may derive multiple measurements from the recorded checkpoint
// information.
// 2) Provide the name of the resource being measured via name()
// e.g. : energy
// 3) Provide the units of the resource being measured via units()
// e.g. : J
// 4) Return the resources consumed between each pair of readings as a
// std::vector<double> from measurements(). So, for n readings, there will
// be n-1 differences.
class meter {
public:
meter() = default;
......@@ -48,15 +30,16 @@ public:
// Take a reading/measurement of the resource
virtual void take_reading() = 0;
// Return a summary of the recordings.
// May perform expensive operations to process and analyse the readings.
// Full output is expected only on the root domain, i.e. when
// global_policy::id()==0
virtual std::vector<measurement> measurements() = 0;
// The units of the values returned in from the measurements method.
virtual std::string units() = 0;
virtual std::vector<double> measurements() = 0;
virtual ~meter() = default;
};
using meter_ptr = std::unique_ptr<meter>;
} // namespace util
} // namespace mc
} // namespace nest
#include <communication/global_policy.hpp>
#include <util/hostname.hpp>
#include <json/json.hpp>
#include "meter_manager.hpp"
#include "memory_meter.hpp"
#include "power_meter.hpp"
namespace nest {
namespace mc {
namespace util {
measurement::measurement(
std::string n, std::string u, const std::vector<double>& readings):
name(std::move(n)), units(std::move(u))
{
using gcom = communication::global_policy;
// Assert that the same number of readings were taken on every domain.
const auto num_readings = readings.size();
if (gcom::min(num_readings)!=gcom::max(num_readings)) {
throw std::out_of_range(
"the number of checkpoints in the \""+name+"\" meter do not match across domains");
}
// Gather across all of the domains onto the root domain.
for (auto r: readings) {
measurements.push_back(gcom::gather(r, 0));
}
}
meter_manager::meter_manager() {
// add time-measurement meter
meters_.emplace_back(new time_meter());
if (auto m = make_memory_meter()) {
meters_.push_back(std::move(m));
}
if (auto m = make_gpu_memory_meter()) {
meters_.push_back(std::move(m));
}
if (auto m = make_power_meter()) {
meters_.push_back(std::move(m));
}
};
void meter_manager::start() {
EXPECTS(!started_);
started_ = true;
// add memory consumption meter
if (has_memory_metering) {
meters_.emplace_back(new memory_meter());
// take readings for the start point
for (auto& m: meters_) {
m->take_reading();
}
// add energy consumption meter
// TODO
// Enforce a global barrier after taking the time stamp
communication::global_policy::barrier();
start_time_ = timer_type::tic();
};
void meter_manager::checkpoint(std::string name) {
// Enforce a global synchronization point the first time that the meters
// are used, to ensure that times measured across all domains are
// synchronised.
if (checkpoint_names_.size()==0) {
communication::global_policy::barrier();
}
EXPECTS(started_);
// Record the time taken on this domain since the last checkpoint
auto end_time = timer_type::tic();
times_.push_back(timer_type::difference(start_time_, end_time));
// Update meters
checkpoint_names_.push_back(std::move(name));
for (auto& m: meters_) {
m->take_reading();
}
// Synchronize all domains before setting start time for the next interval
communication::global_policy::barrier();
start_time_ = timer_type::tic();
}
const std::vector<std::unique_ptr<meter>>& meter_manager::meters() const {
......@@ -39,15 +84,39 @@ const std::vector<std::string>& meter_manager::checkpoint_names() const {
return checkpoint_names_;
}
const std::vector<double>& meter_manager::times() const {
return times_;
}
nlohmann::json to_json(const measurement& mnt) {
nlohmann::json measurements;
for (const auto& m: mnt.measurements) {
measurements.push_back(m);
}
return {
{"name", mnt.name},
{"units", mnt.units},
{"measurements", measurements}
};
}
nlohmann::json to_json(const meter_manager& manager) {
using gcom = communication::global_policy;
// Gather the meter outputs into a json Array
nlohmann::json meter_out;
for (auto& m: manager.meters()) {
for (auto& measure: m->measurements()) {
meter_out.push_back(to_json(measure));
}
meter_out.push_back(
to_json(measurement(m->name(), m->units(), m->measurements()))
);
}
// Add the times to the meter outputs
meter_out.push_back(to_json(measurement("time", "s", manager.times())));
// Gather a vector with the names of the node that each rank is running on.
auto host = hostname();
auto hosts = gcom::gather(host? *host: "unknown", 0);
// Only the "root" process returns meter information
if (gcom::id()==0) {
......@@ -56,9 +125,7 @@ nlohmann::json to_json(const meter_manager& manager) {
{"num_domains", gcom::size()},
{"global_model", std::to_string(gcom::kind())},
{"meters", meter_out},
// TODO mapping of domains to nodes, which will be required to
// calculate the total memory and energy consumption of a
// distributed simulation.
{"hosts", hosts},
};
}
......
......@@ -3,28 +3,54 @@
#include <memory>
#include <vector>
#include <util/make_unique.hpp>
#include <communication/global_policy.hpp>
#include <json/json.hpp>
#include "meter.hpp"
#include "memory_meter.hpp"
#include "time_meter.hpp"
#include "profiler.hpp"
namespace nest {
namespace mc {
namespace util {
// A measurement has the following:
// * name
// * e.g. walltime or allocated-memory
// * units
// * use SI units
// * e.g. s or MiB
// * measurements
// * a vector with one entry for each checkpoint
// * each entry is a std::vector<double> of measurements gathered across
// domains at one checkpoint.
struct measurement {
std::string name;
std::string units;
std::vector<std::vector<double>> measurements;
measurement(std::string, std::string, const std::vector<double>&);
};
// Converts a measurement to a json type for serialization to file.
// See src/profiling/meters.md for more information about the json formating.
nlohmann::json to_json(const measurement& m);
class meter_manager {
private:
bool started_ = false;
timer_type::time_point start_time_;
std::vector<double> times_;
std::vector<std::unique_ptr<meter>> meters_;
std::vector<std::string> checkpoint_names_;
public:
meter_manager();
void start();
void checkpoint(std::string name);
const std::vector<std::unique_ptr<meter>>& meters() const;
const std::vector<std::string>& checkpoint_names() const;
const std::vector<double>& times() const;
};
nlohmann::json to_json(const meter_manager&);
......
......@@ -3,13 +3,14 @@ A json record for a meter measurement is a json object.
Each Object corresponds to a derived measurement:
* `name`: a string describing the measurement
* `units`: a string with SI units for measurements
* `measurements`: a json Array of measurements, with one
entry per checkpoint (corresponding to a call to
meter::take_reading)
* each measurement is itself a numeric array, with one
recording for each domain in the global communicator
* `measurements`: a json Array of measurements, with one entry for the
each checkpoint. The first enry is the measure of resources consumed
between the call to `meter_manager::start()` and the first checkpoint, the
second entry measure between the first and second checkpoints, and son on.
* each measurement is itself a numeric array, with one recording for each
domain in the global communicator
For example, the output of a meter for measuring wall time where 5 readings
For example, the output of a meter for measuring wall time where 4 checkpoints
were taken on 4 MPI ranks could be represented as follows:
```json
......@@ -17,7 +18,6 @@ were taken on 4 MPI ranks could be represented as follows:
"name": "walltime",
"units": "s",
"measurements": [
[ 0, 0, 0, 0, ],
[ 0.001265837, 0.001344004, 0.001299362, 0.001195762, ],
[ 0.014114013, 0.015045662, 0.015071675, 0.014209514, ],
[ 1.491986631, 1.491121134, 1.490957219, 1.492064233, ],
......
#include <string>
#include <vector>
#include <util/config.hpp>
#include "power_meter.hpp"
namespace nest {
namespace mc {
namespace util {
class power_meter: public meter {
std::vector<energy_size_type> readings_;
public:
std::string name() override {
return "energy";
}
std::string units() override {
return "J";
}
std::vector<double> measurements() override {
std::vector<double> diffs;
for (auto i=1ul; i<readings_.size(); ++i) {
diffs.push_back(readings_[i]-readings_[i-1]);
}
return diffs;
}
void take_reading() override {
readings_.push_back(energy());
}
};
meter_ptr make_power_meter() {
if (not config::has_power_measurement) {
return nullptr;
}
return meter_ptr(new power_meter());
}
} // namespace util
} // namespace mc
} // namespace nest
......@@ -3,23 +3,16 @@
#include <string>
#include <vector>
#include <util/power.hpp>
#include "meter.hpp"
#include "profiler.hpp"
namespace nest {
namespace mc {
namespace util {
class time_meter : public meter {
std::vector<timer_type::time_point> readings_;
public:
std::string name() override;
void take_reading() override;
virtual std::vector<measurement> measurements() override;
};
meter_ptr make_power_meter();
} // namespace util
} // namespace mc
} // namespace nest
#include <string>
#include <vector>
#ifdef NMC_HAVE_GPU
#include <cuda_runtime.h>
#endif
#include "time_meter.hpp"
#include <communication/global_policy.hpp>
namespace nest {
namespace mc {
namespace util {
std::string time_meter::name() {
return "time";
}
void time_meter::take_reading() {
// Wait for execution on this global domain to finish before recording the
// time stamp. For now this means waiting for all work to finish executing
// on the GPU (if GPU support is enabled)
#ifdef NMC_HAVE_GPU
cudaDeviceSynchronize();
#endif
// Record the time stamp
readings_.push_back(timer_type::tic());
// Enforce a global barrier after taking the time stamp
communication::global_policy::barrier();
}
std::vector<measurement> time_meter::measurements() {
using gcom = communication::global_policy;
// Calculate the elapsed time on the local domain for each interval,
// and store them in the times vector.
std::vector<double> times;
times.push_back(0);
for (auto i=1u; i<readings_.size(); ++i) {
double t = timer_type::difference(readings_[i-1], readings_[i]);
times.push_back(t);
}
// Assert that the same number of readings were taken on every domain.
const auto num_readings = times.size();
if (gcom::min(num_readings)!=gcom::max(num_readings)) {
throw std::out_of_range(
"the number of checkpoints in the \"time\" meter do not match across domains");
}
// Gather the timers from accross all of the domains onto the root domain.
// Note: results are only valid on the root domain on completion.
measurement results;
results.name = "walltime";
results.units = "s";
for (auto t: times) {
results.measurements.push_back(gcom::gather(t, 0));
}
return {results};
}
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
namespace nest {
namespace mc {
namespace config {
// has_memory_measurement
// Support for measuring total allocated memory.
// * true: calls to util::allocated_memory() will return valid results
// * false: calls to util::allocated_memory() will return -1
//
// has_power_measurement
// Support for measuring energy consumption.
// Currently only on Cray XC30/40/50 systems.
// * true: calls to util::energy() will return valid results
// * false: calls to util::energy() will return -1
//
// has_cuda
// Has been compiled with CUDA back end support
#ifdef __linux__
constexpr bool has_memory_measurement = true;
#else
constexpr bool has_memory_measurement = false;
#endif
#ifdef NMC_HAVE_CRAY
constexpr bool has_power_measurement = true;
#else
constexpr bool has_power_measurement = false;
#endif
#ifdef NMC_HAVE_CUDA
constexpr bool has_cuda = true;
#else
constexpr bool has_cuda = false;
#endif
} // namespace config
} // namespace mc
} // namespace nest
#include <string>
#include <util/optional.hpp>
#include "hostname.hpp"
#ifdef __linux__
extern "C" {
#include <unistd.h>
}
#endif
namespace nest {
namespace mc {
namespace util {
#ifdef __linux__
util::optional<std::string> hostname() {
// Hostnames can be up to 256 characters in length, however on many systems
// it is limitted to 64.
char name[256];
auto result = gethostname(name, sizeof(name));
if (result) {
return util::nothing;
}
return std::string(name);
}
#else
util::optional<std::string> hostname() {
return util::nothing;
}
#endif
} // namespace util
} // namespace mc
} // namespace nest
#pragma once
#include <string>
#include <util/optional.hpp>
namespace nest {
namespace mc {
namespace util {
// Get the name of the host on which this process is running.
util::optional<std::string> hostname();
} // namespace util
} // namespace mc
} // namespace nest
......@@ -6,12 +6,6 @@ namespace nest {
namespace mc {
namespace util {
#ifdef __linux__
constexpr bool has_memory_metering = true;
#else
constexpr bool has_memory_metering = false;
#endif
// Use a signed type to store memory sizes because it can be used to store
// the difference between two readings, which may be negative.
// A 64 bit type is large enough to store any amount of memory that will
......
#include <fstream>
#include "power.hpp"
namespace nest {
namespace mc {
namespace util {
#ifdef NMC_HAVE_CRAY
energy_size_type energy() {
energy_size_type result = -1;
std::ifstream fid("/sys/cray/pm_counters/energy");
if (fid) {
fid >> result;
}
return result;
}
#else
energy_size_type energy() {
return -1;
}
#endif
} // namespace util
} // namespace mc
} // namespace nest
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment