diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index 37917fb7b6638641b0f2c31c23e177744f0e4c1e..eeb0b9624a99897d4eafb29ed0204dc88b6323c2 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -10,6 +10,7 @@ set(arbor_sources builtin_mechanisms.cpp cell_group_factory.cpp common_types_io.cpp + execution_context.cpp gpu_context.cpp local_alloc.cpp event_binner.cpp diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp index 68942b71652b50d4c191bb2bb8370ce25e74ca75..e0668a9fa1c86953ddef971e513dee4244d39ccb 100644 --- a/arbor/backends/gpu/threshold_watcher.hpp +++ b/arbor/backends/gpu/threshold_watcher.hpp @@ -4,6 +4,7 @@ #include <arbor/common_types.hpp> #include <arbor/fvm_types.hpp> +#include "execution_context.hpp" #include "memory/memory.hpp" #include "util/span.hpp" diff --git a/arbor/backends/multicore/fvm.hpp b/arbor/backends/multicore/fvm.hpp index 99dcd2b675de419f9d64d155578339756029d9fa..1cbae3eff1b2ed167e446b00ebfc6c7af7afc8ba 100644 --- a/arbor/backends/multicore/fvm.hpp +++ b/arbor/backends/multicore/fvm.hpp @@ -2,7 +2,6 @@ #include <string> #include <vector> -#include <arbor/execution_context.hpp> #include "backends/event.hpp" #include "backends/multicore/matrix_state.hpp" @@ -10,6 +9,7 @@ #include "backends/multicore/multicore_common.hpp" #include "backends/multicore/shared_state.hpp" #include "backends/multicore/threshold_watcher.hpp" +#include "execution_context.hpp" #include "util/padded_alloc.hpp" #include "util/range.hpp" #include "util/rangeutil.hpp" diff --git a/arbor/backends/multicore/threshold_watcher.hpp b/arbor/backends/multicore/threshold_watcher.hpp index 13e5525712be6114e9f471a774df0a5023a8037a..68b1d61d16dfde5af36a4265b35bd4b8713edabc 100644 --- a/arbor/backends/multicore/threshold_watcher.hpp +++ b/arbor/backends/multicore/threshold_watcher.hpp @@ -1,11 +1,11 @@ #pragma once #include <arbor/assert.hpp> -#include <arbor/execution_context.hpp> #include <arbor/fvm_types.hpp> #include <arbor/math.hpp> #include "backends/threshold_crossing.hpp" +#include "execution_context.hpp" #include "multicore_common.hpp" namespace arb { diff --git a/arbor/cell_group_factory.cpp b/arbor/cell_group_factory.cpp index 0fc0a08b8b7f1d8bfc81f6f1bb2f37d425884db7..e3dc6a749d9a26dbfaf8db855e81e0ccdbb81c9d 100644 --- a/arbor/cell_group_factory.cpp +++ b/arbor/cell_group_factory.cpp @@ -6,6 +6,7 @@ #include "benchmark_cell_group.hpp" #include "cell_group.hpp" #include "cell_group_factory.hpp" +#include "execution_context.hpp" #include "fvm_lowered_cell.hpp" #include "lif_cell_group.hpp" #include "mc_cell_group.hpp" @@ -18,7 +19,9 @@ cell_group_ptr make_cell_group(Args&&... args) { return cell_group_ptr(new Impl(std::forward<Args>(args)...)); } -cell_group_factory cell_kind_implementation(cell_kind ck, backend_kind bk, const execution_context& ctx) { +cell_group_factory cell_kind_implementation( + cell_kind ck, backend_kind bk, const execution_context& ctx) +{ using gid_vector = std::vector<cell_gid_type>; switch (ck) { diff --git a/arbor/cell_group_factory.hpp b/arbor/cell_group_factory.hpp index 0cd72de54aaa46b91b1fd394eb2652d1b9056eec..903526532457bf0dbcf256cf1c18c003f1b754ed 100644 --- a/arbor/cell_group_factory.hpp +++ b/arbor/cell_group_factory.hpp @@ -11,17 +11,21 @@ #include <arbor/common_types.hpp> #include <arbor/recipe.hpp> -#include <arbor/execution_context.hpp> #include "cell_group.hpp" +#include "execution_context.hpp" namespace arb { -using cell_group_factory = std::function<cell_group_ptr (const std::vector<cell_gid_type>&, const recipe&)>; +using cell_group_factory = std::function< + cell_group_ptr(const std::vector<cell_gid_type>&, const recipe&)>; -cell_group_factory cell_kind_implementation(cell_kind, backend_kind, const execution_context&); +cell_group_factory cell_kind_implementation( + cell_kind, backend_kind, const execution_context&); -inline bool cell_kind_supported(cell_kind c, backend_kind b, const execution_context& ctx) { +inline bool cell_kind_supported( + cell_kind c, backend_kind b, const execution_context& ctx) +{ return static_cast<bool>(cell_kind_implementation(c, b, ctx)); } diff --git a/arbor/communication/communicator.hpp b/arbor/communication/communicator.hpp index 4a0777bf73b8712401fa5852a520b1184bb9eb35..996e76da02f946b54abe5906a5f1e4e4247b53b0 100644 --- a/arbor/communication/communicator.hpp +++ b/arbor/communication/communicator.hpp @@ -9,15 +9,16 @@ #include <arbor/assert.hpp> #include <arbor/common_types.hpp> -#include <arbor/communication/gathered_vector.hpp> -#include <arbor/distributed_context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/recipe.hpp> #include <arbor/spike.hpp> #include "algorithms.hpp" +#include "communication/gathered_vector.hpp" #include "connection.hpp" +#include "distributed_context.hpp" #include "event_queue.hpp" +#include "execution_context.hpp" #include "profile/profiler_macro.hpp" #include "threading/threading.hpp" #include "util/double_buffer.hpp" @@ -44,7 +45,7 @@ public: explicit communicator(const recipe& rec, const domain_decomposition& dom_dec, - execution_context ctx) + execution_context& ctx) { distributed_ = ctx.distributed; thread_pool_ = ctx.thread_pool; diff --git a/include/arbor/communication/gathered_vector.hpp b/arbor/communication/gathered_vector.hpp similarity index 100% rename from include/arbor/communication/gathered_vector.hpp rename to arbor/communication/gathered_vector.hpp diff --git a/arbor/communication/mpi.hpp b/arbor/communication/mpi.hpp index 3200dd9e2ee1c0620e4e7fe8d920e8e7dc8e748a..8d7ec7826cda5029c350d96ba3f03e22ec1dc227 100644 --- a/arbor/communication/mpi.hpp +++ b/arbor/communication/mpi.hpp @@ -8,10 +8,10 @@ #include <mpi.h> #include <arbor/assert.hpp> -#include <arbor/communication/gathered_vector.hpp> #include <arbor/communication/mpi_error.hpp> #include "algorithms.hpp" +#include "communication/gathered_vector.hpp" #include "profile/profiler_macro.hpp" diff --git a/arbor/communication/mpi_context.cpp b/arbor/communication/mpi_context.cpp index b1134b9a7334eb6f1b763ee898dcc50b6e2bac42..be80dca50966bddde9683d4b05d0b8c4cd1d54c7 100644 --- a/arbor/communication/mpi_context.cpp +++ b/arbor/communication/mpi_context.cpp @@ -10,10 +10,10 @@ #include <mpi.h> -#include <arbor/distributed_context.hpp> #include <arbor/spike.hpp> #include "communication/mpi.hpp" +#include "distributed_context.hpp" namespace arb { @@ -62,12 +62,8 @@ struct mpi_context_impl { } }; -std::shared_ptr<distributed_context> mpi_context() { - return std::make_shared<distributed_context>(mpi_context_impl(MPI_COMM_WORLD)); -} - template <> -std::shared_ptr<distributed_context> mpi_context(MPI_Comm comm) { +std::shared_ptr<distributed_context> make_mpi_context(MPI_Comm comm) { return std::make_shared<distributed_context>(mpi_context_impl(comm)); } diff --git a/include/arbor/distributed_context.hpp b/arbor/distributed_context.hpp similarity index 94% rename from include/arbor/distributed_context.hpp rename to arbor/distributed_context.hpp index 4c95292ab127851e4a4420efa3de355d4ba66e9c..d4153a15755954d80692889cae48f28a25cf5b07 100644 --- a/include/arbor/distributed_context.hpp +++ b/arbor/distributed_context.hpp @@ -4,9 +4,10 @@ #include <string> #include <arbor/spike.hpp> -#include <arbor/communication/gathered_vector.hpp> #include <arbor/util/pp_util.hpp> +#include "communication/gathered_vector.hpp" + namespace arb { #define ARB_PUBLIC_COLLECTIVES_(T) \ @@ -163,12 +164,16 @@ inline distributed_context::distributed_context(): distributed_context(local_context()) {} -// MPI context creation functions only provided if built with MPI support. +using distributed_context_handle = std::shared_ptr<distributed_context>; -std::shared_ptr<distributed_context> mpi_context(); +inline +distributed_context_handle make_local_context() { + return std::make_shared<distributed_context>(); +} +// MPI context creation functions only provided if built with MPI support. template <typename MPICommType> -std::shared_ptr<distributed_context> mpi_context(MPICommType); +distributed_context_handle make_mpi_context(MPICommType); } // namespace arb diff --git a/arbor/execution_context.cpp b/arbor/execution_context.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5436332961b8157541992d30ca9e4787d24ff036 --- /dev/null +++ b/arbor/execution_context.cpp @@ -0,0 +1,73 @@ +#include <iostream> +#include <memory> + +#include <arbor/context.hpp> +#include <arbor/version.hpp> + +#include "gpu_context.hpp" +#include "distributed_context.hpp" +#include "execution_context.hpp" +#include "threading/threading.hpp" + +#ifdef ARB_MPI_ENABLED +#include <mpi.h> +#endif + +namespace arb { + +execution_context::execution_context(): + execution_context(proc_allocation()) +{} + +execution_context::execution_context(const proc_allocation& resources): + distributed(make_local_context()), + thread_pool(std::make_shared<threading::task_system>(resources.num_threads)), + gpu(resources.has_gpu()? std::make_shared<gpu_context>(resources.gpu_id) + : std::make_shared<gpu_context>()) +{} + +context make_context() { + return context(new execution_context(), [](execution_context* p){delete p;}); +} + +context make_context(const proc_allocation& p) { + return context(new execution_context(p), [](execution_context* p){delete p;}); +} + +#ifdef ARB_MPI_ENABLED +template <> +execution_context::execution_context<MPI_Comm>(const proc_allocation& resources, MPI_Comm comm): + distributed(make_mpi_context(comm)), + thread_pool(std::make_shared<threading::task_system>(resources.num_threads)), + gpu(resources.has_gpu()? std::make_shared<gpu_context>(resources.gpu_id) + : std::make_shared<gpu_context>()) +{} + +template <> +context make_context<MPI_Comm>(const proc_allocation& p, MPI_Comm comm) { + return context(new execution_context(p, comm), [](execution_context* p){delete p;}); +} +#endif + +bool has_gpu(const context& ctx) { + return ctx->gpu->has_gpu(); +} + +unsigned num_threads(const context& ctx) { + return ctx->thread_pool->get_num_threads(); +} + +unsigned num_ranks(const context& ctx) { + return ctx->distributed->size(); +} + +unsigned rank(const context& ctx) { + return ctx->distributed->id(); +} + +bool has_mpi(const context& ctx) { + return ctx->distributed->name() == "MPI"; +} + +} // namespace arb + diff --git a/arbor/execution_context.hpp b/arbor/execution_context.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ee1e8415e69cf0da710416bd352515fd5d9f3eb7 --- /dev/null +++ b/arbor/execution_context.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include <memory> + +#include <arbor/context.hpp> + +#include "distributed_context.hpp" +#include "threading/threading.hpp" +#include "gpu_context.hpp" + +namespace arb { + +// execution_context is a simple container for the state relating to +// execution resources. +// Specifically, it has handles for the distributed context, gpu +// context and thread pool. +// +// Note: the public API uses an opaque handle arb::context for +// execution_context, to hide implementation details of the +// container and its constituent contexts from the public API. + +struct execution_context { + distributed_context_handle distributed; + task_system_handle thread_pool; + gpu_context_handle gpu; + + execution_context(); + execution_context(const proc_allocation& resources); + + // Use a template for constructing with a specific distributed context. + // Specialised implementations are implemented in execution_context.cpp. + template <typename Comm> + execution_context(const proc_allocation& resources, Comm comm); +}; + +} // namespace arb diff --git a/arbor/fvm_lowered_cell.hpp b/arbor/fvm_lowered_cell.hpp index bb4184fcbce6b51cbf29a70f02cc6bb0cc02fc51..c710074cc18b4a096fa463993acc77b03118b901 100644 --- a/arbor/fvm_lowered_cell.hpp +++ b/arbor/fvm_lowered_cell.hpp @@ -4,12 +4,12 @@ #include <vector> #include <arbor/common_types.hpp> -#include <arbor/execution_context.hpp> #include <arbor/fvm_types.hpp> #include <arbor/recipe.hpp> #include "backends/event.hpp" #include "backends/threshold_crossing.hpp" +#include "execution_context.hpp" #include "sampler_map.hpp" #include "util/range.hpp" diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp index fcc9670d2cc0597de39ebd9f04e4238cfb16973d..f2a454852c33d169d2013f09795251a5f56aff17 100644 --- a/arbor/fvm_lowered_cell_impl.hpp +++ b/arbor/fvm_lowered_cell_impl.hpp @@ -20,6 +20,7 @@ #include <arbor/recipe.hpp> #include "builtin_mechanisms.hpp" +#include "execution_context.hpp" #include "fvm_layout.hpp" #include "fvm_lowered_cell.hpp" #include "matrix.hpp" diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp index 1fcbba074bb430c8117b096d131210dcd6cef939..34b078eab9fa066e70250e8ade4266a32cf42b2e 100644 --- a/arbor/gpu_context.cpp +++ b/arbor/gpu_context.cpp @@ -1,14 +1,29 @@ #include <memory> +#include <arbor/arbexcept.hpp> + +#include "gpu_context.hpp" + #ifdef ARB_HAVE_GPU #include <cuda.h> #include <cuda_runtime.h> #endif -#include "gpu_context.hpp" - namespace arb { +enum gpu_flags { + has_concurrent_managed_access = 1, + has_atomic_double = 2 +}; + +gpu_context_handle make_gpu_context(int id) { + return std::make_shared<gpu_context>(id); +} + +bool gpu_context_has_gpu(const gpu_context& ctx) { + return ctx.has_gpu(); +} + bool gpu_context::has_concurrent_managed_access() const { return attributes_ & gpu_flags::has_concurrent_managed_access; } @@ -17,23 +32,43 @@ bool gpu_context::has_atomic_double() const { return attributes_ & gpu_flags::has_atomic_double; } +bool gpu_context::has_gpu() const { + return id_ != -1; +} + #ifndef ARB_HAVE_GPU -gpu_context::gpu_context(): has_gpu_(false), attributes_(0) {} void gpu_context::synchronize_for_managed_access() const {} +gpu_context::gpu_context(int) { + throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU."); +} #else -gpu_context::gpu_context(): has_gpu_(true), attributes_(0) { +gpu_context::gpu_context(int gpu_id) { cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); + auto status = cudaGetDeviceProperties(&prop, gpu_id); + if (status==cudaErrorInvalidDevice) { + throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id)); + } + + // Set the device + status = cudaSetDevice(gpu_id); + if (status!=cudaSuccess) { + throw arbor_exception("Unable to select GPU id " + std::to_string(gpu_id)); + } + + id_ = gpu_id; + + // Record the device attributes + attributes_ = 0; if (prop.concurrentManagedAccess) { attributes_ |= gpu_flags::has_concurrent_managed_access; } if (prop.major*100 + prop.minor >= 600) { attributes_ |= gpu_flags::has_atomic_double; } -}; +} void gpu_context::synchronize_for_managed_access() const { if(!has_concurrent_managed_access()) { @@ -43,8 +78,4 @@ void gpu_context::synchronize_for_managed_access() const { #endif -std::shared_ptr<gpu_context> make_gpu_context() { - return std::make_shared<gpu_context>(); -} - -} +} // namespace arb diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp index edcb9c933ccddbafd68e3a1412c80dc54e7ea512..2b1e4f496a2d47c3a848e2bd08f1c58a5f02ff0e 100644 --- a/arbor/gpu_context.hpp +++ b/arbor/gpu_context.hpp @@ -1,21 +1,25 @@ #pragma once -namespace arb { +#include <cstdlib> +#include <memory> -enum gpu_flags { - has_concurrent_managed_access = 1, - has_atomic_double = 2 -}; +namespace arb { -struct gpu_context { - bool has_gpu_; - size_t attributes_; +class gpu_context { + int id_ = -1; + std::size_t attributes_ = 0; - gpu_context(); +public: + gpu_context() = default; + gpu_context(int id); bool has_concurrent_managed_access() const; bool has_atomic_double() const; void synchronize_for_managed_access() const; + bool has_gpu() const; }; -} +using gpu_context_handle = std::shared_ptr<gpu_context>; +gpu_context_handle make_gpu_context(int id); + +} // namespace arb diff --git a/arbor/local_alloc.cpp b/arbor/local_alloc.cpp index dfe00d603d480b45227221455b56acc88bed6359..3320e22e5b53586d8e90adf56d95580821397e3e 100644 --- a/arbor/local_alloc.cpp +++ b/arbor/local_alloc.cpp @@ -1,17 +1,16 @@ -#include <arbor/domain_decomposition.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include "hardware/node_info.hpp" +#include "threading/thread_info.hpp" #include "threading/threading.hpp" namespace arb { -proc_allocation local_allocation(const execution_context& ctx) { - proc_allocation info; - info.num_threads = ctx.thread_pool->get_num_threads(); - info.num_gpus = arb::hw::node_gpus(); +local_resources get_local_resources() { + auto avail_threads = threading::num_threads_init(); + auto avail_gpus = arb::hw::node_gpus(); - return info; + return local_resources(avail_threads, avail_gpus); } } // namespace arb diff --git a/arbor/partition_load_balance.cpp b/arbor/partition_load_balance.cpp index 0e4ffca339abe8ac1f51d47cdb97ae77d9dba8d6..fe1d79b27975dc0b97bbb1da51be94bf60bea094 100644 --- a/arbor/partition_load_balance.cpp +++ b/arbor/partition_load_balance.cpp @@ -1,9 +1,11 @@ #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> #include <arbor/recipe.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include "cell_group_factory.hpp" +#include "execution_context.hpp" +#include "gpu_context.hpp" #include "util/maputil.hpp" #include "util/partition.hpp" #include "util/span.hpp" @@ -12,10 +14,11 @@ namespace arb { domain_decomposition partition_load_balance( const recipe& rec, - proc_allocation nd, - const execution_context& ctx, + const context& ctx, partition_hint_map hint_map) { + const bool gpu_avail = ctx->gpu->has_gpu(); + struct partition_gid_domain { partition_gid_domain(std::vector<cell_gid_type> divs): gid_divisions(std::move(divs)) @@ -31,8 +34,8 @@ domain_decomposition partition_load_balance( using util::make_span; - unsigned num_domains = ctx.distributed->size(); - unsigned domain_id = ctx.distributed->id(); + unsigned num_domains = ctx->distributed->size(); + unsigned domain_id = ctx->distributed->id(); auto num_global_cells = rec.num_cells(); auto dom_size = [&](unsigned dom) -> cell_gid_type { @@ -65,8 +68,8 @@ domain_decomposition partition_load_balance( // of cell group updates according to rules such as the back end on // which the cell group is running. - auto has_gpu_backend = [ctx](cell_kind c) { - return cell_kind_supported(c, backend_kind::gpu, ctx); + auto has_gpu_backend = [&ctx](cell_kind c) { + return cell_kind_supported(c, backend_kind::gpu, *ctx); }; std::vector<cell_kind> kinds; @@ -85,7 +88,7 @@ domain_decomposition partition_load_balance( backend_kind backend = backend_kind::multicore; std::size_t group_size = hint.cpu_group_size; - if (hint.prefer_gpu && nd.num_gpus>0 && has_gpu_backend(k)) { + if (hint.prefer_gpu && gpu_avail && has_gpu_backend(k)) { backend = backend_kind::gpu; group_size = hint.gpu_group_size; } diff --git a/arbor/profile/meter_manager.cpp b/arbor/profile/meter_manager.cpp index e08b1c1a85d2202a80443413d3667584952bb551..ba5fd8dc6c0b8a4b5706817f5e16abdcc67401c3 100644 --- a/arbor/profile/meter_manager.cpp +++ b/arbor/profile/meter_manager.cpp @@ -1,12 +1,13 @@ #include <arbor/profile/timer.hpp> #include <arbor/profile/meter_manager.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include "memory_meter.hpp" #include "power_meter.hpp" #include "algorithms.hpp" +#include "execution_context.hpp" #include "util/hostname.hpp" #include "util/strprintf.hpp" #include "util/rangeutil.hpp" @@ -19,23 +20,25 @@ using util::strprintf; measurement::measurement(std::string n, std::string u, const std::vector<double>& readings, - const distributed_context_handle& ctx): + const context& ctx): name(std::move(n)), units(std::move(u)) { + auto dist = ctx->distributed; + // Assert that the same number of readings were taken on every domain. const auto num_readings = readings.size(); - if (ctx->min(num_readings)!=ctx->max(num_readings)) { + if (dist->min(num_readings)!=dist->max(num_readings)) { throw std::out_of_range( "the number of checkpoints in the \""+name+"\" meter do not match across domains"); } // Gather across all of the domains onto the root domain. for (auto r: readings) { - measurements.push_back(ctx->gather(r, 0)); + measurements.push_back(dist->gather(r, 0)); } } -meter_manager::meter_manager(distributed_context_handle ctx): glob_ctx_(ctx) { +meter_manager::meter_manager() { if (auto m = make_memory_meter()) { meters_.push_back(std::move(m)); } @@ -47,7 +50,7 @@ meter_manager::meter_manager(distributed_context_handle ctx): glob_ctx_(ctx) { } }; -void meter_manager::start() { +void meter_manager::start(const context& ctx) { arb_assert(!started_); started_ = true; @@ -58,13 +61,13 @@ void meter_manager::start() { } // Enforce a global barrier after taking the time stamp - glob_ctx_->barrier(); + ctx->distributed->barrier(); start_time_ = timer_type::tic(); }; -void meter_manager::checkpoint(std::string name) { +void meter_manager::checkpoint(std::string name, const context& ctx) { arb_assert(started_); // Record the time taken on this domain since the last checkpoint @@ -77,7 +80,7 @@ void meter_manager::checkpoint(std::string name) { } // Synchronize all domains before setting start time for the next interval - glob_ctx_->barrier(); + ctx->distributed->barrier(); start_time_ = timer<>::tic(); } @@ -93,17 +96,11 @@ const std::vector<double>& meter_manager::times() const { return times_; } -distributed_context_handle meter_manager::context() const { - return glob_ctx_; -} - // Build a report of meters, for use at the end of a simulation // for output to file or analysis. -meter_report make_meter_report(const meter_manager& manager) { +meter_report make_meter_report(const meter_manager& manager, const context& ctx) { meter_report report; - auto ctx = manager.context(); - // Add the times to the meter outputs report.meters.push_back(measurement("time", "s", manager.times(), ctx)); @@ -115,7 +112,7 @@ meter_report make_meter_report(const meter_manager& manager) { // Gather a vector with the names of the node that each rank is running on. auto host = util::hostname(); - auto hosts = ctx->gather(host? *host: "unknown", 0); + auto hosts = ctx->distributed->gather(host? *host: "unknown", 0); report.hosts = hosts; // Count the number of unique hosts. @@ -124,7 +121,7 @@ meter_report make_meter_report(const meter_manager& manager) { auto num_hosts = std::distance(hosts.begin(), std::unique(hosts.begin(), hosts.end())); report.checkpoints = manager.checkpoint_names(); - report.num_domains = ctx->size(); + report.num_domains = ctx->distributed->size(); report.num_hosts = num_hosts; return report; diff --git a/arbor/profile/profiler.cpp b/arbor/profile/profiler.cpp index 9923438f8a7167d6a43bdf66e1a26d13a05cafc7..4ecd1efc5cc04cd7302ce29554df04ac7646edaf 100644 --- a/arbor/profile/profiler.cpp +++ b/arbor/profile/profiler.cpp @@ -2,8 +2,10 @@ #include <mutex> #include <ostream> +#include <arbor/context.hpp> #include <arbor/profile/profiler.hpp> +#include "execution_context.hpp" #include "threading/threading.hpp" #include "util/span.hpp" #include "util/rangeutil.hpp" @@ -341,8 +343,8 @@ void profiler_enter(region_id_type region_id) { profiler::get_global_profiler().enter(region_id); } -void profiler_initialize(task_system_handle& ts) { - profiler::get_global_profiler().initialize(ts); +void profiler_initialize(context& ctx) { + profiler::get_global_profiler().initialize(ctx->thread_pool); } // Print profiler statistics to an ostream diff --git a/arbor/simulation.cpp b/arbor/simulation.cpp index 87d88c28b594a99feac251d5b81c122486a44743..2e87f4a45cf4da9f19f52acfd0a158c2d6a4681b 100644 --- a/arbor/simulation.cpp +++ b/arbor/simulation.cpp @@ -2,6 +2,7 @@ #include <set> #include <vector> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/generic_event.hpp> #include <arbor/recipe.hpp> @@ -11,6 +12,7 @@ #include "cell_group.hpp" #include "cell_group_factory.hpp" #include "communication/communicator.hpp" +#include "execution_context.hpp" #include "merge_events.hpp" #include "thread_private_spike_store.hpp" #include "threading/threading.hpp" @@ -419,9 +421,9 @@ void simulation_state::inject_events(const pse_vector& events) { simulation::simulation( const recipe& rec, const domain_decomposition& decomp, - execution_context ctx) + const context& ctx) { - impl_.reset(new simulation_state(rec, decomp, ctx)); + impl_.reset(new simulation_state(rec, decomp, *ctx)); } void simulation::reset() { diff --git a/arbor/thread_private_spike_store.cpp b/arbor/thread_private_spike_store.cpp index e6a1f92eaa07682d3673d68aa54edacc058abee4..f7805350c486e7d0bbb619f430c9029e2a3494e5 100644 --- a/arbor/thread_private_spike_store.cpp +++ b/arbor/thread_private_spike_store.cpp @@ -3,6 +3,8 @@ #include <arbor/common_types.hpp> #include <arbor/spike.hpp> +#include "threading/enumerable_thread_specific.hpp" +#include "threading/threading.hpp" #include "thread_private_spike_store.hpp" namespace arb { @@ -13,11 +15,13 @@ struct local_spike_store_type { local_spike_store_type(const task_system_handle& ts): buffers_(ts) {}; }; -thread_private_spike_store::thread_private_spike_store(thread_private_spike_store&& t): impl_(std::move(t.impl_)) {}; +thread_private_spike_store::thread_private_spike_store(thread_private_spike_store&& t): + impl_(std::move(t.impl_)) +{} thread_private_spike_store::thread_private_spike_store(const task_system_handle& ts): - impl_(new local_spike_store_type(ts)) { -} + impl_(new local_spike_store_type(ts)) +{} thread_private_spike_store::~thread_private_spike_store() {} diff --git a/arbor/thread_private_spike_store.hpp b/arbor/thread_private_spike_store.hpp index eec507090ad6332ad430e78e6182d00ab463b8e2..1293de5b26521311d741785190dee0a8e2e66fcb 100644 --- a/arbor/thread_private_spike_store.hpp +++ b/arbor/thread_private_spike_store.hpp @@ -5,7 +5,6 @@ #include <arbor/common_types.hpp> #include <arbor/spike.hpp> -#include <arbor/execution_context.hpp> #include "threading/threading.hpp" diff --git a/arbor/threading/enumerable_thread_specific.hpp b/arbor/threading/enumerable_thread_specific.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1fbbfea407370c1a4cf81d3906a6bc6a012bbc04 --- /dev/null +++ b/arbor/threading/enumerable_thread_specific.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include <vector> + +#include "threading.hpp" + +namespace arb { +namespace threading { + +template <typename T> +class enumerable_thread_specific { + std::unordered_map<std::thread::id, std::size_t> thread_ids_; + + using storage_class = std::vector<T>; + storage_class data; + +public: + using iterator = typename storage_class::iterator; + using const_iterator = typename storage_class::const_iterator; + + enumerable_thread_specific(const task_system_handle& ts): + thread_ids_{ts->get_thread_ids()}, + data{std::vector<T>(ts->get_num_threads())} + {} + + enumerable_thread_specific(const T& init, const task_system_handle& ts): + thread_ids_{ts->get_thread_ids()}, + data{std::vector<T>(ts->get_num_threads(), init)} + {} + + T& local() { + return data[thread_ids_.at(std::this_thread::get_id())]; + } + const T& local() const { + return data[thread_ids_.at(std::this_thread::get_id())]; + } + + auto size() const { return data.size(); } + + iterator begin() { return data.begin(); } + iterator end() { return data.end(); } + + const_iterator begin() const { return data.begin(); } + const_iterator end() const { return data.end(); } + + const_iterator cbegin() const { return data.cbegin(); } + const_iterator cend() const { return data.cend(); } +}; + +} // namespace threading +} // namespace arb + diff --git a/arbor/threading/thread_info.cpp b/arbor/threading/thread_info.cpp index 025de42097b31b66ea8352051491cb4d42c546ea..dce15dbb6eb819ef13cb9547d4f28d455c7c1169 100644 --- a/arbor/threading/thread_info.cpp +++ b/arbor/threading/thread_info.cpp @@ -45,6 +45,7 @@ util::optional<size_t> get_env_num_threads() { return util::nullopt; } + errno = 0; auto nthreads = std::strtoul(str, nullptr, 10); // check that the environment variable string describes a non-negative integer diff --git a/arbor/threading/threading.cpp b/arbor/threading/threading.cpp index 7c28cdc8cf9d69c5af78f7fc28cb3c0ba27bfbb9..ec116fad338132006f40cddec28cad884530084b 100644 --- a/arbor/threading/threading.cpp +++ b/arbor/threading/threading.cpp @@ -2,7 +2,6 @@ #include "threading.hpp" #include "thread_info.hpp" -#include <arbor/execution_context.hpp> using namespace arb::threading::impl; using namespace arb::threading; @@ -114,18 +113,10 @@ void task_system::async(task tsk) { q_[i % count_].push(std::move(tsk)); } -int task_system::get_num_threads() { +int task_system::get_num_threads() const { return threads_.size() + 1; } -std::unordered_map<std::thread::id, std::size_t> task_system::get_thread_ids() { +std::unordered_map<std::thread::id, std::size_t> task_system::get_thread_ids() const { return thread_ids_; }; - -task_system_handle arb::make_thread_pool() { - return arb::make_thread_pool(num_threads_init()); -} - -task_system_handle arb::make_thread_pool(int nthreads) { - return task_system_handle(new task_system(nthreads)); -} diff --git a/arbor/threading/threading.hpp b/arbor/threading/threading.hpp index ea084dcce9371417ab8df5379b38695b86fee0cd..5c9d8ca0f203d87a97d888cc66e73d1e22960eaa 100644 --- a/arbor/threading/threading.hpp +++ b/arbor/threading/threading.hpp @@ -14,8 +14,6 @@ #include <unordered_map> #include <utility> -#include <arbor/execution_context.hpp> - namespace arb { namespace threading { @@ -91,62 +89,12 @@ public: void try_run_task(); // Includes master thread. - int get_num_threads(); + int get_num_threads() const; // Returns the thread_id map - std::unordered_map<std::thread::id, std::size_t> get_thread_ids(); + std::unordered_map<std::thread::id, std::size_t> get_thread_ids() const; }; -/////////////////////////////////////////////////////////////////////// -// types -/////////////////////////////////////////////////////////////////////// - -template <typename T> -class enumerable_thread_specific { - std::unordered_map<std::thread::id, std::size_t> thread_ids_; - - using storage_class = std::vector<T>; - storage_class data; - -public: - using iterator = typename storage_class::iterator; - using const_iterator = typename storage_class::const_iterator; - - enumerable_thread_specific(const task_system_handle& ts): - thread_ids_{ts.get()->get_thread_ids()}, - data{std::vector<T>(ts.get()->get_num_threads())} - {} - - enumerable_thread_specific(const T& init, const task_system_handle& ts): - thread_ids_{ts.get()->get_thread_ids()}, - data{std::vector<T>(ts.get()->get_num_threads(), init)} - {} - - T& local() { - return data[thread_ids_.at(std::this_thread::get_id())]; - } - const T& local() const { - return data[thread_ids_.at(std::this_thread::get_id())]; - } - - auto size() const { return data.size(); } - - iterator begin() { return data.begin(); } - iterator end() { return data.end(); } - - const_iterator begin() const { return data.begin(); } - const_iterator end() const { return data.end(); } - - const_iterator cbegin() const { return data.cbegin(); } - const_iterator cend() const { return data.cend(); } -}; - -inline std::string description() { - return "CThread Pool"; -} - -constexpr bool multithreaded() { return true; } - class task_group { private: std::atomic<std::size_t> in_flight_{0}; @@ -235,4 +183,7 @@ struct parallel_for { } }; } // namespace threading + +using task_system_handle = std::shared_ptr<threading::task_system>; + } // namespace arb diff --git a/arbor/util/double_buffer.hpp b/arbor/util/double_buffer.hpp index 67afecdee7da1134c8d770106baaf9af904b5f82..845a5ce9457722e4688403007923d444002e75c3 100644 --- a/arbor/util/double_buffer.hpp +++ b/arbor/util/double_buffer.hpp @@ -1,10 +1,9 @@ #pragma once -#include <array> #include <atomic> +#include <vector> #include <arbor/assert.hpp> -#include <arbor/execution_context.hpp> namespace arb { namespace util { diff --git a/doc/cpp_distributed_context.rst b/doc/cpp_distributed_context.rst index 3eeeaee08964cdeb105763b25901bff6073648f0..b5878a1eafc420dff6221242046f2cfaaa57ac50 100644 --- a/doc/cpp_distributed_context.rst +++ b/doc/cpp_distributed_context.rst @@ -18,52 +18,55 @@ This means that if Arbor is compiled with support for MPI enabled, then at run t user can choose between using a non-distributed (local) context, or an distributed MPI context. -A global context is created by a user before building and running a simulation. -The context is then used to perform domain decomposition and initialize the simulation +An execution context is created by a user before building and running a simulation. +This context is then used to perform domain decomposition and initialize the simulation (see :ref:`cppsimulation` for more about the simulation building workflow). In the example below, a context that uses MPI is used to run a distributed simulation: +The public API does not directly expose :cpp:class:`arb::distributed_context` or any of its +implementations. +By default :cpp:class:`arb::context` uses only local "on-node" resources. To use an MPI +communicator for distributed communication, it can be initialised with the communicator: + .. container:: example-code .. code-block:: cpp - arb::hw::node_info node; + arb::proc_allocation resources; my_recipe recipe; - // Get an MPI communication context - arb::distributed_context context = arb::mpi_context(); + // Create a context that uses the local resources enumerated in resources, + // and that uses the standard MPI communicator MPI_COMM_WORLD for + // distributed communication. + arb::context context = arb::make_context(resources, MPI_COMM_WORLD); - // Partition model over the distributed system - arb::domain_decomposition decomp = arb::partition_load_balance(recipe, node, &context); + // Partition model over the distributed system. + arb::domain_decomposition decomp = arb::partition_load_balance(recipe, context); - // Instatitate the simulation over the distributed system - arb::simulation sim(recipe, decomp, &context); + // Instatitate the simulation over the distributed system. + arb::simulation sim(recipe, decomp, context); - // Run the simulation for 100ms over the distributed system + // Run the simulation for 100ms over the distributed system. sim.run(100, 0.01); -By default :cpp:class:`arb::distributed_context` uses an :cpp:class:`arb::local_context`, which -runs on the local computer or node, that is, it is not distributed. +In the back end :cpp:class:`arb::distributed_context` defines the interface for distributed contexts, +for which two implementations are provided: :cpp:class:`arb::local_context` and :cpp:class:`arb::mpi_context`. +Distributed contexts are wrapped in shared pointers: + +.. cpp:type:: distributed_context_handle = std::shared_ptr<distributed_context> -To run on a distributed system, use :cpp:class:`arb::mpi_context`, which uses -MPI for distributed communication. -By default the context will use the default MPI communicator ``MPI_COMM_WORLD``, -though it can be initialised with a user-supplied communicator. +A distributed context can then be generated using helper functions :cpp:func:`arb::make_local_context` and +:cpp:func:`arb::make_mpi_context`: .. container:: example-code .. code-block:: cpp - arb::distributed_context context; - - // This is equivelent to default constructed context above - arb::distributed_context context = arb::local_context(); + // Create a context that uses only local resources (is non-distributed). + auto dist_ctx arb::make_local_context(); - // Create an MPI context that uses MPI_COMM_WORLD - arb::distributed_context context = arb::mpi_context(); - - // create an MPI context with a user-supplied MPI_Comm - arb::distributed_context context = arb::mpi_context(communicator); + // Create an MPI context that uses the standared MPI_COMM_WORLD communicator. + auto dist_ctx = arb::make_mpi_context(MPI_COMM_WORLD); Class Documentation @@ -170,6 +173,10 @@ Class Documentation Default constructor. +.. cpp:function:: distributed_context_handle make_local_context() + + Convenience function that returns a handle to a local context. + .. cpp:class:: mpi_context Implements the :cpp:class:`arb::distributed_context` interface for @@ -177,7 +184,12 @@ Class Documentation **Constructor:** - .. cpp:function:: mpi_context(MPI_Comm comm=MPI_COMM_WORLD) + .. cpp:function:: mpi_context(MPI_Comm comm) Create a context that will uses the MPI communicator :cpp:var:`comm`. - By default uses the global communicator ``MPI_COMM_WORLD``. + +.. cpp:function:: distributed_context_handle make_mpi_context(MPI_Comm comm) + + Convenience function that returns a handle to a :cpp:class:`arb::mpi_context` + that uses the MPI communicator comm. + diff --git a/doc/cpp_domdec.rst b/doc/cpp_domdec.rst index f4cd406436e37ff8567937e8b8aea6c04624f0f3..4ae78020fa2cc3c712cb896d2e78a3578f90aa0a 100644 --- a/doc/cpp_domdec.rst +++ b/doc/cpp_domdec.rst @@ -30,45 +30,141 @@ Load balancer Hardware -------- -.. cpp:namespace:: arb::hw +.. cpp:namespace:: arb + +.. cpp:class:: local_resources + + Enumerates the computational resources available locally, specifically the + number of hardware threads and the number of GPUs. + + The function :cpp:func:`arb::get_local_resources` can be used to automatically + detect the available resources are available :cpp:class:`local_resources` + + .. container:: example-code + + .. code-block:: cpp + + auto resources = arb::get_local_resources(); + std::cout << "This node supports " << resources.num_threads " threads," << + << " and " << resources.num_gpus << " gpus."; + + .. cpp:function:: local_resources(unsigned threads, unsigned gpus) + + Constructor. -.. cpp:class:: node_info + .. cpp:member:: const unsigned num_threads - Information about the computational resources available to a simulation, typically a + The number of threads available. + + .. cpp:member:: const unsigned num_gpus + + The number of GPUs available. + +.. cpp:function:: local_resources get_local_resources() + + Returns an instance of :cpp:class:`local_resources` with the following: + + * ``num_threads`` is determined from the ``ARB_NUM_THREADS`` environment variable if + set, otherwise Arbor attempts to detect the number of available hardware cores. + If Arbor can't determine the available threads it defaults to 1 thread. + * ``num_gpus`` is the number of GPUs detected using the CUDA ``cudaGetDeviceCount`` that + `API call <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html>`_. + +.. cpp:class:: proc_allocation + + Enumerates the computational resources to be used for a simulation, typically a subset of the resources available on a physical hardware node. - When used for distributed simulations, where a model will be distributed over more than - one node, a :cpp:class:`hw::node_info` represents the resources available to the local - MPI rank. .. container:: example-code .. code-block:: cpp - // Make node that uses one thread for each available hardware thread, - // and one GPU if any GPUs are available. - hw::node_info node; - node.num_cpu_cores = threading::num_threads(); - node.num_gpus = hw::num_gpus()>0? 1: 0; + // Default construction uses all detected cores/threads, and the first GPU, if available. + arb::proc_allocation resources; - .. cpp:function:: node_info() = default + // Remove any GPU from the resource description. + resources.gpu_id = -1; - Default constructor (sets 1 CPU core and 0 GPUs). - .. cpp:function:: node_info(unsigned cores, unsigned gpus) + .. cpp:function:: proc_allocation() = default - Constructor that sets the number of :cpp:var:`cores` and :cpp:var:`gpus` available. + Sets the number of threads to the number detected by :cpp:func:`get_local_resources`, and + chooses either the first available GPU, or no GPU if none are available. - .. cpp:member:: unsigned num_cpu_cores = 1 + .. cpp:function:: proc_allocation(unsigned threads, int gpu_id) - The number of CPU cores available. + Constructor that sets the number of :cpp:var:`threads` and selects :cpp:var:`gpus` available. - By default it is assumed that there is one core available. + .. cpp:member:: unsigned num_threads - .. cpp:member:: unsigned num_gpus = 0 + The number of CPU threads available. - The number of GPUs available. + .. cpp:member:: int gpu_id + + The identifier of the the GPU to use. + The gpu id corresponds to the ``int device`` parameter used by CUDA API calls + to identify gpu devices. + Set to -1 to indicate that no GPU device is to be used. + See ``cudaSetDevice`` and ``cudaDeviceGetAttribute`` provided by the + `CUDA API <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html>`_. + + .. cpp:function:: bool has_gpu() const + + Indicates whether a GPU is selected (i.e. whether :cpp:member:`gpu_id` is ``-1``). + +Execution Context +----------------- + +The :cpp:class:`proc_allocation` class enumerates the hardware resources on the local hardware +to use for a simulation. +A :cpp:class:`arb::context` ... + +.. cpp:namespace:: arb + +.. cpp:class:: context + + A handle for the interfaces to the hardware resources used in a simulation. + A :cpp:class:`context` contains the local thread pool, and optionally the GPU state + and MPI communicator, if available. Users of the library do not directly use the functionality + provided by :cpp:class:`context`, instead they configure contexts, which are passed to + Arbor methods and types. + +.. cpp:function:: context make_context() + + Local context that uses all detected threads and a GPU if any are available. + +.. cpp:function:: context make_context(proc_allocation alloc) + + Local context that uses the local resources described by :cpp:var:`alloc`. + +.. cpp:function:: context make_context(proc_allocation alloc, MPI_Comm comm) + + A context that uses the local resources described by :cpp:var:`alloc`, and + uses the MPI communicator :cpp:var:`comm` for distributed calculation. + + +Here are some examples of how to create a :cpp:class:`arb::context`: + + .. container:: example-code + + .. code-block:: cpp + + #include <arbor/context.hpp> + + // Construct a non-distributed context that uses all detected available resources. + auto context = arb::make_context(); + + // Construct a context that: + // * does not use a GPU, reguardless of whether one is available; + // * uses 8 threads in its thread pool. + arb::proc_allocation resources(8, -1); + auto context = arb::make_context(resources); - By default it is assumed that there are no GPUs. + // Construct a context that: + // * uses all available local hardware resources; + // * uses the standard MPI communicator MPI_COMM_WORLD for distributed computation. + arb::proc_allocation resources; // defaults to all detected local resources + auto context = arb::make_context(resources, MPI_COMM_WORLD); Load Balancers -------------- @@ -91,12 +187,11 @@ describes the cell groups on the local MPI rank. .. cpp:namespace:: arb -.. cpp:function:: domain_decomposition partition_load_balance(const recipe& rec, hw::node_info nd, const distributed_context* ctx) +.. cpp:function:: domain_decomposition partition_load_balance(const recipe& rec, const arb::context& ctx) Construct a :cpp:class:`domain_decomposition` that distributes the cells - in the model described by :cpp:any:`rec` over the set of distributed - compute nodes that communicate using :cpp:any:`ctx`, with hardware resources - on the calling node described by :cpp:any:`nd`. + in the model described by :cpp:any:`rec` over the distributed and local hardware + resources described by :cpp:any:`ctx`. The algorithm counts the number of each cell type in the global model, then partitions the cells of each type equally over the available nodes. diff --git a/doc/cpp_simulation.rst b/doc/cpp_simulation.rst index a1b5d5f8f442b6c7e47791922c588d8fd0a5c0d0..525314f798335d2289475858afe254352f660805 100644 --- a/doc/cpp_simulation.rst +++ b/doc/cpp_simulation.rst @@ -10,37 +10,33 @@ To build a simulation the following are needed: * An :cpp:class:`arb::recipe` that describes the cells and connections in the model. - * An :cpp:class:`arb::hw::node_info` that describes the CPU and GPU hardware - resources on which the model will be run. - * An :cpp:class:`arb::distributed_context` that describes the distributed system - on which the model will run. + * An :cpp:class:`arb::context` used to execute the simulation. The workflow to build a simulation is to first generate a :cpp:class:`arb::domain_decomposition` that describes the distribution of the model -over the local and distributed hardware resources (see :ref:`cppdomdec` and :ref:`cppdistcontext`), +over the local and distributed hardware resources (see :ref:`cppdomdec`), then build the simulation. .. container:: example-code .. code-block:: cpp - // Get a communication context - arb::distributed_context context; + #include <arbor/context.hpp> + #include <arbor/domain_decomposition.hpp> + #include <arbor/simulation.hpp> - // Make description of the hardware that the simulation will run on. - arb::hw::node_info node; - node.num_cpu_cores = arb::threading::num_threads(); - node.num_gpus = arb::hw::num_gpus()>0? 1: 0; // use 1 GPU if any available + // Get a communication context + arb::context context = make_context(); // Make a recipe of user defined type my_recipe. my_recipe recipe; // Get a description of the partition the model over the cores // (and gpu if available) on node. - arb::domain_decomposition decomp = arb::partition_load_balance(recipe, node, &context); + arb::domain_decomposition decomp = arb::partition_load_balance(recipe, context); // Instatitate the simulation. - arb::simulation sim(recipe, decomp, &context); + arb::simulation sim(recipe, decomp, context); Class Documentation @@ -59,8 +55,7 @@ Class Documentation * an :cpp:class:`arb::recipe` that describes the model; * an :cpp:class:`arb::domain_decomposition` that describes how the cells in the model are assigned to hardware resources; - * an :cpp:class:`arb::distributed_context` which performs communication - on distributed memory syustems. + * an :cpp:class:`arb::context` which is used to execute the simulation. * **Experimental inputs** that can change between model runs, such as external spike trains. @@ -81,7 +76,7 @@ Class Documentation **Constructor:** - .. cpp:function:: simulation(const recipe& rec, const domain_decomposition& decomp, const distributed_context* ctx) + .. cpp:function:: simulation(const recipe& rec, const domain_decomposition& decomp, const context& ctx) **Experimental inputs:** diff --git a/doc/index.rst b/doc/index.rst index f6de1c0fe9ce2e0292a61f3c07d5a168ba419ddf..0b4364d29e9be1044192ce4f068d837468698f1b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -55,7 +55,6 @@ Some key features include: cpp_recipe cpp_domdec cpp_simulation - cpp_distributed_context .. toctree:: :caption: Developers: @@ -64,4 +63,5 @@ Some key features include: simd_api profiler sampling_api + cpp_distributed_context diff --git a/example/bench/bench.cpp b/example/bench/bench.cpp index a8c32991ecbf2ebde93e1e5586decef4218c915a..988b07b04c506f817609fe24bafd50959b1ca03b 100644 --- a/example/bench/bench.cpp +++ b/example/bench/bench.cpp @@ -9,8 +9,7 @@ #include <nlohmann/json.hpp> #include <arbor/profile/meter_manager.hpp> -#include <arbor/common_types.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> #include <arbor/profile/profiler.hpp> @@ -31,16 +30,23 @@ namespace profile = arb::profile; int main(int argc, char** argv) { + bool is_root = true; + try { - arb::execution_context context; #ifdef ARB_MPI_ENABLED aux::with_mpi guard(argc, argv, false); - context.distributed = arb::mpi_context(MPI_COMM_WORLD); + auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD); + { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + is_root = rank==0; + } +#else + auto context = arb::make_context(); #endif #ifdef ARB_PROFILE_ENABLED - profile::profiler_initialize(context.thread_pool); + profile::profiler_initialize(context); #endif - const bool is_root = context.distributed->id()==0; std::cout << aux::mask_stream(is_root); @@ -48,31 +54,30 @@ int main(int argc, char** argv) { std::cout << params << "\n"; - profile::meter_manager meters(context.distributed); - meters.start(); + profile::meter_manager meters; + meters.start(context); // Create an instance of our recipe. bench_recipe recipe(params); - meters.checkpoint("recipe-build"); + meters.checkpoint("recipe-build", context); // Make the domain decomposition for the model - auto local = arb::local_allocation(context); - auto decomp = arb::partition_load_balance(recipe, local, context); - meters.checkpoint("domain-decomp"); + auto decomp = arb::partition_load_balance(recipe, context); + meters.checkpoint("domain-decomp", context); // Construct the model. arb::simulation sim(recipe, decomp, context); - meters.checkpoint("model-build"); + meters.checkpoint("model-build", context); // Run the simulation for 100 ms, with time steps of 0.01 ms. sim.run(params.duration, 0.01); - meters.checkpoint("model-run"); + meters.checkpoint("model-run", context); // write meters - auto report = profile::make_meter_report(meters); + auto report = profile::make_meter_report(meters, context); std::cout << report << "\n"; - if (is_root==0) { + if (is_root) { std::ofstream fid; fid.exceptions(std::ios_base::badbit | std::ios_base::failbit); fid.open("meters.json"); diff --git a/example/brunel/brunel_miniapp.cpp b/example/brunel/brunel_miniapp.cpp index 9a377a810446e9863904dc9ed36b1015f16893bf..0cf109ed488a25a5d64a703f511913ae25046766 100644 --- a/example/brunel/brunel_miniapp.cpp +++ b/example/brunel/brunel_miniapp.cpp @@ -6,6 +6,7 @@ #include <set> #include <vector> +#include <arbor/context.hpp> #include <arbor/common_types.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/event_generator.hpp> @@ -24,13 +25,14 @@ #include <aux/strsub.hpp> #ifdef ARB_MPI_ENABLED #include <aux/with_mpi.hpp> +#include <mpi.h> #endif #include "io.hpp" using namespace arb; -void banner(proc_allocation, const execution_context&); +void banner(const context& ctx); // Samples m unique values in interval [start, end) - gid. // We exclude gid because we don't want self-loops. @@ -183,25 +185,31 @@ private: }; int main(int argc, char** argv) { - execution_context context; + bool root = true; + int rank = 0; try { #ifdef ARB_MPI_ENABLED aux::with_mpi guard(argc, argv, false); - context.distributed = mpi_context(MPI_COMM_WORLD); -#endif -#ifdef ARB_PROFILE_ENABLED - profile::profiler_initialize(context.thread_pool); + auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD); + { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + root = rank==0; + } +#else + auto context = arb::make_context(); #endif - arb::profile::meter_manager meters(context.distributed); - meters.start(); - std::cout << aux::mask_stream(context.distributed->id()==0); + + std::cout << aux::mask_stream(root); + banner(context); + + arb::profile::meter_manager meters; + meters.start(context); + // read parameters - io::cl_options options = io::read_options(argc, argv, context.distributed->id()==0); - proc_allocation nd = local_allocation(context); - banner(nd, context); + io::cl_options options = io::read_options(argc, argv, root); - meters.checkpoint("setup"); + meters.checkpoint("setup", context); // The size of excitatory population. cell_size_type nexc = options.nexc; @@ -236,7 +244,7 @@ int main(int argc, char** argv) { partition_hint_map hints; hints[cell_kind::lif_neuron].cpu_group_size = group_size; - auto decomp = partition_load_balance(recipe, nd, context, hints); + auto decomp = partition_load_balance(recipe, context, hints); simulation sim(recipe, decomp, context); @@ -245,7 +253,6 @@ int main(int argc, char** argv) { if (options.spike_file_output) { using std::ios_base; - auto rank = context.distributed->id(); aux::path p = options.output_path; p /= aux::strsub("%_%.%", options.file_name, rank, options.file_extension); @@ -259,20 +266,20 @@ int main(int argc, char** argv) { } } - meters.checkpoint("model-init"); + meters.checkpoint("model-init", context); // run simulation sim.run(options.tfinal, options.dt); - meters.checkpoint("model-simulate"); + meters.checkpoint("model-simulate", context); // output profile and diagnostic feedback std::cout << profile::profiler_summary() << "\n"; std::cout << "\nThere were " << sim.num_spikes() << " spikes\n"; - auto report = profile::make_meter_report(meters); + auto report = profile::make_meter_report(meters, context); std::cout << report; - if (context.distributed->id()==0) { + if (root) { std::ofstream fid; fid.exceptions(std::ios_base::badbit | std::ios_base::failbit); fid.open("meters.json"); @@ -281,7 +288,7 @@ int main(int argc, char** argv) { } catch (io::usage_error& e) { // only print usage/startup errors on master - std::cerr << aux::mask_stream(context.distributed->id()==0); + std::cerr << aux::mask_stream(root); std::cerr << e.what() << "\n"; return 1; } @@ -292,13 +299,12 @@ int main(int argc, char** argv) { return 0; } -void banner(proc_allocation nd, const execution_context& ctx) { +void banner(const context& ctx) { std::cout << "==========================================\n"; - std::cout << " Arbor miniapp\n"; - std::cout << " - distributed : " << ctx.distributed->size() - << " (" << ctx.distributed->name() << ")\n"; - std::cout << " - threads : " << nd.num_threads << "\n"; - std::cout << " - gpus : " << nd.num_gpus << "\n"; + std::cout << " Brunel model miniapp\n"; + std::cout << " - distributed : " << arb::num_ranks(ctx) + << (arb::has_mpi(ctx)? " (mpi)": " (serial)") << "\n"; + std::cout << " - threads : " << arb::num_threads(ctx) << "\n"; + std::cout << " - gpus : " << (arb::has_gpu(ctx)? "yes": "no") << "\n"; std::cout << "==========================================\n"; } - diff --git a/example/generators/event_gen.cpp b/example/generators/event_gen.cpp index 6cd934d0f42e74682b5582b7ef76a95fe069e2dc..7ea031c489a00e3e922502eab4a71e41a2d40f53 100644 --- a/example/generators/event_gen.cpp +++ b/example/generators/event_gen.cpp @@ -12,8 +12,8 @@ #include <nlohmann/json.hpp> +#include <arbor/context.hpp> #include <arbor/common_types.hpp> -#include <arbor/distributed_context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/event_generator.hpp> #include <arbor/load_balance.hpp> @@ -127,14 +127,13 @@ int main() { // A distributed_context is required for distributed computation (e.g. MPI). // For this simple one-cell example, non-distributed context is suitable, // which is what we get with a default-constructed distributed_context. - arb::execution_context context; + auto context = arb::make_context(); // Create an instance of our recipe. generator_recipe recipe; // Make the domain decomposition for the model - auto node = arb::local_allocation(context); - auto decomp = arb::partition_load_balance(recipe, node, context); + auto decomp = arb::partition_load_balance(recipe, context); // Construct the model. arb::simulation sim(recipe, decomp, context); diff --git a/example/miniapp/miniapp.cpp b/example/miniapp/miniapp.cpp index a84aedc18df4beab264bc55c78b0baf71a4c53cf..97530ed0cabff51a3d318ba6550581bf589becea 100644 --- a/example/miniapp/miniapp.cpp +++ b/example/miniapp/miniapp.cpp @@ -5,8 +5,7 @@ #include <memory> #include <vector> -#include <arbor/common_types.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/load_balance.hpp> #include <arbor/mc_cell.hpp> #include <arbor/profile/meter_manager.hpp> @@ -25,6 +24,7 @@ #include <aux/strsub.hpp> #ifdef ARB_MPI_ENABLED #include <aux/with_mpi.hpp> +#include <mpi.h> #endif #include "io.hpp" @@ -35,40 +35,45 @@ using namespace arb; using util::any_cast; -void banner(proc_allocation, const execution_context&); +void banner(const context&); std::unique_ptr<recipe> make_recipe(const io::cl_options&, const probe_distribution&); sample_trace make_trace(const probe_info& probe); std::fstream& open_or_throw(std::fstream& file, const aux::path& p, bool exclusive = false); void report_compartment_stats(const recipe&); int main(int argc, char** argv) { - // default serial context - execution_context context; + bool root = true; + int rank = 0; try { #ifdef ARB_MPI_ENABLED aux::with_mpi guard(argc, argv, false); - context.distributed = mpi_context(MPI_COMM_WORLD); + auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD); + { + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + root = rank==0; + } +#else + auto context = arb::make_context(); #endif #ifdef ARB_PROFILE_ENABLED - profile::profiler_initialize(context.thread_pool); + profile::profiler_initialize(context); #endif - profile::meter_manager meters(context.distributed); - meters.start(); + profile::meter_manager meters; + meters.start(context); + + std::cout << aux::mask_stream(root); - std::cout << aux::mask_stream(context.distributed->id()==0); // read parameters - io::cl_options options = io::read_options(argc, argv, context.distributed->id()==0); + io::cl_options options = io::read_options(argc, argv, root); // TODO: add dry run mode // Use a node description that uses the number of threads used by the // threading back end, and 1 gpu if available. - proc_allocation nd = local_allocation(context); - nd.num_gpus = nd.num_gpus>=1? 1: 0; - banner(nd, context); + banner(context); - meters.checkpoint("setup"); + meters.checkpoint("setup", context); // determine what to attach probes to probe_distribution pdist; @@ -80,7 +85,7 @@ int main(int argc, char** argv) { report_compartment_stats(*recipe); } - auto decomp = partition_load_balance(*recipe, nd, context); + auto decomp = partition_load_balance(*recipe, context); simulation sim(*recipe, decomp, context); // Set up samplers for probes on local cable cells, as requested @@ -118,7 +123,6 @@ int main(int argc, char** argv) { if (options.spike_file_output) { using std::ios_base; - auto rank = context.distributed->id(); aux::path p = options.output_path; p /= aux::strsub("%_%.%", options.file_name, rank, options.file_extension); @@ -132,12 +136,12 @@ int main(int argc, char** argv) { } } - meters.checkpoint("model-init"); + meters.checkpoint("model-init", context); // run model sim.run(options.tfinal, options.dt); - meters.checkpoint("model-simulate"); + meters.checkpoint("model-simulate", context); // output profile and diagnostic feedback auto profile = profile::profiler_summary(); @@ -150,9 +154,9 @@ int main(int argc, char** argv) { write_trace(trace, options.trace_prefix); } - auto report = profile::make_meter_report(meters); + auto report = profile::make_meter_report(meters, context); std::cout << report; - if (context.distributed->id()==0) { + if (root) { std::ofstream fid; fid.exceptions(std::ios_base::badbit | std::ios_base::failbit); fid.open("meters.json"); @@ -161,7 +165,7 @@ int main(int argc, char** argv) { } catch (io::usage_error& e) { // only print usage/startup errors on master - std::cerr << aux::mask_stream(context.distributed->id()==0); + std::cerr << aux::mask_stream(root); std::cerr << e.what() << "\n"; return 1; } @@ -172,13 +176,13 @@ int main(int argc, char** argv) { return 0; } -void banner(proc_allocation nd, const execution_context& ctx) { +void banner(const context& ctx) { std::cout << "==========================================\n"; std::cout << " Arbor miniapp\n"; - std::cout << " - distributed : " << ctx.distributed->size() - << " (" << ctx.distributed->name() << ")\n"; - std::cout << " - threads : " << nd.num_threads << "\n"; - std::cout << " - gpus : " << nd.num_gpus << "\n"; + std::cout << " - distributed : " << arb::num_ranks(ctx) + << (arb::has_mpi(ctx)? " (mpi)": " (serial)") << "\n"; + std::cout << " - threads : " << arb::num_threads(ctx) << "\n"; + std::cout << " - gpus : " << (arb::has_gpu(ctx)? "yes": "no") << "\n"; std::cout << "==========================================\n"; } diff --git a/example/ring/ring.cpp b/example/ring/ring.cpp index 11caa577bdd361491f8804d3982c03fbc6e85ff4..30cd1c54f7efcea1bf608ee3fb40d670c6c8bb0e 100644 --- a/example/ring/ring.cpp +++ b/example/ring/ring.cpp @@ -11,19 +11,26 @@ #include <arbor/assert_macro.hpp> #include <arbor/common_types.hpp> -#include <arbor/distributed_context.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/load_balance.hpp> #include <arbor/mc_cell.hpp> #include <arbor/profile/meter_manager.hpp> +#include <arbor/profile/profiler.hpp> #include <arbor/simple_sampler.hpp> #include <arbor/simulation.hpp> #include <arbor/recipe.hpp> +#include <arbor/version.hpp> +#include <aux/ioutil.hpp> #include <aux/json_meter.hpp> #include "parameters.hpp" +#ifdef ARB_MPI_ENABLED +#include <mpi.h> +#include <aux/with_mpi.hpp> +#endif + using arb::cell_gid_type; using arb::cell_lid_type; using arb::cell_size_type; @@ -106,25 +113,37 @@ private: }; struct cell_stats { - using size_type = std::uint64_t; + using size_type = unsigned; size_type ncells = 0; size_type nsegs = 0; size_type ncomp = 0; - cell_stats(arb::distributed_context_handle ctx, arb::recipe& r) { - size_type nranks = ctx->size(); - size_type rank = ctx->id(); + cell_stats(arb::recipe& r) { +#ifdef ARB_MPI_ENABLED + int nranks, rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); ncells = r.num_cells(); size_type cells_per_rank = ncells/nranks; size_type b = rank*cells_per_rank; size_type e = (rank==nranks-1)? ncells: (rank+1)*cells_per_rank; + size_type nsegs_tmp = 0; + size_type ncomp_tmp = 0; for (size_type i=b; i<e; ++i) { + auto c = arb::util::any_cast<arb::mc_cell>(r.get_cell_description(i)); + nsegs_tmp += c.num_segments(); + ncomp_tmp += c.num_compartments(); + } + MPI_Allreduce(&nsegs_tmp, &nsegs, 1, MPI_UNSIGNED, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&ncomp_tmp, &ncomp, 1, MPI_UNSIGNED, MPI_SUM, MPI_COMM_WORLD); +#else + ncells = r.num_cells(); + for (size_type i=0; i<ncells; ++i) { auto c = arb::util::any_cast<arb::mc_cell>(r.get_cell_description(i)); nsegs += c.num_segments(); ncomp += c.num_compartments(); } - nsegs = ctx->sum(nsegs); - ncomp = ctx->sum(ncomp); +#endif } friend std::ostream& operator<<(std::ostream& o, const cell_stats& s) { @@ -137,36 +156,44 @@ struct cell_stats { int main(int argc, char** argv) { - // default serial context - arb::execution_context context; - try { + bool root = true; + #ifdef ARB_MPI_ENABLED aux::with_mpi guard(argc, argv, false); - context.distributed = mpi_context(MPI_COMM_WORLD); + auto context = arb::make_context(arb::proc_allocation(), MPI_COMM_WORLD); + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + root = rank==0; + } +#else + auto context = arb::make_context(); #endif + #ifdef ARB_PROFILE_ENABLED - profile::profiler_initialize(context.thread_pool); + arb::profile::profiler_initialize(context); #endif - const bool root = context.distributed->id()==0; + std::cout << aux::mask_stream(root); + + // Print a banner with information about hardware configuration + std::cout << "gpu: " << (has_gpu(context)? "yes": "no") << "\n"; + std::cout << "threads: " << num_threads(context) << "\n"; + std::cout << "mpi: " << (has_mpi(context)? "yes": "no") << "\n"; + std::cout << "ranks: " << num_ranks(context) << "\n" << std::endl; auto params = read_options(argc, argv); - arb::profile::meter_manager meters(context.distributed); - meters.start(); + arb::profile::meter_manager meters; + meters.start(context); // Create an instance of our recipe. ring_recipe recipe(params.num_cells, params.cell, params.min_delay); - cell_stats stats(context.distributed, recipe); + cell_stats stats(recipe); std::cout << stats << "\n"; - // Use a node description that uses the number of threads used by the - // threading back end, and 1 gpu if available. - arb::proc_allocation nd = arb::local_allocation(context); - nd.num_gpus = nd.num_gpus>=1? 1: 0; - - auto decomp = arb::partition_load_balance(recipe, nd, context); + auto decomp = arb::partition_load_balance(recipe, context); // Construct the model. arb::simulation sim(recipe, decomp, context); @@ -191,19 +218,20 @@ int main(int argc, char** argv) { }); } - meters.checkpoint("model-init"); + meters.checkpoint("model-init", context); + std::cout << "running simulation" << std::endl; // Run the simulation for 100 ms, with time steps of 0.025 ms. sim.run(params.duration, 0.025); - meters.checkpoint("model-run"); + meters.checkpoint("model-run", context); auto ns = sim.num_spikes(); - std::cout << "\n" << ns << " spikes generated at rate of " - << params.duration/ns << " ms between spikes\n"; // Write spikes to file if (root) { + std::cout << "\n" << ns << " spikes generated at rate of " + << params.duration/ns << " ms between spikes\n"; std::ofstream fid("spikes.gdf"); if (!fid.good()) { std::cerr << "Warning: unable to open file spikes.gdf for spike output\n"; @@ -220,9 +248,9 @@ int main(int argc, char** argv) { } // Write the samples to a json file. - write_trace_json(voltage); + if (root) write_trace_json(voltage); - auto report = arb::profile::make_meter_report(meters); + auto report = arb::profile::make_meter_report(meters, context); std::cout << report; } catch (std::exception& e) { diff --git a/include/arbor/context.hpp b/include/arbor/context.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d781896c8ccc657aa1b88a3501e84264acee7fbc --- /dev/null +++ b/include/arbor/context.hpp @@ -0,0 +1,91 @@ +#pragma once + +#include <memory> + +namespace arb { + +/// Summary of all available local computation resource. +struct local_resources { + const unsigned num_threads; + const unsigned num_gpus; + + local_resources(unsigned threads, unsigned gpus): + num_threads(threads), + num_gpus(gpus) + {} +}; + +/// Determine available local domain resources. +local_resources get_local_resources(); + +/// A subset of local computation resources to use in a computation. +struct proc_allocation { + unsigned num_threads; + + // The gpu id corresponds to the `int device` parameter used by CUDA API calls + // to identify gpu devices. + // Set to -1 to indicate that no GPU device is to be used. + // see CUDA documenation for cudaSetDevice and cudaDeviceGetAttribute + int gpu_id; + + // By default a proc_allocation will take all available threads and the + // GPU with id 0, if available. + proc_allocation() { + auto avail = get_local_resources(); + + // By default take all available threads. + num_threads = avail.num_threads; + + // Take the first GPU, if available. + gpu_id = avail.num_gpus>0? 0: -1; + } + + proc_allocation(unsigned threads, int gpu): + num_threads(threads), + gpu_id(gpu) + {} + + bool has_gpu() const { + return gpu_id>=0; + } +}; + +// arb::execution_context is a container defined in the implementation for state +// related to execution resources, specifically thread pools, gpus and MPI +// communicators. + +// Forward declare execution_context. +struct execution_context; + +// arb::context is an opaque handle for this container presented in the +// public API. +// It doesn't make sense to copy contexts, so we use a std::unique_ptr to +// implement the handle with lifetime management. +// +// Because execution_context is an incomplete type, a destructor prototype must +// be provided. +using context = std::unique_ptr<execution_context, void (*)(execution_context*)>; + + +// Helpers for creating contexts. These are implemented in the back end. + +// Non-distributed context that uses all detected threads and one GPU if available. +context make_context(); + +// Non-distributed context that uses resources described by resources +context make_context(const proc_allocation& resources); + +// Distributed context that uses MPI communicator comm, and local resources +// described by resources. +template <typename Comm> +context make_context(const proc_allocation& resources, Comm comm); + +// Queries for properties of execution resources in a context. + +bool has_gpu(const context&); +unsigned num_threads(const context&); +bool has_mpi(const context&); +unsigned num_ranks(const context&); +unsigned rank(const context&); + +} diff --git a/include/arbor/domain_decomposition.hpp b/include/arbor/domain_decomposition.hpp index 2aa14eafbb751401699e3cf7daf6493782017e0b..619526c5e6b55fbec6f2d1aca04c66b15780d148 100644 --- a/include/arbor/domain_decomposition.hpp +++ b/include/arbor/domain_decomposition.hpp @@ -6,19 +6,10 @@ #include <arbor/assert.hpp> #include <arbor/common_types.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> namespace arb { -/// Local resource info for domain partitioning. -struct proc_allocation { - unsigned num_threads = 1; - unsigned num_gpus = 0; -}; - -/// Determine available local domain resources. -proc_allocation local_allocation(const execution_context& ctx); - /// Metadata for a local cell group. struct group_description { /// The kind of cell in the group. All cells in a cell_group have the same type. diff --git a/include/arbor/execution_context.hpp b/include/arbor/execution_context.hpp deleted file mode 100644 index 3a457a2a92c579b5a6a89e3802ddd4c84710bb44..0000000000000000000000000000000000000000 --- a/include/arbor/execution_context.hpp +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include <memory> -#include <string> - -#include <arbor/distributed_context.hpp> -#include <arbor/util/pp_util.hpp> - - -namespace arb { -namespace threading { - class task_system; -} -struct gpu_context; - -using task_system_handle = std::shared_ptr<threading::task_system>; -using distributed_context_handle = std::shared_ptr<distributed_context>; -using gpu_context_handle = std::shared_ptr<gpu_context>; - -task_system_handle make_thread_pool(); -task_system_handle make_thread_pool(int nthreads); - -gpu_context_handle make_gpu_context(); - -struct execution_context { - distributed_context_handle distributed; - task_system_handle thread_pool; - gpu_context_handle gpu; - - execution_context(): distributed(std::make_shared<distributed_context>()), - thread_pool(arb::make_thread_pool()), - gpu(arb::make_gpu_context()) {}; -}; - -} diff --git a/include/arbor/load_balance.hpp b/include/arbor/load_balance.hpp index 004445cceb35c78c6bc47487640dc6b0e46fd9ab..8849f45c5a2e075eb16702d45b2fe92c0faf0f9b 100644 --- a/include/arbor/load_balance.hpp +++ b/include/arbor/load_balance.hpp @@ -1,6 +1,6 @@ #pragma once -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/recipe.hpp> @@ -18,8 +18,7 @@ using partition_hint_map = std::unordered_map<cell_kind, partition_hint>; domain_decomposition partition_load_balance( const recipe& rec, - proc_allocation nd, - const execution_context& ctx, + const context& ctx, partition_hint_map hint_map = {}); } // namespace arb diff --git a/include/arbor/profile/meter_manager.hpp b/include/arbor/profile/meter_manager.hpp index 454d8dffdf6cd2d48f09176de2a93bdd3a026c5d..c264660e7f306cc94c5c342796f1ddbe3b1687d3 100644 --- a/include/arbor/profile/meter_manager.hpp +++ b/include/arbor/profile/meter_manager.hpp @@ -4,7 +4,7 @@ #include <string> #include <vector> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/profile/meter.hpp> #include <arbor/profile/timer.hpp> @@ -25,7 +25,7 @@ struct measurement { std::string name; std::string units; std::vector<std::vector<double>> measurements; - measurement(std::string, std::string, const std::vector<double>&, const distributed_context_handle&); + measurement(std::string, std::string, const std::vector<double>&, const context&); }; class meter_manager { @@ -38,17 +38,16 @@ private: std::vector<std::unique_ptr<meter>> meters_; std::vector<std::string> checkpoint_names_; - distributed_context_handle glob_ctx_; - public: - meter_manager(distributed_context_handle ctx); - void start(); - void checkpoint(std::string name); - distributed_context_handle context() const; + meter_manager(); + void start(const context& ctx); + void checkpoint(std::string name, const context& ctx); const std::vector<std::unique_ptr<meter>>& meters() const; const std::vector<std::string>& checkpoint_names() const; const std::vector<double>& times() const; + + const context& ctx() const; }; // Simple type for gathering distributed meter information @@ -60,7 +59,7 @@ struct meter_report { std::vector<std::string> hosts; }; -meter_report make_meter_report(const meter_manager& manager); +meter_report make_meter_report(const meter_manager& manager, const context& ctx); std::ostream& operator<<(std::ostream& o, const meter_report& report); } // namespace profile diff --git a/include/arbor/profile/profiler.hpp b/include/arbor/profile/profiler.hpp index fa8da040f7ae3ae4171afec667f510910b4b9970..9dbd2c61e910311f5b4fc4eb242d303957077109 100644 --- a/include/arbor/profile/profiler.hpp +++ b/include/arbor/profile/profiler.hpp @@ -5,10 +5,11 @@ #include <unordered_map> #include <vector> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/profile/timer.hpp> namespace arb { + namespace profile { // type used for region identifiers @@ -33,7 +34,7 @@ struct profile { }; void profiler_clear(); -void profiler_initialize(task_system_handle& ts); +void profiler_initialize(context& ctx); void profiler_enter(std::size_t region_id); void profiler_leave(); diff --git a/include/arbor/simulation.hpp b/include/arbor/simulation.hpp index 083409a655ee22e3d74f6f9b65da9761b7974883..bff327d1d4c2cbc2cb439eec843cb79bb1eda26b 100644 --- a/include/arbor/simulation.hpp +++ b/include/arbor/simulation.hpp @@ -6,11 +6,12 @@ #include <vector> #include <arbor/common_types.hpp> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/recipe.hpp> #include <arbor/sampling.hpp> #include <arbor/schedule.hpp> +#include <arbor/spike.hpp> #include <arbor/util/handle_set.hpp> namespace arb { @@ -22,7 +23,7 @@ class simulation_state; class simulation { public: - simulation(const recipe& rec, const domain_decomposition& decomp, execution_context ctx); + simulation(const recipe& rec, const domain_decomposition& decomp, const context& ctx); void reset(); diff --git a/test/unit-distributed/distributed_listener.cpp b/test/unit-distributed/distributed_listener.cpp index 1110dfeba0dcdd30889c368968e8e45cb861b65e..075d00528867077ebccaca70cc72a7de2ec0e9b2 100644 --- a/test/unit-distributed/distributed_listener.cpp +++ b/test/unit-distributed/distributed_listener.cpp @@ -25,10 +25,11 @@ distributed_listener::printer& operator<<(distributed_listener::printer& p, cons return p; } -distributed_listener::distributed_listener(std::string f_base, arb::distributed_context_handle ctx): +distributed_listener::distributed_listener(std::string f_base, const arb::context& ctx): context_(ctx), - rank_(context_->id()), - size_(context_->size()), + rank_(arb::rank(ctx)), + size_(arb::num_ranks(ctx)), + mpi_(arb::has_mpi(ctx)), emit_(std::move(f_base), rank_) {} @@ -93,7 +94,13 @@ void distributed_listener::OnTestEnd(const TestInfo& test_info) { ++test_case_tests_; // count the number of ranks that had errors - int global_errors = context_->sum(test_failures_>0 ? 1 : 0); + int global_errors = test_failures_? 1: 0; +#ifdef ARB_MPI_ENABLED + if (mpi_) { + int local_error = test_failures_? 1: 0; + MPI_Allreduce(&local_error, global_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + } +#endif if (global_errors>0) { ++test_case_failures_; emit_ << " GLOBAL_FAIL on " << global_errors << "ranks\n"; diff --git a/test/unit-distributed/distributed_listener.hpp b/test/unit-distributed/distributed_listener.hpp index b94acea908d19cd86387079dd99984122d581f07..48c9a48748a8bbd78982f08f1804170d55459f4b 100644 --- a/test/unit-distributed/distributed_listener.hpp +++ b/test/unit-distributed/distributed_listener.hpp @@ -4,7 +4,7 @@ #include <string> #include <utility> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include "../gtest.h" @@ -29,7 +29,7 @@ class distributed_listener: public testing::EmptyTestEventListener { using TestPartResult = testing::TestPartResult; public: - distributed_listener(std::string f_base, arb::distributed_context_handle ctx); + distributed_listener(std::string f_base, const arb::context &ctx); /// Messages that are printed at the start and end of the test program. /// i.e. once only. @@ -57,16 +57,19 @@ private: std::ofstream fid_; bool cout_; + printer() = default; printer(std::string base_name, int rank); }; template <typename T> friend printer& operator<<(printer&, const T&); - arb::distributed_context_handle context_; + const arb::context& context_; int rank_; int size_; + bool mpi_; printer emit_; + int test_case_failures_; int test_case_tests_; int test_failures_; diff --git a/test/unit-distributed/test.cpp b/test/unit-distributed/test.cpp index 085a8990b2ee29e3e76df199a98ce7bd5816be89..4f116af81ee07a65b725447f79843f52ce82e03d 100644 --- a/test/unit-distributed/test.cpp +++ b/test/unit-distributed/test.cpp @@ -5,7 +5,7 @@ #include "../gtest.h" -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <aux/ioutil.hpp> #include <aux/tinyopt.hpp> @@ -13,11 +13,16 @@ #include <aux/with_mpi.hpp> #endif +#include "distributed_context.hpp" +#include "execution_context.hpp" + #include "distributed_listener.hpp" +#include "test.hpp" + using namespace arb; -execution_context g_context; +context g_context = make_context(); const char* usage_str = "[OPTION]...\n" @@ -26,11 +31,14 @@ const char* usage_str = " -h, --help Display usage information and exit\n"; int main(int argc, char **argv) { + proc_allocation alloc; + alloc.gpu_id = -1; + #ifdef TEST_MPI aux::with_mpi guard(argc, argv, false); - g_context.distributed = mpi_context(MPI_COMM_WORLD); + g_context = arb::make_context(alloc, MPI_COMM_WORLD); #elif defined(TEST_LOCAL) - g_context.distributed = std::make_shared<distributed_context>(local_context()); + g_context = arb::make_context(alloc); #else #error "define TEST_MPI or TEST_LOCAL for distributed test" #endif @@ -42,7 +50,7 @@ int main(int argc, char **argv) { auto& listeners = testing::UnitTest::GetInstance()->listeners(); // replace original printer with our custom printer delete listeners.Release(listeners.default_result_printer()); - listeners.Append(new distributed_listener("run_"+g_context.distributed->name(), g_context.distributed)); + listeners.Append(new distributed_listener("run_"+g_context->distributed->name(), g_context)); int return_value = 0; try { @@ -84,5 +92,5 @@ int main(int argc, char **argv) { // perform global collective, to ensure that all ranks return // the same exit code - return g_context.distributed->max(return_value); + return g_context->distributed->max(return_value); } diff --git a/test/unit-distributed/test.hpp b/test/unit-distributed/test.hpp index 630bd188258e8fd5a57346be3e2182faa1203e8a..3cb3cb38eff57e85b0e1f1d5753936f8c3bef413 100644 --- a/test/unit-distributed/test.hpp +++ b/test/unit-distributed/test.hpp @@ -1,7 +1,7 @@ #pragma once -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> // Global context is a global variable, set in the main() funtion of the main // test driver test.cpp. -extern arb::execution_context g_context; +extern arb::context g_context; diff --git a/test/unit-distributed/test_communicator.cpp b/test/unit-distributed/test_communicator.cpp index 14a5bc0b36b7b63e2426340dd6cb46e844ddb738..e40a13acee3b8906b9440a60663832973a7c9429 100644 --- a/test/unit-distributed/test_communicator.cpp +++ b/test/unit-distributed/test_communicator.cpp @@ -10,25 +10,23 @@ #include <threading/threading.hpp> #include "communication/communicator.hpp" +#include "execution_context.hpp" #include "util/filter.hpp" #include "util/rangeutil.hpp" #include "util/span.hpp" using namespace arb; -static bool is_dry_run() { - //return global_policy::kind() == global_policy_kind::dryrun; - return false; -} - TEST(communicator, policy_basics) { - const auto num_domains = g_context.distributed->size(); - const auto rank = g_context.distributed->id(); - EXPECT_EQ(g_context.distributed->min(rank), 0); - if (!is_dry_run()) { - EXPECT_EQ(g_context.distributed->max(rank), num_domains-1); - } + const int num_domains = g_context->distributed->size(); + const int rank = g_context->distributed->id();; + + EXPECT_EQ((int)arb::num_ranks(g_context), num_domains); + EXPECT_EQ((int)arb::rank(g_context), rank); + + EXPECT_EQ(g_context->distributed->min(rank), 0); + EXPECT_EQ(g_context->distributed->max(rank), num_domains-1); } // Wrappers for creating and testing spikes used @@ -51,19 +49,11 @@ int get_value(const arb::spike& s) { // Test low level spike_gather function when each domain produces the same // number of spikes in the pattern used by dry run mode. TEST(communicator, gather_spikes_equal) { - const auto num_domains = g_context.distributed->size(); - const auto rank = g_context.distributed->id(); + const auto num_domains = g_context->distributed->size(); + const auto rank = g_context->distributed->id(); const auto n_local_spikes = 10; - /* - const auto n_local_cells = n_local_spikes; - // Important: set up meta-data in dry run back end. - if (is_dry_run()) { - g_context.set_sizes(g_context.size(), n_local_cells); - } - */ - // Create local spikes for communication. std::vector<spike> local_spikes; for (auto i=0; i<n_local_spikes; ++i) { @@ -71,7 +61,7 @@ TEST(communicator, gather_spikes_equal) { } // Perform exchange - const auto global_spikes = g_context.distributed->gather_spikes(local_spikes); + const auto global_spikes = g_context->distributed->gather_spikes(local_spikes); // Test that partition information is correct const auto& part = global_spikes.partition(); @@ -91,29 +81,19 @@ TEST(communicator, gather_spikes_equal) { // is a list of num_domains*n_local_spikes spikes that have // contiguous source gid const auto& spikes = global_spikes.values(); - EXPECT_EQ(n_local_spikes*g_context.distributed->size(), int(spikes.size())); + EXPECT_EQ(n_local_spikes*g_context->distributed->size(), int(spikes.size())); for (auto i=0u; i<spikes.size(); ++i) { const auto s = spikes[i]; EXPECT_EQ(i, unsigned(s.source.gid)); - if (is_dry_run()) { - EXPECT_EQ(0, get_value(s)); - } - else { - EXPECT_EQ(int(i)/n_local_spikes, get_value(s)); - } + EXPECT_EQ(int(i)/n_local_spikes, get_value(s)); } } // Test low level spike_gather function when the number of spikes per domain // are not equal. TEST(communicator, gather_spikes_variant) { - // This test does not apply if in dry run mode. - // Because dry run mode requires that each domain have the same - // number of spikes. - if (is_dry_run()) return; - - const auto num_domains = g_context.distributed->size(); - const auto rank = g_context.distributed->id(); + const auto num_domains = g_context->distributed->size(); + const auto rank = g_context->distributed->id(); // Parameter used to scale the number of spikes generated on successive // ranks. @@ -137,7 +117,7 @@ TEST(communicator, gather_spikes_variant) { } // Perform exchange - const auto global_spikes = g_context.distributed->gather_spikes(local_spikes); + const auto global_spikes = g_context->distributed->gather_spikes(local_spikes); // Test that partition information is correct const auto& part =global_spikes.partition(); @@ -167,7 +147,7 @@ namespace { public: ring_recipe(cell_size_type s): size_(s), - ranks_(g_context.distributed->size()) + ranks_(g_context->distributed->size()) {} cell_size_type num_cells() const override { @@ -231,7 +211,7 @@ namespace { public: all2all_recipe(cell_size_type s): size_(s), - ranks_(g_context.distributed->size()) + ranks_(g_context->distributed->size()) {} cell_size_type num_cells() const override { @@ -314,10 +294,10 @@ test_ring(const domain_decomposition& D, communicator& C, F&& f) { // gather the global set of spikes auto global_spikes = C.exchange(local_spikes); - if (global_spikes.size()!=g_context.distributed->sum(local_spikes.size())) { + if (global_spikes.size()!=g_context->distributed->sum(local_spikes.size())) { return ::testing::AssertionFailure() << "the number of gathered spikes " << global_spikes.size() << " doesn't match the expected " - << g_context.distributed->sum(local_spikes.size()); + << g_context->distributed->sum(local_spikes.size()); } // generate the events @@ -363,7 +343,7 @@ TEST(communicator, ring) using util::make_span; // construct a homogeneous network of 10*n_domain identical cells in a ring - unsigned N = g_context.distributed->size(); + unsigned N = g_context->distributed->size(); unsigned n_local = 10u; unsigned n_global = n_local*N; @@ -371,8 +351,8 @@ TEST(communicator, ring) auto R = ring_recipe(n_global); // use a node decomposition that reflects the resources available // on the node that the test is running on, including gpus. - const auto D = partition_load_balance(R, local_allocation(g_context), g_context); - auto C = communicator(R, D, g_context); + const auto D = partition_load_balance(R, g_context); + auto C = communicator(R, D, *g_context); // every cell fires EXPECT_TRUE(test_ring(D, C, [](cell_gid_type g){return true;})); @@ -405,10 +385,10 @@ test_all2all(const domain_decomposition& D, communicator& C, F&& f) { // gather the global set of spikes auto global_spikes = C.exchange(local_spikes); - if (global_spikes.size()!=g_context.distributed->sum(local_spikes.size())) { + if (global_spikes.size()!=g_context->distributed->sum(local_spikes.size())) { return ::testing::AssertionFailure() << "the number of gathered spikes " << global_spikes.size() << " doesn't match the expected " - << g_context.distributed->sum(local_spikes.size()); + << g_context->distributed->sum(local_spikes.size()); } // generate the events @@ -458,7 +438,7 @@ TEST(communicator, all2all) using util::make_span; // construct a homogeneous network of 10*n_domain identical cells in a ring - unsigned N = g_context.distributed->size(); + unsigned N = g_context->distributed->size(); unsigned n_local = 10u; unsigned n_global = n_local*N; @@ -466,8 +446,8 @@ TEST(communicator, all2all) auto R = all2all_recipe(n_global); // use a node decomposition that reflects the resources available // on the node that the test is running on, including gpus. - const auto D = partition_load_balance(R, local_allocation(g_context), g_context); - auto C = communicator(R, D, g_context); + const auto D = partition_load_balance(R, g_context); + auto C = communicator(R, D, *g_context); // every cell fires EXPECT_TRUE(test_all2all(D, C, [](cell_gid_type g){return true;})); diff --git a/test/unit-distributed/test_domain_decomposition.cpp b/test/unit-distributed/test_domain_decomposition.cpp index 2d513c1741040cd0c1c54070bcb0d6e998fc685e..43f3127f7772a680af3285333dfad03e10e198b7 100644 --- a/test/unit-distributed/test_domain_decomposition.cpp +++ b/test/unit-distributed/test_domain_decomposition.cpp @@ -7,14 +7,20 @@ #include <string> #include <vector> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> +#include <arbor/version.hpp> #include "util/span.hpp" #include "../simple_recipes.hpp" #include "test.hpp" +#ifdef TEST_MPI +#include <mpi.h> +#endif + using namespace arb; namespace { @@ -63,120 +69,150 @@ namespace { }; } -TEST(domain_decomposition, homogeneous_population) { - const auto N = g_context.distributed->size(); - const auto I = g_context.distributed->id(); - - { // Test on a node with 1 cpu core and no gpus. - // We assume that all cells will be put into cell groups of size 1. - // This assumption will not hold in the future, requiring and update to - // the test. - proc_allocation nd{1, 0}; - - // 10 cells per domain - unsigned n_local = 10; - unsigned n_global = n_local*N; - const auto D = partition_load_balance(homo_recipe(n_global, dummy_cell{}), nd, g_context); - - EXPECT_EQ(D.num_global_cells, n_global); - EXPECT_EQ(D.num_local_cells, n_local); - EXPECT_EQ(D.groups.size(), n_local); +TEST(domain_decomposition, homogeneous_population_mc) { + // Test on a node with 1 cpu core and no gpus. + // We assume that all cells will be put into cell groups of size 1. + // This assumption will not hold in the future, requiring and update to + // the test. + proc_allocation resources{1, -1}; +#ifdef ARB_TEST_MPI + auto ctx = make_context(resources, MPI_COMM_WORLD); +#else + auto ctx = make_context(resources); +#endif + + const unsigned N = arb::num_ranks(ctx); + const unsigned I = arb::rank(ctx); + + // 10 cells per domain + unsigned n_local = 10; + unsigned n_global = n_local*N; + const auto D = partition_load_balance(homo_recipe(n_global, dummy_cell{}), ctx); + + EXPECT_EQ(D.num_global_cells, n_global); + EXPECT_EQ(D.num_local_cells, n_local); + EXPECT_EQ(D.groups.size(), n_local); + + auto b = I*n_local; + auto e = (I+1)*n_local; + auto gids = util::make_span(b, e); + for (auto gid: gids) { + EXPECT_EQ(I, (unsigned)D.gid_domain(gid)); + } - auto b = I*n_local; - auto e = (I+1)*n_local; - auto gids = util::make_span(b, e); - for (auto gid: gids) { - EXPECT_EQ(I, D.gid_domain(gid)); - } + // Each cell group contains 1 cell of kind cable1d_neuron + // Each group should also be tagged for cpu execution + for (auto i: gids) { + auto local_group = i-b; + auto& grp = D.groups[local_group]; + EXPECT_EQ(grp.gids.size(), 1u); + EXPECT_EQ(grp.gids.front(), unsigned(i)); + EXPECT_EQ(grp.backend, backend_kind::multicore); + EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); + } +} - // Each cell group contains 1 cell of kind cable1d_neuron - // Each group should also be tagged for cpu execution - for (auto i: gids) { - auto local_group = i-b; - auto& grp = D.groups[local_group]; - EXPECT_EQ(grp.gids.size(), 1u); - EXPECT_EQ(grp.gids.front(), unsigned(i)); - EXPECT_EQ(grp.backend, backend_kind::multicore); - EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); - } +#ifdef ARB_GPU_ENABLED +TEST(domain_decomposition, homogeneous_population_gpu) { + // TODO: skip this test + // * when the ability to skip tests at runtime is added to Google Test. + // * when a GPU is not available + // https://github.com/google/googletest/pull/1544 + + // Test on a node with 1 gpu and 1 cpu core. + // Assumes that all cells will be placed on gpu in a single group. + + proc_allocation resources; + resources.num_threads = 1; +#ifdef ARB_TEST_MPI + auto ctx = make_context(resources, MPI_COMM_WORLD); +#else + auto ctx = make_context(resources); +#endif + + const unsigned N = arb::num_ranks(ctx); + const unsigned I = arb::rank(ctx); + + if (!resources.has_gpu()) return; // Skip if no gpu available. + + // 10 cells per domain + unsigned n_local = 10; + unsigned n_global = n_local*N; + const auto D = partition_load_balance(homo_recipe(n_global, dummy_cell{}), ctx); + + EXPECT_EQ(D.num_global_cells, n_global); + EXPECT_EQ(D.num_local_cells, n_local); + EXPECT_EQ(D.groups.size(), 1u); + + auto b = I*n_local; + auto e = (I+1)*n_local; + auto gids = util::make_span(b, e); + for (auto gid: gids) { + EXPECT_EQ(I, (unsigned)D.gid_domain(gid)); } - { // Test on a node with 1 gpu and 1 cpu core. - // Assumes that all cells will be placed on gpu in a single group. - proc_allocation nd{1, 1}; - - // 10 cells per domain - unsigned n_local = 10; - unsigned n_global = n_local*N; - const auto D = partition_load_balance(homo_recipe(n_global, dummy_cell{}), nd, g_context); - - EXPECT_EQ(D.num_global_cells, n_global); - EXPECT_EQ(D.num_local_cells, n_local); - EXPECT_EQ(D.groups.size(), 1u); - - auto b = I*n_local; - auto e = (I+1)*n_local; - auto gids = util::make_span(b, e); - for (auto gid: gids) { - EXPECT_EQ(I, D.gid_domain(gid)); - } - // Each cell group contains 1 cell of kind cable1d_neuron - // Each group should also be tagged for cpu execution - auto grp = D.groups[0u]; + // Each cell group contains 1 cell of kind cable1d_neuron + // Each group should also be tagged for cpu execution + auto grp = D.groups[0u]; - EXPECT_EQ(grp.gids.size(), n_local); - EXPECT_EQ(grp.gids.front(), b); - EXPECT_EQ(grp.gids.back(), e-1); - EXPECT_EQ(grp.backend, backend_kind::gpu); - EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); - } + EXPECT_EQ(grp.gids.size(), n_local); + EXPECT_EQ(grp.gids.front(), b); + EXPECT_EQ(grp.gids.back(), e-1); + EXPECT_EQ(grp.backend, backend_kind::gpu); + EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); } +#endif TEST(domain_decomposition, heterogeneous_population) { - const auto N = g_context.distributed->size(); - const auto I = g_context.distributed->id(); - - { // Test on a node with 1 cpu core and no gpus. - // We assume that all cells will be put into cell groups of size 1. - // This assumption will not hold in the future, requiring and update to - // the test. - proc_allocation nd{1, 0}; - - // 10 cells per domain - const unsigned n_local = 10; - const unsigned n_global = n_local*N; - const unsigned n_local_grps = n_local; // 1 cell per group - auto R = hetero_recipe(n_global); - const auto D = partition_load_balance(R, nd, g_context); - - EXPECT_EQ(D.num_global_cells, n_global); - EXPECT_EQ(D.num_local_cells, n_local); - EXPECT_EQ(D.groups.size(), n_local); - - auto b = I*n_local; - auto e = (I+1)*n_local; - auto gids = util::make_span(b, e); - for (auto gid: gids) { - EXPECT_EQ(I, D.gid_domain(gid)); - } + // Test on a node with 1 cpu core and no gpus. + // We assume that all cells will be put into cell groups of size 1. + // This assumption will not hold in the future, requiring and update to + // the test. + proc_allocation resources{1, -1}; +#ifdef ARB_TEST_MPI + auto ctx = make_context(resources, MPI_COMM_WORLD); +#else + auto ctx = make_context(resources); +#endif + + const auto N = arb::num_ranks(ctx); + const auto I = arb::rank(ctx); + + + // 10 cells per domain + const unsigned n_local = 10; + const unsigned n_global = n_local*N; + const unsigned n_local_grps = n_local; // 1 cell per group + auto R = hetero_recipe(n_global); + const auto D = partition_load_balance(R, ctx); + + EXPECT_EQ(D.num_global_cells, n_global); + EXPECT_EQ(D.num_local_cells, n_local); + EXPECT_EQ(D.groups.size(), n_local); + + auto b = I*n_local; + auto e = (I+1)*n_local; + auto gids = util::make_span(b, e); + for (auto gid: gids) { + EXPECT_EQ(I, (unsigned)D.gid_domain(gid)); + } - // Each cell group contains 1 cell of kind cable1d_neuron - // Each group should also be tagged for cpu execution - auto grps = util::make_span(0, n_local_grps); - std::map<cell_kind, std::set<cell_gid_type>> kind_lists; - for (auto i: grps) { - auto& grp = D.groups[i]; - EXPECT_EQ(grp.gids.size(), 1u); - kind_lists[grp.kind].insert(grp.gids.front()); - EXPECT_EQ(grp.backend, backend_kind::multicore); - } + // Each cell group contains 1 cell of kind cable1d_neuron + // Each group should also be tagged for cpu execution + auto grps = util::make_span(0, n_local_grps); + std::map<cell_kind, std::set<cell_gid_type>> kind_lists; + for (auto i: grps) { + auto& grp = D.groups[i]; + EXPECT_EQ(grp.gids.size(), 1u); + kind_lists[grp.kind].insert(grp.gids.front()); + EXPECT_EQ(grp.backend, backend_kind::multicore); + } - for (auto k: {cell_kind::cable1d_neuron, cell_kind::spike_source}) { - const auto& gids = kind_lists[k]; - EXPECT_EQ(gids.size(), n_local/2); - for (auto gid: gids) { - EXPECT_EQ(k, R.get_cell_kind(gid)); - } + for (auto k: {cell_kind::cable1d_neuron, cell_kind::spike_source}) { + const auto& gids = kind_lists[k]; + EXPECT_EQ(gids.size(), n_local/2); + for (auto gid: gids) { + EXPECT_EQ(k, R.get_cell_kind(gid)); } } } diff --git a/test/unit/test_backend.cpp b/test/unit/test_backend.cpp index 48f5836c2e90cb1b979f636eda7bc6d3086aade6..b43b7554284b2031cc768fa910608737a1f9db5e 100644 --- a/test/unit/test_backend.cpp +++ b/test/unit/test_backend.cpp @@ -1,6 +1,7 @@ #include <arbor/common_types.hpp> #include <arbor/version.hpp> +#include "execution_context.hpp" #include "fvm_lowered_cell.hpp" #include "../gtest.h" diff --git a/test/unit/test_domain_decomposition.cpp b/test/unit/test_domain_decomposition.cpp index ac8b4a2d6252d4f6591674a1c0975a05251d3028..a35cef75753a0d221b65dd389ed8761e2095a749 100644 --- a/test/unit/test_domain_decomposition.cpp +++ b/test/unit/test_domain_decomposition.cpp @@ -2,7 +2,7 @@ #include <stdexcept> -#include <arbor/execution_context.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> @@ -13,6 +13,17 @@ using namespace arb; using arb::util::make_span; +// TODO +// The tests here will only test domain decomposition with GPUs when compiled +// with CUDA support and run on a system with a GPU. +// Ideally the tests should test domain decompositions under all conditions, however +// to do that we have to refactor the partition_load_balance algorithm. +// The partition_load_balance performs the decomposition to distribute +// over resources described by the user-supplied arb::context, which is a +// provides an interface to resources available at runtime. +// The best way to test under all conditions, is probably to refactor the +// partition_load_balance into components that can be tested in isolation. + namespace { // Dummy recipes types for testing. @@ -48,42 +59,16 @@ namespace { // test assumes one domain TEST(domain_decomposition, homogenous_population) { - execution_context context; - - { // Test on a node with 1 cpu core and no gpus. - // We assume that all cells will be put into cell groups of size 1. - // This assumption will not hold in the future, requiring and update to - // the test. - proc_allocation nd{1, 0}; - - unsigned num_cells = 10; - const auto D = partition_load_balance(homo_recipe(num_cells, dummy_cell{}), nd, context); - - EXPECT_EQ(D.num_global_cells, num_cells); - EXPECT_EQ(D.num_local_cells, num_cells); - EXPECT_EQ(D.groups.size(), num_cells); + proc_allocation resources; + resources.num_threads = 1; - auto gids = make_span(num_cells); - for (auto gid: gids) { - EXPECT_EQ(0, D.gid_domain(gid)); - } - - // Each cell group contains 1 cell of kind cable1d_neuron - // Each group should also be tagged for cpu execution - for (auto i: gids) { - auto& grp = D.groups[i]; - EXPECT_EQ(grp.gids.size(), 1u); - EXPECT_EQ(grp.gids.front(), unsigned(i)); - EXPECT_EQ(grp.backend, backend_kind::multicore); - EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); - } - } - { // Test on a node with 1 gpu and 1 cpu core. + if (resources.has_gpu()) { + // Test on a node with 1 gpu and 1 cpu core. // Assumes that all cells will be placed on gpu in a single group. - proc_allocation nd{1, 1}; + auto ctx = make_context(resources); unsigned num_cells = 10; - const auto D = partition_load_balance(homo_recipe(num_cells, dummy_cell{}), nd, context); + const auto D = partition_load_balance(homo_recipe(num_cells, dummy_cell{}), ctx); EXPECT_EQ(D.num_global_cells, num_cells); EXPECT_EQ(D.num_local_cells, num_cells); @@ -104,21 +89,17 @@ TEST(domain_decomposition, homogenous_population) EXPECT_EQ(grp.backend, backend_kind::gpu); EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); } -} + { + resources.gpu_id = -1; // disable GPU if available + auto ctx = make_context(resources); -TEST(domain_decomposition, heterogenous_population) -{ - execution_context context; - - { // Test on a node with 1 cpu core and no gpus. + // Test on a node with 1 cpu core and no gpus. // We assume that all cells will be put into cell groups of size 1. // This assumption will not hold in the future, requiring and update to // the test. - proc_allocation nd{1, 0}; unsigned num_cells = 10; - auto R = hetero_recipe(num_cells); - const auto D = partition_load_balance(R, nd, context); + const auto D = partition_load_balance(homo_recipe(num_cells, dummy_cell{}), ctx); EXPECT_EQ(D.num_global_cells, num_cells); EXPECT_EQ(D.num_local_cells, num_cells); @@ -131,32 +112,30 @@ TEST(domain_decomposition, heterogenous_population) // Each cell group contains 1 cell of kind cable1d_neuron // Each group should also be tagged for cpu execution - auto grps = make_span(num_cells); - std::map<cell_kind, std::set<cell_gid_type>> kind_lists; - for (auto i: grps) { + for (auto i: gids) { auto& grp = D.groups[i]; EXPECT_EQ(grp.gids.size(), 1u); - auto k = grp.kind; - kind_lists[k].insert(grp.gids.front()); + EXPECT_EQ(grp.gids.front(), unsigned(i)); EXPECT_EQ(grp.backend, backend_kind::multicore); - } - - for (auto k: {cell_kind::cable1d_neuron, cell_kind::spike_source}) { - const auto& gids = kind_lists[k]; - EXPECT_EQ(gids.size(), num_cells/2); - for (auto gid: gids) { - EXPECT_EQ(k, R.get_cell_kind(gid)); - } + EXPECT_EQ(grp.kind, cell_kind::cable1d_neuron); } } - { // Test on a node with 1 gpu and 1 cpu core. +} + +TEST(domain_decomposition, heterogenous_population) +{ + proc_allocation resources; + resources.num_threads = 1; + + if (resources.has_gpu()) { + // Test on a node with 1 gpu and 1 cpu core. // Assumes that calble cells are on gpu in a single group, and // rff cells are on cpu in cell groups of size 1 - proc_allocation nd{1, 1}; + auto ctx = make_context(resources); unsigned num_cells = 10; auto R = hetero_recipe(num_cells); - const auto D = partition_load_balance(R, nd, context); + const auto D = partition_load_balance(R, ctx); EXPECT_EQ(D.num_global_cells, num_cells); EXPECT_EQ(D.num_local_cells, num_cells); @@ -187,13 +166,58 @@ TEST(domain_decomposition, heterogenous_population) } EXPECT_EQ(num_cells, ncells); } + { + // Test on a node with 1 cpu core and no gpus. + // We assume that all cells will be put into cell groups of size 1. + // This assumption will not hold in the future, requiring and update to + // the test. + + resources.gpu_id = -1; // disable GPU if available + auto ctx = make_context(resources); + + unsigned num_cells = 10; + auto R = hetero_recipe(num_cells); + const auto D = partition_load_balance(R, ctx); + + EXPECT_EQ(D.num_global_cells, num_cells); + EXPECT_EQ(D.num_local_cells, num_cells); + EXPECT_EQ(D.groups.size(), num_cells); + + auto gids = make_span(num_cells); + for (auto gid: gids) { + EXPECT_EQ(0, D.gid_domain(gid)); + } + + // Each cell group contains 1 cell of kind cable1d_neuron + // Each group should also be tagged for cpu execution + auto grps = make_span(num_cells); + std::map<cell_kind, std::set<cell_gid_type>> kind_lists; + for (auto i: grps) { + auto& grp = D.groups[i]; + EXPECT_EQ(grp.gids.size(), 1u); + auto k = grp.kind; + kind_lists[k].insert(grp.gids.front()); + EXPECT_EQ(grp.backend, backend_kind::multicore); + } + + for (auto k: {cell_kind::cable1d_neuron, cell_kind::spike_source}) { + const auto& gids = kind_lists[k]; + EXPECT_EQ(gids.size(), num_cells/2); + for (auto gid: gids) { + EXPECT_EQ(k, R.get_cell_kind(gid)); + } + } + } } TEST(domain_decomposition, hints) { // Check that we can provide group size hint and gpu/cpu preference // by cell kind. + // The hints perfer the multicore backend, so the decomposition is expected + // to never have cell groups on the GPU, regardless of whether a GPU is + // available or not. - execution_context context; + auto ctx = make_context(); partition_hint_map hints; hints[cell_kind::cable1d_neuron].cpu_group_size = 3; @@ -202,8 +226,7 @@ TEST(domain_decomposition, hints) { domain_decomposition D = partition_load_balance( hetero_recipe(20), - proc_allocation{16, 1}, // 16 threads, 1 gpu. - context, + ctx, hints); std::vector<std::vector<cell_gid_type>> expected_c1d_groups = diff --git a/test/unit/test_fvm_lowered.cpp b/test/unit/test_fvm_lowered.cpp index 9001ef0c3be38d5bb1f0a7e58e0cead6721d7f73..80e73927bb5483d3a551a784488eaf0c546eab72 100644 --- a/test/unit/test_fvm_lowered.cpp +++ b/test/unit/test_fvm_lowered.cpp @@ -4,7 +4,6 @@ #include "../gtest.h" #include <arbor/common_types.hpp> -#include <arbor/distributed_context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/fvm_types.hpp> #include <arbor/load_balance.hpp> @@ -19,6 +18,7 @@ #include "algorithms.hpp" #include "backends/multicore/fvm.hpp" #include "backends/multicore/mechanism.hpp" +#include "execution_context.hpp" #include "fvm_lowered_cell.hpp" #include "fvm_lowered_cell_impl.hpp" #include "sampler_map.hpp" @@ -256,8 +256,6 @@ TEST(fvm_lowered, derived_mechs) { // // 3. Cell with both test_kin1 and custom_kin1. - execution_context context; - std::vector<mc_cell> cells(3); for (int i = 0; i<3; ++i) { mc_cell& c = cells[i]; @@ -298,6 +296,7 @@ TEST(fvm_lowered, derived_mechs) { std::vector<target_handle> targets; probe_association_map<probe_handle> probe_map; + execution_context context; fvm_cell fvcell(context); fvcell.initialize({0, 1, 2}, rec, targets, probe_map); @@ -318,7 +317,6 @@ TEST(fvm_lowered, derived_mechs) { util::sort(tau_values); EXPECT_EQ(fvec({10., 20.}), tau_values); } - { // Test dynamics: // 1. Current at same point on cell 0 at time 10 ms should equal that @@ -336,9 +334,9 @@ TEST(fvm_lowered, derived_mechs) { float times[] = {10.f, 20.f}; - execution_context context; - auto decomp = partition_load_balance(rec, proc_allocation{1, 0}, context); - simulation sim(rec, decomp, context); + auto ctx = make_context(); + auto decomp = partition_load_balance(rec, ctx); + simulation sim(rec, decomp, ctx); sim.add_sampler(all_probes, explicit_schedule(times), sampler); sim.run(30.0, 1.f/1024); @@ -354,6 +352,7 @@ TEST(fvm_lowered, derived_mechs) { } } + // Test area-weighted linear combination of ion species concentrations TEST(fvm_lowered, weighted_write_ion) { diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu index 9e0235dd2c4a047c1b869575a29beddbfabace4a..e75f8c3b08f174a34f7f06976a32d5d43690d9ac 100644 --- a/test/unit/test_gpu_stack.cu +++ b/test/unit/test_gpu_stack.cu @@ -1,17 +1,19 @@ #include "../gtest.h" -#include <backends/gpu/stack.hpp> -#include <backends/gpu/stack_cu.hpp> -#include <backends/gpu/managed_ptr.hpp> -#include <arbor/execution_context.hpp> +#include "backends/gpu/stack.hpp" +#include "backends/gpu/stack_cu.hpp" +#include "backends/gpu/managed_ptr.hpp" +#include "gpu_context.hpp" using namespace arb; TEST(stack, construction) { using T = int; - execution_context context; - gpu::stack<T> s(10, context.gpu); + auto context = make_gpu_context(0); + if (!context->has_gpu()) return; + + gpu::stack<T> s(10, context); EXPECT_EQ(0u, s.size()); EXPECT_EQ(10u, s.capacity()); @@ -53,11 +55,12 @@ TEST(stack, push_back) { using T = int; using stack = gpu::stack<T>; - execution_context context; + auto context = make_gpu_context(0); + if (!context->has_gpu()) return; const unsigned n = 10; EXPECT_TRUE(n%2 == 0); // require n is even for tests to work - auto s = stack(n, context.gpu); + auto s = stack(n, context); auto& sstorage = s.storage(); kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor()); @@ -88,10 +91,11 @@ TEST(stack, overflow) { using T = int; using stack = gpu::stack<T>; - execution_context context; + auto context = make_gpu_context(0); + if (!context->has_gpu()) return; const unsigned n = 10; - auto s = stack(n, context.gpu); + auto s = stack(n, context); auto& sstorage = s.storage(); EXPECT_FALSE(s.overflow()); @@ -107,9 +111,10 @@ TEST(stack, empty) { using T = int; using stack = gpu::stack<T>; - execution_context context; + auto context = make_gpu_context(0); + if (!context->has_gpu()) return; - stack s(0u, context.gpu); + stack s(0u, context); EXPECT_EQ(s.size(), 0u); EXPECT_EQ(s.capacity(), 0u); diff --git a/test/unit/test_lif_cell_group.cpp b/test/unit/test_lif_cell_group.cpp index 51283529dbdcc50bb814e8740077e8872793b57d..7f0e8d839fbb3f93d47ad3155c1949e8635e4684 100644 --- a/test/unit/test_lif_cell_group.cpp +++ b/test/unit/test_lif_cell_group.cpp @@ -154,10 +154,9 @@ TEST(lif_cell_group, spikes) { // make two lif cells path_recipe recipe(2, 1000, 0.1); - execution_context context; - proc_allocation nd = local_allocation(context); + auto context = make_context(); - auto decomp = partition_load_balance(recipe, nd, context); + auto decomp = partition_load_balance(recipe, context); simulation sim(recipe, decomp, context); std::vector<spike_event> events; @@ -193,10 +192,9 @@ TEST(lif_cell_group, ring) // Total simulation time. time_type simulation_time = 100; - execution_context context; - proc_allocation nd = local_allocation(context); + auto context = make_context(); auto recipe = ring_recipe(num_lif_cells, weight, delay); - auto decomp = partition_load_balance(recipe, nd, context); + auto decomp = partition_load_balance(recipe, context); // Creates a simulation with a ring recipe of lif neurons simulation sim(recipe, decomp, context); diff --git a/test/unit/test_local_context.cpp b/test/unit/test_local_context.cpp index 7ac207d8950ca98526a6a21b15daa97bc693124b..0f42e206f22eceefb672039097c32a5e278aa50e 100644 --- a/test/unit/test_local_context.cpp +++ b/test/unit/test_local_context.cpp @@ -1,8 +1,8 @@ #include <vector> #include "../gtest.h" +#include "distributed_context.hpp" -#include <arbor/distributed_context.hpp> #include <arbor/spike.hpp> // Test that there are no errors constructing a distributed_context from a local_context diff --git a/test/unit/test_mc_cell_group_gpu.cpp b/test/unit/test_mc_cell_group_gpu.cpp index ad15d437467cfb915e2f41346adfdecb2157e180..7628dce147c61966aa77bab1cdbb24cbc62f6d65 100644 --- a/test/unit/test_mc_cell_group_gpu.cpp +++ b/test/unit/test_mc_cell_group_gpu.cpp @@ -3,6 +3,7 @@ #include <arbor/common_types.hpp> #include "epoch.hpp" +#include "execution_context.hpp" #include "fvm_lowered_cell.hpp" #include "mc_cell_group.hpp" diff --git a/test/unit/test_simd.cpp b/test/unit/test_simd.cpp index 8b37db6503e5e4b9dc71ee3384a7a27d69b3f9c2..40c05587b99f08fe0b3507c935933fd41406cfda 100644 --- a/test/unit/test_simd.cpp +++ b/test/unit/test_simd.cpp @@ -706,6 +706,11 @@ TYPED_TEST_P(simd_fp_value, fp_maths) { pow(simd(u), simd(v)).copy_to(r); EXPECT_TRUE(testing::seq_almost_eq<fp>(pow_u_v_int, r)); } + + // The tests can cause floating point exceptions, which may set errno to nonzero + // value. + // Reset errno so that subsequent tests are not affected. + errno = 0; } // Check special function behaviour for specific values including diff --git a/test/unit/test_spike_store.cpp b/test/unit/test_spike_store.cpp index 11bd112344223793692ff93081cb4175dfe12dd6..61c01340d88308ff35e7e4454f5d17ef2e18e080 100644 --- a/test/unit/test_spike_store.cpp +++ b/test/unit/test_spike_store.cpp @@ -1,8 +1,8 @@ #include "../gtest.h" #include <arbor/spike.hpp> -#include <arbor/execution_context.hpp> +#include "execution_context.hpp" #include "thread_private_spike_store.hpp" using arb::spike; diff --git a/test/unit/test_thread.cpp b/test/unit/test_thread.cpp index ea3c0b4eb06a4120f7ec3e6524cb1d2cc04dcf14..e1f0b2ed1117244ba919b349976191acd8fc908b 100644 --- a/test/unit/test_thread.cpp +++ b/test/unit/test_thread.cpp @@ -1,6 +1,5 @@ #include "../gtest.h" #include "common.hpp" -#include <arbor/execution_context.hpp> #include <iostream> #include <ostream> @@ -8,6 +7,7 @@ #include <arbor/version.hpp> #include "threading/threading.hpp" +#include "threading/enumerable_thread_specific.hpp" using namespace arb::threading::impl; using namespace arb::threading; diff --git a/test/validation/validate_ball_and_stick.cpp b/test/validation/validate_ball_and_stick.cpp index 8d3691d2b9a25a9fee40daade18fbb8b092ccfc4..2a047354bf3efa80577a11c40e52955f4a087c8d 100644 --- a/test/validation/validate_ball_and_stick.cpp +++ b/test/validation/validate_ball_and_stick.cpp @@ -3,7 +3,9 @@ #include <nlohmann/json.hpp> #include <arbor/common_types.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> +#include <arbor/context.hpp> #include <arbor/load_balance.hpp> #include <arbor/mc_cell.hpp> #include <arbor/recipe.hpp> @@ -32,7 +34,7 @@ template <typename ProbePointSeq> void run_ncomp_convergence_test( const char* model_name, const aux::path& ref_data_path, - backend_kind backend, + context& context, const mc_cell& c, ProbePointSeq& probe_points, float t_end=100.f) @@ -49,7 +51,7 @@ void run_ncomp_convergence_test( {"dt", dt}, {"sim", "arbor"}, {"units", "mV"}, - {"backend_kind", to_string(backend)} + {"backend_kind", (has_gpu(context)? "gpu": "multicore")} }; auto exclude = stimulus_ends(c); @@ -64,10 +66,6 @@ void run_ncomp_convergence_test( convergence_test_runner<int> runner("ncomp", plabels, meta); runner.load_reference_data(ref_data_path); - execution_context context; - proc_allocation nd; - nd.num_gpus = (backend==backend_kind::gpu); - for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) { for (auto& seg: c.segments()) { if (!seg->is_soma()) { @@ -79,7 +77,7 @@ void run_ncomp_convergence_test( rec.add_probe(0, 0, cell_probe_address{p.where, cell_probe_address::membrane_voltage}); } - auto decomp = partition_load_balance(rec, nd, context); + auto decomp = partition_load_balance(rec, context); simulation sim(rec, decomp, context); runner.run(sim, ncomp, sample_dt, t_end, dt, exclude); @@ -88,7 +86,7 @@ void run_ncomp_convergence_test( runner.assert_all_convergence(); } -void validate_ball_and_stick(arb::backend_kind backend) { +void validate_ball_and_stick(context& ctx) { using namespace arb; mc_cell c = make_cell_ball_and_stick(); @@ -101,12 +99,12 @@ void validate_ball_and_stick(arb::backend_kind backend) { run_ncomp_convergence_test( "ball_and_stick", "neuron_ball_and_stick.json", - backend, + ctx, c, points); } -void validate_ball_and_taper(arb::backend_kind backend) { +void validate_ball_and_taper(context& ctx) { using namespace arb; mc_cell c = make_cell_ball_and_taper(); @@ -119,12 +117,12 @@ void validate_ball_and_taper(arb::backend_kind backend) { run_ncomp_convergence_test( "ball_and_taper", "neuron_ball_and_taper.json", - backend, + ctx, c, points); } -void validate_ball_and_3stick(arb::backend_kind backend) { +void validate_ball_and_3stick(context& ctx) { using namespace arb; mc_cell c = make_cell_ball_and_3stick(); @@ -141,12 +139,12 @@ void validate_ball_and_3stick(arb::backend_kind backend) { run_ncomp_convergence_test( "ball_and_3stick", "neuron_ball_and_3stick.json", - backend, + ctx, c, points); } -void validate_rallpack1(arb::backend_kind backend) { +void validate_rallpack1(context& ctx) { using namespace arb; mc_cell c = make_cell_simple_cable(); @@ -159,13 +157,13 @@ void validate_rallpack1(arb::backend_kind backend) { run_ncomp_convergence_test( "rallpack1", "numeric_rallpack1.json", - backend, + ctx, c, points, 250.f); } -void validate_ball_and_squiggle(arb::backend_kind backend) { +void validate_ball_and_squiggle(context& ctx) { using namespace arb; mc_cell c = make_cell_ball_and_squiggle(); @@ -189,47 +187,72 @@ void validate_ball_and_squiggle(arb::backend_kind backend) { run_ncomp_convergence_test( "ball_and_squiggle_integrator", "neuron_ball_and_squiggle.json", - backend, + ctx, c, points); } TEST(ball_and_stick, neuron_ref) { - execution_context ctx; - validate_ball_and_stick(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_ball_and_stick(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_ball_and_stick(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_ball_and_stick(ctx); } } TEST(ball_and_taper, neuron_ref) { - execution_context ctx; - validate_ball_and_taper(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_ball_and_taper(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_ball_and_taper(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_ball_and_taper(ctx); } } TEST(ball_and_3stick, neuron_ref) { - execution_context ctx; - validate_ball_and_3stick(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_ball_and_3stick(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_ball_and_3stick(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_ball_and_3stick(ctx); } } TEST(rallpack1, numeric_ref) { - execution_context ctx; - validate_rallpack1(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_rallpack1(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_rallpack1(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_rallpack1(ctx); } } TEST(ball_and_squiggle, neuron_ref) { - execution_context ctx; - validate_ball_and_squiggle(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_ball_and_squiggle(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_ball_and_squiggle(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_ball_and_squiggle(ctx); } } diff --git a/test/validation/validate_kinetic.cpp b/test/validation/validate_kinetic.cpp index b8bfb31094be5448be40f2bda5220661a967d907..9bf35611a6159e8d1ac9c8fdc53ffe2dcb622816 100644 --- a/test/validation/validate_kinetic.cpp +++ b/test/validation/validate_kinetic.cpp @@ -4,6 +4,7 @@ #include <nlohmann/json.hpp> +#include <arbor/context.hpp> #include <arbor/common_types.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> @@ -21,7 +22,7 @@ #include "validation_data.hpp" void run_kinetic_dt( - arb::backend_kind backend, + const arb::context& context, arb::mc_cell& c, arb::cell_probe_address probe, float t_end, @@ -38,16 +39,12 @@ void run_kinetic_dt( probe_label plabels[1] = {{"soma.mid", {0u, 0u}}}; meta["sim"] = "arbor"; - meta["backend_kind"] = to_string(backend); + meta["backend_kind"] = arb::has_gpu(context)? "gpu": "multicore"; convergence_test_runner<float> runner("dt", plabels, meta); runner.load_reference_data(ref_file); - execution_context context; - proc_allocation nd; - nd.num_gpus = (backend==backend_kind::gpu); - - auto decomp = partition_load_balance(rec, nd, context); + auto decomp = partition_load_balance(rec, context); simulation sim(rec, decomp, context); auto exclude = stimulus_ends(c); @@ -70,7 +67,7 @@ end: runner.assert_all_convergence(); } -void validate_kinetic_kin1(arb::backend_kind backend) { +void validate_kinetic_kin1(const arb::context& ctx) { using namespace arb; // 20 µm diameter soma with single mechanism, current probe @@ -85,10 +82,10 @@ void validate_kinetic_kin1(arb::backend_kind backend) { {"units", "nA"} }; - run_kinetic_dt(backend, c, probe, 100.f, meta, "numeric_kin1.json"); + run_kinetic_dt(ctx, c, probe, 100.f, meta, "numeric_kin1.json"); } -void validate_kinetic_kinlva(arb::backend_kind backend) { +void validate_kinetic_kinlva(const arb::context& ctx) { using namespace arb; // 20 µm diameter soma with single mechanism, current probe @@ -104,24 +101,34 @@ void validate_kinetic_kinlva(arb::backend_kind backend) { {"units", "mV"} }; - run_kinetic_dt(backend, c, probe, 300.f, meta, "numeric_kinlva.json"); + run_kinetic_dt(ctx, c, probe, 300.f, meta, "numeric_kinlva.json"); } using namespace arb; TEST(kinetic, kin1_numeric_ref) { - execution_context ctx; - validate_kinetic_kin1(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_kinetic_kin1(arb::backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_kinetic_kin1(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_kinetic_kin1(ctx); } } TEST(kinetic, kinlva_numeric_ref) { - execution_context ctx; - validate_kinetic_kinlva(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_kinetic_kinlva(arb::backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_kinetic_kinlva(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_kinetic_kinlva(ctx); } } diff --git a/test/validation/validate_soma.cpp b/test/validation/validate_soma.cpp index f61e3ab1c75e317ef6b18c0299cd020b4c9708e1..5e5fdd6e91db9079c9753944c10f410e5110991b 100644 --- a/test/validation/validate_soma.cpp +++ b/test/validation/validate_soma.cpp @@ -1,6 +1,7 @@ #include <nlohmann/json.hpp> #include <arbor/common_types.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> #include <arbor/mc_cell.hpp> @@ -20,7 +21,7 @@ using namespace arb; -void validate_soma(backend_kind backend) { +void validate_soma(const context& context) { float sample_dt = g_trace_io.sample_dt(); mc_cell c = make_cell_soma_only(); @@ -29,11 +30,7 @@ void validate_soma(backend_kind backend) { rec.add_probe(0, 0, cell_probe_address{{0, 0.5}, cell_probe_address::membrane_voltage}); probe_label plabels[1] = {{"soma.mid", {0u, 0u}}}; - execution_context context; - proc_allocation nd; - nd.num_gpus = (backend==backend_kind::gpu); - - auto decomp = partition_load_balance(rec, nd, context); + auto decomp = partition_load_balance(rec, context); simulation sim(rec, decomp, context); nlohmann::json meta = { @@ -41,7 +38,7 @@ void validate_soma(backend_kind backend) { {"model", "soma"}, {"sim", "arbor"}, {"units", "mV"}, - {"backend_kind", to_string(backend)} + {"backend_kind", has_gpu(context)? "gpu": "multicore"} }; convergence_test_runner<float> runner("dt", plabels, meta); @@ -68,9 +65,14 @@ end: } TEST(soma, numeric_ref) { - execution_context ctx; - validate_soma(backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { - validate_soma(backend_kind::gpu); + proc_allocation resources; + { + auto ctx = make_context(resources); + validate_soma(ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); + validate_soma(ctx); } } diff --git a/test/validation/validate_synapses.cpp b/test/validation/validate_synapses.cpp index cd19a08dd3fd1ec4db6ecb851eb0cd46723b617b..92b43b9784740dd004bfd8916ca67568aa1a47bb 100644 --- a/test/validation/validate_synapses.cpp +++ b/test/validation/validate_synapses.cpp @@ -1,5 +1,6 @@ #include <nlohmann/json.hpp> +#include <arbor/context.hpp> #include <arbor/domain_decomposition.hpp> #include <arbor/load_balance.hpp> #include <arbor/mc_cell.hpp> @@ -24,7 +25,7 @@ using namespace arb; void run_synapse_test( const char* syn_type, const aux::path& ref_data_path, - backend_kind backend, + const context& context, float t_end=70.f, float dt=0.001) { @@ -34,7 +35,7 @@ void run_synapse_test( {"model", syn_type}, {"sim", "arbor"}, {"units", "mV"}, - {"backend_kind", to_string(backend)} + {"backend_kind", arb::has_gpu(context)? "gpu": "multicore"} }; mc_cell c = make_cell_ball_and_stick(false); // no stimuli @@ -61,10 +62,6 @@ void run_synapse_test( convergence_test_runner<int> runner("ncomp", plabels, meta); runner.load_reference_data(ref_data_path); - execution_context context; - proc_allocation nd; - nd.num_gpus = (backend==backend_kind::gpu); - for (int ncomp = 10; ncomp<max_ncomp; ncomp*=2) { c.cable(1)->set_compartments(ncomp); @@ -76,7 +73,7 @@ void run_synapse_test( // dend.end rec.add_probe(0, 0, cell_probe_address{{1, 1.0}, cell_probe_address::membrane_voltage}); - auto decomp = partition_load_balance(rec, nd, context); + auto decomp = partition_load_balance(rec, context); simulation sim(rec, decomp, context); sim.inject_events(synthetic_events); @@ -88,21 +85,31 @@ void run_synapse_test( } TEST(simple_synapse, expsyn_neuron_ref) { - execution_context ctx; - SCOPED_TRACE("expsyn-multicore"); - run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { + proc_allocation resources; + { + auto ctx = make_context(resources); + SCOPED_TRACE("expsyn-multicore"); + run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); SCOPED_TRACE("expsyn-gpu"); - run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", backend_kind::gpu); + run_synapse_test("expsyn", "neuron_simple_exp_synapse.json", ctx); } } TEST(simple_synapse, exp2syn_neuron_ref) { - execution_context ctx; - SCOPED_TRACE("exp2syn-multicore"); - run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend_kind::multicore); - if (local_allocation(ctx).num_gpus) { + proc_allocation resources; + { + auto ctx = make_context(resources); + SCOPED_TRACE("exp2syn-multicore"); + run_synapse_test("exp2syn", "neuron_simple_exp_synapse.json", ctx); + } + if (resources.has_gpu()) { + resources.gpu_id = -1; + auto ctx = make_context(resources); SCOPED_TRACE("exp2syn-gpu"); - run_synapse_test("exp2syn", "neuron_simple_exp2_synapse.json", backend_kind::gpu); + run_synapse_test("exp2syn", "neuron_simple_exp_synapse.json", ctx); } }