diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp index 2885a1c5be2df231df570f0f731e985c66c371ae..5672dd48fb152b23b67bd2fdbe7c65756bec8d89 100644 --- a/modcc/cudaprinter.cpp +++ b/modcc/cudaprinter.cpp @@ -41,7 +41,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) text_.add_line(); text_.add_line("#include <mechanism.hpp>"); text_.add_line("#include <algorithms.hpp>"); - text_.add_line("#include <backends/gpu_intrinsics.hpp>"); + text_.add_line("#include <backends/gpu/intrinsics.hpp>"); text_.add_line("#include <util/pprintf.hpp>"); text_.add_line(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7e6e6f160ef3f1fcc552fc54c2643636327727f1..d11ca436ed2aeb51c50d8fa11bf4e891c3e421df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,10 +9,10 @@ set(BASE_SOURCES util/debug.cpp util/path.cpp util/unwind.cpp - backends/fvm_multicore.cpp + backends/multicore/fvm.cpp ) set(CUDA_SOURCES - backends/fvm_gpu.cu + backends/gpu/fvm.cu memory/fill.cu ) diff --git a/src/backends/fvm.hpp b/src/backends/fvm.hpp index 1cdc16d62593770dee3f0aed71e93a6e6a51dd53..a8124da0815f85dbb674ed5e81c31d5cbf2328b9 100644 --- a/src/backends/fvm.hpp +++ b/src/backends/fvm.hpp @@ -1,7 +1,7 @@ #pragma once -#include "fvm_multicore.hpp" +#include <backends/multicore/fvm.hpp> #ifdef NMC_HAVE_CUDA - #include "fvm_gpu.hpp" + #include <backends/gpu/fvm.hpp> #endif diff --git a/src/backends/fvm_gpu.hpp b/src/backends/fvm_gpu.hpp deleted file mode 100644 index 5683efbf528e721d33253cbbd5433ad68e958af4..0000000000000000000000000000000000000000 --- a/src/backends/fvm_gpu.hpp +++ /dev/null @@ -1,458 +0,0 @@ -#pragma once - -#include <map> -#include <string> - -#include <common_types.hpp> -#include <mechanism.hpp> -#include <memory/memory.hpp> -#include <memory/managed_ptr.hpp> -#include <util/span.hpp> - -#include "stimulus_gpu.hpp" -#include "gpu_stack.hpp" - -namespace nest { -namespace mc { -namespace gpu { - -/// Parameter pack for passing matrix fields and dimensions to the -/// Hines matrix solver implemented on the GPU backend. -template <typename T, typename I> -struct matrix_solve_param_pack { - T* d; - const T* u; - T* rhs; - const I* p; - const I* cell_index; - I n; - I ncells; -}; - -/// Parameter pack for passing matrix and fvm fields and dimensions to the -/// FVM matrix generator implemented on the GPU -template <typename T, typename I> -struct matrix_update_param_pack { - T* d; - const T* u; - T* rhs; - const T* invariant_d; - const T* cv_capacitance; - const T* face_conductance; - const T* voltage; - const T* current; - I n; -}; - -// forward declarations of the matrix solver implementation -// see the bottom of the file for implementation - -template <typename T, typename I> -__global__ void matrix_solve(matrix_solve_param_pack<T, I> params); - -template <typename T, typename I> -__global__ void assemble_matrix(matrix_update_param_pack<T, I> params, T dt); - -/// kernel used to test for threshold crossing test code. -/// params: -/// t : current time (ms) -/// t_prev : time of last test (ms) -/// size : number of values to test -/// is_crossed : crossing state at time t_prev (true or false) -/// prev_values : values at sample points (see index) sampled at t_prev -/// index : index with locations in values to test for crossing -/// values : values at t_prev -/// thresholds : threshold values to watch for crossings -template <typename T, typename I, typename Stack> -__global__ -void test_thresholds( - float t, float t_prev, int size, - Stack& stack, - I* is_crossed, T* prev_values, - const I* index, const T* values, const T* thresholds); - -struct backend { - /// define the real and index types - using value_type = double; - using size_type = nest::mc::cell_lid_type; - - /// define storage types - using array = memory::device_vector<value_type>; - using iarray = memory::device_vector<size_type>; - - using view = typename array::view_type; - using const_view = typename array::const_view_type; - - using iview = typename iarray::view_type; - using const_iview = typename iarray::const_view_type; - - using host_array = typename memory::host_vector<value_type>; - using host_iarray = typename memory::host_vector<size_type>; - - using host_view = typename host_array::view_type; - using host_iview = typename host_iarray::const_view_type; - - static std::string name() { - return "gpu"; - } - - // - // matrix infrastructure - // - - /// matrix state - struct matrix_state { - const_iview p; - const_iview cell_index; - - array d; // [μS] - array u; // [μS] - array rhs; // [nA] - - array cv_capacitance; // [pF] - array face_conductance; // [μS] - - // the invariant part of the matrix diagonal - array invariant_d; // [μS] - - std::size_t size() const { return p.size(); } - - matrix_state() = default; - - matrix_state(const_iview p, const_iview cell_index): - p(p), cell_index(cell_index), - d(size()), u(size()), rhs(size()) - {} - - matrix_state(const_iview p, const_iview cell_index, array cap, array cond): - p(p), cell_index(cell_index), - d(size()), u(size()), rhs(size()), - cv_capacitance(std::move(cap)), - face_conductance(std::move(cond)) - { - auto n = d.size(); - host_array invariant_d_tmp(n, 0); - host_array u_tmp(n, 0); - - // make a copy of the conductance on the host - host_array face_conductance_tmp = face_conductance; - auto p_tmp = memory::on_host(p); - for(auto i: util::make_span(1u, n)) { - auto gij = face_conductance_tmp[i]; - - u_tmp[i] = -gij; - invariant_d_tmp[i] += gij; - invariant_d_tmp[p_tmp[i]] += gij; - } - invariant_d = invariant_d_tmp; - memory::copy(u_tmp, u); - } - - // Assemble the matrix - // Afterwards the diagonal and RHS will have been set given dt, voltage and current - // dt [ms] - // voltage [mV] - // current [nA] - void assemble(value_type dt, const_view voltage, const_view current) { - EXPECTS(has_fvm_state()); - - // determine the grid dimensions for the kernel - auto const n = voltage.size(); - auto const block_dim = 128; - auto const grid_dim = (n+block_dim-1)/block_dim; - - auto params = matrix_update_param_pack<value_type, size_type> { - d.data(), u.data(), rhs.data(), - invariant_d.data(), cv_capacitance.data(), face_conductance.data(), - voltage.data(), current.data(), size_type(n)}; - - assemble_matrix<value_type, size_type><<<grid_dim, block_dim>>> - (params, dt); - - } - - void solve() { - using solve_param_pack = matrix_solve_param_pack<value_type, size_type>; - - // pack the parameters into a single struct for kernel launch - auto params = solve_param_pack{ - d.data(), u.data(), rhs.data(), - p.data(), cell_index.data(), - size_type(d.size()), size_type(cell_index.size()-1) - }; - - // determine the grid dimensions for the kernel - auto const n = params.ncells; - auto const block_dim = 96; - auto const grid_dim = (n+block_dim-1)/block_dim; - - // perform solve on gpu - matrix_solve<value_type, size_type><<<grid_dim, block_dim>>>(params); - } - - // Test if the matrix has the full state required to assemble the - // matrix in the fvm scheme. - bool has_fvm_state() const { - return cv_capacitance.size()>0; - } - }; - - // - // mechanism infrastructure - // - using ion = mechanisms::ion<backend>; - - using mechanism = mechanisms::mechanism_ptr<backend>; - - using stimulus = mechanisms::gpu::stimulus<backend>; - - static mechanism make_mechanism( - const std::string& name, - view vec_v, view vec_i, - const std::vector<value_type>& weights, - const std::vector<size_type>& node_indices) - { - if (!has_mechanism(name)) { - throw std::out_of_range("no mechanism in database : " + name); - } - - return mech_map_.find(name)-> - second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices)); - } - - static bool has_mechanism(const std::string& name) { return mech_map_.count(name)>0; } - - /// threshold crossing logic - /// used as part of spike detection back end - class threshold_watcher { - public: - /// stores a single crossing event - struct threshold_crossing { - size_type index; // index of variable - value_type time; // time of crossing - __host__ __device__ - friend bool operator== - (const threshold_crossing& lhs, const threshold_crossing& rhs) - { - return lhs.index==rhs.index && lhs.time==rhs.time; - } - }; - - using stack_type = gpu_stack<threshold_crossing>; - - threshold_watcher() = default; - - threshold_watcher( - const_view values, - const std::vector<size_type>& index, - const std::vector<value_type>& thresh, - value_type t=0): - values_(values), - index_(memory::make_const_view(index)), - thresholds_(memory::make_const_view(thresh)), - prev_values_(values), - is_crossed_(size()), - stack_(memory::make_managed_ptr<stack_type>(10*size())) - { - reset(t); - } - - /// Remove all stored crossings that were detected in previous calls - /// to test() - void clear_crossings() { - stack_->clear(); - } - - /// Reset state machine for each detector. - /// Assume that the values in values_ have been set correctly before - /// calling, because the values are used to determine the initial state - void reset(value_type t=0) { - clear_crossings(); - - // Make host-side copies of the information needed to calculate - // the initial crossed state - auto values = memory::on_host(values_); - auto thresholds = memory::on_host(thresholds_); - auto index = memory::on_host(index_); - - // calculate the initial crossed state in host memory - auto crossed = std::vector<size_type>(size()); - for (auto i: util::make_span(0u, size())) { - crossed[i] = values[index[i]] < thresholds[i] ? 0 : 1; - } - - // copy the initial crossed state to device memory - is_crossed_ = memory::on_gpu(crossed); - - // reset time of last test - t_prev_ = t; - } - - bool is_crossed(size_type i) const { - return is_crossed_[i]; - } - - const std::vector<threshold_crossing> crossings() const { - return std::vector<threshold_crossing>(stack_->begin(), stack_->end()); - } - - /// The time at which the last test was performed - value_type last_test_time() const { - return t_prev_; - } - - /// Tests each target for changed threshold state. - /// Crossing events are recorded for each threshold that has been - /// crossed since current time t, and the last time the test was - /// performed. - void test(value_type t) { - EXPECTS(t_prev_<t); - - constexpr int block_dim = 128; - const int grid_dim = (size()+block_dim-1)/block_dim; - test_thresholds<<<grid_dim, block_dim>>>( - t, t_prev_, size(), - *stack_, - is_crossed_.data(), prev_values_.data(), - index_.data(), values_.data(), thresholds_.data()); - - // Check that the number of spikes has not exceeded - // the capacity of the stack. - EXPECTS(stack_->size() <= stack_->capacity()); - - t_prev_ = t; - } - - /// the number of threashold values that are being monitored - std::size_t size() const { - return index_.size(); - } - - /// Data type used to store the crossings. - /// Provided to make type-generic calling code. - using crossing_list = std::vector<threshold_crossing>; - - private: - - const_view values_; // values to watch: on gpu - iarray index_; // indexes of values to watch: on gpu - - array thresholds_; // threshold for each watch: on gpu - value_type t_prev_; // time of previous sample: on host - array prev_values_; // values at previous sample time: on host - iarray is_crossed_; // bool flag for state of each watch: on gpu - - memory::managed_ptr<stack_type> stack_; - }; - -private: - - using maker_type = mechanism (*)(view, view, array&&, iarray&&); - static std::map<std::string, maker_type> mech_map_; - - template <template <typename> class Mech> - static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) { - return mechanisms::make_mechanism<Mech<backend>> - (vec_v, vec_i, std::move(weights), std::move(node_indices)); - } -}; - -/// GPU implementation of Hines Matrix solver. -/// Naive implementation with one CUDA thread per matrix. -template <typename T, typename I> -__global__ -void matrix_solve(matrix_solve_param_pack<T, I> params) { - auto tid = threadIdx.x + blockDim.x*blockIdx.x; - auto d = params.d; - auto u = params.u; - auto rhs = params.rhs; - auto p = params.p; - - if(tid < params.ncells) { - // get range of this thread's cell matrix - auto first = params.cell_index[tid]; - auto last = params.cell_index[tid+1]; - - // backward sweep - for(auto i=last-1; i>first; --i) { - auto factor = u[i] / d[i]; - d[p[i]] -= factor * u[i]; - rhs[p[i]] -= factor * rhs[i]; - } - - __syncthreads(); - rhs[first] /= d[first]; - - // forward sweep - for(auto i=first+1; i<last; ++i) { - rhs[i] -= u[i] * rhs[p[i]]; - rhs[i] /= d[i]; - } - } -} - -/// GPU implementatin of Hines matrix assembly -/// For a given time step size dt -/// - use the precomputed alpha and alpha_d values to construct the diagonal -/// and off diagonal of the symmetric Hines matrix. -/// - compute the RHS of the linear system to solve -template <typename T, typename I> -__global__ -void assemble_matrix(matrix_update_param_pack<T, I> params, T dt) { - auto tid = threadIdx.x + blockDim.x*blockIdx.x; - - T factor = 1e-3/dt; - if(tid < params.n) { - auto gi = factor * params.cv_capacitance[tid]; - - params.d[tid] = gi + params.invariant_d[tid]; - - params.rhs[tid] = gi*params.voltage[tid] - params.current[tid]; - } -} - -template <typename T, typename I, typename Stack> -__global__ -void test_thresholds( - float t, float t_prev, int size, - Stack& stack, - I* is_crossed, T* prev_values, - const I* index, const T* values, const T* thresholds) -{ - int i = threadIdx.x + blockIdx.x*blockDim.x; - - bool crossed = false; - float crossing_time; - - if (i<size) { - // Test for threshold crossing - const auto v_prev = prev_values[i]; - const auto v = values[index[i]]; - const auto thresh = thresholds[i]; - - if (!is_crossed[i]) { - if (v>=thresh) { - // The threshold has been passed, so estimate the time using - // linear interpolation - auto pos = (thresh - v_prev)/(v - v_prev); - crossing_time = t_prev + pos*(t - t_prev); - - is_crossed[i] = 1; - crossed = true; - } - } - else if (v<thresh) { - is_crossed[i]=0; - } - - prev_values[i] = v; - } - - if (crossed) { - stack.push_back({I(i), crossing_time}); - } -} - -} // namespace multicore -} // namespace mc -} // namespace nest diff --git a/src/backends/fvm_multicore.hpp b/src/backends/fvm_multicore.hpp deleted file mode 100644 index 8bddb9566490690856d54aa436409d8224d4bc4c..0000000000000000000000000000000000000000 --- a/src/backends/fvm_multicore.hpp +++ /dev/null @@ -1,286 +0,0 @@ - -#include <map> -#include <string> - -#include <common_types.hpp> -#include <mechanism.hpp> -#include <memory/memory.hpp> -#include <memory/wrappers.hpp> -#include <util/span.hpp> - -#include "stimulus_multicore.hpp" - -namespace nest { -namespace mc { -namespace multicore { - -struct backend { - /// define the real and index types - using value_type = double; - using size_type = nest::mc::cell_lid_type; - - /// define storage types - using array = memory::host_vector<value_type>; - using iarray = memory::host_vector<size_type>; - - using view = typename array::view_type; - using const_view = typename array::const_view_type; - - using iview = typename iarray::view_type; - using const_iview = typename iarray::const_view_type; - - using host_array = array; - using host_iarray = iarray; - - using host_view = view; - using host_iview = iview; - - /// matrix state - struct matrix_state { - const_iview p; - const_iview cell_index; - - array d; // [μS] - array u; // [μS] - array rhs; // [nA] - - array cv_capacitance; // [pF] - array face_conductance; // [μS] - - // the invariant part of the matrix diagonal - array invariant_d; // [μS] - - std::size_t size() const { return p.size(); } - - matrix_state() = default; - - matrix_state(const_iview p, const_iview cell_index): - p(p), cell_index(cell_index), - d(size()), u(size()), rhs(size()) - {} - - matrix_state(const_iview p, const_iview cell_index, array cap, array cond): - p(p), cell_index(cell_index), - d(size()), u(size()), rhs(size()), - cv_capacitance(std::move(cap)), - face_conductance(std::move(cond)) - { - auto n = d.size(); - invariant_d = array(n, 0); - for (auto i: util::make_span(1u, n)) { - auto gij = face_conductance[i]; - - u[i] = -gij; - invariant_d[i] += gij; - invariant_d[p[i]] += gij; - } - } - - // Assemble the matrix - // Afterwards the diagonal and RHS will have been set given dt, voltage and current - // dt [ms] - // voltage [mV] - // current [nA] - void assemble(value_type dt, const_view voltage, const_view current) { - EXPECTS(has_fvm_state()); - - auto n = d.size(); - value_type factor = 1e-3/dt; - for (auto i: util::make_span(0u, n)) { - auto gi = factor*cv_capacitance[i]; - - d[i] = gi + invariant_d[i]; - - rhs[i] = gi*voltage[i] - current[i]; - } - } - - void solve() { - const size_type ncells = cell_index.size()-1; - - // loop over submatrices - for (auto m: util::make_span(0, ncells)) { - auto first = cell_index[m]; - auto last = cell_index[m+1]; - - // backward sweep - for(auto i=last-1; i>first; --i) { - auto factor = u[i] / d[i]; - d[p[i]] -= factor * u[i]; - rhs[p[i]] -= factor * rhs[i]; - } - rhs[first] /= d[first]; - - // forward sweep - for(auto i=first+1; i<last; ++i) { - rhs[i] -= u[i] * rhs[p[i]]; - rhs[i] /= d[i]; - } - } - } - - // Test if the matrix has the full state required to assemble the - // matrix in the fvm scheme. - bool has_fvm_state() const { - return cv_capacitance.size()>0; - } - }; - - // - // mechanism infrastructure - // - using ion = mechanisms::ion<backend>; - - using mechanism = mechanisms::mechanism_ptr<backend>; - - using stimulus = mechanisms::multicore::stimulus<backend>; - - static mechanism make_mechanism( - const std::string& name, - view vec_v, view vec_i, - const std::vector<value_type>& weights, - const std::vector<size_type>& node_indices) - { - if (!has_mechanism(name)) { - throw std::out_of_range("no mechanism in database : " + name); - } - - return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices)); - } - - static bool has_mechanism(const std::string& name) { - return mech_map_.count(name)>0; - } - - static std::string name() { - return "cpu"; - } - - /// threshold crossing logic - /// used as part of spike detection back end - class threshold_watcher { - public: - /// stores a single crossing event - struct threshold_crossing { - size_type index; // index of variable - value_type time; // time of crossing - friend bool operator== ( - const threshold_crossing& lhs, const threshold_crossing& rhs) - { - return lhs.index==rhs.index && lhs.time==rhs.time; - } - }; - - threshold_watcher() = default; - - threshold_watcher( - const_view vals, - const std::vector<size_type>& indxs, - const std::vector<value_type>& thresh, - value_type t=0): - values_(vals), - index_(memory::make_const_view(indxs)), - thresholds_(memory::make_const_view(thresh)), - v_prev_(vals) - { - is_crossed_ = iarray(size()); - reset(t); - } - - /// Remove all stored crossings that were detected in previous calls - /// to the test() member function. - void clear_crossings() { - crossings_.clear(); - } - - /// Reset state machine for each detector. - /// Assume that the values in values_ have been set correctly before - /// calling, because the values are used to determine the initial state - void reset(value_type t=0) { - clear_crossings(); - for (auto i=0u; i<size(); ++i) { - is_crossed_[i] = values_[index_[i]]>=thresholds_[i]; - } - t_prev_ = t; - } - - const std::vector<threshold_crossing>& crossings() const { - return crossings_; - } - - /// The time at which the last test was performed - value_type last_test_time() const { - return t_prev_; - } - - /// Tests each target for changed threshold state - /// Crossing events are recorded for each threshold that - /// is crossed since the last call to test - void test(value_type t) { - for (auto i=0u; i<size(); ++i) { - auto v_prev = v_prev_[i]; - auto v = values_[index_[i]]; - auto thresh = thresholds_[i]; - if (!is_crossed_[i]) { - if (v>=thresh) { - // the threshold has been passed, so estimate the time using - // linear interpolation - auto pos = (thresh - v_prev)/(v - v_prev); - auto crossing_time = t_prev_ + pos*(t - t_prev_); - crossings_.push_back({i, crossing_time}); - - is_crossed_[i] = true; - } - } - else { - if (v<thresh) { - is_crossed_[i] = false; - } - } - - v_prev_[i] = v; - } - t_prev_ = t; - } - - bool is_crossed(size_type i) const { - return is_crossed_[i]; - } - - /// the number of threashold values that are being monitored - std::size_t size() const { - return index_.size(); - } - - /// Data type used to store the crossings. - /// Provided to make type-generic calling code. - using crossing_list = std::vector<threshold_crossing>; - - private: - const_view values_; - iarray index_; - - array thresholds_; - value_type t_prev_; - array v_prev_; - crossing_list crossings_; - iarray is_crossed_; - }; - - -private: - - using maker_type = mechanism (*)(view, view, array&&, iarray&&); - static std::map<std::string, maker_type> mech_map_; - - template <template <typename> class Mech> - static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) { - return mechanisms::make_mechanism<Mech<backend>> - (vec_v, vec_i, std::move(weights), std::move(node_indices)); - } -}; - -} // namespace multicore -} // namespace mc -} // namespace nest - diff --git a/src/backends/fvm_gpu.cu b/src/backends/gpu/fvm.cu similarity index 68% rename from src/backends/fvm_gpu.cu rename to src/backends/gpu/fvm.cu index d83705c64c10787ad628d5c765c3181775cc282a..d5c6debf0faf4e84852265e138a1eb8f290a33d3 100644 --- a/src/backends/fvm_gpu.cu +++ b/src/backends/gpu/fvm.cu @@ -1,9 +1,11 @@ -#include "fvm_gpu.hpp" +#include "fvm.hpp" #include <mechanisms/gpu/hh.hpp> #include <mechanisms/gpu/pas.hpp> #include <mechanisms/gpu/expsyn.hpp> #include <mechanisms/gpu/exp2syn.hpp> +#include <mechanisms/gpu/test_kin1.hpp> +#include <mechanisms/gpu/test_kinlva.hpp> namespace nest { namespace mc { @@ -14,7 +16,9 @@ backend::mech_map_ = { { "pas", maker<mechanisms::gpu::pas::mechanism_pas> }, { "hh", maker<mechanisms::gpu::hh::mechanism_hh> }, { "expsyn", maker<mechanisms::gpu::expsyn::mechanism_expsyn> }, - { "exp2syn", maker<mechanisms::gpu::exp2syn::mechanism_exp2syn> } + { "exp2syn", maker<mechanisms::gpu::exp2syn::mechanism_exp2syn> }, + { "test_kin1", maker<mechanisms::gpu::test_kin1::mechanism_test_kin1> }, + { "test_kinlva", maker<mechanisms::gpu::test_kinlva::mechanism_test_kinlva> } }; } // namespace multicore diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp new file mode 100644 index 0000000000000000000000000000000000000000..80a8de85f8a1ba59e821fbe0cfd3e46637ea6386 --- /dev/null +++ b/src/backends/gpu/fvm.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include <map> +#include <string> + +#include <common_types.hpp> +#include <mechanism.hpp> +#include <memory/memory.hpp> + +#include "matrix_state_interleaved.hpp" +#include "matrix_state_flat.hpp" +#include "stimulus.hpp" +#include "threshold_watcher.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +struct backend { + /// define the real and index types + using value_type = double; + using size_type = nest::mc::cell_lid_type; + + /// define storage types + using array = memory::device_vector<value_type>; + using iarray = memory::device_vector<size_type>; + + using view = typename array::view_type; + using const_view = typename array::const_view_type; + + using iview = typename iarray::view_type; + using const_iview = typename iarray::const_view_type; + + using host_array = typename memory::host_vector<value_type>; + using host_iarray = typename memory::host_vector<size_type>; + + using host_view = typename host_array::view_type; + using host_iview = typename host_iarray::const_view_type; + + static std::string name() { + return "gpu"; + } + + // matrix back end implementation + using matrix_state = matrix_state_interleaved<value_type, size_type>; + + // mechanism infrastructure + using ion = mechanisms::ion<backend>; + + using mechanism = mechanisms::mechanism_ptr<backend>; + + using stimulus = mechanisms::gpu::stimulus<backend>; + + static mechanism make_mechanism( + const std::string& name, + view vec_v, view vec_i, + const std::vector<value_type>& weights, + const std::vector<size_type>& node_indices) + { + if (!has_mechanism(name)) { + throw std::out_of_range("no mechanism in database : " + name); + } + + return mech_map_.find(name)-> + second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices)); + } + + static bool has_mechanism(const std::string& name) { + return mech_map_.count(name)>0; + } + + using threshold_watcher = + nest::mc::gpu::threshold_watcher<value_type, size_type>; + +private: + + using maker_type = mechanism (*)(view, view, array&&, iarray&&); + static std::map<std::string, maker_type> mech_map_; + + template <template <typename> class Mech> + static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) { + return mechanisms::make_mechanism<Mech<backend>> + (vec_v, vec_i, std::move(weights), std::move(node_indices)); + } +}; + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu_intrinsics.hpp b/src/backends/gpu/intrinsics.hpp similarity index 100% rename from src/backends/gpu_intrinsics.hpp rename to src/backends/gpu/intrinsics.hpp diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8524846859e3e657a6de13cfa5d71beccac35946 --- /dev/null +++ b/src/backends/gpu/kernels/assemble_matrix.hpp @@ -0,0 +1,104 @@ +#pragma once + +#include "detail.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +/// GPU implementatin of Hines matrix assembly +/// Flat layout +/// For a given time step size dt +/// - use the precomputed alpha and alpha_d values to construct the diagonal +/// and off diagonal of the symmetric Hines matrix. +/// - compute the RHS of the linear system to solve +template <typename T, typename I> +__global__ +void assemble_matrix_flat( + T* d, T* rhs, const T* invariant_d, + const T* voltage, const T* current, const T* cv_capacitance, + T dt, unsigned n) +{ + const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x; + + // The 1e-3 is a constant of proportionality required to ensure that the + // conductance (gi) values have units μS (micro-Siemens). + // See the model documentation in docs/model for more information. + T factor = 1e-3/dt; + if (tid<n) { + auto gi = factor * cv_capacitance[tid]; + d[tid] = gi + invariant_d[tid]; + rhs[tid] = gi*voltage[tid] - current[tid]; + } +} + +/// GPU implementatin of Hines matrix assembly +/// Interleaved layout +/// For a given time step size dt +/// - use the precomputed alpha and alpha_d values to construct the diagonal +/// and off diagonal of the symmetric Hines matrix. +/// - compute the RHS of the linear system to solve +template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads> +__global__ +void assemble_matrix_interleaved( + T* d, + T* rhs, + const T* invariant_d, + const T* voltage, + const T* current, + const T* cv_capacitance, + const I* sizes, + const I* starts, + T dt, unsigned padded_size, unsigned num_mtx) +{ + static_assert(BlockWidth*LoadWidth==Threads, + "number of threads must equal number of values to process per block"); + __shared__ T buffer_v[Threads]; + __shared__ T buffer_i[Threads]; + + const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; + const unsigned lid = threadIdx.x; + + const unsigned mtx_id = tid/LoadWidth; + const unsigned mtx_lane = tid - mtx_id*LoadWidth; + + const unsigned blk_id = tid/(BlockWidth*LoadWidth); + const unsigned blk_row = lid/BlockWidth; + const unsigned blk_lane = lid - blk_row*BlockWidth; + + const unsigned blk_pos = LoadWidth*blk_lane + blk_row; + + const bool do_load = mtx_id<num_mtx; + + unsigned load_pos = do_load? starts[mtx_id] + mtx_lane : 0; + const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0; + unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane); + + const unsigned max_size = sizes[0]; + + // The 1e-3 is a constant of proportionality required to ensure that the + // conductance (gi) values have units μS (micro-Siemens). + // See the model documentation in docs/model for more information. + T factor = 1e-3/dt; + for (unsigned j=0u; j<max_size; j+=LoadWidth) { + if (do_load && load_pos<end) { + buffer_v[lid] = voltage[load_pos]; + buffer_i[lid] = current[load_pos]; + } + + __syncthreads(); + + if (j+blk_row<padded_size) { + const auto gi = factor * cv_capacitance[store_pos]; + d[store_pos] = gi + invariant_d[store_pos]; + rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos]; + } + + store_pos += LoadWidth*BlockWidth; + load_pos += LoadWidth; + } +} + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8f3993faee6d47af106d4f05a1e33783b1befc18 --- /dev/null +++ b/src/backends/gpu/kernels/detail.hpp @@ -0,0 +1,82 @@ +#pragma once + +#include <cstdint> +#include <cfloat> + +#include <iostream> +#include <limits> +#include <string> +#include <vector> + +namespace nest { +namespace mc { +namespace gpu { + +namespace impl { +// Number of matrices per block in block-interleaved storage +__host__ __device__ +constexpr inline unsigned block_dim() { + return 32u; +} + +// The number of threads per matrix in the interleave and reverse-interleave +// operations. +__host__ __device__ +constexpr inline unsigned load_width() { + return 32u; +} + +// The alignment of matrices inside the block-interleaved storage. +__host__ __device__ +constexpr inline unsigned matrix_padding() { + return load_width(); +} + +// Number of threads per warp +// This has always been 32, however it may change in future NVIDIA gpus +__host__ __device__ +constexpr inline unsigned threads_per_warp() { + return 32u; +} + +// The minimum number of bins required to store n values where the bins have +// dimension of block_size. +__host__ __device__ +constexpr inline unsigned block_count(unsigned n, unsigned block_size) { + return (n+block_size-1)/block_size; +} + +// The smallest size of a buffer required to store n items in such that the +// buffer has size that is a multiple of block_dim. +constexpr inline unsigned padded_size (unsigned n, unsigned block_dim) { + return n%block_dim ? n+block_dim-(n%block_dim): n; +} + +// Placeholders to use for mark padded locations in data structures that use +// padding. Using such markers makes it easier to test that padding is +// performed correctly. +template <typename T> __host__ __device__ constexpr T npos(); +template <> __host__ __device__ constexpr char npos<char>() { return CHAR_MAX; } +template <> __host__ __device__ constexpr unsigned char npos<unsigned char>() { return UCHAR_MAX; } +template <> __host__ __device__ constexpr short npos<short>() { return SHRT_MAX; } +template <> __host__ __device__ constexpr int npos<int>() { return INT_MAX; } +template <> __host__ __device__ constexpr long npos<long>() { return LONG_MAX; } +template <> __host__ __device__ constexpr float npos<float>() { return FLT_MAX; } +template <> __host__ __device__ constexpr double npos<double>() { return DBL_MAX; } +template <> __host__ __device__ constexpr unsigned short npos<unsigned short>() { return USHRT_MAX; } +template <> __host__ __device__ constexpr unsigned int npos<unsigned int>() { return UINT_MAX; } +template <> __host__ __device__ constexpr unsigned long npos<unsigned long>() { return ULONG_MAX; } +template <> __host__ __device__ constexpr long long npos<long long>() { return LLONG_MAX; } + +// test if value v is npos +template <typename T> +__host__ __device__ +constexpr bool is_npos(T v) { + return v == npos<T>(); +} + +} // namespace impl + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3762368af4c2575f3e7e25b11467837cf3264872 --- /dev/null +++ b/src/backends/gpu/kernels/interleave.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include "detail.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +/////////////////////////////////////////////////////////////////////////////// +// For more information about the interleaved and flat storage formats for +// Hines matrices, see src/backends/matrix_storage.md +/////////////////////////////////////////////////////////////////////////////// + + +// Data in a vector is to be interleaved into blocks of width BlockWidth. +// The kernel assigns LoadWidth threads to each lane in the block. +// +// Note that all indexes can reasonably be represented by an unsigned 32-bit +// integer, so we use unsigned explicitly. +template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads> +__global__ +void flat_to_interleaved( + const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) +{ + static_assert(BlockWidth*LoadWidth==Threads, ""); + + __shared__ T buffer[Threads]; + + const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; + const unsigned lid = threadIdx.x; + + const unsigned mtx_id = tid/LoadWidth; + const unsigned mtx_lane = tid - mtx_id*LoadWidth; + + const unsigned blk_id = tid/(BlockWidth*LoadWidth); + const unsigned blk_row = lid/BlockWidth; + const unsigned blk_lane = lid - blk_row*BlockWidth; + + const unsigned blk_pos = LoadWidth*blk_lane + blk_row; + + const bool do_load = mtx_id<num_vec; + + // only threads that participate in loading access starts and sizes arrays + unsigned load_pos = do_load? starts[mtx_id] + mtx_lane : 0u; + const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0u; + unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane); + + for (unsigned i=0u; i<padded_size; i+=LoadWidth) { + auto loaded = impl::npos<T>(); + if (do_load && load_pos<end) { + loaded = in[load_pos]; + } + buffer[lid] = loaded; + __syncthreads(); + if (i+blk_row<padded_size) { + out[store_pos] = buffer[blk_pos]; + } + load_pos += LoadWidth; + store_pos += LoadWidth*BlockWidth; + } +} + +// Note that all indexes can reasonably be represented by an unsigned 32-bit +// integer, so we use unsigned explicitly. +template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned THREADS> +__global__ +void interleaved_to_flat( + const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) +{ + static_assert(BlockWidth*LoadWidth==THREADS, ""); + + __shared__ T buffer[THREADS]; + + const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; + const unsigned lid = threadIdx.x; + + const unsigned mtx_id = tid/LoadWidth; + const unsigned mtx_lane = tid - mtx_id*LoadWidth; + + const unsigned blk_id = tid/(BlockWidth*LoadWidth); + const unsigned blk_row = lid/BlockWidth; + const unsigned blk_lane = lid - blk_row*BlockWidth; + + const unsigned blk_pos = LoadWidth*blk_lane + blk_row; + + const bool do_store = mtx_id<num_vec; + + // only threads that participate in storing access starts and sizes arrays + unsigned store_pos = do_store? starts[mtx_id] + mtx_lane : 0u; + const unsigned end = do_store? starts[mtx_id] + sizes[mtx_id]: 0u; + unsigned load_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane); + + for (unsigned i=0u; i<padded_size; i+=LoadWidth) { + auto loaded = impl::npos<T>(); + if (i+blk_row<padded_size) { + loaded = in[load_pos]; + } + buffer[blk_pos] = loaded; + __syncthreads(); + if (do_store && store_pos<end) { + out[store_pos] = buffer[lid]; + } + load_pos += LoadWidth*BlockWidth; + store_pos += LoadWidth; + } +} + +// host side wrapper for the flat to interleaved operation +template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth> +void flat_to_interleaved( + const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) +{ + constexpr unsigned Threads = BlockWidth*LoadWidth; + const unsigned blocks = impl::block_count(num_vec, BlockWidth); + + flat_to_interleaved<T, I, BlockWidth, LoadWidth, Threads> + <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec); +} + +// host side wrapper for the interleave to flat operation +template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth> +void interleaved_to_flat( + const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) +{ + constexpr unsigned Threads = BlockWidth*LoadWidth; + const unsigned blocks = impl::block_count(num_vec, BlockWidth); + + interleaved_to_flat<T, I, BlockWidth, LoadWidth, Threads> + <<<blocks, Threads>>> (in, out, sizes, starts, padded_size, num_vec); +} + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/kernels/solve_matrix.hpp b/src/backends/gpu/kernels/solve_matrix.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2cfa5f105f54f3a0e0e9c357f8dc3ad6ea7ee4b5 --- /dev/null +++ b/src/backends/gpu/kernels/solve_matrix.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "detail.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +/// GPU implementation of Hines Matrix solver. +/// Flat format +template <typename T, typename I> +__global__ +void solve_matrix_flat( + T* rhs, T* d, const T* u, const I* p, const I* cell_index, int num_mtx) +{ + auto tid = threadIdx.x + blockDim.x*blockIdx.x; + + if (tid<num_mtx) { + // get range of this thread's cell matrix + const auto first = cell_index[tid]; + const auto last = cell_index[tid+1]; + + // backward sweep + for(auto i=last-1; i>first; --i) { + auto factor = u[i] / d[i]; + d[p[i]] -= factor * u[i]; + rhs[p[i]] -= factor * rhs[i]; + } + rhs[first] /= d[first]; + + // forward sweep + for(auto i=first+1; i<last; ++i) { + rhs[i] -= u[i] * rhs[p[i]]; + rhs[i] /= d[i]; + } + } +} + +/// GPU implementation of Hines Matrix solver. +/// Block-interleaved format +template <typename T, typename I, int BlockWidth> +__global__ +void solve_matrix_interleaved( + T* rhs, T* d, const T* u, const I* p, const I* sizes, int padded_size, int num_mtx) +{ + auto tid = threadIdx.x + blockDim.x*blockIdx.x; + + if (tid<num_mtx) { + const auto block = tid/BlockWidth; + const auto block_start = block*BlockWidth; + const auto block_lane = tid - block_start; + + // get range of this thread's cell matrix + const auto first = block_start*padded_size + block_lane; + const auto last = first + BlockWidth*(sizes[tid]-1); + const auto last_max = first + BlockWidth*(sizes[block_start]-1); + + // backward sweep + for(auto i=last_max; i>first; i-=BlockWidth) { + if (i<=last) { + auto factor = u[i] / d[i]; + d[p[i]] -= factor * u[i]; + rhs[p[i]] -= factor * rhs[i]; + } + } + rhs[first] /= d[first]; + + // forward sweep + for(auto i=first+BlockWidth; i<=last; i+=BlockWidth) { + rhs[i] -= u[i] * rhs[p[i]]; + rhs[i] /= d[i]; + } + } +} + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/kernels/test_thresholds.hpp b/src/backends/gpu/kernels/test_thresholds.hpp new file mode 100644 index 0000000000000000000000000000000000000000..946030704607b87bad5531c8530f671fe52a48d8 --- /dev/null +++ b/src/backends/gpu/kernels/test_thresholds.hpp @@ -0,0 +1,61 @@ +#pragma once + +namespace nest { +namespace mc { +namespace gpu { + +/// kernel used to test for threshold crossing test code. +/// params: +/// t : current time (ms) +/// t_prev : time of last test (ms) +/// size : number of values to test +/// is_crossed : crossing state at time t_prev (true or false) +/// prev_values : values at sample points (see index) sampled at t_prev +/// index : index with locations in values to test for crossing +/// values : values at t_prev +/// thresholds : threshold values to watch for crossings +template <typename T, typename I, typename Stack> +__global__ +void test_thresholds( + float t, float t_prev, int size, + Stack& stack, + I* is_crossed, T* prev_values, + const I* index, const T* values, const T* thresholds) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + + bool crossed = false; + float crossing_time; + + if (i<size) { + // Test for threshold crossing + const auto v_prev = prev_values[i]; + const auto v = values[index[i]]; + const auto thresh = thresholds[i]; + + if (!is_crossed[i]) { + if (v>=thresh) { + // The threshold has been passed, so estimate the time using + // linear interpolation + auto pos = (thresh - v_prev)/(v - v_prev); + crossing_time = t_prev + pos*(t - t_prev); + + is_crossed[i] = 1; + crossed = true; + } + } + else if (v<thresh) { + is_crossed[i]=0; + } + + prev_values[i] = v; + } + + if (crossed) { + stack.push_back({I(i), crossing_time}); + } +} + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/matrix_state_flat.hpp b/src/backends/gpu/matrix_state_flat.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d214079f7f5841a43c6f6c52cf9992e3e0a635e3 --- /dev/null +++ b/src/backends/gpu/matrix_state_flat.hpp @@ -0,0 +1,118 @@ +#pragma once + +#include <memory/memory.hpp> +#include <util/span.hpp> +#include <util/rangeutil.hpp> + +#include "kernels/solve_matrix.hpp" +#include "kernels/assemble_matrix.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +/// matrix state +template <typename T, typename I> +struct matrix_state_flat { + using value_type = T; + using size_type = I; + + using array = memory::device_vector<value_type>; + using iarray = memory::device_vector<size_type>; + + using view = typename array::view_type; + using const_view = typename array::const_view_type; + + iarray parent_index; + iarray cell_index; + + array d; // [μS] + array u; // [μS] + array rhs; // [nA] + + array cv_capacitance; // [pF] + array face_conductance; // [μS] + + // the invariant part of the matrix diagonal + array invariant_d; // [μS] + + // interface for exposing the solution to the outside world + view solution; + + matrix_state_flat() = default; + + matrix_state_flat(const std::vector<size_type>& p, + const std::vector<size_type>& cell_idx, + const std::vector<value_type>& cv_cap, + const std::vector<value_type>& face_cond): + parent_index(memory::make_const_view(p)), + cell_index(memory::make_const_view(cell_idx)), + d(p.size()), + u(p.size()), + rhs(p.size()), + cv_capacitance(memory::make_const_view(cv_cap)) + { + EXPECTS(cv_cap.size() == size()); + EXPECTS(face_cond.size() == size()); + EXPECTS(cell_idx.back() == size()); + EXPECTS(cell_idx.size() > 2u); + + using memory::make_const_view; + + auto n = d.size(); + std::vector<value_type> invariant_d_tmp(n, 0); + std::vector<value_type> u_tmp(n, 0); + + for (auto i: util::make_span(1u, n)) { + auto gij = face_cond[i]; + + u_tmp[i] = -gij; + invariant_d_tmp[i] += gij; + invariant_d_tmp[p[i]] += gij; + } + invariant_d = make_const_view(invariant_d_tmp); + u = make_const_view(u_tmp); + + solution = rhs; + } + + // Assemble the matrix + // Afterwards the diagonal and RHS will have been set given dt, voltage and current + // dt [ms] + // voltage [mV] + // current [nA] + void assemble(value_type dt, const_view voltage, const_view current) { + // determine the grid dimensions for the kernel + auto const n = voltage.size(); + auto const block_dim = 128; + auto const grid_dim = impl::block_count(n, block_dim); + + assemble_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> ( + d.data(), rhs.data(), invariant_d.data(), voltage.data(), + current.data(), cv_capacitance.data(), dt, size()); + } + + void solve() { + // determine the grid dimensions for the kernel + auto const block_dim = 128; + auto const grid_dim = impl::block_count(num_matrices(), block_dim); + + // perform solve on gpu + solve_matrix_flat<value_type, size_type><<<grid_dim, block_dim>>> ( + rhs.data(), d.data(), u.data(), parent_index.data(), + cell_index.data(), num_matrices()); + } + + std::size_t size() const { + return parent_index.size(); + } + +private: + unsigned num_matrices() const { + return cell_index.size()-1; + } +}; + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu/matrix_state_interleaved.hpp b/src/backends/gpu/matrix_state_interleaved.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2f1888395dbaf7624edef3813ad1a9984170e697 --- /dev/null +++ b/src/backends/gpu/matrix_state_interleaved.hpp @@ -0,0 +1,256 @@ +#pragma once + +#include <memory/memory.hpp> +#include <util/span.hpp> +#include <util/rangeutil.hpp> + +#include "kernels/solve_matrix.hpp" +#include "kernels/assemble_matrix.hpp" +#include "kernels/interleave.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +// A helper that performs the interleave operation on host memory. +template <typename T, typename I> +std::vector<T> flat_to_interleaved( + const std::vector<T>& in, + const std::vector<I>& sizes, + const std::vector<I>& starts, + unsigned block_width, unsigned num_vec, unsigned padded_length) +{ + auto num_blocks = impl::block_count(num_vec, block_width); + std::vector<T> out(num_blocks*block_width*padded_length, impl::npos<T>()); + for (auto mtx: util::make_span(0u, num_vec)) { + auto block = mtx/block_width; + auto lane = mtx%block_width; + + auto len = sizes[mtx]; + auto src = starts[mtx]; + auto dst = block*(block_width*padded_length) + lane; + for (auto i: util::make_span(0, len)) { + out[dst] = in[src+i]; + dst += block_width; + } + } + return out; +}; + +/// matrix state +template <typename T, typename I> +struct matrix_state_interleaved { + using value_type = T; + using size_type = I; + + using array = memory::device_vector<value_type>; + using iarray = memory::device_vector<size_type>; + + using const_view = typename array::const_view_type; + + // Permutation and index information required for forward and backward + // interleave-permutation of vectors. + + // size of each matrix (after permutation in ascending size) + iarray matrix_sizes; + // start values corresponding to matrix i in the external storage + iarray matrix_index; + + // Storage for the matrix and parent index in interleaved format. + // Includes the cv_capacitance, which is required for matrix assembly. + + iarray parent_index; + array d; // [μS] + array u; // [μS] + array rhs; // [nA] + + // required for matrix assembly + array cv_capacitance; // [pF] + + // the invariant part of the matrix diagonal + array invariant_d; // [μS] + + // the length of a vector required to store values for one + // matrix with padding + unsigned padded_size; + + // Storage for solution in uninterleaved format. + // Used to hold the storage for passing to caller, and must be updated + // after each call to the ::solve() method. + array solution; + + // default constructor + matrix_state_interleaved() = default; + + // Construct matrix state for a set of matrices defined by parent_index p + // The matrix solver stores the matrix in an "interleaved" structure for + // optimal solution, which requires a significant amount of precomputing + // of indexes and data structures in the constructor. + // cv_cap // [pF] + // face_cond // [μS] + matrix_state_interleaved(const std::vector<size_type>& p, + const std::vector<size_type>& cell_idx, + const std::vector<value_type>& cv_cap, + const std::vector<value_type>& face_cond) + { + EXPECTS(cv_cap.size() == p.size()); + EXPECTS(face_cond.size() == p.size()); + EXPECTS(cell_idx.back() == p.size()); + + // Just because you never know. + EXPECTS(cell_idx.size() <= UINT_MAX); + + using util::make_span; + + // Convenience for commonly used type in this routine. + using svec = std::vector<size_type>; + + // + // Sort matrices in descending order of size. + // + + // Find the size of each matrix. + const auto num_mtx = cell_idx.size()-1; + svec sizes; + for (auto it=cell_idx.begin()+1; it!=cell_idx.end(); ++it) { + sizes.push_back(*it - *(it-1)); + } + + // Find permutations and sort indexes/sizes. + svec perm(num_mtx); + std::iota(perm.begin(), perm.end(), 0); + // calculate the permutation of matrices to put the in ascending size + util::stable_sort_by(perm, [&sizes](size_type i){ return sizes[i]; }); + std::reverse(perm.begin(), perm.end()); + + // TODO: refactor to be less verbose with permutation_view + svec sizes_p; + for (auto i: make_span(0, num_mtx)) { + sizes_p.push_back(sizes[perm[i]]); + } + svec cell_index_p; + for (auto i: make_span(0, num_mtx)) { + cell_index_p.push_back(cell_idx[perm[i]]); + } + + // + // Calculate dimensions required to store matrices. + // + using impl::block_dim; + using impl::matrix_padding; + + // To start, take simplest approach of assuming all matrices stored + // in blocks of the same dimension: padded_size + padded_size = impl::padded_size(sizes_p[0], matrix_padding()); + const auto num_blocks = impl::block_count(num_mtx, block_dim()); + + const auto total_storage = num_blocks*block_dim()*padded_size; + + // calculate the interleaved and permuted p vector + constexpr auto npos = std::numeric_limits<size_type>::max(); + std::vector<size_type> p_tmp(total_storage, npos); + for (auto mtx: make_span(0, num_mtx)) { + auto block = mtx/block_dim(); + auto lane = mtx%block_dim(); + + auto len = sizes_p[mtx]; + auto src = cell_index_p[mtx]; + auto dst = block*(block_dim()*padded_size) + lane; + for (auto i: make_span(0, len)) { + // the p indexes are always relative to the start of the p vector. + // the addition and subtraction of dst and src respectively is to convert from + // the original offset to the new padded and permuted offset. + p_tmp[dst+block_dim()*i] = dst + block_dim()*(p[src+i]-src); + } + } + + d = array(total_storage); + u = array(total_storage); + rhs = array(total_storage); + parent_index = memory::make_const_view(p_tmp); + + // + // Calculate the invariant part of the matrix diagonal and the + // upper diagonal on the host, then copy to the device. + // + + std::vector<value_type> invariant_d_tmp(p.size(), 0); + std::vector<value_type> u_tmp(p.size(), 0); + auto face_conductance_tmp = memory::on_host(face_cond); + for (auto i: util::make_span(1u, p.size())) { + auto gij = face_conductance_tmp[i]; + + u_tmp[i] = -gij; + invariant_d_tmp[i] += gij; + invariant_d_tmp[p[i]] += gij; + } + + // Helper that converts to interleaved format on the host, then copies to device + // memory, for use as an rvalue in an assignemt to a device vector. + auto interleave = [&] (std::vector<T>const& x) { + return memory::on_gpu( + flat_to_interleaved(x, sizes_p, cell_index_p, block_dim(), num_mtx, padded_size)); + }; + u = interleave(u_tmp); + invariant_d = interleave(invariant_d_tmp); + cv_capacitance = interleave(cv_cap); + + matrix_sizes = memory::make_const_view(sizes_p); + matrix_index = memory::make_const_view(cell_index_p); + + // Allocate space for storing the un-interleaved solution. + solution = array(p.size()); + } + + // Assemble the matrix + // Afterwards the diagonal and RHS will have been set given dt, voltage and current + // dt [ms] + // voltage [mV] + // current [nA] + void assemble(value_type dt, const_view voltage, const_view current) { + constexpr auto bd = impl::block_dim(); + constexpr auto lw = impl::load_width(); + constexpr auto block_dim = bd*lw; + + // The number of threads is threads_per_matrix*num_mtx + const auto num_blocks = impl::block_count(num_matrices()*lw, block_dim); + + assemble_matrix_interleaved<value_type, size_type, bd, lw, block_dim> + <<<num_blocks, block_dim>>> + (d.data(), rhs.data(), invariant_d.data(), + voltage.data(), current.data(), cv_capacitance.data(), + matrix_sizes.data(), matrix_index.data(), + dt, padded_matrix_size(), num_matrices()); + + } + + void solve() { + // Perform the Hines solve. + auto const grid_dim = impl::block_count(num_matrices(), impl::block_dim()); + solve_matrix_interleaved<value_type, size_type, impl::block_dim()> + <<<grid_dim, impl::block_dim()>>> + (rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(), + padded_matrix_size(), num_matrices()); + + // Copy the solution from interleaved to front end storage. + interleaved_to_flat<value_type, size_type, impl::block_dim(), impl::load_width()> + (rhs.data(), solution.data(), matrix_sizes.data(), matrix_index.data(), + padded_matrix_size(), num_matrices()); + } + +private: + + // The number of matrices stored in the matrix state. + unsigned num_matrices() const { + return matrix_sizes.size(); + } + + // The full padded matrix size + unsigned padded_matrix_size() const { + return padded_size; + } +}; + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/gpu_stack.hpp b/src/backends/gpu/stack.hpp similarity index 97% rename from src/backends/gpu_stack.hpp rename to src/backends/gpu/stack.hpp index 7767b9e535d22aa05d23b7c7bd18690028f95ca0..2cad1d10ba55b4189c7a438e3bed4401d88da2c4 100644 --- a/src/backends/gpu_stack.hpp +++ b/src/backends/gpu/stack.hpp @@ -20,7 +20,7 @@ namespace gpu { // It is designed to be initialized empty with a given capacity on the host, // updated by device kernels, and periodically read and reset from the host side. template <typename T> -class gpu_stack { +class stack { using value_type = T; using allocator = memory::managed_allocator<value_type>; @@ -37,13 +37,13 @@ class gpu_stack { public: - gpu_stack(unsigned capacity): + stack(unsigned capacity): capacity_(capacity), size_(0u) { data_ = allocator().allocate(capacity_); } - ~gpu_stack() { + ~stack() { allocator().deallocate(data_, capacity_); } diff --git a/src/backends/stimulus_gpu.hpp b/src/backends/gpu/stimulus.hpp similarity index 87% rename from src/backends/stimulus_gpu.hpp rename to src/backends/gpu/stimulus.hpp index 3d2ac907fc89e9d2bd0ac188c980418c870b2706..6cc41703dba92be14fd0a87cad59ccf20da0691f 100644 --- a/src/backends/stimulus_gpu.hpp +++ b/src/backends/gpu/stimulus.hpp @@ -7,24 +7,14 @@ #include <algorithms.hpp> #include <util/pprintf.hpp> +#include "intrinsics.hpp" + namespace nest{ namespace mc{ namespace mechanisms { namespace gpu { namespace kernels { - __device__ - inline double atomicAdd(double* address, double val) { - using I = unsigned long long int; - I* address_as_ull = (I*)address; - I old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); - } - template <typename T, typename I> __global__ void stim_current( @@ -40,7 +30,7 @@ namespace kernels { if (t>=delay[i] && t<(delay[i]+duration[i])) { // use subtraction because the electrode currents are specified // in terms of current into the compartment - atomicAdd(current+node_index[i], -amplitude[i]); + cuda_atomic_add(current+node_index[i], -amplitude[i]); } } } diff --git a/src/backends/gpu/threshold_watcher.hpp b/src/backends/gpu/threshold_watcher.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e4624c077239eff3a5a74cda42c60939b1ea8ae8 --- /dev/null +++ b/src/backends/gpu/threshold_watcher.hpp @@ -0,0 +1,148 @@ +#pragma once + +#include <common_types.hpp> +#include <memory/memory.hpp> +#include <memory/managed_ptr.hpp> +#include <util/span.hpp> + +#include "stack.hpp" +#include "kernels/test_thresholds.hpp" + +namespace nest { +namespace mc { +namespace gpu { + +/// threshold crossing logic +/// used as part of spike detection back end +template <typename T, typename I> +class threshold_watcher { +public: + using value_type = T; + using size_type = I; + + using array = memory::device_vector<T>; + using iarray = memory::device_vector<I>; + using const_view = typename array::const_view_type; + + /// stores a single crossing event + struct threshold_crossing { + size_type index; // index of variable + value_type time; // time of crossing + __host__ __device__ + friend bool operator== + (const threshold_crossing& lhs, const threshold_crossing& rhs) + { + return lhs.index==rhs.index && lhs.time==rhs.time; + } + }; + + using stack_type = stack<threshold_crossing>; + + threshold_watcher() = default; + + threshold_watcher( + const_view values, + const std::vector<size_type>& index, + const std::vector<value_type>& thresh, + value_type t=0): + values_(values), + index_(memory::make_const_view(index)), + thresholds_(memory::make_const_view(thresh)), + prev_values_(values), + is_crossed_(size()), + stack_(memory::make_managed_ptr<stack_type>(10*size())) + { + reset(t); + } + + /// Remove all stored crossings that were detected in previous calls + /// to test() + void clear_crossings() { + stack_->clear(); + } + + /// Reset state machine for each detector. + /// Assume that the values in values_ have been set correctly before + /// calling, because the values are used to determine the initial state + void reset(value_type t=0) { + clear_crossings(); + + // Make host-side copies of the information needed to calculate + // the initial crossed state + auto values = memory::on_host(values_); + auto thresholds = memory::on_host(thresholds_); + auto index = memory::on_host(index_); + + // calculate the initial crossed state in host memory + auto crossed = std::vector<size_type>(size()); + for (auto i: util::make_span(0u, size())) { + crossed[i] = values[index[i]] < thresholds[i] ? 0 : 1; + } + + // copy the initial crossed state to device memory + is_crossed_ = memory::on_gpu(crossed); + + // reset time of last test + t_prev_ = t; + } + + bool is_crossed(size_type i) const { + return is_crossed_[i]; + } + + const std::vector<threshold_crossing> crossings() const { + return std::vector<threshold_crossing>(stack_->begin(), stack_->end()); + } + + /// The time at which the last test was performed + value_type last_test_time() const { + return t_prev_; + } + + /// Tests each target for changed threshold state. + /// Crossing events are recorded for each threshold that has been + /// crossed since current time t, and the last time the test was + /// performed. + void test(value_type t) { + EXPECTS(t_prev_<t); + + constexpr int block_dim = 128; + const int grid_dim = (size()+block_dim-1)/block_dim; + test_thresholds<<<grid_dim, block_dim>>>( + t, t_prev_, size(), + *stack_, + is_crossed_.data(), prev_values_.data(), + index_.data(), values_.data(), thresholds_.data()); + + // Check that the number of spikes has not exceeded + // the capacity of the stack. + EXPECTS(stack_->size() <= stack_->capacity()); + + t_prev_ = t; + } + + /// the number of threashold values that are being monitored + std::size_t size() const { + return index_.size(); + } + + /// Data type used to store the crossings. + /// Provided to make type-generic calling code. + using crossing_list = std::vector<threshold_crossing>; + +private: + + const_view values_; // values to watch: on gpu + iarray index_; // indexes of values to watch: on gpu + + array thresholds_; // threshold for each watch: on gpu + value_type t_prev_; // time of previous sample: on host + array prev_values_; // values at previous sample time: on host + iarray is_crossed_; // bool flag for state of each watch: on gpu + + memory::managed_ptr<stack_type> stack_; +}; + +} // namespace gpu +} // namespace mc +} // namespace nest diff --git a/src/backends/matrix_storage.md b/src/backends/matrix_storage.md new file mode 100644 index 0000000000000000000000000000000000000000..d6ed40c2cf9121dc480f1a8604906e2e96b18166 --- /dev/null +++ b/src/backends/matrix_storage.md @@ -0,0 +1,108 @@ +# Flat and Interleaved Matrix Storage + +This document describes the layout of different storage schemes for matrices use in the GPU back end. + +An NxN Hines matrix can be stored compactly with 3 vectors of length N: + * `d`: the diagonal of the matrix + * `u`: the upper/lower part of the matrix (referred to somewhat casually as the super diagonal) + * `p`: the parent index +Additionally, we often store N*1 vectors that have one value per compartment, e.g. voltage, solution or rhs vectors. + +In NestMC multicompartment a single cell has a matrix structure associated with in, that is derived directly from the connections between its constituent compartments. NestMC groups these cells into groups of cells, called `cell_groups`. The matrices for all the cells in a group are packed together + +The matrix packing applies the same packing operation to each vector associated with a matrix, i.e. the `u`, `d`, `p` and solution, voltage vectors. + +In this discussion we use a simple example group of matrices to illustrate the storage methods, because an example is more illustrative than a formal description: + * 7 vectors labeled `{a, b, c, d, e, f, g}` + * the vectors have respective lenghts `{8, 7, 6, 6, 5, 5, 3}` + * the `i`th value in vector `a` is labelled `ai` + +## Flat storage + +Take a vector vals containing the values: + +``` +vals = [a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 | c0 c1 c2 c3 c4 c5 | d0 d1 d2 d3 d4 d5 | e0 e1 e2 e3 e4 | f0 f1 f2 f3 f4 | g0 g1 g2 ] +``` + +To fully describe the set of matrices we need an index vector of lenth `#matrices+1`: + +``` +indx = [0, 8, 15, 21, 27, 32, 37, 40] +``` + +To look up the value of the `i`th entry in the vector `m`, we use the following formula to calculate the index + +``` +lookup_flt(i,m): indx[m] + i +``` + +## Interleaved storage + +To store the matrices with block width 4 and padded matrix size of 8 two arrays are also required: + +``` +vals = +[ a0 b0 c0 d0 | a1 b1 c1 d1 | a2 b2 c2 d2 | a3 b3 c3 d3 | a4 b4 c4 d4 | a5 b5 c5 d5 | a6 b6 * * | a7 * * * | + e0 f0 g0 * | e1 f1 g1 * | e2 f2 g2 * | e3 f3 * * | e4 f4 * * | * * * * | * * * * | * * * * ] +sizes = [8, 7, 6, 6, 5, 5, 3] +``` + +where `*` indicates padding, or a location in `vals` that does not hold a value that is part of one of the packed vectors. + +To look up the value of the `i`th entry in the vector `m`, we use the following formula to calculate the index into `vals` + +``` +lookup_int(i,m) = floor(m/BW)*BW*N + m-floor(m/BW)*BW + i*BW +``` + +The `block` and `lane` (i.e. the block-local index) of a matrix can be computed + +``` +block = floor(m/BW) +lane = m-block*BW +``` + +so that the index calcuation can be expressed more succinctly and clearly: + +``` +lookup_int(i,m): block*BW*N + lane + i*BW +``` + +## On parent indexes + +Parent index vectors are packed in the same format as other vectors, however the index values must also be modified because parent indexes are relative. + +``` +p_flt(i,m) = indx[m] + p_lcl(i, m) +p_int(i,m) = lookup_int(0, m) + BW*p_lcl(i, m) +``` + +For example, the following two cells + +``` +cell 1, 6 nodes: + +0--1--2--3 + \ + 4--5 + +cell 2, 8 nodes: + +0--1--2--3 + \ + 4--5--6 + \ + 7 +``` + +have the following packed structures + +``` +p_lcl = [0 0 1 2 1 4 | 0 0 1 2 1 4 5 4] +p_flt = [0 0 1 2 1 4 | 6 6 7 8 7 10 11 10] +p_int = [0 1 * *| 0 1 * * | 4 5 * * | 8 9 * * | 4 5 * * | 16 17 * * | 20 * * * | 16 * * * ] +``` + +Where the interleaved storage used block width 4, and packed matrix size 8, as in the earlier example. + diff --git a/src/backends/fvm_multicore.cpp b/src/backends/multicore/fvm.cpp similarity index 96% rename from src/backends/fvm_multicore.cpp rename to src/backends/multicore/fvm.cpp index 8d20f2ffe7161832bbdd66fd77dd83255766f8b4..06f45a40d19a76b4fdcf910676896cf960303715 100644 --- a/src/backends/fvm_multicore.cpp +++ b/src/backends/multicore/fvm.cpp @@ -1,4 +1,4 @@ -#include "fvm_multicore.hpp" +#include "fvm.hpp" #include <mechanisms/multicore/hh.hpp> #include <mechanisms/multicore/pas.hpp> diff --git a/src/backends/multicore/fvm.hpp b/src/backends/multicore/fvm.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d04928cf00a64978a1ae076957a8c7e4695b9099 --- /dev/null +++ b/src/backends/multicore/fvm.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include <map> +#include <string> + +#include <common_types.hpp> +#include <mechanism.hpp> +#include <memory/memory.hpp> +#include <memory/wrappers.hpp> +#include <util/span.hpp> + +#include "matrix_state.hpp" +#include "stimulus.hpp" +#include "threshold_watcher.hpp" + +namespace nest { +namespace mc { +namespace multicore { + +struct backend { + /// define the real and index types + using value_type = double; + using size_type = nest::mc::cell_lid_type; + + /// define storage types + using array = memory::host_vector<value_type>; + using iarray = memory::host_vector<size_type>; + + using view = typename array::view_type; + using const_view = typename array::const_view_type; + + using iview = typename iarray::view_type; + using const_iview = typename iarray::const_view_type; + + using host_array = array; + using host_iarray = iarray; + + using host_view = view; + using host_iview = iview; + + /// matrix state + using matrix_state = + nest::mc::multicore::matrix_state<value_type, size_type>; + + // + // mechanism infrastructure + // + using ion = mechanisms::ion<backend>; + + using mechanism = mechanisms::mechanism_ptr<backend>; + + using stimulus = mechanisms::multicore::stimulus<backend>; + + static mechanism make_mechanism( + const std::string& name, + view vec_v, view vec_i, + const std::vector<value_type>& weights, + const std::vector<size_type>& node_indices) + { + if (!has_mechanism(name)) { + throw std::out_of_range("no mechanism in database : " + name); + } + + return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices)); + } + + static bool has_mechanism(const std::string& name) { + return mech_map_.count(name)>0; + } + + static std::string name() { + return "cpu"; + } + + /// threshold crossing logic + /// used as part of spike detection back end + using threshold_watcher = + nest::mc::multicore::threshold_watcher<value_type, size_type>; + + +private: + + using maker_type = mechanism (*)(view, view, array&&, iarray&&); + static std::map<std::string, maker_type> mech_map_; + + template <template <typename> class Mech> + static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) { + return mechanisms::make_mechanism<Mech<backend>> + (vec_v, vec_i, std::move(weights), std::move(node_indices)); + } +}; + +} // namespace multicore +} // namespace mc +} // namespace nest diff --git a/src/backends/multicore/matrix_state.hpp b/src/backends/multicore/matrix_state.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0414a9d02ce7f6d30d9535b3c5ca7ab0553c9c12 --- /dev/null +++ b/src/backends/multicore/matrix_state.hpp @@ -0,0 +1,115 @@ +#pragma once + +#include <memory/memory.hpp> +#include <util/span.hpp> + +namespace nest { +namespace mc { +namespace multicore { + +template <typename T, typename I> +struct matrix_state { +public: + using value_type = T; + using size_type = I; + + using array = memory::host_vector<value_type>; + using const_view = typename array::const_view_type; + using iarray = memory::host_vector<size_type>; + iarray parent_index; + iarray cell_index; + + array d; // [μS] + array u; // [μS] + array rhs; // [nA] + + array cv_capacitance; // [pF] + array face_conductance; // [μS] + + // the invariant part of the matrix diagonal + array invariant_d; // [μS] + + const_view solution; + + matrix_state() = default; + + matrix_state(const std::vector<size_type>& p, + const std::vector<size_type>& cell_idx, + const std::vector<value_type>& cap, + const std::vector<value_type>& cond): + parent_index(memory::make_const_view(p)), + cell_index(memory::make_const_view(cell_idx)), + d(size(), 0), u(size(), 0), rhs(size()), + cv_capacitance(memory::make_const_view(cap)), + face_conductance(memory::make_const_view(cond)) + { + EXPECTS(cap.size() == size()); + EXPECTS(cond.size() == size()); + EXPECTS(cell_idx.back() == size()); + + auto n = size(); + invariant_d = array(n, 0); + for (auto i: util::make_span(1u, n)) { + auto gij = face_conductance[i]; + + u[i] = -gij; + invariant_d[i] += gij; + invariant_d[p[i]] += gij; + } + + // In this back end the solution is a simple view of the rhs, which + // contains the solution after the matrix_solve is performed. + solution = rhs; + } + + // Assemble the matrix + // Afterwards the diagonal and RHS will have been set given dt, voltage and current + // dt [ms] + // voltage [mV] + // current [nA] + void assemble(value_type dt, const_view voltage, const_view current) { + auto n = size(); + value_type factor = 1e-3/dt; + for (auto i: util::make_span(0u, n)) { + auto gi = factor*cv_capacitance[i]; + + d[i] = gi + invariant_d[i]; + + rhs[i] = gi*voltage[i] - current[i]; + } + } + + void solve() { + const size_type ncells = cell_index.size()-1; + + // loop over submatrices + for (auto m: util::make_span(0, ncells)) { + auto first = cell_index[m]; + auto last = cell_index[m+1]; + + // backward sweep + for(auto i=last-1; i>first; --i) { + auto factor = u[i] / d[i]; + d[parent_index[i]] -= factor * u[i]; + rhs[parent_index[i]] -= factor * rhs[i]; + } + rhs[first] /= d[first]; + + // forward sweep + for(auto i=first+1; i<last; ++i) { + rhs[i] -= u[i] * rhs[parent_index[i]]; + rhs[i] /= d[i]; + } + } + } + +private: + + std::size_t size() const { + return parent_index.size(); + } +}; + +} // namespace multicore +} // namespace mc +} // namespace nest diff --git a/src/backends/stimulus_multicore.hpp b/src/backends/multicore/stimulus.hpp similarity index 100% rename from src/backends/stimulus_multicore.hpp rename to src/backends/multicore/stimulus.hpp diff --git a/src/backends/multicore/threshold_watcher.hpp b/src/backends/multicore/threshold_watcher.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c7cfb3ab5db0abca8662e6685edbaabb25183088 --- /dev/null +++ b/src/backends/multicore/threshold_watcher.hpp @@ -0,0 +1,128 @@ +#pragma once + +#include <memory/memory.hpp> + +namespace nest { +namespace mc { +namespace multicore { + +template <typename T, typename I> +class threshold_watcher { +public: + using value_type = T; + using size_type = I; + + using array = memory::host_vector<value_type>; + using const_view = typename array::const_view_type; + using iarray = memory::host_vector<size_type>; + + /// stores a single crossing event + struct threshold_crossing { + size_type index; // index of variable + value_type time; // time of crossing + friend bool operator== ( + const threshold_crossing& lhs, const threshold_crossing& rhs) + { + return lhs.index==rhs.index && lhs.time==rhs.time; + } + }; + + threshold_watcher() = default; + + threshold_watcher( + const_view vals, + const std::vector<size_type>& indxs, + const std::vector<value_type>& thresh, + value_type t=0): + values_(vals), + index_(memory::make_const_view(indxs)), + thresholds_(memory::make_const_view(thresh)), + v_prev_(vals) + { + is_crossed_ = iarray(size()); + reset(t); + } + + /// Remove all stored crossings that were detected in previous calls + /// to the test() member function. + void clear_crossings() { + crossings_.clear(); + } + + /// Reset state machine for each detector. + /// Assume that the values in values_ have been set correctly before + /// calling, because the values are used to determine the initial state + void reset(value_type t=0) { + clear_crossings(); + for (auto i=0u; i<size(); ++i) { + is_crossed_[i] = values_[index_[i]]>=thresholds_[i]; + } + t_prev_ = t; + } + + const std::vector<threshold_crossing>& crossings() const { + return crossings_; + } + + /// The time at which the last test was performed + value_type last_test_time() const { + return t_prev_; + } + + /// Tests each target for changed threshold state + /// Crossing events are recorded for each threshold that + /// is crossed since the last call to test + void test(value_type t) { + for (auto i=0u; i<size(); ++i) { + auto v_prev = v_prev_[i]; + auto v = values_[index_[i]]; + auto thresh = thresholds_[i]; + if (!is_crossed_[i]) { + if (v>=thresh) { + // the threshold has been passed, so estimate the time using + // linear interpolation + auto pos = (thresh - v_prev)/(v - v_prev); + auto crossing_time = t_prev_ + pos*(t - t_prev_); + crossings_.push_back({i, crossing_time}); + + is_crossed_[i] = true; + } + } + else { + if (v<thresh) { + is_crossed_[i] = false; + } + } + + v_prev_[i] = v; + } + t_prev_ = t; + } + + bool is_crossed(size_type i) const { + return is_crossed_[i]; + } + + /// the number of threashold values that are being monitored + std::size_t size() const { + return index_.size(); + } + + /// Data type used to store the crossings. + /// Provided to make type-generic calling code. + using crossing_list = std::vector<threshold_crossing>; + +private: + const_view values_; + iarray index_; + + array thresholds_; + value_type t_prev_; + array v_prev_; + crossing_list crossings_; + iarray is_crossed_; +}; + +} // namespace multicore +} // namespace mc +} // namespace nest diff --git a/src/matrix.hpp b/src/matrix.hpp index f37e54a834316552f3fdc12f0a939b50e2d901c6..8f010763cce427011f191b36c821fe7c0167e9df 100644 --- a/src/matrix.hpp +++ b/src/matrix.hpp @@ -11,8 +11,9 @@ namespace nest { namespace mc { /// Hines matrix -/// the TargetPolicy defines the backend specific data types and solver -template<class Backend> +/// Make the back end state implementation optional to allow for +/// testing different implementations in the same code. +template<class Backend, class State=typename Backend::matrix_state> class matrix { public: using backend = Backend; @@ -25,37 +26,23 @@ public: using array = typename backend::array; using iarray = typename backend::iarray; - using view = typename backend::view; - using iview = typename backend::iview; using const_view = typename backend::const_view; using const_iview = typename backend::const_iview; using host_array = typename backend::host_array; // back end specific storage for matrix state - using state = typename backend::matrix_state; + using state = State; matrix() = default; - /// construct matrix for one or more cells, described by a parent index and - /// a cell index. - matrix(const std::vector<size_type>& pi, const std::vector<size_type>& ci): - parent_index_(memory::make_const_view(pi)), - cell_index_(memory::make_const_view(ci)), - state_(parent_index_, cell_index_) - { - EXPECTS(cell_index_[num_cells()] == parent_index_.size()); - } - matrix( const std::vector<size_type>& pi, const std::vector<size_type>& ci, const std::vector<value_type>& cv_capacitance, const std::vector<value_type>& face_conductance): parent_index_(memory::make_const_view(pi)), cell_index_(memory::make_const_view(ci)), - state_( parent_index_, cell_index_, - memory::make_const_view(cv_capacitance), - memory::make_const_view(face_conductance)) + state_(pi, ci, cv_capacitance, face_conductance) { EXPECTS(cell_index_[num_cells()] == parent_index_.size()); } @@ -88,7 +75,7 @@ public: /// Get a view of the solution const_view solution() const { - return state_.rhs; + return state_.solution; } private: diff --git a/tests/unit/common.hpp b/tests/unit/common.hpp index dc7b5a357fdbc8e0a1aa25ec993a36e3d9c6427f..af9833f5840b64fb8e21e72d2831640fc580c621 100644 --- a/tests/unit/common.hpp +++ b/tests/unit/common.hpp @@ -126,6 +126,8 @@ template <typename FPType, typename Seq1, typename Seq2> for (std::size_t j = 0; i1!=e1 && i2!=e2; ++i1, ++i2, ++j) { using FP = testing::internal::FloatingPoint<FPType>; + // cast to FPType to avoid warnings about lowering conversion + // if FPType has lower precision than Seq{12}::value_type auto v1 = *i1; auto v2 = *i2; diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu index 694300e6239ed6c95ff0a90e72740e4e7014c60f..fb9877c9a7fd6de6258c31bf7c40fcc062357ab3 100644 --- a/tests/unit/test_atomics.cu +++ b/tests/unit/test_atomics.cu @@ -1,6 +1,6 @@ #include "../gtest.h" -#include <backends/gpu_intrinsics.hpp> +#include <backends/gpu/intrinsics.hpp> #include <memory/managed_ptr.hpp> namespace kernels { diff --git a/tests/unit/test_gpu_stack.cu b/tests/unit/test_gpu_stack.cu index 38087a9570ad975f8a3028966a56a0b47842ac61..2756078dca09d4bfebf14adb992bf05cf61a4d28 100644 --- a/tests/unit/test_gpu_stack.cu +++ b/tests/unit/test_gpu_stack.cu @@ -1,14 +1,14 @@ #include "../gtest.h" -#include <backends/gpu_stack.hpp> +#include <backends/gpu/stack.hpp> #include <memory/managed_ptr.hpp> using namespace nest::mc; -TEST(gpu_stack, construction) { +TEST(stack, construction) { using T = int; - gpu::gpu_stack<T> s(10); + gpu::stack<T> s(10); EXPECT_EQ(0u, s.size()); EXPECT_EQ(10u, s.capacity()); @@ -18,7 +18,7 @@ TEST(gpu_stack, construction) { namespace kernels { template <typename F> __global__ - void push_back(gpu::gpu_stack<int>& s, F f) { + void push_back(gpu::stack<int>& s, F f) { if (f(threadIdx.x)) { s.push_back(threadIdx.x); } @@ -46,9 +46,9 @@ namespace kernels { }; } -TEST(gpu_stack, push_back) { +TEST(stack, push_back) { using T = int; - using stack = gpu::gpu_stack<T>; + using stack = gpu::stack<T>; const unsigned n = 10; EXPECT_TRUE(n%2 == 0); // require n is even for tests to work diff --git a/tests/unit/test_matrix.cpp b/tests/unit/test_matrix.cpp index 7bd9911140324db1b7d909434dc66e47cafe0d90..f74a6760047119678ef8192d1bca26c1ab15350c 100644 --- a/tests/unit/test_matrix.cpp +++ b/tests/unit/test_matrix.cpp @@ -5,31 +5,29 @@ #include <math.hpp> #include <matrix.hpp> -#include <backends/fvm_multicore.hpp> +#include <backends/multicore/fvm.hpp> #include <util/span.hpp> using namespace nest::mc; using matrix_type = matrix<nest::mc::multicore::backend>; -using size_type = matrix_type::size_type; +using size_type = matrix_type::size_type; +using value_type = matrix_type::value_type; + +using vvec = std::vector<value_type>; TEST(matrix, construct_from_parent_only) { - using util::make_span; - - // pass parent index as a std::vector cast to host data - { - std::vector<size_type> p = {0,0,1}; - matrix_type m(p, {0, 3}); - EXPECT_EQ(m.num_cells(), 1u); - EXPECT_EQ(m.size(), 3u); - EXPECT_EQ(p.size(), 3u); - - auto mp = m.p(); - EXPECT_EQ(mp[0], 0u); - EXPECT_EQ(mp[1], 0u); - EXPECT_EQ(mp[2], 1u); - } + std::vector<size_type> p = {0,0,1}; + matrix_type m(p, {0, 3}, vvec(3), vvec(3)); + EXPECT_EQ(m.num_cells(), 1u); + EXPECT_EQ(m.size(), 3u); + EXPECT_EQ(p.size(), 3u); + + auto mp = m.p(); + EXPECT_EQ(mp[0], 0u); + EXPECT_EQ(mp[1], 0u); + EXPECT_EQ(mp[2], 1u); } TEST(matrix, solve_host) @@ -39,7 +37,7 @@ TEST(matrix, solve_host) // trivial case : 1x1 matrix { - matrix_type m({0}, {0,1}); + matrix_type m({0}, {0,1}, vvec(1), vvec(1)); auto& state = m.state_; fill(state.d, 2); fill(state.u, -1); @@ -55,7 +53,7 @@ TEST(matrix, solve_host) for(auto n : make_span(2u,1001u)) { auto p = std::vector<size_type>(n); std::iota(p.begin()+1, p.end(), 0); - matrix_type m(p, {0, n}); + matrix_type m(p, {0, n}, vvec(n), vvec(n)); EXPECT_EQ(m.size(), n); EXPECT_EQ(m.num_cells(), 1u); diff --git a/tests/unit/test_matrix.cu b/tests/unit/test_matrix.cu index 3f13a86f636748585191d85e9bef2d6e04e2e27e..dbf39f034fc42fd6243c5fa098648d6eb41ad54f 100644 --- a/tests/unit/test_matrix.cu +++ b/tests/unit/test_matrix.cu @@ -2,64 +2,387 @@ #include <vector> #include "../gtest.h" +#include "common.hpp" #include <math.hpp> #include <matrix.hpp> -#include <backends/fvm_gpu.hpp> +#include <backends/gpu/fvm.hpp> +#include <backends/multicore/fvm.hpp> #include <memory/memory.hpp> #include <util/span.hpp> -using matrix_type = nest::mc::matrix<nest::mc::gpu::backend>; -using index_type = matrix_type::size_type; +using namespace nest::mc; -TEST(matrix, solve_gpu) +using gpu::impl::npos; +using util::make_span; +using util::assign_from; +using memory::on_gpu; +using memory::on_host; + +using testing::seq_almost_eq; + +using std::begin; +using std::end; + +// will test the flat_to_interleaved and interleaved_to_flat operations for the +// set of matrices defined by sizes and starts. +// Applies the interleave to the vector in values, and checks this against +// a reference result generated using a host side reference implementation. +// Then the interleave result is reverse_interleaved, and the result is +// compared to the original input. +// +// This is implemented in a separate function to facilitate testing on a +// broad range of BlockWidth and LoadWidth compile time parameters. +template <typename T, typename I, int BlockWidth, int LoadWidth> +::testing::AssertionResult test_interleave( + std::vector<I> sizes, + std::vector<I> starts, + std::vector<T> values, + int padded_size) +{ + auto num_mtx = sizes.size(); + + auto in = on_gpu(memory::make_const_view(values)); + auto sizes_d = on_gpu(memory::make_const_view(sizes)); + auto starts_d = on_gpu(memory::make_const_view(starts)); + + int packed_size = padded_size * BlockWidth * gpu::impl::block_count(num_mtx, BlockWidth); + + // forward will hold the result of the interleave operation on the GPU + auto forward = memory::device_vector<T>(packed_size, npos<T>()); + + // find the reference interleaved values using host side implementation + auto baseline = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size); + + // find the interleaved values on gpu + gpu::flat_to_interleaved<T, I, BlockWidth, LoadWidth>(in.data(), forward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx); + + std::vector<T> result_f = assign_from(on_host(forward)); + std::vector<T> expected = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size); + const auto forward_success = (result_f==expected); + if (!forward_success) { + return ::testing::AssertionFailure() << "interleave to flat failed: BlockWidth " + << BlockWidth << ", LoadWidth " << LoadWidth << "\n"; + } + + // backward will hold the result of reverse interleave on the GPU + auto backward = memory::device_vector<T>(values.size(), npos<T>()); + gpu::interleaved_to_flat<T, I, BlockWidth, LoadWidth>(forward.data(), backward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx); + + std::vector<T> result_b = assign_from(on_host(backward)); + + // we expect that the result of the reverse permutation is the original input vector + const auto backward_success = (result_b==values); + if (!backward_success) { + return ::testing::AssertionFailure() << "flat to interleave failed: BlockWidth " + << BlockWidth << ", LoadWidth " << LoadWidth << "\n"; + } + + return ::testing::AssertionSuccess(); +} + +// test conversion to and from interleaved back end storage format +TEST(matrix, interleave) { - using namespace nest::mc; + using I = int; + using T = int; + using ivec = std::vector<I>; + using tvec = std::vector<T>; + + // simple case with 4 matrices of length 2 + { + const int padded_size = 2; + const int num_mtx = 4; + ivec sizes(num_mtx, padded_size); + + // find the start position of each matrix in the flat storage + // we are assuming that the matrices are unpermuted + ivec starts(num_mtx, 0); + std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); + + tvec values(padded_size*num_mtx); + std::iota(values.begin(), values.end(), 0); - using nest::mc::util::make_span; + EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size))); - // trivial case : 1x1 matrix + EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size))); + + EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size))); + } + + // another small example with matrices of differing lengths { - matrix_type m({0}, {0,1}); + const int padded_size = 8; + const int num_mtx = 8; + ivec sizes = {6, 5, 4, 4, 3, 2, 2, 1}; + + // find the start position of each matrix in the flat storage + // we are assuming that the matrices are unpermuted + ivec starts(num_mtx, 0); + std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); - auto& state = m.state_; - memory::fill(state.d, 2); - memory::fill(state.u, -1); - memory::fill(state.rhs,1); + tvec values(algorithms::sum(sizes)); + std::iota(values.begin(), values.end(), 0); - m.solve(); + EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size))); - auto rhs = memory::on_host(m.solution()); + EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size))); - EXPECT_EQ(rhs[0], 0.5); + EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size))); + EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size))); } - // matrices in the range of 2x2 to 100x100 + // more interesting case... { - using namespace nest::mc; - for(auto n : make_span(2u,101u)) { - auto p = std::vector<index_type>(n); - std::iota(p.begin()+1, p.end(), 0); - matrix_type m{p, {0, n}}; - - EXPECT_EQ(m.size(), n); - EXPECT_EQ(m.num_cells(), 1u); - - auto& state = m.state_; - memory::fill(state.d, 2); - memory::fill(state.u, -1); - memory::fill(state.rhs,1); - - m.solve(); - - auto x = memory::on_host(m.solution()); - auto err = math::square(std::fabs(2.*x[0] - x[1] - 1.)); - for(auto i : make_span(1,n-1)) { - err += math::square(std::fabs(2.*x[i] - x[i-1] - x[i+1] - 1.)); - } - err += math::square(std::fabs(2.*x[n-1] - x[n-2] - 1.)); - - EXPECT_NEAR(0., std::sqrt(err), 1e-8); + const int padded_size = 256; + const int num_mtx = 1000; + ivec sizes(num_mtx); + for (auto i: make_span( 0, 100)) sizes[i] = 250; + for (auto i: make_span(100, 103)) sizes[i] = 213; + for (auto i: make_span(103, 150)) sizes[i] = 200; + for (auto i: make_span(150, 500)) sizes[i] = 178; + for (auto i: make_span(500, 999)) sizes[i] = 6; + + // we are assuming that the matrices are unpermuted + ivec starts(num_mtx, 0); + std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); + + tvec values(algorithms::sum(sizes)); + std::iota(values.begin(), values.end(), 0); + + // test in "full" 1024 thread configuration with 32 threads per matrix + EXPECT_TRUE((test_interleave<T, I, 32, 32>(sizes, starts, values, padded_size))); + } +} + +// Test that matrix assembly works. +// The test proceeds by assembling a reference matrix on the host and +// device backends, then performs solve, and compares solution. +// +// limitations of test +// * matrices all have same size and structure +TEST(matrix, assemble) +{ + using gpu_state = gpu::backend::matrix_state; + using mc_state = multicore::backend::matrix_state; + + using T = typename gpu::backend::value_type; + using I = typename gpu::backend::size_type; + + using gpu_array = typename gpu::backend::array; + using host_array = typename multicore::backend::array; + + // There are two matrix structures: + // + // p_1: 3 branches, 6 compartments + // + // 3 + // /. + // 0 - 1 - 2 + // \. + // 4 + // \. + // 5 + // + // p_2: 5 branches, 8 compartments + // + // 4 + // /. + // 3 + // / \. + // 0 - 1 - 2 5 + // \. + // 6 + // \. + // 7 + + // The parent indexes that define the two matrix structures + std::vector<std::vector<I>> + p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6} }; + + // Make a set of matrices based on repeating this pattern. + // We assign the patterns round-robin, i.e. so that the input + // matrices will have alternating sizes of 6 and 8, which will + // test the solver with variable matrix size, and exercise + // solvers that reorder matrices according to size. + const int num_mtx = 8; + + std::vector<I> p; + std::vector<I> cell_index; + for (auto m=0; m<num_mtx; ++m) { + auto &p_ref = p_base[m%2]; + auto first = p.size(); + for (auto i: p_ref) { + p.push_back(i + first); } + cell_index.push_back(first); } + cell_index.push_back(p.size()); + + auto group_size = cell_index.back(); + + // Build the capacitance and conductance vectors and + // populate with nonzero random values. + + auto gen = std::mt19937(); + auto dist = std::uniform_real_distribution<T>(1, 2); + + std::vector<T> Cm(group_size); + std::generate(Cm.begin(), Cm.end(), [&](){return dist(gen);}); + + std::vector<T> g(group_size); + std::generate(g.begin(), g.end(), [&](){return dist(gen);}); + + // Make the referenace matrix and the gpu matrix + auto m_mc = mc_state( p, cell_index, Cm, g); // on host + auto m_gpu = gpu_state(p, cell_index, Cm, g); // on gpu + + // Voltage and current values + m_mc.assemble( 0.2, host_array(group_size, -64), host_array(group_size, 10)); + m_mc.solve(); + m_gpu.assemble(0.2, gpu_array(group_size, -64), gpu_array(group_size, 10)); + m_gpu.solve(); + + // Compare the GPU and CPU results. + // Cast result to float, because we are happy to ignore small differencs + std::vector<float> result_h = util::assign_from(m_mc.solution); + std::vector<float> result_g = util::assign_from(on_host(m_gpu.solution)); + EXPECT_TRUE(seq_almost_eq<float>(result_h, result_g)); +} + +// test that the flat and interleaved storage back ends produce identical results +TEST(matrix, backends) +{ + using T = typename gpu::backend::value_type; + using I = typename gpu::backend::size_type; + + using state_flat = gpu::matrix_state_flat<T, I>; + using state_intl = gpu::matrix_state_interleaved<T, I>; + + using gpu_array = typename gpu::backend::array; + + // There are two matrix structures: + // + // p_1: 3 branches, 6 compartments + // + // 3 + // /. + // 0 - 1 - 2 + // \. + // 4 + // \. + // 5 + // + // p_2: 5 branches, 8 compartments + // + // 4 + // /. + // 3 + // / \. + // 0 - 1 - 2 5 + // \. + // 6 + // \. + // 7 + + // The parent indexes that define the two matrix structures + std::vector<std::vector<I>> + p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6} }; + + // Make a set of matrices based on repeating this pattern. + // We assign the patterns round-robin, i.e. so that the input + // matrices will have alternating sizes of 6 and 8, which will + // test the solver with variable matrix size, and exercise + // solvers that reorder matrices according to size. + const int num_mtx = 200; + + std::vector<I> p; + std::vector<I> cell_index; + for (auto m=0; m<num_mtx; ++m) { + auto &p_ref = p_base[m%2]; + auto first = p.size(); + for (auto i: p_ref) { + p.push_back(i + first); + } + cell_index.push_back(first); + } + cell_index.push_back(p.size()); + + auto group_size = cell_index.back(); + + // Build the capacitance and conductance vectors and + // populate with nonzero random values + + auto gen = std::mt19937(); + gen.seed(100); + auto dist = std::uniform_real_distribution<T>(1, 200); + + std::vector<T> Cm(group_size); + std::vector<T> g(group_size); + std::vector<T> v(group_size); + std::vector<T> i(group_size); + + std::generate(Cm.begin(), Cm.end(), [&](){return dist(gen);}); + std::generate(g.begin(), g.end(), [&](){return dist(gen);}); + std::generate(v.begin(), v.end(), [&](){return dist(gen);}); + std::generate(i.begin(), i.end(), [&](){return dist(gen);}); + + // Make the referenace matrix and the gpu matrix + auto flat = state_flat(p, cell_index, Cm, g); // flat + auto intl = state_intl(p, cell_index, Cm, g); // interleaved + + // voltage and current values + flat.assemble(0.02, on_gpu(v), on_gpu(i)); + intl.assemble(0.02, on_gpu(v), on_gpu(i)); + + flat.solve(); + intl.solve(); + + // Compare the results. + // We expect exact equality for the two gpu matrix implementations because both + // perform the same operations in the same order on the same inputs. + std::vector<double> x_flat = assign_from(on_host(flat.solution)); + std::vector<double> x_intl = assign_from(on_host(intl.solution)); + EXPECT_EQ(x_flat, x_intl); } diff --git a/tests/unit/test_mechanisms.cpp b/tests/unit/test_mechanisms.cpp index 7a4f57da152afcea0649254cb1456491efece61f..f91d3bf592a518d8c64c5dfbdb6d893ae0386e0e 100644 --- a/tests/unit/test_mechanisms.cpp +++ b/tests/unit/test_mechanisms.cpp @@ -17,7 +17,7 @@ #include "mechanisms/multicore/test_kinlva.hpp" #include <initializer_list> -#include <backends/fvm_multicore.hpp> +#include <backends/multicore/fvm.hpp> #include <ion.hpp> #include <matrix.hpp> #include <memory/wrappers.hpp> diff --git a/tests/unit/test_spikes.cpp b/tests/unit/test_spikes.cpp index 2941698cc5b47cc2739e19f3eaca7f911001f0f1..35ef18e7d7b2574b79097f06d2a87aff7c429451 100644 --- a/tests/unit/test_spikes.cpp +++ b/tests/unit/test_spikes.cpp @@ -1,7 +1,7 @@ #include "../gtest.h" #include <spike.hpp> -#include <backends/fvm_multicore.hpp> +#include <backends/multicore/fvm.hpp> using namespace nest::mc; diff --git a/tests/unit/test_spikes.cu b/tests/unit/test_spikes.cu index 01daf1562b7ffde7dfd4700f041ac63dacdae2d7..12bb89ee975b90020337b91b3acf4a4e5d482780 100644 --- a/tests/unit/test_spikes.cu +++ b/tests/unit/test_spikes.cu @@ -1,7 +1,7 @@ #include "../gtest.h" #include <spike.hpp> -#include <backends/fvm_gpu.hpp> +#include <backends/gpu/fvm.hpp> using namespace nest::mc; diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp index cd899a5b0fd4890cce0b8c14860340facada0055..3e9b8ef4aa5b903fbf494ab5e79d4a12581d5927 100644 --- a/tests/unit/test_synapses.cpp +++ b/tests/unit/test_synapses.cpp @@ -2,7 +2,7 @@ #include "../test_util.hpp" #include <cell.hpp> -#include <backends/fvm_multicore.hpp> +#include <backends/multicore/fvm.hpp> #include <mechanisms/multicore/expsyn.hpp> #include <mechanisms/multicore/exp2syn.hpp> diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt index a42fc4ddd65cb0ad6fce7fe8376bac4caa25d48e..cea12373b6d9850ab82a2a92af510b5f0122aef1 100644 --- a/tests/validation/CMakeLists.txt +++ b/tests/validation/CMakeLists.txt @@ -18,6 +18,7 @@ set(VALIDATION_CUDA_SOURCES # unit tests validate_soma.cu validate_ball_and_stick.cu + validate_kinetic.cu validate_synapses.cu # support code diff --git a/tests/validation/validate_kinetic.cu b/tests/validation/validate_kinetic.cu new file mode 100644 index 0000000000000000000000000000000000000000..4c32cd9345582fa0d141fb6e82b099ddf8f7b0ce --- /dev/null +++ b/tests/validation/validate_kinetic.cu @@ -0,0 +1,13 @@ +#include "validate_kinetic.hpp" + +#include "../gtest.h" + +using lowered_cell = nest::mc::fvm::fvm_multicell<nest::mc::gpu::backend>; + +TEST(kinetic, kin1_numeric_ref) { + validate_kinetic_kin1<lowered_cell>(); +} + +TEST(kinetic, kinlva_numeric_ref) { + validate_kinetic_kinlva<lowered_cell>(); +}