From 67b70a80f71a0dc4bb8b2f555b6da7b31aba9803 Mon Sep 17 00:00:00 2001 From: Sam Yates <yates@cscs.ch> Date: Tue, 13 Nov 2018 11:20:13 +0100 Subject: [PATCH] Revert "Squashed merge for fine matrix solver (#640)" This reverts commit be2a8a9faa58832ea4482b8e012d5169c805a5f1. --- CMakeLists.txt | 4 - arbor/CMakeLists.txt | 4 - arbor/algorithms.hpp | 31 +- arbor/backends/gpu/forest.cpp | 181 --------- arbor/backends/gpu/forest.hpp | 140 ------- arbor/backends/gpu/fvm.hpp | 12 +- arbor/backends/gpu/matrix_fine.cpp | 71 ---- arbor/backends/gpu/matrix_fine.cu | 314 -------------- arbor/backends/gpu/matrix_fine.hpp | 77 ---- arbor/backends/gpu/matrix_solve.cu | 1 - arbor/backends/gpu/matrix_state_fine.hpp | 494 ----------------------- arbor/tree.cpp | 385 ------------------ arbor/tree.hpp | 153 ++++--- test/unit/test_gpu_stack.cu | 3 - test/unit/test_matrix.cu | 90 ++++- test/unit/test_matrix_cpuvsgpu.cpp | 9 +- test/unit/test_tree.cpp | 83 ---- 17 files changed, 179 insertions(+), 1873 deletions(-) delete mode 100644 arbor/backends/gpu/forest.cpp delete mode 100644 arbor/backends/gpu/forest.hpp delete mode 100644 arbor/backends/gpu/matrix_fine.cpp delete mode 100644 arbor/backends/gpu/matrix_fine.cu delete mode 100644 arbor/backends/gpu/matrix_fine.hpp delete mode 100644 arbor/backends/gpu/matrix_state_fine.hpp delete mode 100644 arbor/tree.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b9802b68..b8c83fae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,6 @@ set(ARB_VALIDATION_DATA_DIR "${PROJECT_SOURCE_DIR}/validation/data" CACHE PATH #---------------------------------------------------------- option(ARB_WITH_GPU "build with GPU support" OFF) -option(ARB_WITH_GPU_FINE_MATRIX "use optimized fine matrix solver" ON) option(ARB_WITH_MPI "build with MPI support" OFF) @@ -198,9 +197,6 @@ if(ARB_WITH_GPU) "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>" "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>") target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_GPU) - if(ARB_WITH_GPU_FINE_MATRIX) - target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_GPU_FINE_MATRIX) - endif() target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>) target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>) diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index 87ec5150..dd885eac 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -45,7 +45,6 @@ set(arbor_sources threading/threading.cpp threading/thread_info.cpp thread_private_spike_store.cpp - tree.cpp util/hostname.cpp util/unwind.cpp version.cpp @@ -60,13 +59,10 @@ if(ARB_WITH_CUDA) backends/gpu/threshold_watcher.cu backends/gpu/matrix_assemble.cu backends/gpu/matrix_interleave.cu - backends/gpu/matrix_fine.cu - backends/gpu/matrix_fine.cpp backends/gpu/matrix_solve.cu backends/gpu/multi_event_stream.cpp backends/gpu/multi_event_stream.cu backends/gpu/shared_state.cu - backends/gpu/forest.cpp backends/gpu/stimulus.cu backends/gpu/threshold_watcher.cu memory/fill.cu diff --git a/arbor/algorithms.hpp b/arbor/algorithms.hpp index a2ec968a..4e295834 100644 --- a/arbor/algorithms.hpp +++ b/arbor/algorithms.hpp @@ -41,8 +41,6 @@ mean(C const& c) return sum(c)/util::size(c); } -// returns the prefix sum of c in the form `[0, c[0], c[0]+c[1], ..., sum(c)]`. -// This means that the returned vector has one more element than c. template <typename C> C make_index(C const& c) { @@ -96,9 +94,6 @@ bool is_strictly_monotonic_decreasing(C const& c) ); } -// check if c[0] == 0 and c[i] < 0 holds for i != 0 -// this means that children of a node always have larger indices than their -// parent template < typename C, typename = typename std::enable_if<std::is_integral<typename C::value_type>::value> @@ -135,23 +130,17 @@ bool all_negative(const C& c) { return util::all_of(c, [](auto v) { return v<decltype(v){}; }); } -// returns a vector containing the number of children for each node. template<typename C> std::vector<typename C::value_type> child_count(const C& parent_index) { - using value_type = typename C::value_type; static_assert( - std::is_integral<value_type>::value, + std::is_integral<typename C::value_type>::value, "integral type required" ); - std::vector<value_type> count(parent_index.size(), 0); - for (auto i = 0u; i < parent_index.size(); ++i) { - auto p = parent_index[i]; - // -1 means no parent - if (p != value_type(i) && p != value_type(-1)) { - ++count[p]; - } + std::vector<typename C::value_type> count(parent_index.size(), 0); + for (auto i = 1u; i < parent_index.size(); ++i) { + ++count[parent_index[i]]; } return count; @@ -201,7 +190,7 @@ std::vector<typename C::value_type> branches(const C& parent_index) for (std::size_t i = 1; i < parent_index.size(); ++i) { auto p = parent_index[i]; if (num_child[p] > 1 || parent_index[p] == p) { - // `parent_index[p] == p` ~> parent_index[i] is the soma + // parent_index[p] == p -> parent_index[i] is the soma branch_index.push_back(i); } } @@ -210,9 +199,7 @@ std::vector<typename C::value_type> branches(const C& parent_index) return branch_index; } -// creates a vector that contains the branch index for each compartment. -// e.g. {0, 1, 5, 9, 10} -> {0, 1, 1, 1, 1, 2, 2, 2, 2, 3} -// indices 0 1 5 9 + template<typename C> std::vector<typename C::value_type> expand_branches(const C& branch_index) { @@ -294,13 +281,11 @@ std::vector<typename C::value_type> tree_reduce( arb_assert(has_contiguous_compartments(parent_index)); arb_assert(is_strictly_monotonic_increasing(branch_index)); - // expand the branch index to lookup the banch id for each compartment + // expand the branch index auto expanded_branch = expand_branches(branch_index); std::vector<typename C::value_type> new_parent_index; - // push the first element manually as the parent of the root might be -1 - new_parent_index.push_back(expanded_branch[0]); - for (std::size_t i = 1; i < branch_index.size()-1; ++i) { + for (std::size_t i = 0; i < branch_index.size()-1; ++i) { auto p = parent_index[branch_index[i]]; new_parent_index.push_back(expanded_branch[p]); } diff --git a/arbor/backends/gpu/forest.cpp b/arbor/backends/gpu/forest.cpp deleted file mode 100644 index 6650b92c..00000000 --- a/arbor/backends/gpu/forest.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "backends/gpu/forest.hpp" -#include "util/span.hpp" - -namespace arb { -namespace gpu { - -forest::forest(const std::vector<size_type>& p, const std::vector<size_type>& cell_cv_divs) : - perm_balancing(p.size()) -{ - using util::make_span; - - auto num_cells = cell_cv_divs.size() - 1; - - for (auto c: make_span(0u, num_cells)) { - // build the parent index for cell c - auto cell_start = cell_cv_divs[c]; - std::vector<unsigned> cell_p = - util::assign_from( - util::transform_view( - util::subrange_view(p, cell_cv_divs[c], cell_cv_divs[c+1]), - [cell_start](unsigned i) {return i-cell_start;})); - - auto fine_tree = tree(cell_p); - - // select a root node and merge branches with discontinuous compartment - // indices - auto perm = fine_tree.select_new_root(0); - for (auto i: make_span(perm.size())) { - perm_balancing[cell_start + i] = cell_start + perm[i]; - } - - // find the index of the first node for each branch - auto branch_starts = algorithms::branches(fine_tree.parents()); - - // compute branch length and apply permutation - std::vector<unsigned> branch_lengths(branch_starts.size() - 1); - for (auto i: make_span(branch_lengths.size())) { - branch_lengths[i] = branch_starts[i+1] - branch_starts[i]; - } - - // find the parent index of branches - // we need to convert to cell_lid_type, required to construct a tree. - std::vector<cell_lid_type> branch_p = - util::assign_from( - algorithms::tree_reduce(fine_tree.parents(), branch_starts)); - // build tree structure that describes the branch topology - auto cell_tree = tree(branch_p); - - trees.push_back(cell_tree); - fine_trees.push_back(fine_tree); - tree_branch_starts.push_back(branch_starts); - tree_branch_lengths.push_back(branch_lengths); - } -} - -void forest::optimize() { - using util::make_span; - - // cut the tree - unsigned count = 1; // number of nodes found on the previous level - for (auto level = 0; count > 0; level++) { - count = 0; - - // decide where to cut it ... - unsigned max_length = 0; - for (auto t_ix: make_span(trees.size())) { // TODO make this local on an intermediate packing - for (level_iterator it (&trees[t_ix], level); it.valid(); it.next()) { - auto length = tree_branch_lengths[t_ix][it.peek()]; - max_length += length; - count++; - } - } - if (count == 0) { - // there exists no tree with branches on this level - continue; - }; - max_length = max_length / count; - // avoid ininite loops - if (max_length <= 1) max_length = 1; - // we don't want too small segments - if (max_length <= 10) max_length = 10; - - for (auto t_ix: make_span(trees.size())) { - // ... cut all trees on this level - for (level_iterator it (&trees[t_ix], level); it.valid(); it.next()) { - - auto length = tree_branch_lengths[t_ix][it.peek()]; - if (length > max_length) { - // now cut the tree - - // we are allowed to mess with the tree because of the - // implementation of level_iterator o.O - - auto insert_at_bs = tree_branch_starts[t_ix].begin() + it.peek(); - auto insert_at_ls = tree_branch_lengths[t_ix].begin() + it.peek(); - - trees[t_ix].split_node(it.peek()); - - // now the tree got a new node. - // we now have to insert a corresponding new 'branch - // start' to the list - - // make sure that `tree_branch_starts` for A and N point to - // the correct slices - auto old_start = tree_branch_starts[t_ix][it.peek()]; - // first insert, then index peek, as we already - // incremented the iterator - tree_branch_starts[t_ix].insert(insert_at_bs, old_start); - tree_branch_lengths[t_ix].insert(insert_at_ls, max_length); - tree_branch_starts[t_ix][it.peek() + 1] = old_start + max_length; - tree_branch_lengths[t_ix][it.peek() + 1] = length - max_length; - // we don't have to shift any indices as we did not - // create any new branch segments, but just split - // one over two nodes - } - } - } - } -} - - -// debugging functions: - - -// Exports the tree's parent structure into a dot file. -// the children and parent trees are equivalent. Both methods are provided -// for debugging purposes. -template<typename F> -void export_parents(const tree& t, std::string file, F label) { - using util::make_span; - std::ofstream ofile; - ofile.open(file); - ofile << "strict digraph Parents {" << std::endl; - for (auto i: make_span(t.parents().size())) { - ofile << i << "[label=\"" << label(i) << "\"]" << std::endl; - } - for (auto i: make_span(t.parents().size())) { - auto p = t.parent(i); - if (p != tree::no_parent) { - ofile << i << " -> " << t.parent(i) << std::endl; - } - } - ofile << "}" << std::endl; - ofile.close(); -} - -void export_parents(const tree& t, std::string file) { - // the labels in the the graph are the branch indices - export_parents(t, file, [](auto i){return i;}); -} - -// Exports the tree's children structure into a dot file. -// the children and parent trees are equivalent. Both methods are provided -// for debugging purposes. -template<typename F> -void export_children(const tree& t, std::string file, F label) { - using util::make_span; - std::ofstream ofile; - ofile.open(file); - ofile << "strict digraph Children {" << std::endl; - for (auto i: make_span(t.num_segments())) { - ofile << i << "[label=\"" << label(i) << "\"]" << std::endl; - } - for (auto i: make_span(t.num_segments())) { - ofile << i << " -> {"; - for (auto c: t.children(i)) { - ofile << " " << c; - } - ofile << "}" << std::endl; - } - ofile << "}" << std::endl; - ofile.close(); -} - -void export_children(const tree& t, std::string file) { - // the labels in the the graph are the branch indices - export_children(t, file, [](auto i){return i;}); -} - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/forest.hpp b/arbor/backends/gpu/forest.hpp deleted file mode 100644 index 43900858..00000000 --- a/arbor/backends/gpu/forest.hpp +++ /dev/null @@ -1,140 +0,0 @@ -#pragma once - -#include <vector> - -#include "tree.hpp" - -namespace arb { -namespace gpu { - -using size_type = int; - -struct forest { - forest(const std::vector<size_type>& p, const std::vector<size_type>& cell_cv_divs); - - void optimize(); - - unsigned num_trees() { - return fine_trees.size(); - } - - const tree& branch_tree(unsigned tree_index) { - return trees[tree_index]; - } - - const tree& compartment_tree(unsigned tree_index) { - return fine_trees[tree_index]; - } - - // Returns the offset into the compartments of a tree where each branch - // starts. It holds `0 <= offset < tree.num_segments()` - const std::vector<unsigned>& branch_offsets(unsigned tree_index) { - return tree_branch_starts[tree_index]; - } - - // Returns vector of the length of each branch in a tree. - const std::vector<unsigned>& branch_lengths(unsigned tree_index) { - return tree_branch_lengths[tree_index]; - } - - // Return the permutation that was applied to the compartments in the - // format: `new[i] = old[perm[i]]` - const std::vector<size_type>& permutation() { - return perm_balancing; - } - - // trees of compartments - std::vector<tree> fine_trees; - std::vector<std::vector<unsigned>> tree_branch_starts; - std::vector<std::vector<unsigned>> tree_branch_lengths; - // trees of branches - std::vector<tree> trees; - - // the permutation matrix used by the balancing algorithm - // format: `solver_format[i] = external_format[perm[i]]` - std::vector<size_type> perm_balancing; -}; - - -struct level_iterator { - level_iterator(tree* t, unsigned level) { - tree_ = t; - only_on_level = level; - // due to the ordering of the nodes we know that 0 is the root - current_node = 0; - current_level = 0; - next_children = 0; - if (level != 0) { - next(); - }; - } - - void advance_depth_first() { - auto children = tree_->children(current_node); - if (next_children < children.size() && current_level <= only_on_level) { - // go to next children - current_level += 1; - current_node = children[next_children]; - next_children = 0; - } else { - // go to parent - auto parent_node = tree_->parents()[current_node]; - constexpr unsigned npos = unsigned(-1); - if (parent_node != npos) { - auto siblings = tree_->children(parent_node); - // get the index in the child list of the parent - unsigned index = 0; - while (siblings[index] != current_node) { // TODO repalce by array lockup: sibling_nr - index += 1; - } - - current_level -= 1; - current_node = parent_node; - next_children = index + 1; - } else { - // we are done with the iteration - current_level = -1; - current_node = -1; - next_children = -1; - } - - } - } - - unsigned next() { - constexpr unsigned npos = unsigned(-1); - if (!valid()) { - // we are done - return npos; - } else { - advance_depth_first(); - // next_children != 0 means, that we have seen the node before - while (valid() && (current_level != only_on_level || next_children != 0)) { - advance_depth_first(); - } - return current_node; - } - } - - bool valid() { - constexpr unsigned npos = unsigned(-1); - return this->peek() != npos; - } - - unsigned peek() { - return current_node; - } - -private: - tree* tree_; - - unsigned current_node; - unsigned current_level; - unsigned next_children; - - unsigned only_on_level; -}; - - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/fvm.hpp b/arbor/backends/gpu/fvm.hpp index cc82bed0..9d02eea3 100644 --- a/arbor/backends/gpu/fvm.hpp +++ b/arbor/backends/gpu/fvm.hpp @@ -14,15 +14,9 @@ #include "backends/gpu/gpu_store_types.hpp" #include "backends/gpu/shared_state.hpp" +#include "matrix_state_interleaved.hpp" #include "threshold_watcher.hpp" - -#ifdef ARB_HAVE_GPU_FINE_MATRIX - #include "matrix_state_fine.hpp" -#else - #include "matrix_state_interleaved.hpp" -#endif - namespace arb { namespace gpu { @@ -45,11 +39,7 @@ struct backend { return memory::on_host(v); } -#ifdef ARB_HAVE_GPU_FINE_MATRIX - using matrix_state = arb::gpu::matrix_state_fine<value_type, index_type>; -#else using matrix_state = arb::gpu::matrix_state_interleaved<value_type, index_type>; -#endif using threshold_watcher = arb::gpu::threshold_watcher; using deliverable_event_stream = arb::gpu::deliverable_event_stream; diff --git a/arbor/backends/gpu/matrix_fine.cpp b/arbor/backends/gpu/matrix_fine.cpp deleted file mode 100644 index b1d7a227..00000000 --- a/arbor/backends/gpu/matrix_fine.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include <ostream> - -#include <cuda_runtime.h> - -#include "memory/cuda_wrappers.hpp" -#include "util/span.hpp" - -#include "matrix_fine.hpp" - -namespace arb { -namespace gpu { - -level::level(unsigned branches): - num_branches(branches) -{ - using memory::cuda_malloc_managed; - - using arb::memory::cuda_malloc_managed; - if (num_branches!=0) { - lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - cudaDeviceSynchronize(); - } -} - -level::level(level&& other) { - std::swap(other.lengths, this->lengths); - std::swap(other.parents, this->parents); - std::swap(other.num_branches, this->num_branches); - std::swap(other.max_length, this->max_length); - std::swap(other.data_index, this->data_index); -} - -level::level(const level& other) { - using memory::cuda_malloc_managed; - - num_branches = other.num_branches; - max_length = other.max_length; - data_index = other.data_index; - if (num_branches!=0) { - lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - cudaDeviceSynchronize(); - std::copy(other.lengths, other.lengths+num_branches, lengths); - std::copy(other.parents, other.parents+num_branches, parents); - } -} - -level::~level() { - if (num_branches!=0) { - cudaDeviceSynchronize(); // to ensure that managed memory has been freed - if (lengths) arb::memory::cuda_free(lengths); - if (parents) arb::memory::cuda_free(parents); - } -} - -std::ostream& operator<<(std::ostream& o, const level& l) { - cudaDeviceSynchronize(); - o << "branches:" << l.num_branches - << " max_len:" << l.max_length - << " data_idx:" << l.data_index - << " lengths:["; - for (auto i: util::make_span(l.num_branches)) o << l.lengths[i] << " "; - o << "] parents:["; - for (auto i: util::make_span(l.num_branches)) o << l.parents[i] << " "; - o << "]"; - return o; -} - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu deleted file mode 100644 index 764b39d5..00000000 --- a/arbor/backends/gpu/matrix_fine.cu +++ /dev/null @@ -1,314 +0,0 @@ -#include <arbor/fvm_types.hpp> - -#include "cuda_atomic.hpp" -#include "cuda_common.hpp" -#include "matrix_common.hpp" -#include "matrix_fine.hpp" - -namespace arb { -namespace gpu { - -namespace kernels { - -// -// gather and scatter kernels -// - -// to[i] = from[p[i]] -template <typename T, typename I> -__global__ -void gather(const T* from, T* to, const I* p, unsigned n) { - unsigned i = threadIdx.x + blockDim.x*blockIdx.x; - - if (i<n) { - to[i] = from[p[i]]; - } -} - -// to[p[i]] = from[i] -template <typename T, typename I> -__global__ -void scatter(const T* from, T* to, const I* p, unsigned n) { - unsigned i = threadIdx.x + blockDim.x*blockIdx.x; - - if (i<n) { - to[p[i]] = from[i]; - } -} - -/// GPU implementatin of Hines matrix assembly. -/// Fine layout. -/// For a given time step size dt: -/// - use the precomputed alpha and alpha_d values to construct the diagonal -/// and off diagonal of the symmetric Hines matrix. -/// - compute the RHS of the linear system to solve. -template <typename T, typename I> -__global__ -void assemble_matrix_fine( - T* d, - T* rhs, - const T* invariant_d, - const T* voltage, - const T* current, - const T* cv_capacitance, - const T* area, - const I* cv_to_cell, - const T* dt_cell, - const I* perm, - unsigned n) -{ - const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x; - - if (tid<n) { - auto cid = cv_to_cell[tid]; - auto dt = dt_cell[cid]; - - if (dt>0) { - // The 1e-3 is a constant of proportionality required to ensure that the - // conductance (gi) values have units μS (micro-Siemens). - // See the model documentation in docs/model for more information. - T factor = 1e-3/dt; - - const auto gi = factor * cv_capacitance[tid]; - const auto pid = perm[tid]; - d[pid] = gi + invariant_d[tid]; - rhs[pid] = gi*voltage[tid] - T(1e-3)*area[tid]*current[tid]; - } - else { - const auto pid = perm[tid]; - d[pid] = 0; - rhs[pid] = voltage[tid]; - } - } -} - -/// GPU implementation of Hines Matrix solver. -/// Fine-grained tree based solver. -/// Each block solves a set of matricesb iterating over the levels of matrix -/// and perfoming a backward and forward substitution. On each level one thread -/// gets assigned to one branch on this level of a matrix and solves and -/// performs the substitution. Afterwards all threads continue on the next -/// level. -/// To avoid idle threads, one should try that on each level, there is a similar -/// number of branches. -template <typename T> -__global__ -void solve_matrix_fine( - T* rhs, - T* d, - const T* u, - const level* levels, - const unsigned* levels_start, - unsigned* num_matrix, // number of packed matrices = number of cells - unsigned* padded_size) -{ - const auto tid = threadIdx.x; - const auto bid = blockIdx.x; - - const auto first_level = levels_start[bid]; - const auto num_levels = levels_start[bid + 1] - first_level; - const auto block_levels = &levels[first_level]; - - // backward substitution - - for (unsigned l=0; l<num_levels-1; ++l) { - const auto& lvl = block_levels[l]; - - const unsigned width = lvl.num_branches; - - // Perform backward substitution for each branch on this level. - // One thread per branch. - if (tid < width) { - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + tid; - - // Zero diagonal term implies dt==0; just leave rhs (for whole matrix) - // alone in that case. - - // Each cell has a different `dt`, because we choose time step size - // according to when the next event is arriving at a cell. So, some - // cells require more time steps than others, but we have to solve - // all the matrices at the same time. When a cell finishes, we put a - // `0` on the diagonal to mark that it should not be solved for. - if (d[pos]!=0) { - - // each branch perform substitution - T factor = u[pos] / d[pos]; - for (unsigned i=0; i<len-1; ++i) { - const unsigned next_pos = pos + width; - d[next_pos] -= factor * u[pos]; - rhs[next_pos] -= factor * rhs[pos]; - - factor = u[next_pos] / d[next_pos]; - pos = next_pos; - } - - // Update d and rhs at the parent node of this branch. - // A parent may have more than one contributing to it, so we use - // atomic updates to avoid races conditions. - const unsigned parent_index = block_levels[l+1].data_index; - const unsigned p = parent_index + lvl.parents[tid]; - //d[p] -= factor * u[pos]; - cuda_atomic_add(d +p, -factor*u[pos]); - //rhs[p] -= factor * rhs[pos]; - cuda_atomic_add(rhs+p, -factor*rhs[pos]); - } - } - __syncthreads(); - } - - { - // the levels are sorted such that the root is the last level - const auto& lvl = block_levels[num_levels-1]; - const unsigned width = num_matrix[bid]; - - if (tid < width) { - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + tid; - - if (d[pos]!=0) { - - // backward - for (unsigned i=0; i<len-1; ++i) { - T factor = u[pos] / d[pos]; - const unsigned next_pos = pos + width; - d[next_pos] -= factor * u[pos]; - rhs[next_pos] -= factor * rhs[pos]; - - pos = next_pos; - } - - auto rhsp = rhs[pos] / d[pos]; - rhs[pos] = rhsp; - pos -= width; - - // forward - for (unsigned i=0; i<len-1; ++i) { - rhsp = rhs[pos] - u[pos]*rhsp; - rhsp /= d[pos]; - rhs[pos] = rhsp; - pos -= width; - } - } - } - } - - // forward substitution - - // take great care with loop limits decrementing unsigned counter l - for (unsigned l=num_levels-1; l>0; --l) { - const auto& lvl = block_levels[l-1]; - - const unsigned width = lvl.num_branches; - const unsigned parent_index = block_levels[l].data_index; - - __syncthreads(); - - // Perform forward-substitution for each branch on this level. - // One thread per branch. - if (tid < width) { - // Load the rhs value for the parent node of this branch. - const unsigned p = parent_index + lvl.parents[tid]; - T rhsp = rhs[p]; - - // Find the index of the first node in this branch. - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + (len-1)*width + tid; - - if (d[pos]!=0) { - // each branch perform substitution - for (unsigned i=0; i<len; ++i) { - rhsp = rhs[pos] - u[pos]*rhsp; - rhsp /= d[pos]; - rhs[pos] = rhsp; - pos -= width; - } - } - } - } -} - -} // namespace kernels - -void gather( - const fvm_value_type* from, - fvm_value_type* to, - const fvm_index_type* p, - unsigned n) -{ - constexpr unsigned blockdim = 128; - const unsigned griddim = impl::block_count(n, blockdim); - - kernels::gather<<<griddim, blockdim>>>(from, to, p, n); -} - -void scatter( - const fvm_value_type* from, - fvm_value_type* to, - const fvm_index_type* p, - unsigned n) -{ - constexpr unsigned blockdim = 128; - const unsigned griddim = impl::block_count(n, blockdim); - - kernels::scatter<<<griddim, blockdim>>>(from, to, p, n); -} - - -void assemble_matrix_fine( - fvm_value_type* d, - fvm_value_type* rhs, - const fvm_value_type* invariant_d, - const fvm_value_type* voltage, - const fvm_value_type* current, - const fvm_value_type* cv_capacitance, - const fvm_value_type* area, - const fvm_index_type* cv_to_cell, - const fvm_value_type* dt_cell, - const fvm_index_type* perm, - unsigned n) -{ - const unsigned block_dim = 128; - const unsigned num_blocks = impl::block_count(n, block_dim); - - kernels::assemble_matrix_fine<<<num_blocks, block_dim>>>( - d, rhs, invariant_d, voltage, current, cv_capacitance, area, - cv_to_cell, dt_cell, - perm, n); -} - -// Example: -// -// block 0 block 1 block 2 -// .~~~~~~~~~~~~~~~~~~. .~~~~~~~~~~~~~~~~~~~~~~~~. .~~~~~~~~~~~ ~ ~ -// -// L0 \ / L5 \ / -// \/ \/ -// L1 \ / \ / L3 \ / \ | / \ / L6 \ / . . . -// \ / \ / \ / \|/ \ / \ / -// L2 | | L4 | | | L7 | -// | | | | | | -// -// levels = [L0, L1, L2, L3, L4, L5, L6, L7, ... ] -// levels_start = [0, 3, 5, 8, ...] -// num_levels = [3, 2, 3, ...] -// num_cells = [2, 3, ...] -// num_blocks = level_start.size() - 1 = num_levels.size() = num_cells.size() -void solve_matrix_fine( - fvm_value_type* rhs, - fvm_value_type* d, // diagonal values - const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) - const level* levels, // pointer to an array containing level meta-data for all blocks - const unsigned* levels_start, // start index into levels for each cuda block - unsigned* num_cells, // he number of cells packed into this single matrix - unsigned* padded_size, // length of rhs, d, u, including padding - unsigned num_blocks, // nuber of blocks - unsigned blocksize) // size of each block -{ - kernels::solve_matrix_fine<<<num_blocks, blocksize>>>( - rhs, d, u, levels, levels_start, - num_cells, padded_size); -} - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp deleted file mode 100644 index 547b80e2..00000000 --- a/arbor/backends/gpu/matrix_fine.hpp +++ /dev/null @@ -1,77 +0,0 @@ -#include <arbor/fvm_types.hpp> - -#include <ostream> - -namespace arb { -namespace gpu { - -struct level { - level() = default; - - level(unsigned branches); - level(level&& other); - level(const level& other); - - ~level(); - - unsigned num_branches = 0; // Number of branches - unsigned max_length = 0; // Length of the longest branch - unsigned data_index = 0; // Index into data values of the first branch - - // The lengths and parents vectors are raw pointers to managed memory, - // so there is need for tricksy deep copy of this type to GPU. - - // An array holding the length of each branch in the level. - // length: num_branches. - unsigned* lengths = nullptr; - - // An array with the index of the parent branch for each branch on this level. - // length: num_branches. - // When performing backward/forward substitution we need to update/read - // data values for the parent node for each branch. - // This can be done easily if we know where the parent branch is located - // on the next level. - unsigned* parents = nullptr; -}; - -std::ostream& operator<<(std::ostream& o, const level& l); - -// C wrappers around kernels -void gather( - const fvm_value_type* from, - fvm_value_type* to, - const fvm_index_type* p, - unsigned n); - -void scatter( - const fvm_value_type* from, - fvm_value_type* to, - const fvm_index_type* p, - unsigned n); - -void assemble_matrix_fine( - fvm_value_type* d, - fvm_value_type* rhs, - const fvm_value_type* invariant_d, - const fvm_value_type* voltage, - const fvm_value_type* current, - const fvm_value_type* cv_capacitance, - const fvm_value_type* area, - const fvm_index_type* cv_to_cell, - const fvm_value_type* dt_cell, - const fvm_index_type* perm, - unsigned n); - -void solve_matrix_fine( - fvm_value_type* rhs, - fvm_value_type* d, // diagonal values - const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) - const level* levels, // pointer to an array containing level meta-data for all blocks - const unsigned* levels_end, // end index (exclusive) into levels for each cuda block - unsigned* num_cells, // he number of cells packed into this single matrix - unsigned* padded_size, // length of rhs, d, u, including padding - unsigned num_blocks, // nuber of blocks - unsigned blocksize); // size of each block - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/matrix_solve.cu b/arbor/backends/gpu/matrix_solve.cu index 7fa719c8..1a9ab8c6 100644 --- a/arbor/backends/gpu/matrix_solve.cu +++ b/arbor/backends/gpu/matrix_solve.cu @@ -10,7 +10,6 @@ namespace kernels { /// GPU implementation of Hines Matrix solver. /// Flat format -/// p: parent index for each variable. Needed for backward and forward sweep template <typename T, typename I> __global__ void solve_matrix_flat( diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp deleted file mode 100644 index c457fe24..00000000 --- a/arbor/backends/gpu/matrix_state_fine.hpp +++ /dev/null @@ -1,494 +0,0 @@ -#pragma once - -#include <cstring> - -#include <vector> -#include <type_traits> - -#include <arbor/common_types.hpp> - -#include "algorithms.hpp" -#include "memory/memory.hpp" -#include "util/partition.hpp" -#include "util/rangeutil.hpp" -#include "util/span.hpp" -#include "tree.hpp" - -#include "matrix_fine.hpp" -#include "forest.hpp" - -namespace arb { -namespace gpu { - -// Helper type for branch meta data in setup phase of fine grained -// matrix storage+solver. -// -// leaf -// . -// . -// . -// - * -// | -// l * -// e | -// n * -// g | -// t * -// h | -// - start_idx -// | -// parent_idx -// | -// . -// . -// . -// root -struct branch { - unsigned id; // branch id - unsigned parent_id; // parent branch id - unsigned parent_idx; // - unsigned start_idx; // the index of the first node in the input parent index - unsigned length; // the number of nodes in the branch -}; - -// order branches by: -// - descending length -// - ascending id -inline -bool operator<(const branch& lhs, const branch& rhs) { - if (lhs.length!=rhs.length) { - return lhs.length>rhs.length; - } else { - return lhs.id<rhs.id; - } -} - -inline -std::ostream& operator<<(std::ostream& o, branch b) { - return o << "[" << b.id - << ", len " << b.length - << ", pid " << b.parent_idx - << ", sta " << b.start_idx - << "]"; -} - -template <typename T, typename I> -struct matrix_state_fine { -public: - using value_type = T; - using size_type = I; - - using array = memory::device_vector<value_type>; - using view = typename array::view_type; - using const_view = typename array::const_view_type; - using iarray = memory::device_vector<size_type>; - - template <typename ValueType> - using managed_vector = std::vector<ValueType, memory::managed_allocator<ValueType>>; - - iarray cv_to_cell; - - array d; // [μS] - array u; // [μS] - array rhs; // [nA] - - // required for matrix assembly - - array cv_area; // [μm^2] - - array cv_capacitance; // [pF] - - // the invariant part of the matrix diagonal - array invariant_d; // [μS] - - // for storing the solution in unpacked format - array solution_; - - // the maximum nuber of branches in each level per block - unsigned max_branches_per_level; - - // number of rows in matrix - unsigned matrix_size; - - // number of cells - unsigned num_cells; - managed_vector<unsigned> num_cells_in_block; - - // end of the data of each level - // use data_size.back() to get the total data size - // data_size >= size - managed_vector<unsigned> data_size; // TODO rename - - // the meta data for each level for each block layed out linearly in memory - managed_vector<level> levels; - // the start of the levels of each block - // block b owns { leves[level_start[b]], ..., leves[level_start[b+1] - 1] } - // there is an additional entry at the end of the vector to make the above - // compuation save - managed_vector<unsigned> levels_start; - - // permutation from front end storage to packed storage - // `solver_format[perm[i]] = external_format[i]` - iarray perm; - - - matrix_state_fine() = default; - - // constructor for fine-grained matrix. - matrix_state_fine(const std::vector<size_type>& p, - const std::vector<size_type>& cell_cv_divs, - const std::vector<value_type>& cap, - const std::vector<value_type>& face_conductance, - const std::vector<value_type>& area) - { - using util::make_span; - constexpr unsigned npos = unsigned(-1); - - max_branches_per_level = 128; - - // for now we have single cell per cell group - arb_assert(cell_cv_divs.size()==2); - - num_cells = cell_cv_divs.size()-1; - - forest trees(p, cell_cv_divs); - trees.optimize(); - - // Now distribute the cells into cuda blocks. - // While the total number of branches on each level of theses cells in a - // block are less than `max_branches_per_level` we add more cells. If - // one block is full, we start a new cuda block. - - unsigned current_block = 0; - std::vector<unsigned> block_num_branches_per_depth; - std::vector<unsigned> block_ix(num_cells); - num_cells_in_block.resize(1, 0); - - // branch_map = branch_maps[block] is a branch map for each cuda block - // branch_map[depth] is list of branches is this level - // each branch branch_map[depth][i] has - // {id, parent_id, start_idx, parent_idx, length} - std::vector<std::vector<std::vector<branch>>> branch_maps; - branch_maps.resize(1); - - unsigned num_branches = 0u; - for (auto c: make_span(0u, num_cells)) { - auto cell_start = cell_cv_divs[c]; - auto cell_tree = trees.branch_tree(c); - auto fine_tree = trees.compartment_tree(c); - auto branch_starts = trees.branch_offsets(c); - auto branch_lengths = trees.branch_lengths(c); - - auto depths = depth_from_root(cell_tree); - - // calculate the number of levels in this cell - auto cell_num_levels = util::max_value(depths)+1u; - - auto num_cell_branches = cell_tree.num_segments(); - - // count number of branches per level - std::vector<unsigned> cell_num_branches_per_depth(cell_num_levels, 0u); - for (auto i: make_span(num_cell_branches)) { - cell_num_branches_per_depth[depths[i]] += 1; - } - // resize the block levels if neccessary - if (cell_num_levels > block_num_branches_per_depth.size()) { - block_num_branches_per_depth.resize(cell_num_levels, 0); - } - - - // check if we can fit the current cell into the last cuda block - bool fits_current_block = true; - for (auto i: make_span(cell_num_levels)) { - unsigned new_branches_per_depth = - block_num_branches_per_depth[i] - + cell_num_branches_per_depth[i]; - if (new_branches_per_depth > max_branches_per_level) { - fits_current_block = false; - } - } - if (fits_current_block) { - // put the cell into current block - block_ix[c] = current_block; - num_cells_in_block[block_ix[c]] += 1; - // and increment counter - for (auto i: make_span(cell_num_levels)) { - block_num_branches_per_depth[i] += cell_num_branches_per_depth[i]; - } - } else { - // otherwise start a new block - block_ix[c] = current_block + 1; - num_cells_in_block.push_back(1); - branch_maps.resize(branch_maps.size()+1); - current_block += 1; - // and reset counter - block_num_branches_per_depth = cell_num_branches_per_depth; - for (auto num: block_num_branches_per_depth) { - if (num > max_branches_per_level) { - throw std::runtime_error( - "Could not fit " + std::to_string(num) - + " branches in a block of size " - + std::to_string(max_branches_per_level)); - } - } - } - - - // the branch map for the block in which we put the cell - // maps levels to a list of branches in that level - auto& branch_map = branch_maps[block_ix[c]]; - - // build branch_map: - // branch_map[i] is a list of branch meta-data for branches with depth i - if (cell_num_levels > branch_map.size()) { - branch_map.resize(cell_num_levels); - } - for (auto i: make_span(num_cell_branches)) { - branch b; - auto depth = depths[i]; - // give the branch a unique id number - b.id = i + num_branches; - // take care to mark branches with no parents with npos - b.parent_id = cell_tree.parent(i)==cell_tree.no_parent ? - npos : cell_tree.parent(i) + num_branches; - b.start_idx = branch_starts[i] + cell_start; - b.length = branch_lengths[i]; - b.parent_idx = p[b.start_idx] + cell_start; - branch_map[depth].push_back(b); - } - // total number of branches of all cells - num_branches += num_cell_branches; - } - - for (auto& branch_map: branch_maps) { - // reverse the levels - std::reverse(branch_map.begin(), branch_map.end()); - - // Sort all branches on each level in descending order of length. - // Later, branches will be partitioned over thread blocks, and we will - // take advantage of the fact that the first branch in a partition is - // the longest, to determine how to pack all the branches in a block. - for (auto& branches: branch_map) { - util::sort(branches); - } - } - - // The branches generated above have been assigned contiguous ids. - // Now generate a vector of branch_loc, one for each branch, that - // allow for quick lookup by id of the level and index within a level - // of each branch. - // This information is only used in the generation of the levels below. - - // Helper for recording location of a branch once packed. - struct branch_loc { - unsigned block; // the cuda block containing the cell to which the branch blongs to - unsigned level; // the level containing the branch - unsigned index; // the index of the branch on that level - }; - - // branch_locs will hold the location information for each branch. - std::vector<branch_loc> branch_locs(num_branches); - for (unsigned b: make_span(branch_maps.size())) { - const auto& branch_map = branch_maps[b]; - for (unsigned l: make_span(branch_map.size())) { - const auto& branches = branch_map[l]; - - // Record the location information - for (auto i=0u; i<branches.size(); ++i) { - const auto& branch = branches[i]; - branch_locs[branch.id] = {b, l, i}; - } - } - } - - unsigned total_num_levels = std::accumulate( - branch_maps.begin(), branch_maps.end(), 0, - [](unsigned value, decltype(branch_maps[0])& l) { - return value + l.size();}); - - // construct description for the set of branches on each level for each - // block. This is later used to sort the branches in each block in each - // level into conineous chunks which are easier to read for the cuda - // kernel. - levels.reserve(total_num_levels); - levels_start.reserve(branch_maps.size() + 1); - levels_start.push_back(0); - data_size.reserve(branch_maps.size()); - // offset into the packed data format, used to apply permutation on data - auto pos = 0u; - for (const auto& branch_map: branch_maps) { - for (const auto& lvl_branches: branch_map) { - - level lvl(lvl_branches.size()); - - // The length of the first branch is the upper bound on branch - // length as they are sorted in descending order of length. - lvl.max_length = lvl_branches.front().length; - lvl.data_index = pos; - - unsigned bi = 0u; - for (const auto& b: lvl_branches) { - // Set the length of the branch. - lvl.lengths[bi] = b.length; - - // Set the parent indexes. During the forward and backward - // substitution phases each branch accesses the last node in - // its parent branch. - auto index = b.parent_id==npos? npos: branch_locs[b.parent_id].index; - lvl.parents[bi] = index; - ++bi; - } - - pos += lvl.max_length*lvl.num_branches; - - levels.push_back(std::move(lvl)); - } - auto prev_end = levels_start.back(); - levels_start.push_back(prev_end + branch_map.size()); - data_size.push_back(pos); - } - - // set matrix state - matrix_size = p.size(); - - // form the permutation index used to reorder vectors to/from the - // ordering used by the fine grained matrix storage. - std::vector<size_type> perm_tmp(matrix_size); - for (auto block: make_span(branch_maps.size())) { - const auto& branch_map = branch_maps[block]; - const auto first_level = levels_start[block]; - - for (auto i: make_span(levels_start[block + 1] - first_level)) { - const auto& l = levels[first_level + i]; - for (auto j: make_span(l.num_branches)) { - const auto& b = branch_map[i][j]; - auto to = l.data_index + j + l.num_branches*(l.lengths[j]-1); - auto from = b.start_idx; - for (auto k: make_span(b.length)) { - perm_tmp[from + k] = to - k*l.num_branches; - } - } - } - } - - auto perm_balancing = trees.permutation(); - - // apppy permutation form balancing - std::vector<size_type> perm_tmp2(matrix_size); - for (auto i: make_span(matrix_size)) { - // This is CORRECT! verified by using the ring benchmark with root=0 (where the permutation is actually not id) - perm_tmp2[perm_balancing[i]] = perm_tmp[i]; - } - // copy permutation to device memory - perm = memory::make_const_view(perm_tmp2); - - - // Summary of fields and their storage format: - // - // face_conductance : not needed, don't store - // d, u, rhs : packed - // cv_capacitance : flat - // invariant_d : flat - // solution_ : flat - // cv_to_cell : flat - // area : flat - - // the invariant part of d is stored in in flat form - std::vector<value_type> invariant_d_tmp(matrix_size, 0); - managed_vector<value_type> u_tmp(matrix_size, 0); - for (auto i: make_span(1u, matrix_size)) { - auto gij = face_conductance[i]; - - u_tmp[i] = -gij; - invariant_d_tmp[i] += gij; - invariant_d_tmp[p[i]] += gij; - } - - // the matrix components u, d and rhs are stored in packed form - auto nan = std::numeric_limits<double>::quiet_NaN(); - d = array(data_size.back(), nan); - u = array(data_size.back(), nan); - rhs = array(data_size.back(), nan); - - // transform u_tmp values into packed u vector. - flat_to_packed(u_tmp, u); - - // the invariant part of d, cv_area and the solution are in flat form - solution_ = array(matrix_size, 0); - cv_area = memory::make_const_view(area); - - // the cv_capacitance can be copied directly because it is - // to be stored in flat format - cv_capacitance = memory::make_const_view(cap); - invariant_d = memory::make_const_view(invariant_d_tmp); - - // calculte the cv -> cell mappings - std::vector<size_type> cv_to_cell_tmp(matrix_size); - size_type ci = 0; - for (auto cv_span: util::partition_view(cell_cv_divs)) { - util::fill(util::subrange_view(cv_to_cell_tmp, cv_span), ci); - ++ci; - } - cv_to_cell = memory::make_const_view(cv_to_cell_tmp); - } - - // Assemble the matrix - // Afterwards the diagonal and RHS will have been set given dt, voltage and current - // dt_cell [ms] (per cell) - // voltage [mV] - // current [nA] - void assemble(const_view dt_cell, const_view voltage, const_view current) { - assemble_matrix_fine( - d.data(), - rhs.data(), - invariant_d.data(), - voltage.data(), - current.data(), - cv_capacitance.data(), - cv_area.data(), - cv_to_cell.data(), - dt_cell.data(), - perm.data(), - size()); - } - - void solve() { - solve_matrix_fine( - rhs.data(), d.data(), u.data(), - levels.data(), levels_start.data(), - num_cells_in_block.data(), - data_size.data(), - num_cells_in_block.size(), max_branches_per_level); - - // unpermute the solution - packed_to_flat(rhs, solution_); - } - - const_view solution() const { - return solution_; - } - - template <typename VFrom, typename VTo> - void flat_to_packed(const VFrom& from, VTo& to ) { - arb_assert(from.size()==matrix_size); - arb_assert(to.size()==data_size.back()); - - scatter(from.data(), to.data(), perm.data(), perm.size()); - } - - template <typename VFrom, typename VTo> - void packed_to_flat(const VFrom& from, VTo& to ) { - arb_assert(from.size()==data_size.back()); - arb_assert(to.size()==matrix_size); - - gather(from.data(), to.data(), perm.data(), perm.size()); - } - -private: - std::size_t size() const { - return matrix_size; - } -}; - -} // namespace gpu -} // namespace arb diff --git a/arbor/tree.cpp b/arbor/tree.cpp deleted file mode 100644 index a4bc28d6..00000000 --- a/arbor/tree.cpp +++ /dev/null @@ -1,385 +0,0 @@ -#include <algorithm> -#include <cassert> -#include <numeric> -#include <queue> -#include <vector> - -#include <arbor/common_types.hpp> - -#include "algorithms.hpp" -#include "memory/memory.hpp" -#include "tree.hpp" -#include "util/span.hpp" - -namespace arb { - -tree::tree(std::vector<tree::int_type> parent_index) { - // validate the input - if(!algorithms::is_minimal_degree(parent_index)) { - throw std::domain_error( - "parent index used to build a tree did not satisfy minimal degree ordering" - ); - } - - // an empty parent_index implies a single-compartment/segment cell - arb_assert(parent_index.size()!=0u); - - init(parent_index.size()); - memory::copy(parent_index, parents_); - parents_[0] = no_parent; - - // compute offsets into children_ array - memory::copy(algorithms::make_index(algorithms::child_count(parents_)), child_index_); - - std::vector<int_type> pos(parents_.size(), 0); - for (auto i = 1u; i < parents_.size(); ++i) { - auto p = parents_[i]; - children_[child_index_[p] + pos[p]] = i; - ++pos[p]; - } -} - -tree::size_type tree::num_children() const { - return static_cast<size_type>(children_.size()); -} - -tree::size_type tree::num_children(size_t b) const { - return child_index_[b+1] - child_index_[b]; -} - -tree::size_type tree::num_segments() const { - // the number of segments/nodes is the size of the child index minus 1 - // ... except for the case of an empty tree - auto sz = static_cast<size_type>(child_index_.size()); - return sz ? sz - 1 : 0; -} - -const tree::iarray& tree::child_index() { - return child_index_; -} - -const tree::iarray& tree::children() const { - return children_; -} - -const tree::iarray& tree::parents() const { - return parents_; -} - -const tree::int_type& tree::parent(size_t b) const { - return parents_[b]; -} -tree::int_type& tree::parent(size_t b) { - return parents_[b]; -} - -tree::int_type tree::split_node(int_type ix) { - using util::make_span; - - auto insert_at_p = parents_.begin() + ix; - auto insert_at_ci = child_index_.begin() + ix; - auto insert_at_c = children_.begin() + child_index_[ix]; - auto new_node_ix = ix; - - // we first adjust the parent sructure - - // first create a new node N below the parent - auto parent = parents_[ix]; - parents_.insert(insert_at_p, parent); - // and attach the remining subtree below it - parents_[ix+1] = new_node_ix; - // shift all parents, as the indices changed when we - // inserted a new node - for (auto i: make_span(ix + 2, parents().size())) { - if (parents_[i] >= new_node_ix) { - parents_[i]++; - } - } - - // now we adjust the children structure - - // insert a child node for the new node N, pointing to - // the old node A - child_index_.insert(insert_at_ci, child_index_[ix]); - // we will set this value later as it will be overridden - children_.insert(insert_at_c, ~0u); - // shift indices for all larger indices, as we inserted - // a new element in the list - for (auto i: make_span(ix + 1, child_index_.size())) { - child_index_[i]++; - } - for (auto i: make_span(0, children_.size())) { - if(children_[i] > new_node_ix) { - children_[i]++; - } - } - // set the children of the new node to the old subtree - children_[child_index_[ix]] = ix + 1; - - return ix+1; -} - -tree::iarray tree::select_new_root(int_type root) { - using util::make_span; - - const auto num_nodes = parents().size(); - - if(root >= num_nodes && root != no_parent) { - throw std::domain_error( - "root is out of bounds: root="+std::to_string(root)+", nodes=" - +std::to_string(num_nodes) - ); - } - - // walk up to the old root and turn `parent->child` into `parent<-child` - auto prev = no_parent; - auto current = root; - while (current != no_parent) { - auto parent = parents_[current]; - parents_[current] = prev; - prev = current; - current = parent; - } - - // sort the list to get min degree ordering and keep index such that we - // can sort also the `branch_starts` array. - - // comput the depth for each node - iarray depth (num_nodes, 0); - // the depth when we don't count nodes that only have one child - iarray reduced_depth (num_nodes, 0); - // the index of the last node that passed this node on its way to the root - // we need this to keep nodes that are part of the same reduced tree close - // together in the final sorting. - // - // Instead of the left order we want the right order. - // .-----------------------. - // | 0 0 | - // | / \ / \ | - // | 1 2 1 3 | - // | | | | | | - // | 3 4 2 4 | - // | / \ / \ | - // | 5 6 5 6 | - // '-----------------------' - // - // we achieve this by first sorting by reduced_depth, branch_ix and depth - // in this order. The resulting ordering satisfies minimal degree ordering. - // - // Using the tree above we would get the following results: - // - // `depth` `reduced_depth` `branch_ix` - // .-----------. .-----------. .-----------. - // | 0 | | 0 | | 6 | - // | / \ | | / \ | | / \ | - // | 1 1 | | 1 1 | | 2 6 | - // | | | | | | | | | | | | - // | 2 2 | | 1 1 | | 2 6 | - // | / \ | | / \ | | / \ | - // | 3 3 | | 2 2 | | 5 6 | - // '-----------' '-----------' '-----------' - iarray branch_ix (num_nodes, 0); - // we cannot use the existing `children_` array as we only updated the - // parent structure yet - auto new_num_children = algorithms::child_count(parents_); - for (auto n: make_span(num_nodes)) { - branch_ix[n] = n; - auto prev = n; - auto curr = parents_[n]; - - // find the way to the root - while (curr != no_parent) { - depth[n]++; - if (new_num_children[curr] > 1) { - reduced_depth[n]++; - } - branch_ix[curr] = branch_ix[prev]; - curr = parents_[curr]; - } - } - - // maps new indices to old indices - iarray indices (num_nodes); - // fill array with indices - for (auto i: make_span(num_nodes)) { - indices[i] = i; - } - // perform sort by depth index to get the permutation - std::sort(indices.begin(), indices.end(), [&](auto i, auto j){ - if (reduced_depth[i] != reduced_depth[j]) { - return reduced_depth[i] < reduced_depth[j]; - } - if (branch_ix[i] != branch_ix[j]) { - return branch_ix[i] < branch_ix[j]; - } - return depth[i] < depth[j]; - }); - // maps old indices to new indices - iarray indices_inv (num_nodes); - // fill array with indices - for (auto i: make_span(num_nodes)) { - indices_inv[i] = i; - } - // perform sort - std::sort(indices_inv.begin(), indices_inv.end(), [&](auto i, auto j){ - return indices[i] < indices[j]; - }); - - // translate the parent vetor to new indices - for (auto i: make_span(num_nodes)) { - if (parents_[i] != no_parent) { - parents_[i] = indices_inv[parents_[i]]; - } - } - - iarray new_parents (num_nodes); - for (auto i: make_span(num_nodes)) { - new_parents[i] = parents_[indices[i]]; - } - // parent now starts with the root, then it's children, then their - // children, etc... - - // recompute the children array - memory::copy(new_parents, parents_); - memory::copy(algorithms::make_index(algorithms::child_count(parents_)), child_index_); - - std::vector<int_type> pos(parents_.size(), 0); - for (auto i = 1u; i < parents_.size(); ++i) { - auto p = parents_[i]; - children_[child_index_[p] + pos[p]] = i; - ++pos[p]; - } - - return indices; -} - -tree::iarray tree::minimize_depth() { - const auto num_nodes = parents().size(); - tree::iarray seen(num_nodes, 0); - - // find the furhtest node from the root - std::queue<tree::int_type> queue; - queue.push(0); // start at the root node - seen[0] = 1; - auto front = queue.front(); - // breath first traversal - while (!queue.empty()) { - front = queue.front(); - queue.pop(); - // we only have to check children as we started at the root node - auto cs = children(front); - for (auto c: cs) { - if (seen[c] == 0) { - seen[c] = 1; - queue.push(c); - } - } - } - - auto u = front; - - // find the furhtest node from this node - std::fill(seen.begin(), seen.end(), 0); - queue.push(u); - seen[u] = 1; - front = queue.front(); - // breath first traversal - while (!queue.empty()) { - front = queue.front(); - queue.pop(); - auto cs = children(front); - for (auto c: cs) { - if (seen[c] == 0) { - seen[c] = 1; - queue.push(c); - } - } - // also check the partent node! - auto c = parent(front); - if (c != tree::no_parent && seen[c] == 0) { - seen[c] = 1; - queue.push(c); - } - } - - auto v = front; - - // now find the middle between u and v - - // each path to the root - tree::iarray path_to_root_u (1, u); - tree::iarray path_to_root_v (1, v); - - auto curr = parent(u); - while (curr != tree::no_parent) { - path_to_root_u.push_back(curr); - curr = parent(curr); - } - curr = parent(v); - while (curr != tree::no_parent) { - path_to_root_v.push_back(curr); - curr = parent(curr); - } - - // reduce the path - auto last_together = 0; - while (path_to_root_u.back() == path_to_root_v.back()) { - last_together = path_to_root_u.back(); - path_to_root_u.pop_back(); - path_to_root_v.pop_back(); - } - path_to_root_u.push_back(last_together); - - auto path_length = path_to_root_u.size() + path_to_root_v.size() - 1; - - // walk up half of the path length to find the middle node - tree::int_type root; - if (path_to_root_u.size() > path_to_root_v.size()) { - root = path_to_root_u[path_length / 2]; - } else { - root = path_to_root_v[path_length / 2]; - } - - return select_new_root(root); -} - -/// memory used to store tree (in bytes) -std::size_t tree::memory() const { - return sizeof(int_type)*(children_.size()+child_index_.size()+parents_.size()) - + sizeof(tree); -} - -void tree::init(tree::size_type nnode) { - if (nnode) { - auto nchild = nnode - 1; - children_.resize(nchild); - child_index_.resize(nnode+1); - parents_.resize(nnode); - } - else { - children_.resize(0); - child_index_.resize(0); - parents_.resize(0); - } -} - -// recursive helper for the depth_from_root() below -void depth_from_root(const tree& t, tree::iarray& depth, tree::int_type segment) { - auto d = depth[t.parent(segment)] + 1; - depth[segment] = d; - for(auto c : t.children(segment)) { - depth_from_root(t, depth, c); - } -} - -tree::iarray depth_from_root(const tree& t) { - tree::iarray depth(t.num_segments()); - depth[0] = 0; - for (auto c: t.children(0)) { - depth_from_root(t, depth, c); - } - - return depth; -} - -} // namespace arb diff --git a/arbor/tree.hpp b/arbor/tree.hpp index da55f2a9..5adb6f6d 100644 --- a/arbor/tree.hpp +++ b/arbor/tree.hpp @@ -2,7 +2,6 @@ #include <algorithm> #include <cassert> -#include <fstream> #include <numeric> #include <vector> @@ -24,21 +23,81 @@ public: tree() = default; + tree& operator=(tree&& other) { + std::swap(child_index_, other.child_index_); + std::swap(children_, other.children_); + std::swap(parents_, other.parents_); + return *this; + } + + tree& operator=(tree const& other) { + children_ = other.children_; + child_index_ = other.child_index_; + parents_ = other.child_index_; + return *this; + } + + // copy constructors take advantage of the assignment operators + // defined above + tree(tree const& other) { + *this = other; + } + + tree(tree&& other) { + *this = std::move(other); + } + /// Create the tree from a parent index that lists the parent segment /// of each segment in a cell tree. - tree(std::vector<int_type> parent_index); + tree(std::vector<int_type> parent_index) { + // validate the input + if(!algorithms::is_minimal_degree(parent_index)) { + throw std::domain_error( + "parent index used to build a tree did not satisfy minimal degree ordering" + ); + } + + // an empty parent_index implies a single-compartment/segment cell + arb_assert(parent_index.size()!=0u); - size_type num_children() const; + init(parent_index.size()); + memory::copy(parent_index, parents_); + parents_[0] = no_parent; - size_type num_children(size_t b) const; + memory::copy(algorithms::make_index(algorithms::child_count(parents_)), child_index_); - size_type num_segments() const; + std::vector<int_type> pos(parents_.size(), 0); + for (auto i = 1u; i < parents_.size(); ++i) { + auto p = parents_[i]; + children_[child_index_[p] + pos[p]] = i; + ++pos[p]; + } + } + + size_type num_children() const { + return static_cast<size_type>(children_.size()); + } + + size_type num_children(size_t b) const { + return child_index_[b+1] - child_index_[b]; + } + + size_type num_segments() const { + // the number of segments/nodes is the size of the child index minus 1 + // ... except for the case of an empty tree + auto sz = static_cast<size_type>(child_index_.size()); + return sz ? sz - 1 : 0; + } /// return the child index - const iarray& child_index(); + const iarray& child_index() { + return child_index_; + } /// return the list of all children - const iarray& children() const; + const iarray& children() const { + return children_; + } /// return the list of all children of branch i auto children(size_type i) const { @@ -48,62 +107,38 @@ public: } /// return the list of parents - const iarray& parents() const; + const iarray& parents() const { + return parents_; + } /// return the parent of branch b - const int_type& parent(size_t b) const; - int_type& parent(size_t b); - - // splits the node in two parts. Returns `ix + 1` which is the new index of - // the old node. - // .-------------------. - // | P P | - // | / / | - // | A ~~> N | - // | / \ | | - // | B C A | - // | / \ | - // | B C | - // '-------------------' - int_type split_node(int_type ix); - - // Changes the root node of a tree - // .------------------------. - // | A | - // | / \ R | - // | R B / \ | - // | / ~~> A C | - // | C / / \ | - // | / \ B D E | - // | D E | - // '------------------------' - // Returns the permutation applied to the nodes, - // i.e. `new_node_data[i] = old_node_data[perm[i]]` - // - // This function has the additional effect, that branches with only one - // child branch get merged. That means that `select_new_root(0)` can also - // lead to an permutation of the indices of the compartments: - // .------------------------------. - // | 0 0 | - // | / \ / \ | - // | 1 3 1 3 | - // | / \ ~~> / \ | - // | 2 4 2 4 | - // | / \ \ / \ \ | - // | 5 6 7 6 7 5 | - // '------------------------------' - iarray select_new_root(int_type root); - - // Selects a new node such that the depth of the graph is minimal. - // Returns the permutation applied to the nodes, - // i.e. `new_node_data[i] = old_node_data[perm[i]]` - iarray minimize_depth(); + int_type parent(size_t b) const { + return parents_[b]; + } + int_type& parent(size_t b) { + return parents_[b]; + } /// memory used to store tree (in bytes) - std::size_t memory() const; + std::size_t memory() const { + return sizeof(int_type)*(children_.size()+child_index_.size()+parents_.size()) + + sizeof(tree); + } private: - void init(size_type nnode); + void init(size_type nnode) { + if (nnode) { + auto nchild = nnode - 1; + children_.resize(nchild); + child_index_.resize(nnode+1); + parents_.resize(nnode); + } + else { + children_.resize(0); + child_index_.resize(0); + parents_.resize(0); + } + } // state iarray children_; @@ -111,10 +146,6 @@ private: iarray parents_; }; -// Calculates the depth of each branch from the root of a cell segment tree. -// The root has depth 0, it's children have depth 1, and so on. -tree::iarray depth_from_root(const tree& t); - template <typename C> std::vector<tree::int_type> make_parent_index(tree const& t, C const& counts) { diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu index 10af85c4..e75f8c3b 100644 --- a/test/unit/test_gpu_stack.cu +++ b/test/unit/test_gpu_stack.cu @@ -66,7 +66,6 @@ TEST(stack, push_back) { kernels::push_back<<<1, n>>>(sstorage, kernels::all_ftor()); cudaDeviceSynchronize(); EXPECT_EQ(n, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); for (auto i=0; i<int(s.size()); ++i) { EXPECT_EQ(i, s[i]); } @@ -75,7 +74,6 @@ TEST(stack, push_back) { kernels::push_back<<<1, n>>>(sstorage, kernels::even_ftor()); cudaDeviceSynchronize(); EXPECT_EQ(n/2, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); for (auto i=0; i<int(s.size())/2; ++i) { EXPECT_EQ(2*i, s[i]); } @@ -84,7 +82,6 @@ TEST(stack, push_back) { kernels::push_back<<<1, n>>>(sstorage, kernels::odd_ftor()); cudaDeviceSynchronize(); EXPECT_EQ(n/2, s.size()); - std::sort(sstorage.data, sstorage.data+s.size()); for (auto i=0; i<int(s.size())/2; ++i) { EXPECT_EQ(2*i+1, s[i]); } diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu index b3424ad2..b01fff92 100644 --- a/test/unit/test_matrix.cu +++ b/test/unit/test_matrix.cu @@ -15,7 +15,6 @@ #include "backends/gpu/matrix_state_flat.hpp" #include "backends/gpu/matrix_state_interleaved.hpp" #include "backends/gpu/matrix_interleave.hpp" -#include "backends/gpu/matrix_state_fine.hpp" #include "../gtest.h" #include "common.hpp" @@ -215,7 +214,6 @@ TEST(matrix, backends) using state_flat = gpu::matrix_state_flat<T, I>; using state_intl = gpu::matrix_state_interleaved<T, I>; - using state_fine = gpu::matrix_state_fine<T, I>; using gpu_array = memory::device_vector<T>; @@ -288,7 +286,6 @@ TEST(matrix, backends) // Make the reference matrix and the gpu matrix auto flat = state_flat(p, cell_cv_divs, Cm, g, area); // flat auto intl = state_intl(p, cell_cv_divs, Cm, g, area); // interleaved - auto fine = state_fine(p, cell_cv_divs, Cm, g, area); // interleaved // Set the integration times for the cells to be between 0.01 and 0.02 ms. std::vector<T> dt(num_mtx, 0); @@ -303,27 +300,90 @@ TEST(matrix, backends) flat.assemble(gpu_dt, gpu_v, gpu_i); intl.assemble(gpu_dt, gpu_v, gpu_i); - fine.assemble(gpu_dt, gpu_v, gpu_i); flat.solve(); intl.solve(); - fine.solve(); // Compare the results. // We expect exact equality for the two gpu matrix implementations because both // perform the same operations in the same order on the same inputs. std::vector<double> x_flat = assign_from(on_host(flat.solution())); std::vector<double> x_intl = assign_from(on_host(intl.solution())); - // as the fine algorithm contains atomics the solution might be slightly - // different from flat and interleaved - std::vector<double> x_fine = assign_from(on_host(fine.solution())); + EXPECT_EQ(x_flat, x_intl); +} - auto max_diff_fine = - util::max_value( - util::transform_view( - util::count_along(x_flat), - [&](unsigned i) {return std::abs(x_flat[i] - x_fine[i]);})); +/* - EXPECT_EQ(x_flat, x_intl); - EXPECT_LE(max_diff_fine, 1e-12); +// Test for special zero diagonal behaviour. (see `test_matrix.cpp`.) +TEST(matrix, zero_diagonal) +{ + using util::assign; + + using value_type = gpu::backend::value_type; + using size_type = gpu::backend::size_type; + using matrix_type = gpu::backend::matrix_state; + using vvec = std::vector<value_type>; + + // Combined matrix may have zero-blocks, corresponding to a zero dt. + // Zero-blocks are indicated by zero value in the diagonal (the off-diagonal + // elements should be ignored). + // These submatrices should leave the rhs as-is when solved. + + // Three matrices, sizes 3, 3 and 2, with no branching. + std::vector<size_type> p = {0, 0, 1, 3, 3, 5, 5}; + std::vector<size_type> c = {0, 3, 5, 7}; + + // Face conductances. + std::vector<value_type> g = {0, 1, 1, 0, 1, 0, 2}; + + // dt of 1e-3. + std::vector<value_type> dt(3, 1.0e-3); + + // Capacitances. + std::vector<value_type> Cm = {1, 1, 1, 1, 1, 2, 3}; + + // Intial voltage of zero; currents alone determine rhs. + std::vector<value_type> v(7, 0.0); + std::vector<value_type> i = {-3, -5, -7, -6, -9, -16, -32}; + + // Expected matrix and rhs: + // u = [ 0 -1 -1 0 -1 0 -2] + // d = [ 2 3 2 2 2 4 5] + // b = [ 3 5 7 2 4 16 32] + // + // Expected solution: + // x = [ 4 5 6 7 8 9 10] + + matrix_type m(p, c, Cm, g); + auto gpu_dt = on_gpu(dt); + auto gpu_v = on_gpu(v); + auto gpu_i = on_gpu(i); + m.assemble(gpu_dt, gpu_v, gpu_i); + m.solve(); + + vvec x; + assign(x, on_host(m.solution())); + std::vector<value_type> expected = {4, 5, 6, 7, 8, 9, 10}; + + EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x)); + + // Set dt of 2nd (middle) submatrix to zero. Solution + // should then return voltage values for that submatrix. + + dt[1] = 0; + gpu_dt = on_gpu(dt); + + v[3] = 20; + v[4] = 30; + gpu_v = on_gpu(v); + + m.assemble(gpu_dt, gpu_v, gpu_i); + m.solve(); + + assign(x, on_host(m.solution())); + expected = {4, 5, 6, 20, 30, 9, 10}; + + EXPECT_TRUE(testing::seq_almost_eq<double>(expected, x)); } + +*/ diff --git a/test/unit/test_matrix_cpuvsgpu.cpp b/test/unit/test_matrix_cpuvsgpu.cpp index 861c83dd..27390686 100644 --- a/test/unit/test_matrix_cpuvsgpu.cpp +++ b/test/unit/test_matrix_cpuvsgpu.cpp @@ -69,25 +69,22 @@ TEST(matrix, assemble) // 6 // \. // 7 - // p_3: 1 branch, 1 compartment - // - // 0 // The parent indexes that define the two matrix structures std::vector<std::vector<I>> - p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6}, {0} }; + p_base = { {0,0,1,2,2,4}, {0,0,1,2,3,3,2,6} }; // Make a set of matrices based on repeating this pattern. // We assign the patterns round-robin, i.e. so that the input // matrices will have alternating sizes of 6 and 8, which will // test the solver with variable matrix size, and exercise // solvers that reorder matrices according to size. - const int num_mtx = 100; + const int num_mtx = 8; std::vector<I> p; std::vector<I> cell_index; for (auto m=0; m<num_mtx; ++m) { - auto &p_ref = p_base[m%p_base.size()]; + auto &p_ref = p_base[m%2]; auto first = p.size(); for (auto i: p_ref) { p.push_back(i + first); diff --git a/test/unit/test_tree.cpp b/test/unit/test_tree.cpp index 094e77f1..e23cc8ed 100644 --- a/test/unit/test_tree.cpp +++ b/test/unit/test_tree.cpp @@ -9,7 +9,6 @@ using namespace arb; using int_type = tree::int_type; -using iarray = tree::iarray; TEST(tree, from_segment_index) { auto no_parent = tree::no_parent; @@ -175,85 +174,3 @@ TEST(tree, from_segment_index) { } } -TEST(tree, depth_from_root) { - // tree with single branch corresponding to the root node - // this is equivalent to a single compartment model - // CASE 1 : single root node in parent_index - { - std::vector<int_type> parent_index = {0}; - iarray expected = {0u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - - { - // 0 - // / \. - // 1 2 - std::vector<int_type> parent_index = {0, 0, 0}; - iarray expected = {0u, 1u, 1u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // 0-1-2-3 - std::vector<int_type> parent_index = {0, 0, 1, 2}; - iarray expected = {0u, 1u, 2u, 3u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // - // 0 - // /|\. - // 1 2 3 - // - std::vector<int_type> parent_index = {0, 0, 0, 0}; - iarray expected = {0u, 1u, 1u, 1u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // - // 0 - // /|\. - // 1 2 3 - // / \. - // 4 5 - // - std::vector<int_type> parent_index = {0, 0, 0, 0, 3, 3}; - iarray expected = {0u, 1u, 1u, 1u, 2u, 2u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // - // 0 - // / - // 1 - // / \. - // 2 3 - std::vector<int_type> parent_index = {0,0,1,1}; - iarray expected = {0u, 1u, 2u, 2u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // - // 0 - // /|\. - // 1 4 5 - // / \. - // 2 3 - std::vector<int_type> parent_index = {0,0,1,1,0,0}; - iarray expected = {0u, 1u, 2u, 2u, 1u, 1u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } - { - // 0 - // / \. - // 1 2 - // / \. - // 3 4 - // / \. - // 5 6 - std::vector<int_type> parent_index = {0,0,0,1,1,4,4}; - iarray expected = {0u, 1u, 1u, 2u, 2u, 3u, 3u}; - EXPECT_EQ(expected, depth_from_root(tree(parent_index))); - } -} - -- GitLab