From a88f9c0201b0d72e1945e152ec51b48d503427c4 Mon Sep 17 00:00:00 2001 From: thorstenhater <24411438+thorstenhater@users.noreply.github.com> Date: Wed, 14 Oct 2020 10:53:08 +0200 Subject: [PATCH] Gpu/optimise matrix fine (#1028) Memory access optimization for the fine matrix GPU solver. --- arbor/backends/gpu/matrix_fine.cu | 92 ++++++++++-------------- arbor/backends/gpu/matrix_fine.hpp | 3 +- arbor/backends/gpu/matrix_state_fine.hpp | 20 +++--- 3 files changed, 52 insertions(+), 63 deletions(-) diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu index 9e785676..228d9642 100644 --- a/arbor/backends/gpu/matrix_fine.cu +++ b/arbor/backends/gpu/matrix_fine.cu @@ -42,7 +42,7 @@ void scatter(const T* __restrict__ const from, } } -/// GPU implementatin of Hines matrix assembly. +/// GPU implementation of Hines matrix assembly. /// Fine layout. /// For a given time step size dt: /// - use the precomputed alpha and alpha_d values to construct the diagonal @@ -59,35 +59,26 @@ void assemble_matrix_fine( const T* __restrict__ const conductivity, const T* __restrict__ const cv_capacitance, const T* __restrict__ const area, - const I* __restrict__ const cv_to_cell, + const I* __restrict__ const cv_to_intdom, const T* __restrict__ const dt_intdom, - const I* __restrict__ const cell_to_intdom, const I* __restrict__ const perm, unsigned n) { const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x; - - if (tid<n) { - auto cid = cv_to_cell[tid]; - auto dt = dt_intdom[cell_to_intdom[cid]]; - - if (dt>0) { - // The 1e-3 is a constant of proportionality required to ensure that the - // conductance (gi) values have units μS (micro-Siemens). - // See the model documentation in docs/model for more information. - T oodt_factor = T(1e-3)/dt; - T area_factor = T(1e-3)*area[tid]; - - const auto gi = oodt_factor*cv_capacitance[tid] + area_factor*conductivity[tid]; - const auto pid = perm[tid]; - d[pid] = gi + invariant_d[tid]; - rhs[pid] = gi*voltage[tid] - area_factor*current[tid]; - } - else { - const auto pid = perm[tid]; - d[pid] = 0; - rhs[pid] = voltage[tid]; - } + if (tid < n) { + // The 1e-3 is a constant of proportionality required to ensure that the + // conductance (gi) values have units μS (micro-Siemens). + // See the model documentation in docs/model for more information. + const auto dt = dt_intdom[cv_to_intdom[tid]]; + const auto p = dt > 0; + const auto pid = perm[tid]; + const auto area_factor = T(1e-3)*area[tid]; + const auto gi = T(1e-3)*cv_capacitance[tid]/dt + area_factor*conductivity[tid]; + const auto r_d = gi + invariant_d[tid]; + const auto r_rhs = gi*voltage[tid] - area_factor*current[tid]; + + d[pid] = p ? r_d : 0; + rhs[pid] = p ? r_rhs : voltage[tid]; } } @@ -110,8 +101,7 @@ void solve_matrix_fine( const fvm_index_type* __restrict__ const level_lengths, const fvm_index_type* __restrict__ const level_parents, const fvm_index_type* __restrict__ const block_index, - fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells - fvm_index_type* __restrict__ const padded_size) + const fvm_index_type* __restrict__ const num_matrix) // number of packed matrices = number of cells { const auto tid = threadIdx.x; const auto bid = blockIdx.x; @@ -135,6 +125,7 @@ void solve_matrix_fine( const unsigned width = lvl_meta.num_branches; + // Perform backward substitution for each branch on this level. // One thread per branch. if (tid < width) { @@ -150,32 +141,31 @@ void solve_matrix_fine( // all the matrices at the same time. When a cell finishes, we put a // `0` on the diagonal to mark that it should not be solved for. if (d[pos]!=0) { - // each branch perform substitution - T factor = u[pos] / d[pos]; for (unsigned i=0; i<len-1; ++i) { const unsigned next_pos = pos + width; - d[next_pos] -= factor * u[pos]; - rhs[next_pos] -= factor * rhs[pos]; - - factor = u[next_pos] / d[next_pos]; + const auto d_next = d[next_pos]; + const auto rhs_next = rhs[next_pos]; + const T factor = -u[pos]/d[pos]; + d[next_pos] = fma(factor, u[pos], d_next); + rhs[next_pos] = fma(factor, rhs[pos], rhs_next); pos = next_pos; } - // Update d and rhs at the parent node of this branch. // A parent may have more than one contributing to it, so we use // atomic updates to avoid races conditions. const unsigned parent_index = next_lvl_meta.matrix_data_index; const unsigned p = parent_index + lvl_parents[tid]; - //d[p] -= factor * u[pos]; - gpu_atomic_add(d +p, -factor*u[pos]); - //rhs[p] -= factor * rhs[pos]; - gpu_atomic_add(rhs+p, -factor*rhs[pos]); + + const T factor = -u[pos] / d[pos]; + gpu_atomic_add(d + p, factor*u[pos]); + gpu_atomic_add(rhs + p, factor*rhs[pos]); } } __syncthreads(); } + // Solve the root { // The levels are sorted such that the root is the last level const auto& last_lvl_meta = block_level_meta[num_levels-1]; @@ -188,14 +178,14 @@ void solve_matrix_fine( unsigned pos = last_lvl_meta.matrix_data_index + tid; if (d[pos]!=0) { - // backward for (unsigned i=0; i<len-1; ++i) { - T factor = u[pos] / d[pos]; const unsigned next_pos = pos + width; - d[next_pos] -= factor * u[pos]; - rhs[next_pos] -= factor * rhs[pos]; - + const T factor = -u[pos] / d[pos]; + const auto rhs_next = rhs[next_pos]; + const auto d_next = d[next_pos]; + d[next_pos] = fma(factor, u[pos], d_next); + rhs[next_pos] = fma(factor, rhs[pos], rhs_next); pos = next_pos; } @@ -233,15 +223,14 @@ void solve_matrix_fine( // Perform forward-substitution for each branch on this level. // One thread per branch. if (tid < width) { - // Load the rhs value for the parent node of this branch. - const unsigned p = parent_index + lvl_parents[tid]; - T rhsp = rhs[p]; - // Find the index of the first node in this branch. const unsigned len = lvl_lengths[tid]; unsigned pos = lvl_meta.matrix_data_index + (len-1)*width + tid; if (d[pos]!=0) { + // Load the rhs value for the parent node of this branch. + const unsigned p = parent_index + lvl_parents[tid]; + T rhsp = rhs[p]; // each branch perform substitution for (unsigned i=0; i<len; ++i) { rhsp = rhs[pos] - u[pos]*rhsp; @@ -280,7 +269,6 @@ void scatter( kernels::scatter<<<griddim, blockdim>>>(from, to, p, n); } - void assemble_matrix_fine( fvm_value_type* d, fvm_value_type* rhs, @@ -290,9 +278,8 @@ void assemble_matrix_fine( const fvm_value_type* conductivity, const fvm_value_type* cv_capacitance, const fvm_value_type* area, - const fvm_index_type* cv_to_cell, + const fvm_index_type* cv_to_intdom, const fvm_value_type* dt_intdom, - const fvm_index_type* cell_to_intdom, const fvm_index_type* perm, unsigned n) { @@ -301,8 +288,7 @@ void assemble_matrix_fine( kernels::assemble_matrix_fine<<<num_blocks, block_dim>>>( d, rhs, invariant_d, voltage, current, conductivity, cv_capacitance, area, - cv_to_cell, dt_intdom, cell_to_intdom, - perm, n); + cv_to_intdom, dt_intdom, perm, n); } // Example: @@ -337,7 +323,7 @@ void solve_matrix_fine( { kernels::solve_matrix_fine<<<num_blocks, blocksize>>>( rhs, d, u, level_meta, level_lengths, level_parents, block_index, - num_cells, padded_size); + num_cells); } } // namespace gpu diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp index a892ce5b..74b34906 100644 --- a/arbor/backends/gpu/matrix_fine.hpp +++ b/arbor/backends/gpu/matrix_fine.hpp @@ -34,9 +34,8 @@ void assemble_matrix_fine( const fvm_value_type* conductivity, const fvm_value_type* cv_capacitance, const fvm_value_type* area, - const fvm_index_type* cv_to_cell, + const fvm_index_type* cv_to_intdom, const fvm_value_type* dt_intdom, - const fvm_index_type* cell_to_intdom, const fvm_index_type* perm, unsigned n); diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp index 3b393a85..4d0d46c5 100644 --- a/arbor/backends/gpu/matrix_state_fine.hpp +++ b/arbor/backends/gpu/matrix_state_fine.hpp @@ -85,7 +85,8 @@ public: using metadata_array = memory::device_vector<level_metadata>; - iarray cv_to_cell; + // Maps control volume to integration domain + iarray cv_to_intdom; array d; // [μS] array u; // [μS] @@ -98,8 +99,8 @@ public: // Invariant part of the matrix diagonal array invariant_d; // [μS] - // Maps cell to integration domain - iarray cell_to_intdom; + // Solution in unpacked format + array solution_; // Maximum number of branches in each level per block unsigned max_branches_per_level; @@ -449,16 +450,20 @@ public: // to be stored in flat format cv_capacitance = memory::make_const_view(cap); invariant_d = memory::make_const_view(invariant_d_tmp); - cell_to_intdom = memory::make_const_view(cell_intdom); - // calculte the cv -> cell mappings + // calculate the cv -> cell mappings std::vector<size_type> cv_to_cell_tmp(matrix_size); size_type ci = 0; for (auto cv_span: util::partition_view(cell_cv_divs)) { util::fill(util::subrange_view(cv_to_cell_tmp, cv_span), ci); ++ci; } - cv_to_cell = memory::make_const_view(cv_to_cell_tmp); + std::vector<size_type> cv_to_intdom_tmp(matrix_size); + for (auto i = 0ul; i < cv_to_cell_tmp.size(); ++i) { + cv_to_intdom_tmp[i] = cell_intdom[cv_to_cell_tmp[i]]; + } + cv_to_intdom = memory::make_const_view(cv_to_intdom_tmp); + } // Assemble the matrix @@ -477,9 +482,8 @@ public: conductivity.data(), cv_capacitance.data(), cv_area.data(), - cv_to_cell.data(), + cv_to_intdom.data(), dt_intdom.data(), - cell_to_intdom.data(), perm.data(), size()); } -- GitLab