Skip to content
Snippets Groups Projects
Unverified Commit a88f9c02 authored by thorstenhater's avatar thorstenhater Committed by GitHub
Browse files

Gpu/optimise matrix fine (#1028)

Memory access optimization for the fine matrix GPU solver.
parent f953783d
No related branches found
No related tags found
No related merge requests found
......@@ -42,7 +42,7 @@ void scatter(const T* __restrict__ const from,
}
}
/// GPU implementatin of Hines matrix assembly.
/// GPU implementation of Hines matrix assembly.
/// Fine layout.
/// For a given time step size dt:
/// - use the precomputed alpha and alpha_d values to construct the diagonal
......@@ -59,35 +59,26 @@ void assemble_matrix_fine(
const T* __restrict__ const conductivity,
const T* __restrict__ const cv_capacitance,
const T* __restrict__ const area,
const I* __restrict__ const cv_to_cell,
const I* __restrict__ const cv_to_intdom,
const T* __restrict__ const dt_intdom,
const I* __restrict__ const cell_to_intdom,
const I* __restrict__ const perm,
unsigned n)
{
const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid<n) {
auto cid = cv_to_cell[tid];
auto dt = dt_intdom[cell_to_intdom[cid]];
if (dt>0) {
// The 1e-3 is a constant of proportionality required to ensure that the
// conductance (gi) values have units μS (micro-Siemens).
// See the model documentation in docs/model for more information.
T oodt_factor = T(1e-3)/dt;
T area_factor = T(1e-3)*area[tid];
const auto gi = oodt_factor*cv_capacitance[tid] + area_factor*conductivity[tid];
const auto pid = perm[tid];
d[pid] = gi + invariant_d[tid];
rhs[pid] = gi*voltage[tid] - area_factor*current[tid];
}
else {
const auto pid = perm[tid];
d[pid] = 0;
rhs[pid] = voltage[tid];
}
if (tid < n) {
// The 1e-3 is a constant of proportionality required to ensure that the
// conductance (gi) values have units μS (micro-Siemens).
// See the model documentation in docs/model for more information.
const auto dt = dt_intdom[cv_to_intdom[tid]];
const auto p = dt > 0;
const auto pid = perm[tid];
const auto area_factor = T(1e-3)*area[tid];
const auto gi = T(1e-3)*cv_capacitance[tid]/dt + area_factor*conductivity[tid];
const auto r_d = gi + invariant_d[tid];
const auto r_rhs = gi*voltage[tid] - area_factor*current[tid];
d[pid] = p ? r_d : 0;
rhs[pid] = p ? r_rhs : voltage[tid];
}
}
......@@ -110,8 +101,7 @@ void solve_matrix_fine(
const fvm_index_type* __restrict__ const level_lengths,
const fvm_index_type* __restrict__ const level_parents,
const fvm_index_type* __restrict__ const block_index,
fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells
fvm_index_type* __restrict__ const padded_size)
const fvm_index_type* __restrict__ const num_matrix) // number of packed matrices = number of cells
{
const auto tid = threadIdx.x;
const auto bid = blockIdx.x;
......@@ -135,6 +125,7 @@ void solve_matrix_fine(
const unsigned width = lvl_meta.num_branches;
// Perform backward substitution for each branch on this level.
// One thread per branch.
if (tid < width) {
......@@ -150,32 +141,31 @@ void solve_matrix_fine(
// all the matrices at the same time. When a cell finishes, we put a
// `0` on the diagonal to mark that it should not be solved for.
if (d[pos]!=0) {
// each branch perform substitution
T factor = u[pos] / d[pos];
for (unsigned i=0; i<len-1; ++i) {
const unsigned next_pos = pos + width;
d[next_pos] -= factor * u[pos];
rhs[next_pos] -= factor * rhs[pos];
factor = u[next_pos] / d[next_pos];
const auto d_next = d[next_pos];
const auto rhs_next = rhs[next_pos];
const T factor = -u[pos]/d[pos];
d[next_pos] = fma(factor, u[pos], d_next);
rhs[next_pos] = fma(factor, rhs[pos], rhs_next);
pos = next_pos;
}
// Update d and rhs at the parent node of this branch.
// A parent may have more than one contributing to it, so we use
// atomic updates to avoid races conditions.
const unsigned parent_index = next_lvl_meta.matrix_data_index;
const unsigned p = parent_index + lvl_parents[tid];
//d[p] -= factor * u[pos];
gpu_atomic_add(d +p, -factor*u[pos]);
//rhs[p] -= factor * rhs[pos];
gpu_atomic_add(rhs+p, -factor*rhs[pos]);
const T factor = -u[pos] / d[pos];
gpu_atomic_add(d + p, factor*u[pos]);
gpu_atomic_add(rhs + p, factor*rhs[pos]);
}
}
__syncthreads();
}
// Solve the root
{
// The levels are sorted such that the root is the last level
const auto& last_lvl_meta = block_level_meta[num_levels-1];
......@@ -188,14 +178,14 @@ void solve_matrix_fine(
unsigned pos = last_lvl_meta.matrix_data_index + tid;
if (d[pos]!=0) {
// backward
for (unsigned i=0; i<len-1; ++i) {
T factor = u[pos] / d[pos];
const unsigned next_pos = pos + width;
d[next_pos] -= factor * u[pos];
rhs[next_pos] -= factor * rhs[pos];
const T factor = -u[pos] / d[pos];
const auto rhs_next = rhs[next_pos];
const auto d_next = d[next_pos];
d[next_pos] = fma(factor, u[pos], d_next);
rhs[next_pos] = fma(factor, rhs[pos], rhs_next);
pos = next_pos;
}
......@@ -233,15 +223,14 @@ void solve_matrix_fine(
// Perform forward-substitution for each branch on this level.
// One thread per branch.
if (tid < width) {
// Load the rhs value for the parent node of this branch.
const unsigned p = parent_index + lvl_parents[tid];
T rhsp = rhs[p];
// Find the index of the first node in this branch.
const unsigned len = lvl_lengths[tid];
unsigned pos = lvl_meta.matrix_data_index + (len-1)*width + tid;
if (d[pos]!=0) {
// Load the rhs value for the parent node of this branch.
const unsigned p = parent_index + lvl_parents[tid];
T rhsp = rhs[p];
// each branch perform substitution
for (unsigned i=0; i<len; ++i) {
rhsp = rhs[pos] - u[pos]*rhsp;
......@@ -280,7 +269,6 @@ void scatter(
kernels::scatter<<<griddim, blockdim>>>(from, to, p, n);
}
void assemble_matrix_fine(
fvm_value_type* d,
fvm_value_type* rhs,
......@@ -290,9 +278,8 @@ void assemble_matrix_fine(
const fvm_value_type* conductivity,
const fvm_value_type* cv_capacitance,
const fvm_value_type* area,
const fvm_index_type* cv_to_cell,
const fvm_index_type* cv_to_intdom,
const fvm_value_type* dt_intdom,
const fvm_index_type* cell_to_intdom,
const fvm_index_type* perm,
unsigned n)
{
......@@ -301,8 +288,7 @@ void assemble_matrix_fine(
kernels::assemble_matrix_fine<<<num_blocks, block_dim>>>(
d, rhs, invariant_d, voltage, current, conductivity, cv_capacitance, area,
cv_to_cell, dt_intdom, cell_to_intdom,
perm, n);
cv_to_intdom, dt_intdom, perm, n);
}
// Example:
......@@ -337,7 +323,7 @@ void solve_matrix_fine(
{
kernels::solve_matrix_fine<<<num_blocks, blocksize>>>(
rhs, d, u, level_meta, level_lengths, level_parents, block_index,
num_cells, padded_size);
num_cells);
}
} // namespace gpu
......
......@@ -34,9 +34,8 @@ void assemble_matrix_fine(
const fvm_value_type* conductivity,
const fvm_value_type* cv_capacitance,
const fvm_value_type* area,
const fvm_index_type* cv_to_cell,
const fvm_index_type* cv_to_intdom,
const fvm_value_type* dt_intdom,
const fvm_index_type* cell_to_intdom,
const fvm_index_type* perm,
unsigned n);
......
......@@ -85,7 +85,8 @@ public:
using metadata_array = memory::device_vector<level_metadata>;
iarray cv_to_cell;
// Maps control volume to integration domain
iarray cv_to_intdom;
array d; // [μS]
array u; // [μS]
......@@ -98,8 +99,8 @@ public:
// Invariant part of the matrix diagonal
array invariant_d; // [μS]
// Maps cell to integration domain
iarray cell_to_intdom;
// Solution in unpacked format
array solution_;
// Maximum number of branches in each level per block
unsigned max_branches_per_level;
......@@ -449,16 +450,20 @@ public:
// to be stored in flat format
cv_capacitance = memory::make_const_view(cap);
invariant_d = memory::make_const_view(invariant_d_tmp);
cell_to_intdom = memory::make_const_view(cell_intdom);
// calculte the cv -> cell mappings
// calculate the cv -> cell mappings
std::vector<size_type> cv_to_cell_tmp(matrix_size);
size_type ci = 0;
for (auto cv_span: util::partition_view(cell_cv_divs)) {
util::fill(util::subrange_view(cv_to_cell_tmp, cv_span), ci);
++ci;
}
cv_to_cell = memory::make_const_view(cv_to_cell_tmp);
std::vector<size_type> cv_to_intdom_tmp(matrix_size);
for (auto i = 0ul; i < cv_to_cell_tmp.size(); ++i) {
cv_to_intdom_tmp[i] = cell_intdom[cv_to_cell_tmp[i]];
}
cv_to_intdom = memory::make_const_view(cv_to_intdom_tmp);
}
// Assemble the matrix
......@@ -477,9 +482,8 @@ public:
conductivity.data(),
cv_capacitance.data(),
cv_area.data(),
cv_to_cell.data(),
cv_to_intdom.data(),
dt_intdom.data(),
cell_to_intdom.data(),
perm.data(),
size());
}
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment