Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
arbor
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Analyze
Contributor analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
arbor-sim
arbor
Commits
a88f9c02
Unverified
Commit
a88f9c02
authored
4 years ago
by
thorstenhater
Committed by
GitHub
4 years ago
Browse files
Options
Downloads
Patches
Plain Diff
Gpu/optimise matrix fine (#1028)
Memory access optimization for the fine matrix GPU solver.
parent
f953783d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
arbor/backends/gpu/matrix_fine.cu
+39
-53
39 additions, 53 deletions
arbor/backends/gpu/matrix_fine.cu
arbor/backends/gpu/matrix_fine.hpp
+1
-2
1 addition, 2 deletions
arbor/backends/gpu/matrix_fine.hpp
arbor/backends/gpu/matrix_state_fine.hpp
+12
-8
12 additions, 8 deletions
arbor/backends/gpu/matrix_state_fine.hpp
with
52 additions
and
63 deletions
arbor/backends/gpu/matrix_fine.cu
+
39
−
53
View file @
a88f9c02
...
...
@@ -42,7 +42,7 @@ void scatter(const T* __restrict__ const from,
}
}
/// GPU implementatin of Hines matrix assembly.
/// GPU implementati
o
n of Hines matrix assembly.
/// Fine layout.
/// For a given time step size dt:
/// - use the precomputed alpha and alpha_d values to construct the diagonal
...
...
@@ -59,35 +59,26 @@ void assemble_matrix_fine(
const
T
*
__restrict__
const
conductivity
,
const
T
*
__restrict__
const
cv_capacitance
,
const
T
*
__restrict__
const
area
,
const
I
*
__restrict__
const
cv_to_
cell
,
const
I
*
__restrict__
const
cv_to_
intdom
,
const
T
*
__restrict__
const
dt_intdom
,
const
I
*
__restrict__
const
cell_to_intdom
,
const
I
*
__restrict__
const
perm
,
unsigned
n
)
{
const
unsigned
tid
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
if
(
tid
<
n
)
{
auto
cid
=
cv_to_cell
[
tid
];
auto
dt
=
dt_intdom
[
cell_to_intdom
[
cid
]];
if
(
dt
>
0
)
{
// The 1e-3 is a constant of proportionality required to ensure that the
// conductance (gi) values have units μS (micro-Siemens).
// See the model documentation in docs/model for more information.
T
oodt_factor
=
T
(
1e-3
)
/
dt
;
T
area_factor
=
T
(
1e-3
)
*
area
[
tid
];
const
auto
gi
=
oodt_factor
*
cv_capacitance
[
tid
]
+
area_factor
*
conductivity
[
tid
];
const
auto
pid
=
perm
[
tid
];
d
[
pid
]
=
gi
+
invariant_d
[
tid
];
rhs
[
pid
]
=
gi
*
voltage
[
tid
]
-
area_factor
*
current
[
tid
];
}
else
{
const
auto
pid
=
perm
[
tid
];
d
[
pid
]
=
0
;
rhs
[
pid
]
=
voltage
[
tid
];
}
if
(
tid
<
n
)
{
// The 1e-3 is a constant of proportionality required to ensure that the
// conductance (gi) values have units μS (micro-Siemens).
// See the model documentation in docs/model for more information.
const
auto
dt
=
dt_intdom
[
cv_to_intdom
[
tid
]];
const
auto
p
=
dt
>
0
;
const
auto
pid
=
perm
[
tid
];
const
auto
area_factor
=
T
(
1e-3
)
*
area
[
tid
];
const
auto
gi
=
T
(
1e-3
)
*
cv_capacitance
[
tid
]
/
dt
+
area_factor
*
conductivity
[
tid
];
const
auto
r_d
=
gi
+
invariant_d
[
tid
];
const
auto
r_rhs
=
gi
*
voltage
[
tid
]
-
area_factor
*
current
[
tid
];
d
[
pid
]
=
p
?
r_d
:
0
;
rhs
[
pid
]
=
p
?
r_rhs
:
voltage
[
tid
];
}
}
...
...
@@ -110,8 +101,7 @@ void solve_matrix_fine(
const
fvm_index_type
*
__restrict__
const
level_lengths
,
const
fvm_index_type
*
__restrict__
const
level_parents
,
const
fvm_index_type
*
__restrict__
const
block_index
,
fvm_index_type
*
__restrict__
const
num_matrix
,
// number of packed matrices = number of cells
fvm_index_type
*
__restrict__
const
padded_size
)
const
fvm_index_type
*
__restrict__
const
num_matrix
)
// number of packed matrices = number of cells
{
const
auto
tid
=
threadIdx
.
x
;
const
auto
bid
=
blockIdx
.
x
;
...
...
@@ -135,6 +125,7 @@ void solve_matrix_fine(
const
unsigned
width
=
lvl_meta
.
num_branches
;
// Perform backward substitution for each branch on this level.
// One thread per branch.
if
(
tid
<
width
)
{
...
...
@@ -150,32 +141,31 @@ void solve_matrix_fine(
// all the matrices at the same time. When a cell finishes, we put a
// `0` on the diagonal to mark that it should not be solved for.
if
(
d
[
pos
]
!=
0
)
{
// each branch perform substitution
T
factor
=
u
[
pos
]
/
d
[
pos
];
for
(
unsigned
i
=
0
;
i
<
len
-
1
;
++
i
)
{
const
unsigned
next_pos
=
pos
+
width
;
d
[
next_pos
]
-=
factor
*
u
[
pos
];
rhs
[
next_pos
]
-=
factor
*
rhs
[
pos
];
factor
=
u
[
next_pos
]
/
d
[
next_pos
];
const
auto
d_next
=
d
[
next_pos
];
const
auto
rhs_next
=
rhs
[
next_pos
];
const
T
factor
=
-
u
[
pos
]
/
d
[
pos
];
d
[
next_pos
]
=
fma
(
factor
,
u
[
pos
],
d_next
);
rhs
[
next_pos
]
=
fma
(
factor
,
rhs
[
pos
],
rhs_next
);
pos
=
next_pos
;
}
// Update d and rhs at the parent node of this branch.
// A parent may have more than one contributing to it, so we use
// atomic updates to avoid races conditions.
const
unsigned
parent_index
=
next_lvl_meta
.
matrix_data_index
;
const
unsigned
p
=
parent_index
+
lvl_parents
[
tid
];
//d[p] -= factor * u[pos];
gpu_atomic_add
(
d
+
p
,
-
factor
*
u
[
pos
]
)
;
//rhs[p] -=
factor
* rhs
[pos];
gpu_atomic_add
(
rhs
+
p
,
-
factor
*
rhs
[
pos
]);
const
T
factor
=
-
u
[
pos
]
/
d
[
pos
]
;
gpu_atomic_add
(
d
+
p
,
factor
*
u
[
pos
]
)
;
gpu_atomic_add
(
rhs
+
p
,
factor
*
rhs
[
pos
]);
}
}
__syncthreads
();
}
// Solve the root
{
// The levels are sorted such that the root is the last level
const
auto
&
last_lvl_meta
=
block_level_meta
[
num_levels
-
1
];
...
...
@@ -188,14 +178,14 @@ void solve_matrix_fine(
unsigned
pos
=
last_lvl_meta
.
matrix_data_index
+
tid
;
if
(
d
[
pos
]
!=
0
)
{
// backward
for
(
unsigned
i
=
0
;
i
<
len
-
1
;
++
i
)
{
T
factor
=
u
[
pos
]
/
d
[
pos
];
const
unsigned
next_pos
=
pos
+
width
;
d
[
next_pos
]
-=
factor
*
u
[
pos
];
rhs
[
next_pos
]
-=
factor
*
rhs
[
pos
];
const
T
factor
=
-
u
[
pos
]
/
d
[
pos
];
const
auto
rhs_next
=
rhs
[
next_pos
];
const
auto
d_next
=
d
[
next_pos
];
d
[
next_pos
]
=
fma
(
factor
,
u
[
pos
],
d_next
);
rhs
[
next_pos
]
=
fma
(
factor
,
rhs
[
pos
],
rhs_next
);
pos
=
next_pos
;
}
...
...
@@ -233,15 +223,14 @@ void solve_matrix_fine(
// Perform forward-substitution for each branch on this level.
// One thread per branch.
if
(
tid
<
width
)
{
// Load the rhs value for the parent node of this branch.
const
unsigned
p
=
parent_index
+
lvl_parents
[
tid
];
T
rhsp
=
rhs
[
p
];
// Find the index of the first node in this branch.
const
unsigned
len
=
lvl_lengths
[
tid
];
unsigned
pos
=
lvl_meta
.
matrix_data_index
+
(
len
-
1
)
*
width
+
tid
;
if
(
d
[
pos
]
!=
0
)
{
// Load the rhs value for the parent node of this branch.
const
unsigned
p
=
parent_index
+
lvl_parents
[
tid
];
T
rhsp
=
rhs
[
p
];
// each branch perform substitution
for
(
unsigned
i
=
0
;
i
<
len
;
++
i
)
{
rhsp
=
rhs
[
pos
]
-
u
[
pos
]
*
rhsp
;
...
...
@@ -280,7 +269,6 @@ void scatter(
kernels
::
scatter
<<<
griddim
,
blockdim
>>>
(
from
,
to
,
p
,
n
);
}
void
assemble_matrix_fine
(
fvm_value_type
*
d
,
fvm_value_type
*
rhs
,
...
...
@@ -290,9 +278,8 @@ void assemble_matrix_fine(
const
fvm_value_type
*
conductivity
,
const
fvm_value_type
*
cv_capacitance
,
const
fvm_value_type
*
area
,
const
fvm_index_type
*
cv_to_
cell
,
const
fvm_index_type
*
cv_to_
intdom
,
const
fvm_value_type
*
dt_intdom
,
const
fvm_index_type
*
cell_to_intdom
,
const
fvm_index_type
*
perm
,
unsigned
n
)
{
...
...
@@ -301,8 +288,7 @@ void assemble_matrix_fine(
kernels
::
assemble_matrix_fine
<<<
num_blocks
,
block_dim
>>>
(
d
,
rhs
,
invariant_d
,
voltage
,
current
,
conductivity
,
cv_capacitance
,
area
,
cv_to_cell
,
dt_intdom
,
cell_to_intdom
,
perm
,
n
);
cv_to_intdom
,
dt_intdom
,
perm
,
n
);
}
// Example:
...
...
@@ -337,7 +323,7 @@ void solve_matrix_fine(
{
kernels
::
solve_matrix_fine
<<<
num_blocks
,
blocksize
>>>
(
rhs
,
d
,
u
,
level_meta
,
level_lengths
,
level_parents
,
block_index
,
num_cells
,
padded_size
);
num_cells
);
}
}
// namespace gpu
...
...
This diff is collapsed.
Click to expand it.
arbor/backends/gpu/matrix_fine.hpp
+
1
−
2
View file @
a88f9c02
...
...
@@ -34,9 +34,8 @@ void assemble_matrix_fine(
const
fvm_value_type
*
conductivity
,
const
fvm_value_type
*
cv_capacitance
,
const
fvm_value_type
*
area
,
const
fvm_index_type
*
cv_to_
cell
,
const
fvm_index_type
*
cv_to_
intdom
,
const
fvm_value_type
*
dt_intdom
,
const
fvm_index_type
*
cell_to_intdom
,
const
fvm_index_type
*
perm
,
unsigned
n
);
...
...
This diff is collapsed.
Click to expand it.
arbor/backends/gpu/matrix_state_fine.hpp
+
12
−
8
View file @
a88f9c02
...
...
@@ -85,7 +85,8 @@ public:
using
metadata_array
=
memory
::
device_vector
<
level_metadata
>
;
iarray
cv_to_cell
;
// Maps control volume to integration domain
iarray
cv_to_intdom
;
array
d
;
// [μS]
array
u
;
// [μS]
...
...
@@ -98,8 +99,8 @@ public:
// Invariant part of the matrix diagonal
array
invariant_d
;
// [μS]
//
Maps cell to integration domain
i
array
cell_to_intdom
;
//
Solution in unpacked format
array
solution_
;
// Maximum number of branches in each level per block
unsigned
max_branches_per_level
;
...
...
@@ -449,16 +450,20 @@ public:
// to be stored in flat format
cv_capacitance
=
memory
::
make_const_view
(
cap
);
invariant_d
=
memory
::
make_const_view
(
invariant_d_tmp
);
cell_to_intdom
=
memory
::
make_const_view
(
cell_intdom
);
// calculte the cv -> cell mappings
// calcul
a
te the cv -> cell mappings
std
::
vector
<
size_type
>
cv_to_cell_tmp
(
matrix_size
);
size_type
ci
=
0
;
for
(
auto
cv_span
:
util
::
partition_view
(
cell_cv_divs
))
{
util
::
fill
(
util
::
subrange_view
(
cv_to_cell_tmp
,
cv_span
),
ci
);
++
ci
;
}
cv_to_cell
=
memory
::
make_const_view
(
cv_to_cell_tmp
);
std
::
vector
<
size_type
>
cv_to_intdom_tmp
(
matrix_size
);
for
(
auto
i
=
0ul
;
i
<
cv_to_cell_tmp
.
size
();
++
i
)
{
cv_to_intdom_tmp
[
i
]
=
cell_intdom
[
cv_to_cell_tmp
[
i
]];
}
cv_to_intdom
=
memory
::
make_const_view
(
cv_to_intdom_tmp
);
}
// Assemble the matrix
...
...
@@ -477,9 +482,8 @@ public:
conductivity
.
data
(),
cv_capacitance
.
data
(),
cv_area
.
data
(),
cv_to_
cell
.
data
(),
cv_to_
intdom
.
data
(),
dt_intdom
.
data
(),
cell_to_intdom
.
data
(),
perm
.
data
(),
size
());
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment