Skip to content
Snippets Groups Projects
Unverified Commit 5ace929e authored by thorstenhater's avatar thorstenhater Committed by GitHub
Browse files

Gpu/restrict all the things (#1026)

Make all pointer arguments to kernels `__restrict__` to avoid unnecessary loads.

The effect on the busyring benchmark (swapped pas -> hh) with 8192 cells on a V100 GPU (time for model-run in seconds): 
```
|----------+-------|
| Baseline | After |
|----------+-------|
|    2.347 | 2.268 |
|    2.345 | 2.262 |
|    2.321 | 2.276 |
|    2.323 | 2.267 |
|    2.330 | 2.249 |
|----------+-------|
|    2.321 | 2.249 |
|----------+-------|
```
parent efa85c86
No related branches found
No related tags found
No related merge requests found
......@@ -17,7 +17,10 @@ namespace kernels {
// to[i] = from[p[i]]
template <typename T, typename I>
__global__
void gather(const T* from, T* to, const I* p, unsigned n) {
void gather(const T* __restrict__ const from,
T* __restrict__ const to,
const I* __restrict__ const p,
unsigned n) {
unsigned i = threadIdx.x + blockDim.x*blockIdx.x;
if (i<n) {
......@@ -28,7 +31,10 @@ void gather(const T* from, T* to, const I* p, unsigned n) {
// to[p[i]] = from[i]
template <typename T, typename I>
__global__
void scatter(const T* from, T* to, const I* p, unsigned n) {
void scatter(const T* __restrict__ const from,
T* __restrict__ const to,
const I* __restrict__ const p,
unsigned n) {
unsigned i = threadIdx.x + blockDim.x*blockIdx.x;
if (i<n) {
......@@ -45,18 +51,18 @@ void scatter(const T* from, T* to, const I* p, unsigned n) {
template <typename T, typename I>
__global__
void assemble_matrix_fine(
T* d,
T* rhs,
const T* invariant_d,
const T* voltage,
const T* current,
const T* conductivity,
const T* cv_capacitance,
const T* area,
const I* cv_to_cell,
const T* dt_intdom,
const I* cell_to_intdom,
const I* perm,
T* __restrict__ const d,
T* __restrict__ const rhs,
const T* __restrict__ const invariant_d,
const T* __restrict__ const voltage,
const T* __restrict__ const current,
const T* __restrict__ const conductivity,
const T* __restrict__ const cv_capacitance,
const T* __restrict__ const area,
const I* __restrict__ const cv_to_cell,
const T* __restrict__ const dt_intdom,
const I* __restrict__ const cell_to_intdom,
const I* __restrict__ const perm,
unsigned n)
{
const unsigned tid = threadIdx.x + blockDim.x*blockIdx.x;
......@@ -97,15 +103,15 @@ void assemble_matrix_fine(
template <typename T>
__global__
void solve_matrix_fine(
T* rhs,
T* d,
const T* u,
const level_metadata* level_meta,
const fvm_index_type* level_lengths,
const fvm_index_type* level_parents,
const fvm_index_type* block_index,
fvm_index_type* num_matrix, // number of packed matrices = number of cells
fvm_index_type* padded_size)
T* __restrict__ const rhs,
T* __restrict__ const d,
const T* __restrict__ const u,
const level_metadata* __restrict__ const level_meta,
const fvm_index_type* __restrict__ const level_lengths,
const fvm_index_type* __restrict__ const level_parents,
const fvm_index_type* __restrict__ const block_index,
fvm_index_type* __restrict__ const num_matrix, // number of packed matrices = number of cells
fvm_index_type* __restrict__ const padded_size)
{
const auto tid = threadIdx.x;
const auto bid = blockIdx.x;
......
......@@ -11,10 +11,10 @@ namespace kernels {
template <typename T, typename I>
__global__ void mark_until_after(
unsigned n,
I* mark,
const I* span_end,
const T* ev_time,
const T* t_until)
I* __restrict__ const mark,
const I* __restrict__ const span_end,
const T* __restrict__ const ev_time,
const T* __restrict__ const t_until)
{
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
......@@ -31,10 +31,10 @@ namespace kernels {
template <typename T, typename I>
__global__ void mark_until(
unsigned n,
I* mark,
const I* span_end,
const T* ev_time,
const T* t_until)
I* __restrict__ const mark,
const I* __restrict__ const span_end,
const T* __restrict__ const ev_time,
const T* __restrict__ const t_until)
{
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
......@@ -51,10 +51,10 @@ namespace kernels {
template <typename I>
__global__ void drop_marked_events(
unsigned n,
I* n_nonempty,
I* span_begin,
const I* span_end,
const I* mark)
I* __restrict__ const n_nonempty,
I* __restrict__ const span_begin,
const I* __restrict__ const span_end,
const I* __restrict__ const mark)
{
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
......@@ -69,10 +69,10 @@ namespace kernels {
template <typename T, typename I>
__global__ void event_time_if_before(
unsigned n,
const I* span_begin,
const I* span_end,
const T* ev_time,
T* t_until)
const I* __restrict__ const span_begin,
const I* __restrict__ const span_end,
const T* __restrict__ const ev_time,
T* __restrict__ const t_until)
{
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
......
......@@ -14,7 +14,11 @@ namespace gpu {
namespace kernel {
template <typename T>
__global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt, T tmax) {
__global__ void update_time_to_impl(unsigned n,
T* __restrict__ const time_to,
const T* __restrict__ const time,
T dt,
T tmax) {
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
auto t = time[i]+dt;
......@@ -23,7 +27,10 @@ __global__ void update_time_to_impl(unsigned n, T* time_to, const T* time, T dt,
}
template <typename T, typename I>
__global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* voltage, I* current_density) {
__global__ void add_gj_current_impl(unsigned n,
const T* __restrict__ const gj_info,
const I* __restrict__ const voltage,
I* __restrict__ const current_density) {
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
auto gj = gj_info[i];
......@@ -35,7 +42,9 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta
// Vector/scalar addition: x[i] += v ∀i
template <typename T>
__global__ void add_scalar(unsigned n, T* x, fvm_value_type v) {
__global__ void add_scalar(unsigned n,
T* __restrict__ const x,
fvm_value_type v) {
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<n) {
x[i] += v;
......@@ -44,11 +53,11 @@ __global__ void add_scalar(unsigned n, T* x, fvm_value_type v) {
template <typename T, typename I>
__global__ void set_dt_impl( T* __restrict__ dt_intdom,
const T* time_to,
const T* time,
const T* __restrict__ time_to,
const T* __restrict__ time,
const unsigned ncomp,
T* __restrict__ dt_comp,
const I* cv_to_intdom) {
const I* __restrict__ cv_to_intdom) {
auto idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < ncomp) {
const auto ind = cv_to_intdom[idx];
......@@ -60,7 +69,9 @@ __global__ void set_dt_impl( T* __restrict__ dt_intdom,
__global__ void take_samples_impl(
multi_event_stream_state<raw_probe_info> s,
const fvm_value_type* time, fvm_value_type* sample_time, fvm_value_type* sample_value)
const fvm_value_type* __restrict__ const time,
fvm_value_type* __restrict__ const sample_time,
fvm_value_type* __restrict__ const sample_value)
{
unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
if (i<s.n) {
......
......@@ -30,10 +30,15 @@ inline T lerp(T a, T b, T u) {
__global__
void test_thresholds_impl(
int size,
const fvm_index_type* cv_to_intdom, const fvm_value_type* t_after, const fvm_value_type* t_before,
const fvm_index_type* __restrict__ const cv_to_intdom,
const fvm_value_type* __restrict__ const t_after,
const fvm_value_type* __restrict__ const t_before,
stack_storage<threshold_crossing>& stack,
fvm_index_type* is_crossed, fvm_value_type* prev_values,
const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
fvm_index_type* __restrict__ const is_crossed,
fvm_value_type* __restrict__ const prev_values,
const fvm_index_type* __restrict__ const cv_index,
const fvm_value_type* __restrict__ const values,
const fvm_value_type* __restrict__ const thresholds)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
......@@ -73,8 +78,11 @@ void test_thresholds_impl(
__global__
extern void reset_crossed_impl(
int size, fvm_index_type* is_crossed,
const fvm_index_type* cv_index, const fvm_value_type* values, const fvm_value_type* thresholds)
int size,
fvm_index_type* __restrict__ const is_crossed,
const fvm_index_type* __restrict__ const cv_index,
const fvm_value_type* __restrict__ const values,
const fvm_value_type* __restrict__ const thresholds)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i<size) {
......
......@@ -7,7 +7,7 @@ namespace gpu {
template <typename T, typename I>
__global__
void fill_kernel(T* v, T value, I n) {
void fill_kernel(T* __restrict__ const v, T value, I n) {
auto tid = threadIdx.x + blockDim.x*blockIdx.x;
if(tid < n) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment