From bd1e56a562a4c64d40f9e67ffedd64a177d25d71 Mon Sep 17 00:00:00 2001
From: Sam Yates <yates@cscs.ch>
Date: Wed, 24 May 2017 16:34:50 +0200
Subject: [PATCH] Add required thread synchronization to matrix kernel. (#280)

There is a potential data race in the `assemble_matrix_interleaved` kernel, where threads in a different warp can overwrite the `buffer_v` and `buffer_i` values before they are used to update the `d` and `rhs` vectors.

This race has been exercised in the asynchronous event delivery branch.

* Add `__syncthreads()` to assemble matrix interleaved kernel after `d` and `rhs` update.
---
 src/backends/gpu/kernels/assemble_matrix.hpp | 2 ++
 src/backends/gpu/kernels/interleave.hpp      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp
index 85248468..69d19733 100644
--- a/src/backends/gpu/kernels/assemble_matrix.hpp
+++ b/src/backends/gpu/kernels/assemble_matrix.hpp
@@ -94,6 +94,8 @@ void assemble_matrix_interleaved(
             rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos];
         }
 
+        __syncthreads();
+
         store_pos += LoadWidth*BlockWidth;
         load_pos  += LoadWidth;
     }
diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp
index 3762368a..7488f38b 100644
--- a/src/backends/gpu/kernels/interleave.hpp
+++ b/src/backends/gpu/kernels/interleave.hpp
@@ -55,6 +55,7 @@ void flat_to_interleaved(
         if (i+blk_row<padded_size) {
             out[store_pos] = buffer[blk_pos];
         }
+        __syncthreads();
         load_pos  += LoadWidth;
         store_pos += LoadWidth*BlockWidth;
     }
@@ -100,6 +101,7 @@ void interleaved_to_flat(
         if (do_store && store_pos<end) {
             out[store_pos] = buffer[lid];
         }
+        __syncthreads();
         load_pos  += LoadWidth*BlockWidth;
         store_pos += LoadWidth;
     }
-- 
GitLab