diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp index 8524846859e3e657a6de13cfa5d71beccac35946..69d19733dcd9e36eaf5c5fbc24f806fd7fed1f50 100644 --- a/src/backends/gpu/kernels/assemble_matrix.hpp +++ b/src/backends/gpu/kernels/assemble_matrix.hpp @@ -94,6 +94,8 @@ void assemble_matrix_interleaved( rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos]; } + __syncthreads(); + store_pos += LoadWidth*BlockWidth; load_pos += LoadWidth; } diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp index 3762368af4c2575f3e7e25b11467837cf3264872..7488f38b333e5d7b232cf8a40913d33c3c7b873d 100644 --- a/src/backends/gpu/kernels/interleave.hpp +++ b/src/backends/gpu/kernels/interleave.hpp @@ -55,6 +55,7 @@ void flat_to_interleaved( if (i+blk_row<padded_size) { out[store_pos] = buffer[blk_pos]; } + __syncthreads(); load_pos += LoadWidth; store_pos += LoadWidth*BlockWidth; } @@ -100,6 +101,7 @@ void interleaved_to_flat( if (do_store && store_pos<end) { out[store_pos] = buffer[lid]; } + __syncthreads(); load_pos += LoadWidth*BlockWidth; store_pos += LoadWidth; }