diff --git a/src/backends/gpu/kernels/assemble_matrix.hpp b/src/backends/gpu/kernels/assemble_matrix.hpp
index 8524846859e3e657a6de13cfa5d71beccac35946..69d19733dcd9e36eaf5c5fbc24f806fd7fed1f50 100644
--- a/src/backends/gpu/kernels/assemble_matrix.hpp
+++ b/src/backends/gpu/kernels/assemble_matrix.hpp
@@ -94,6 +94,8 @@ void assemble_matrix_interleaved(
             rhs[store_pos] = gi*buffer_v[blk_pos] - buffer_i[blk_pos];
         }
 
+        __syncthreads();
+
         store_pos += LoadWidth*BlockWidth;
         load_pos  += LoadWidth;
     }
diff --git a/src/backends/gpu/kernels/interleave.hpp b/src/backends/gpu/kernels/interleave.hpp
index 3762368af4c2575f3e7e25b11467837cf3264872..7488f38b333e5d7b232cf8a40913d33c3c7b873d 100644
--- a/src/backends/gpu/kernels/interleave.hpp
+++ b/src/backends/gpu/kernels/interleave.hpp
@@ -55,6 +55,7 @@ void flat_to_interleaved(
         if (i+blk_row<padded_size) {
             out[store_pos] = buffer[blk_pos];
         }
+        __syncthreads();
         load_pos  += LoadWidth;
         store_pos += LoadWidth*BlockWidth;
     }
@@ -100,6 +101,7 @@ void interleaved_to_flat(
         if (do_store && store_pos<end) {
             out[store_pos] = buffer[lid];
         }
+        __syncthreads();
         load_pos  += LoadWidth*BlockWidth;
         store_pos += LoadWidth;
     }