diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb6800b491e2425afda9e1d52ca8a24c96212f68..ba243a643eeff1db995de7283fc579e6c10add86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,8 +119,7 @@ if(NMC_WITH_CUDA)
     # and device compiler when targetting GPU.
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA)
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
-    #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
 
     add_definitions(-DNMC_HAVE_GPU)
     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 23246ea69295163bc883494f9e11d49787471a09..d151c2d03687bd8747a88036c513ea668ac162a5 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line();
     text_.add_line("#include <mechanism.hpp>");
     text_.add_line("#include <algorithms.hpp>");
+    text_.add_line("#include <backends/gpu_intrinsics.hpp>");
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 
@@ -104,21 +105,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     {
         increase_indentation();
 
-        text_.add_line("__device__");
-        text_.add_line("inline double atomicAdd(double* address, double val) {");
-        text_.increase_indentation();
-        text_.add_line("using I = unsigned long long int;");
-        text_.add_line("I* address_as_ull = (I*)address;");
-        text_.add_line("I old = *address_as_ull, assumed;");
-        text_.add_line("do {");
-        text_.add_line("assumed = old;");
-        text_.add_line("old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));");
-        text_.add_line("} while (assumed != old);");
-        text_.add_line("return __longlong_as_double(old);");
-        text_.decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
-
         // forward declarations of procedures
         for(auto const &var : m.symbols()) {
             if( var.second->kind()==symbolKind::procedure &&
@@ -799,7 +785,7 @@ void CUDAPrinter::print_APIMethod_body(APIMethod* e) {
             in->accept(this);
         }
         else {
-            text_ << (out->op()==tok::plus ? "atomicAdd" : "atomicSub") << "(&";
+            text_ << (out->op()==tok::plus ? "cuda_atomic_add" : "cuda_atomic_sub") << "(&";
             out->accept(this);
             text_ << ", ";
             in->accept(this);
diff --git a/src/backends/gpu_intrinsics.hpp b/src/backends/gpu_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ee9e53d85cd0d034e3c86f3b652d8eb5420681f
--- /dev/null
+++ b/src/backends/gpu_intrinsics.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+// Wrappers around CUDA addition functions.
+// CUDA 8 introduced support for atomicAdd with double precision, but only for
+// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
+// atomic addition interface that chooses the appropriate implementation.
+
+#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
+    __device__
+    inline double cuda_atomic_add(double* address, double val) {
+        using I = unsigned long long int;
+        I* address_as_ull = (I*)address;
+        I old = *address_as_ull, assumed;
+        do {
+            assumed = old;
+            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
+        } while (assumed != old);
+        return __longlong_as_double(old);
+    }
+#else // use build in atomicAdd for double precision from Pascal onwards
+    __device__
+    inline double cuda_atomic_add(double* address, double val) {
+        return atomicAdd(address, val);
+    }
+#endif
+
+__device__
+inline double cuda_atomic_sub(double* address, double val) {
+    return cuda_atomic_add(address, -val);
+}
+
+__device__
+inline float cuda_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+
+__device__
+inline float cuda_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
+
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 4f774670128bca75a559fdcbdeb4baedf03117ab..4e51c3720ec1089a88dc8c88bad3d68ec9bdec86 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -17,6 +17,7 @@ build_modules(
 # Unit test sources
 
 set(TEST_CUDA_SOURCES
+    test_atomics.cu
     test_cell_group.cu
     test_gpu_stack.cu
     test_matrix.cu
diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu
new file mode 100644
index 0000000000000000000000000000000000000000..694300e6239ed6c95ff0a90e72740e4e7014c60f
--- /dev/null
+++ b/tests/unit/test_atomics.cu
@@ -0,0 +1,53 @@
+#include "../gtest.h"
+
+#include <backends/gpu_intrinsics.hpp>
+#include <memory/managed_ptr.hpp>
+
+namespace kernels {
+    template <typename T>
+    __global__
+    void test_atomic_add(T* x) {
+        cuda_atomic_add(x, threadIdx.x+1);
+    }
+
+    template <typename T>
+    __global__
+    void test_atomic_sub(T* x) {
+        cuda_atomic_sub(x, threadIdx.x+1);
+    }
+}
+
+// test atomic addition wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_add) {
+    int expected = (128*129)/2;
+
+    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_add<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(float(expected), *f);
+
+    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    kernels::test_atomic_add<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(double(expected), *d);
+}
+
+// test atomic subtraction wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_sub) {
+    int expected = -(128*129)/2;
+
+    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_sub<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(float(expected), *f);
+
+    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    kernels::test_atomic_sub<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(double(expected), *d);
+}
+