From 5e3865cf2091dbbde10e519442c59dae9948c753 Mon Sep 17 00:00:00 2001
From: Ben Cumming <bcumming@cscs.ch>
Date: Thu, 29 Nov 2018 14:36:48 +0100
Subject: [PATCH] Fix thread-GPU affinity bug. (#656)

Ensure that all threads use the same GPU, which wasn't the case before.

* add `gpu_context::set_gpu()` method that will set all subsequent GPU calls from the calling thread run on the GPU of `gpu_context`.
* `fvm_lowered_cell_impl` now calls the `set_gpu` method on construction and `advance`.
* Also changed GPU memory allocation errors in `arb::memory` to throw `arb_exception` instead of calling `std::terminate` on error. Now errors due to poor GPU configuration can be caught by the calling application, and unit tests fail gracefully and allow other tests to run.

Fixes #655
---
 arbor/fvm_lowered_cell_impl.hpp | 12 ++++++++++++
 arbor/gpu_context.cpp           | 29 +++++++++++++++++++++++------
 arbor/gpu_context.hpp           |  3 +++
 arbor/memory/cuda_wrappers.cpp  | 26 +++++++++++---------------
 4 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index f2a45485..d033f464 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -114,6 +114,14 @@ private:
     static unsigned dt_steps(value_type t0, value_type t1, value_type dt) {
         return t0>=t1? 0: 1+(unsigned)((t1-t0)/dt);
     }
+
+    // Sets the GPU used for CUDA calls from the thread that calls it.
+    // The GPU will be the one in the execution context context_.
+    // If not called, the thread may attempt to launch on a different GPU,
+    // leading to crashes.
+    void set_gpu() {
+        if (context_.gpu->has_gpu()) context_.gpu->set_gpu();
+    }
 };
 
 template <typename Backend>
@@ -152,6 +160,8 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
 {
     using util::as_const;
 
+    set_gpu();
+
     // Integration setup
     PE(advance_integrate_setup);
     threshold_watcher_.clear_crossings();
@@ -301,6 +311,8 @@ void fvm_lowered_cell_impl<B>::initialize(
     using util::value_by_key;
     using util::keys;
 
+    set_gpu();
+
     std::vector<mc_cell> cells;
     const std::size_t ncell = gids.size();
 
diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp
index 34b078ea..a71b5a90 100644
--- a/arbor/gpu_context.cpp
+++ b/arbor/gpu_context.cpp
@@ -38,7 +38,12 @@ bool gpu_context::has_gpu() const {
 
 #ifndef ARB_HAVE_GPU
 
+void gpu_context::set_gpu() const {
+    throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU.");
+}
+
 void gpu_context::synchronize_for_managed_access() const {}
+
 gpu_context::gpu_context(int) {
     throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU.");
 }
@@ -52,13 +57,12 @@ gpu_context::gpu_context(int gpu_id) {
         throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id));
     }
 
-    // Set the device
-    status = cudaSetDevice(gpu_id);
-    if (status!=cudaSuccess) {
-        throw arbor_exception("Unable to select GPU id " + std::to_string(gpu_id));
-    }
-
+    // Set the device.
+    // The device will also have to be set for every host thread that uses the
+    // GPU, however performing this call here is a good check that the GPU can
+    // be set and initialized.
     id_ = gpu_id;
+    set_gpu();
 
     // Record the device attributes
     attributes_ = 0;
@@ -76,6 +80,19 @@ void gpu_context::synchronize_for_managed_access() const {
     }
 }
 
+void gpu_context::set_gpu() const {
+    if (!has_gpu()) {
+        throw arbor_exception(
+            "Call to gpu_context::set_gpu() when there is no GPU selected.");
+    }
+    auto status = cudaSetDevice(id_);
+    if (status != cudaSuccess) {
+        throw arbor_exception(
+            "Unable to select GPU id " + std::to_string(id_)
+            + ": " + cudaGetErrorName(status));
+    }
+}
+
 #endif
 
 } // namespace arb
diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp
index 2b1e4f49..44de4749 100644
--- a/arbor/gpu_context.hpp
+++ b/arbor/gpu_context.hpp
@@ -17,6 +17,9 @@ public:
     bool has_atomic_double() const;
     void synchronize_for_managed_access() const;
     bool has_gpu() const;
+    // Calls cudaSetDevice(id), so that GPU calls from the calling thread will
+    // be executed on the GPU.
+    void set_gpu() const;
 };
 
 using gpu_context_handle = std::shared_ptr<gpu_context>;
diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp
index 7e694d7e..d31c82b4 100644
--- a/arbor/memory/cuda_wrappers.cpp
+++ b/arbor/memory/cuda_wrappers.cpp
@@ -1,6 +1,8 @@
 #include <cstdlib>
 #include <string>
 
+#include <arbor/arbexcept.hpp>
+
 #include "util.hpp"
 
 #ifdef ARB_HAVE_GPU
@@ -8,8 +10,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#define LOG_CUDA_ERROR(error, msg)\
-LOG_ERROR("memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error))
+#define HANDLE_CUDA_ERROR(error, msg)\
+throw arbor_exception("CUDA memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error));
 
 namespace arb {
 namespace memory {
@@ -18,29 +20,25 @@ using std::to_string;
 
 void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) {
     if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)) {
-        LOG_CUDA_ERROR(error, "n="+to_string(n));
-        abort();
+        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
     }
 }
 
 void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) {
     if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)) {
-        LOG_CUDA_ERROR(error, "n="+to_string(n));
-        abort();
+        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
     }
 }
 
 void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) {
     if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)) {
-        LOG_CUDA_ERROR(error, "n="+to_string(n));
-        abort();
+        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
     }
 }
 
 void* cuda_host_register(void* ptr, std::size_t size) {
     if (auto error = cudaHostRegister(ptr, size, cudaHostRegisterPortable)) {
-        LOG_CUDA_ERROR(error, "unable to register host memory");
-        return nullptr;
+        HANDLE_CUDA_ERROR(error, "unable to register host memory");
     }
     return ptr;
 }
@@ -53,8 +51,7 @@ void* cuda_malloc(std::size_t n) {
     void* ptr;
 
     if (auto error = cudaMalloc(&ptr, n)) {
-        LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
-        ptr = nullptr;
+        HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
     }
     return ptr;
 }
@@ -63,15 +60,14 @@ void* cuda_malloc_managed(std::size_t n) {
     void* ptr;
 
     if (auto error = cudaMallocManaged(&ptr, n)) {
-        LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
-        ptr = nullptr;
+        HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes of managed memory");
     }
     return ptr;
 }
 
 void cuda_free(void* ptr) {
     if (auto error = cudaFree(ptr)) {
-        LOG_CUDA_ERROR(error, "");
+        HANDLE_CUDA_ERROR(error, "");
     }
 }
 
-- 
GitLab