From 5e3865cf2091dbbde10e519442c59dae9948c753 Mon Sep 17 00:00:00 2001
From: Ben Cumming <bcumming@cscs.ch>
Date: Thu, 29 Nov 2018 14:36:48 +0100
Subject: [PATCH] Fix thread-GPU affinity bug. (#656)
Ensure that all threads use the same GPU, which wasn't the case before.
* add `gpu_context::set_gpu()` method that will set all subsequent GPU calls from the calling thread run on the GPU of `gpu_context`.
* `fvm_lowered_cell_impl` now calls the `set_gpu` method on construction and `advance`.
* Also changed GPU memory allocation errors in `arb::memory` to throw `arb_exception` instead of calling `std::terminate` on error. Now errors due to poor GPU configuration can be caught by the calling application, and unit tests fail gracefully and allow other tests to run.
Fixes #655
---
arbor/fvm_lowered_cell_impl.hpp | 12 ++++++++++++
arbor/gpu_context.cpp | 29 +++++++++++++++++++++++------
arbor/gpu_context.hpp | 3 +++
arbor/memory/cuda_wrappers.cpp | 26 +++++++++++---------------
4 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index f2a45485..d033f464 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -114,6 +114,14 @@ private:
static unsigned dt_steps(value_type t0, value_type t1, value_type dt) {
return t0>=t1? 0: 1+(unsigned)((t1-t0)/dt);
}
+
+ // Sets the GPU used for CUDA calls from the thread that calls it.
+ // The GPU will be the one in the execution context context_.
+ // If not called, the thread may attempt to launch on a different GPU,
+ // leading to crashes.
+ void set_gpu() {
+ if (context_.gpu->has_gpu()) context_.gpu->set_gpu();
+ }
};
template <typename Backend>
@@ -152,6 +160,8 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
{
using util::as_const;
+ set_gpu();
+
// Integration setup
PE(advance_integrate_setup);
threshold_watcher_.clear_crossings();
@@ -301,6 +311,8 @@ void fvm_lowered_cell_impl<B>::initialize(
using util::value_by_key;
using util::keys;
+ set_gpu();
+
std::vector<mc_cell> cells;
const std::size_t ncell = gids.size();
diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp
index 34b078ea..a71b5a90 100644
--- a/arbor/gpu_context.cpp
+++ b/arbor/gpu_context.cpp
@@ -38,7 +38,12 @@ bool gpu_context::has_gpu() const {
#ifndef ARB_HAVE_GPU
+void gpu_context::set_gpu() const {
+ throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU.");
+}
+
void gpu_context::synchronize_for_managed_access() const {}
+
gpu_context::gpu_context(int) {
throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU.");
}
@@ -52,13 +57,12 @@ gpu_context::gpu_context(int gpu_id) {
throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id));
}
- // Set the device
- status = cudaSetDevice(gpu_id);
- if (status!=cudaSuccess) {
- throw arbor_exception("Unable to select GPU id " + std::to_string(gpu_id));
- }
-
+ // Set the device.
+ // The device will also have to be set for every host thread that uses the
+ // GPU, however performing this call here is a good check that the GPU can
+ // be set and initialized.
id_ = gpu_id;
+ set_gpu();
// Record the device attributes
attributes_ = 0;
@@ -76,6 +80,19 @@ void gpu_context::synchronize_for_managed_access() const {
}
}
+void gpu_context::set_gpu() const {
+ if (!has_gpu()) {
+ throw arbor_exception(
+ "Call to gpu_context::set_gpu() when there is no GPU selected.");
+ }
+ auto status = cudaSetDevice(id_);
+ if (status != cudaSuccess) {
+ throw arbor_exception(
+ "Unable to select GPU id " + std::to_string(id_)
+ + ": " + cudaGetErrorName(status));
+ }
+}
+
#endif
} // namespace arb
diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp
index 2b1e4f49..44de4749 100644
--- a/arbor/gpu_context.hpp
+++ b/arbor/gpu_context.hpp
@@ -17,6 +17,9 @@ public:
bool has_atomic_double() const;
void synchronize_for_managed_access() const;
bool has_gpu() const;
+ // Calls cudaSetDevice(id), so that GPU calls from the calling thread will
+ // be executed on the GPU.
+ void set_gpu() const;
};
using gpu_context_handle = std::shared_ptr<gpu_context>;
diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp
index 7e694d7e..d31c82b4 100644
--- a/arbor/memory/cuda_wrappers.cpp
+++ b/arbor/memory/cuda_wrappers.cpp
@@ -1,6 +1,8 @@
#include <cstdlib>
#include <string>
+#include <arbor/arbexcept.hpp>
+
#include "util.hpp"
#ifdef ARB_HAVE_GPU
@@ -8,8 +10,8 @@
#include <cuda.h>
#include <cuda_runtime.h>
-#define LOG_CUDA_ERROR(error, msg)\
-LOG_ERROR("memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error))
+#define HANDLE_CUDA_ERROR(error, msg)\
+throw arbor_exception("CUDA memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error));
namespace arb {
namespace memory {
@@ -18,29 +20,25 @@ using std::to_string;
void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) {
if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)) {
- LOG_CUDA_ERROR(error, "n="+to_string(n));
- abort();
+ HANDLE_CUDA_ERROR(error, "n="+to_string(n));
}
}
void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) {
if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)) {
- LOG_CUDA_ERROR(error, "n="+to_string(n));
- abort();
+ HANDLE_CUDA_ERROR(error, "n="+to_string(n));
}
}
void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) {
if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)) {
- LOG_CUDA_ERROR(error, "n="+to_string(n));
- abort();
+ HANDLE_CUDA_ERROR(error, "n="+to_string(n));
}
}
void* cuda_host_register(void* ptr, std::size_t size) {
if (auto error = cudaHostRegister(ptr, size, cudaHostRegisterPortable)) {
- LOG_CUDA_ERROR(error, "unable to register host memory");
- return nullptr;
+ HANDLE_CUDA_ERROR(error, "unable to register host memory");
}
return ptr;
}
@@ -53,8 +51,7 @@ void* cuda_malloc(std::size_t n) {
void* ptr;
if (auto error = cudaMalloc(&ptr, n)) {
- LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
- ptr = nullptr;
+ HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
}
return ptr;
}
@@ -63,15 +60,14 @@ void* cuda_malloc_managed(std::size_t n) {
void* ptr;
if (auto error = cudaMallocManaged(&ptr, n)) {
- LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
- ptr = nullptr;
+ HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes of managed memory");
}
return ptr;
}
void cuda_free(void* ptr) {
if (auto error = cudaFree(ptr)) {
- LOG_CUDA_ERROR(error, "");
+ HANDLE_CUDA_ERROR(error, "");
}
}
--
GitLab