pytorch/c10/cuda/CUDAFunctions.cpp

312 lines
9.5 KiB
C++

#include <c10/cuda/CUDAFunctions.h>
#include <c10/macros/Macros.h>
#include <limits>
namespace c10::cuda {
namespace {
// returns -1 on failure
int32_t driver_version() {
int driver_version = -1;
C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version));
return driver_version;
}
int device_count_impl(bool fail_if_no_driver) {
int count = 0;
auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count));
if (err == cudaSuccess) {
return count;
}
// Clear out the error state, so we don't spuriously trigger someone else.
// (This shouldn't really matter, since we won't be running very much CUDA
// code in this regime.)
cudaError_t last_err C10_UNUSED = cudaGetLastError();
switch (err) {
case cudaErrorNoDevice:
// Zero devices is ok here
count = 0;
break;
case cudaErrorInsufficientDriver: {
auto version = driver_version();
if (version <= 0) {
if (!fail_if_no_driver) {
// No CUDA driver means no devices
count = 0;
break;
}
TORCH_CHECK(
false,
"Found no NVIDIA driver on your system. Please check that you "
"have an NVIDIA GPU and installed a driver from "
"http://www.nvidia.com/Download/index.aspx");
} else {
TORCH_CHECK(
false,
"The NVIDIA driver on your system is too old (found version ",
version,
"). Please update your GPU driver by downloading and installing "
"a new version from the URL: "
"http://www.nvidia.com/Download/index.aspx Alternatively, go to: "
"https://pytorch.org to install a PyTorch version that has been "
"compiled with your version of the CUDA driver.");
}
} break;
case cudaErrorInitializationError:
TORCH_CHECK(
false,
"CUDA driver initialization failed, you might not "
"have a CUDA gpu.");
break;
case cudaErrorUnknown:
TORCH_CHECK(
false,
"CUDA unknown error - this may be due to an "
"incorrectly set up environment, e.g. changing env "
"variable CUDA_VISIBLE_DEVICES after program start. "
"Setting the available devices to be zero.");
break;
#if C10_ASAN_ENABLED
case cudaErrorMemoryAllocation:
// In ASAN mode, we know that a cudaErrorMemoryAllocation error will
// pop up if compiled with NVCC (clang-cuda is fine)
TORCH_CHECK(
false,
"Got 'out of memory' error while trying to initialize CUDA. "
"CUDA with nvcc does not work well with ASAN and it's probably "
"the reason. We will simply shut down CUDA support. If you "
"would like to use GPUs, turn off ASAN.");
break;
#endif // C10_ASAN_ENABLED
default:
TORCH_CHECK(
false,
"Unexpected error from cudaGetDeviceCount(). Did you run "
"some cuda functions before calling NumCudaDevices() "
"that might have already set an error? Error ",
err,
": ",
cudaGetErrorString(err));
}
return count;
}
} // namespace
DeviceIndex device_count() noexcept {
// initialize number of devices only once
static int count = []() {
try {
auto result = device_count_impl(/*fail_if_no_driver=*/false);
TORCH_INTERNAL_ASSERT(
result <= std::numeric_limits<DeviceIndex>::max(),
"Too many CUDA devices, DeviceIndex overflowed");
return result;
} catch (const c10::Error& ex) {
// We don't want to fail, but still log the warning
// msg() returns the message without the stack trace
TORCH_WARN("CUDA initialization: ", ex.msg());
return 0;
}
}();
return static_cast<DeviceIndex>(count);
}
DeviceIndex device_count_ensure_non_zero() {
// Call the implementation every time to throw the exception
int count = device_count_impl(/*fail_if_no_driver=*/true);
// Zero gpus doesn't produce a warning in `device_count` but we fail here
TORCH_CHECK(count, "No CUDA GPUs are available");
return static_cast<DeviceIndex>(count);
}
DeviceIndex current_device() {
int cur_device = 0;
C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
return static_cast<DeviceIndex>(cur_device);
}
void set_device(DeviceIndex device) {
C10_CUDA_CHECK(c10::cuda::SetDevice(static_cast<int>(device)));
}
void device_synchronize() {
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
if (C10_UNLIKELY(interp)) {
(*interp)->trace_gpu_device_synchronization();
}
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
// this function has to be called from callers performing cuda synchronizing
// operations, to raise proper error or warning
void warn_or_error_on_sync() {
if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
TORCH_CHECK(false, "called a synchronizing CUDA operation");
} else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
TORCH_WARN("called a synchronizing CUDA operation");
}
}
c10::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
// check current device first
auto current_device_index = current_device();
if (current_device_index >= 0) {
if (hasPrimaryContext(current_device_index)) {
return current_device_index;
}
}
for (const auto device_index : c10::irange(at::cuda::device_count())) {
if (device_index == current_device_index)
continue;
if (hasPrimaryContext(device_index)) {
return device_index;
}
}
return c10::nullopt;
}
namespace _internal {
bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
TORCH_CHECK(false, "Should never been called");
}
bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
// Private api to be called from CUDAHooks.cpp
C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
hasPrimaryContext = func ? func : dummyHasPrimaryContext;
}
} // namespace _internal
bool hasPrimaryContext(DeviceIndex device_index) {
return _internal::hasPrimaryContext(device_index);
}
// Wrappers for raw CUDA device management functions
cudaError_t GetDeviceCount(int* dev_count) {
return cudaGetDeviceCount(dev_count);
}
// This is a codepath for CUDA 12 that comes with a critical change in behavior
// of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context
// lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice`
// is called. This can lead to dramatic consequences and pollute the device
// memory in distributed runs. To avoid unnecessary context creation a new
// function called `MaybeSetDevice` was introduced. This function is to be
// called in device guard destructor and at the exit of torch.cuda.device
// context manager. The behavior of `MaybeSetDevice` is quite simple, it calls
// to `cudaSetDevice` if context already exist or if context was not allocated
// on targeted device it simply saves the device index. This way we can keep
// PyTorch backward compatible for applications like this:
//
// ```
// import torch
// x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this
// call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
// ```
#if CUDA_VERSION >= 12000
thread_local int targetDeviceIndex = -1;
cudaError_t GetDevice(int* device) {
if (targetDeviceIndex >= 0) {
*device = targetDeviceIndex;
return cudaSuccess;
}
return cudaGetDevice(device);
}
cudaError_t SetDevice(int device) {
TORCH_CHECK(device >= 0, "device id must be positive!");
targetDeviceIndex = -1;
int cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (device == cur_device) {
return cudaSuccess;
}
return cudaSetDevice(device);
}
cudaError_t MaybeSetDevice(int device) {
if (hasPrimaryContext(device)) {
return c10::cuda::SetDevice(device);
}
targetDeviceIndex = device;
return cudaSuccess;
}
// This function always initializes the CUDA context
// on to_device
int ExchangeDevice(int to_device) {
int cur_device = targetDeviceIndex;
targetDeviceIndex = -1;
if (cur_device < 0) {
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (to_device == cur_device) {
return cur_device;
}
}
C10_CUDA_CHECK(cudaSetDevice(to_device));
return cur_device;
}
// This function does not initialize the CUDA context
// on to_device if it does not already exist
int MaybeExchangeDevice(int to_device) {
int cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (to_device == cur_device) {
return cur_device;
}
if (hasPrimaryContext(to_device)) {
C10_CUDA_CHECK(cudaSetDevice(to_device));
} else {
targetDeviceIndex = to_device;
}
return cur_device;
}
void SetTargetDevice() {
if (targetDeviceIndex >= 0) {
C10_CUDA_CHECK(c10::cuda::SetDevice(targetDeviceIndex));
}
}
#else
cudaError_t GetDevice(int* device) {
return cudaGetDevice(device);
}
cudaError_t SetDevice(int device) {
TORCH_CHECK(device >= 0, "device id must be positive!");
int cur_device = -1;
C10_CUDA_CHECK(cudaGetDevice(&cur_device));
if (device == cur_device) {
return cudaSuccess;
}
return cudaSetDevice(device);
}
cudaError_t MaybeSetDevice(int device) {
return c10::cuda::SetDevice(device);
}
int ExchangeDevice(int to_device) {
int cur_device = -1;
C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
if (to_device == cur_device) {
return cur_device;
}
C10_CUDA_CHECK(cudaSetDevice(to_device));
return cur_device;
}
int MaybeExchangeDevice(int to_device) {
return c10::cuda::ExchangeDevice(to_device);
}
void SetTargetDevice() {
// no-op on CUDA version < 12.x
}
#endif
} // namespace c10::cuda