pytorch/c10/cuda/CUDADeviceAssertionHost.h

159 lines
6.2 KiB
C++

#pragma once
#include <c10/cuda/CUDAMacros.h>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#ifdef USE_CUDA
#define TORCH_USE_CUDA_DSA
#endif
/// Number of assertion failure messages we can store. If this is too small
/// threads will fail silently.
constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
namespace c10 {
namespace cuda {
/// Holds information about any device-side assertions that fail.
/// Held in managed memory and access by both the CPU and the GPU.
struct DeviceAssertionData {
/// Stringification of the assertion
char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
/// File the assertion was in
char filename[C10_CUDA_DSA_MAX_STR_LEN];
/// Name of the function the assertion was in
char function_name[C10_CUDA_DSA_MAX_STR_LEN];
/// Line number the assertion was at
int line_number;
/// Number uniquely identifying the kernel launch that triggered the assertion
uint32_t caller;
/// block_id of the thread that failed the assertion
int32_t block_id[3];
/// third_id of the thread that failed the assertion
int32_t thread_id[3];
};
/// Used to hold assertions generated by the device
/// Held in managed memory and access by both the CPU and the GPU.
struct DeviceAssertionsData {
/// Total number of assertions found; a subset of thse will be recorded
/// in `assertions`
int32_t assertion_count;
/// An array of assertions that will be written to in a race-free manner
DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
};
/// Use to hold info about kernel launches so that we can run kernels
/// asynchronously and still associate launches with device-side
/// assertion failures
struct CUDAKernelLaunchInfo {
/// Filename of the code where the kernel was launched from
const char* launch_filename;
/// Function from which the kernel was launched
const char* launch_function;
/// Line number of where the code was launched from
uint32_t launch_linenum;
/// Backtrace of where the kernel was launched from, only populated if
/// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
std::string launch_stacktrace;
/// Kernel that was launched
const char* kernel_name;
/// Device the kernel was launched on
int device;
/// Stream the kernel was launched on
int32_t stream;
/// A number that uniquely identifies the kernel launch
uint64_t generation_number;
};
/// Circular buffer used to hold information about kernel launches
/// this is later used to reconstruct how a device-side kernel assertion failure
/// occurred CUDAKernelLaunchRegistry is used as a singleton
class C10_CUDA_API CUDAKernelLaunchRegistry {
private:
/// Assume that this is the max number of kernel launches that might ever be
/// enqueued across all streams on a single device
static constexpr int max_kernel_launches = 1024;
/// How many kernel launch infos we've inserted. Used to ensure that circular
/// queue doesn't provide false information by always increasing, but also to
/// mark where we are inserting into the queue
#ifdef TORCH_USE_CUDA_DSA
uint64_t generation_number = 0;
#endif
/// Shared mutex between writer and accessor to ensure multi-threaded safety.
mutable std::mutex read_write_mutex;
/// Used to ensure prevent race conditions in GPU memory allocation
mutable std::mutex gpu_alloc_mutex;
/// Pointer to managed memory keeping track of device-side assertions. There
/// is one entry for each possible device the process might work with. Unused
/// entries are nullptrs. We could also use an unordered_set here, but this
/// vector design will be faster and the wasted memory is small since we
/// expect the number of GPUs per node will always be small
std::vector<
std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
uvm_assertions;
/// A single circular buffer holds information about every kernel launch the
/// process makes across all devices.
std::vector<CUDAKernelLaunchInfo> kernel_launches;
bool check_env_for_enable_launch_stacktracing() const;
bool check_env_for_dsa_enabled() const;
public:
CUDAKernelLaunchRegistry();
/// Register a new kernel launch and obtain a generation number back to be
/// passed to the kernel
uint32_t insert(
const char* launch_filename,
const char* launch_function,
const uint32_t launch_linenum,
const char* kernel_name,
const int32_t stream_id);
/// Get copies of the kernel launch registry and each device's assertion
/// failure buffer so they can be inspected without raising race conditions
std::
pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
snapshot() const;
/// Get a pointer to the current device's assertion failure buffer. If no such
/// buffer exists then one is created. This means that the first kernel launch
/// made on each device will be slightly slower because memory allocations are
/// required
DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
/// Gets the global singleton of the registry
static CUDAKernelLaunchRegistry& get_singleton_ref();
/// If not all devices support DSA, we disable it
const bool do_all_devices_support_managed_memory = false;
/// Whether or not to gather stack traces when launching kernels
bool gather_launch_stacktrace = false;
/// Whether or not host-side DSA is enabled or disabled at run-time
/// Note: Device-side code cannot be enabled/disabled at run-time
bool enabled_at_runtime = false;
/// Whether or not a device has indicated a failure
bool has_failed() const;
#ifdef TORCH_USE_CUDA_DSA
const bool enabled_at_compile_time = true;
#else
const bool enabled_at_compile_time = false;
#endif
};
std::string c10_retrieve_device_side_assertion_info();
} // namespace cuda
} // namespace c10
// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
// requires the same input arguments. We introduce the following macro to
// standardize these.
#define TORCH_DSA_KERNEL_ARGS \
[[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
[[maybe_unused]] uint32_t assertion_caller_id
// This macro can be used to pass the DSA arguments onward to another
// function
#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id