pytorch/caffe2/operators/fully_connected_op_gpu.cc

223 lines
5.9 KiB
C++

#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/fully_connected_op.h"
namespace caffe2 {
namespace {
template <class FullyConnectedOp>
bool RunFullyConnectedOpOnCUDADevice(
const bool float16_compute,
FullyConnectedOp* op) {
if (op->Input(0).template IsType<float>()) {
return op->template DoRunWithType<
float, // X
float, // W
float, // B
float, // Y
float>(); // Math
} else if (op->Input(0).template IsType<at::Half>()) {
if (float16_compute) {
const cudaDeviceProp& prop = GetDeviceProperty(0);
if (prop.major >= kFp16CUDADevicePropMajor) {
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // B
at::Half, // Y
at::Half>(); // Math
} else {
LOG(INFO) << "CUDA Device does not support FP16 computation, "
"falling back to FP32.";
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // B
at::Half, // Y
float>(); // Math
}
} else {
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // B
at::Half, // Y
float>(); // Math
}
} else {
CAFFE_THROW("Unsupported type");
}
return false;
}
template <class FullyConnectedGradientOp>
bool RunFullyConnectedGradientOpOnCUDADevice(
const bool float16_compute,
FullyConnectedGradientOp* op) {
if (op->Input(0).template IsType<float>()) {
return op->template DoRunWithType<
float, // X
float, // W
float, // dY
float, // B
float, // dX
float, // dW
float, // dB
float>(); // Math
} else if (op->Input(0).template IsType<at::Half>()) {
if (float16_compute) {
const cudaDeviceProp& prop = GetDeviceProperty(0);
if (prop.major >= kFp16CUDADevicePropMajor) {
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // dY
at::Half, // B
at::Half, // dX
at::Half, // dW
at::Half, // dB
at::Half>(); // Math
} else {
LOG(INFO) << "CUDA Device does not support FP16 computation, "
"falling back to FP32.";
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // dY
at::Half, // B
at::Half, // dX
at::Half, // dW
at::Half, // dB
float>(); // Math
}
} else {
return op->template DoRunWithType<
at::Half, // X
at::Half, // W
at::Half, // dY
at::Half, // B
at::Half, // dX
at::Half, // dW
at::Half, // dB
float>(); // Math
}
} else {
CAFFE_THROW("Unsupported type");
}
return false;
}
} // namespace
// The RunFullyConnectedOpOnCUDADevice Function will use the pointer of current
// op and the DoRunWithType will make sure to run the correct things.
template <>
bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
}
template <>
bool FullyConnectedOp<
CUDAContext,
DefaultEngine,
false /* don't transpose weight */>::RunOnDevice() {
return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
}
template <>
bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
}
template <>
bool FullyConnectedGradientOp<
CUDAContext,
DefaultEngine,
false /* don't transpose weight */>::RunOnDevice() {
return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
}
#if !defined(USE_ROCM)
// Require these to be defined otherwise TensorCore FC ops will end
// up calling the default FC implementation which doesn't have
// fp16 support...
template <>
bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
}
template <>
bool FullyConnectedOp<
CUDAContext,
TensorCoreEngine,
false /* don't transpose weight */>::RunOnDevice() {
return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
}
template <>
bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
return RunFullyConnectedGradientOpOnCUDADevice(
false /* float16_compute */, this);
}
template <>
bool FullyConnectedGradientOp<
CUDAContext,
TensorCoreEngine,
false /* don't transpose weight */>::RunOnDevice() {
return RunFullyConnectedGradientOpOnCUDADevice(
false /* float16_compute */, this);
}
#endif
REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(
FCTransposed,
FullyConnectedOp<
CUDAContext,
DefaultEngine,
false /* don't transpose weight */>);
REGISTER_CUDA_OPERATOR(
FCTransposedGradient,
FullyConnectedGradientOp<
CUDAContext,
DefaultEngine,
false /* don't transpose weight */>);
#if !defined(USE_ROCM)
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FC,
TENSORCORE,
FullyConnectedOp<CUDAContext, TensorCoreEngine>);
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FCGradient,
TENSORCORE,
FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FCTransposed,
TENSORCORE,
FullyConnectedOp<
CUDAContext,
TensorCoreEngine,
false /* don't transpose weight */>);
REGISTER_CUDA_OPERATOR_WITH_ENGINE(
FCTransposedGradient,
TENSORCORE,
FullyConnectedGradientOp<
CUDAContext,
TensorCoreEngine,
false /* don't transpose weight */>);
#endif
} // namespace caffe2