pytorch/caffe2/operators/dropout_op_cudnn.cc

#include "caffe2/core/context_gpu.h"
#include "caffe2/core/cudnn_wrappers.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"

namespace caffe2 {

// cudnnRestoreDropoutDescriptor is needed for correctness and
// doesn't exist prior to cuDNN v7
#if CUDNN_VERSION_MIN(7,0,0)

class CuDNNDropoutOp final : public Operator<CUDAContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CUDAContext);

  explicit CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CUDAContext>(operator_def, ws),
        cudnn_wrapper_(&context_),
        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
        states_initialized_(false),
        random_seed_(operator_def.device_option().random_seed()) {
    CAFFE_ENFORCE_GE(ratio_, 0);
    CAFFE_ENFORCE_LT(ratio_, 1);
    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));

    CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
    CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
        cudnn_wrapper_.inline_cudnn_handle(),
        reinterpret_cast<size_t*>(&states_size_in_bytes_)));

    if (!is_test_) {
      scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1)));
      CAFFE_ENFORCE(scratch_blob_);
    }
  }

  ~CuDNNDropoutOp() noexcept override {
    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
    CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
  }

  template <typename T, typename M>
  bool DoRunWithType();

  bool RunOnDevice() override;

  static string scratch_blob_name(string mask_blob_name) {
    return "cudnn_dropout_scratch_" + mask_blob_name;
  }

 protected:
  CuDNNWrapper cudnn_wrapper_;
  cudnnTensorDescriptor_t data_desc_;
  cudnnDropoutDescriptor_t dropout_desc_;

  vector<int64_t> cudnn_input_dims_;

  float ratio_;
  bool is_test_;

  Blob* scratch_blob_ = nullptr;

  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
  // Input: X, Output: Y, mask_and_states

  // track whether states have been initialized - only needs to happen once
  bool states_initialized_;

  // random seed
  unsigned long long random_seed_;
};

class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CUDAContext);
  explicit CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CUDAContext>(operator_def, ws),
        cudnn_wrapper_(&context_),
        ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
        is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
        states_initialized_(false),
        random_seed_(operator_def.device_option().random_seed()) {
    CAFFE_ENFORCE_GE(ratio_, 0);
    CAFFE_ENFORCE_LT(ratio_, 1);
    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));

    CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
    CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
        cudnn_wrapper_.inline_cudnn_handle(),
        reinterpret_cast<size_t*>(&states_size_in_bytes_)));

    // Share scratch with the forward op
    scratch_blob_ =
        ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1)));
    CAFFE_ENFORCE(scratch_blob_);
  }

  ~CuDNNDropoutGradientOp() noexcept override {
    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
    CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
  }

  template <typename T, typename M>
  bool DoRunWithType();

  bool RunOnDevice() override;

 protected:
  CuDNNWrapper cudnn_wrapper_;
  cudnnTensorDescriptor_t data_desc_;
  cudnnDropoutDescriptor_t dropout_desc_;

  vector<int64_t> cudnn_input_dims_;

  Blob* scratch_blob_;

  float ratio_;
  bool is_test_;

  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
  // Input: dY, mask_and_states, Output: dX

  // only need to initialize states once (size is static)
  bool states_initialized_;

  unsigned long long random_seed_;
};

template <typename T, typename M>
bool CuDNNDropoutOp::DoRunWithType() {
  const auto& X = Input(0);
  auto* Y = Output(0);

  auto size_prod = 1;
  for (auto dim : X.sizes()) {
    size_prod *= dim;
  }
  // now actually run the computation
  if (is_test_) {
    if (Y != &X) {
      context_.CopySameDevice<T>(
          X.numel(), X.template data<T>(), Y->template mutable_data<T>());
    }
    return true;
  } else {
    // Reshape tensor descriptors if necessary
    if (X.sizes() != cudnn_input_dims_) {
      CAFFE_ENFORCE(scratch_blob_);
      Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
      cudnn_input_dims_ = X.sizes().vec();
      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
          data_desc_,
          GetCudnnTensorFormat(StorageOrder::NCHW),
          cudnnTypeWrapper<T>::type,
          size_prod,
          1,
          1,
          1));

      // get the reserve space we need
      CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
          data_desc_, &reserve_space_size_in_bytes_));

      states->Resize(states_size_in_bytes_);

      if (!states_initialized_) {
        // set the dropout descriptor (note: need to allocate the states data
        // before acquiring the mutex)
        uint8_t* states_data = states->template mutable_data<uint8_t>();
        {
          // Need to protect  as clashes with NCCL
          std::lock_guard<std::mutex> lk(CUDAContext::mutex());
          CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
              dropout_desc_,
              cudnn_wrapper_.inline_cudnn_handle(),
              ratio_,
              states_data,
              states_size_in_bytes_,
              random_seed_
              ));
        }
        states_initialized_ = true;
      }
    }
    auto* mask = Output(
        1,
        {static_cast<int64_t>(reserve_space_size_in_bytes_)},
        at::dtype<uint8_t>());
    CUDNN_ENFORCE(cudnnDropoutForward(
        cudnn_wrapper_.inline_cudnn_handle(),
        dropout_desc_,
        data_desc_,
        X.template data<T>(),
        data_desc_,
        Y->template mutable_data<T>(),
        mask->template mutable_data<uint8_t>(),
        reserve_space_size_in_bytes_));
  }
  return true;
}

bool CuDNNDropoutOp::RunOnDevice() {
  // dispatch based on contents of tensor(s)
  const auto& X = Input(0);
  auto* Y = Output(0);
  Y->ResizeLike(X);

  if (X.IsType<float>()) {
    return DoRunWithType<float, float>();
  } else if (X.IsType<at::Half>()) {
    return DoRunWithType<at::Half, float>();
  }
  return false;
}

template <typename T, typename M>
bool CuDNNDropoutGradientOp::DoRunWithType() {
  const auto& dY = Input(0);
  const auto& mask = Input(1);
  const Tensor& states = scratch_blob_->Get<Tensor>();
  auto* dX = Output(0);

  auto size_prod = 1;
  for (auto dim : dY.sizes()) {
    size_prod *= dim;
  }

  if (!states_initialized_) {
    // set the dropout descriptor
    {
      // Need to protect  as clashes with NCCL
      std::lock_guard<std::mutex> lk(CUDAContext::mutex());
      CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor(
          dropout_desc_,
          cudnn_wrapper_.inline_cudnn_handle(),
          ratio_,
          const_cast<uint8_t*>(states.data<uint8_t>()),
          states_size_in_bytes_,
          random_seed_
          ));
    }
    states_initialized_ = true;
  }

  if (dY.sizes() != cudnn_input_dims_) {
    cudnn_input_dims_ = dY.sizes().vec();
    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
        data_desc_,
        GetCudnnTensorFormat(StorageOrder::NCHW),
        cudnnTypeWrapper<T>::type,
        size_prod,
        1,
        1,
        1));

    // get the reserve space we need
    CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
        data_desc_, &reserve_space_size_in_bytes_));
  }

  // run the computation
  void* mask_data = const_cast<void*>(mask.raw_data());
  CUDNN_ENFORCE(cudnnDropoutBackward(
      cudnn_wrapper_.inline_cudnn_handle(),
      dropout_desc_,
      data_desc_,
      dY.data<T>(),
      data_desc_,
      dX->template mutable_data<T>(),
      mask_data,
      reserve_space_size_in_bytes_));
  return true;
}

bool CuDNNDropoutGradientOp::RunOnDevice() {
  // dispatch based on contents of tensor(s)
  const auto& dY = Input(0);
  auto* dX = Output(0);

  dX->ResizeLike(dY);

  if (dY.IsType<float>()) {
    return DoRunWithType<float, float>();
  } else if (dY.IsType<at::Half>()) {
    return DoRunWithType<at::Half, float>();
  }
  return false;
}

namespace {
REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
}

#endif

}; // namespace caffe2