pytorch/caffe2/operators/inference_lstm_op.h

#ifndef LSTM_OP_H_
#define LSTM_OP_H_

#include <algorithm>
#include <sstream>
#include <unordered_map>
#include <vector>
#include "caffe2/core/blob_serialization.h"
#include "caffe2/core/export_caffe2_op_to_c10.h"
#include <c10/util/irange.h>
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "lstm_utils.h"

C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LSTMOp);

namespace caffe2 {
namespace {

using t_tuple = std::tuple<Tensor, Tensor>;

struct CellParams {
  CellParams(
      const Tensor& _w_ih,
      const Tensor& _w_hh,
      const Tensor& _b_ih,
      const Tensor& _b_hh,
      CPUContext* _context) {
    initParams(_w_ih, _w_hh, _b_ih, _b_hh, _context);
  }

  CellParams(const CellParams& rhs) {
    initParams(rhs.w_ih, rhs.w_hh, rhs.b_ih, rhs.b_hh, rhs.context);
  }

  CellParams& operator=(const CellParams& rhs) {
    initParams(rhs.w_ih, rhs.w_hh, rhs.b_ih, rhs.b_hh, rhs.context);
    return *this;
  }

  void initParams(
      const Tensor& _w_ih,
      const Tensor& _w_hh,
      const Tensor& _b_ih,
      const Tensor& _b_hh,
      CPUContext* _context) {
    w_ih = copy_ctor(_w_ih);
    w_hh = copy_ctor(_w_hh);
    b_ih = copy_ctor(_b_ih);
    b_hh = copy_ctor(_b_hh);
    context = _context;
  }

  Tensor w_ih;
  Tensor w_hh;
  Tensor b_ih; /* optional */
  Tensor b_hh; /* optional */
  CPUContext* context;

  Tensor linear_ih(const Tensor& input) const {
    return linear(input, w_ih, b_ih, context);
  }
  Tensor linear_hh(const Tensor& h) const {
    return linear(h, w_hh, b_hh, context);
  }
};

struct LSTMCell {
  explicit LSTMCell(CPUContext* context) : context_(context) {}
  t_tuple operator()(
      const Tensor& input,
      const t_tuple& hidden,
      const CellParams& params) const {
    const auto& hx = std::get<0>(hidden);
    const auto& cx = std::get<1>(hidden);
    auto linear_ih = params.linear_ih(input);
    auto linear_hh = params.linear_hh(hx);
    auto gates = add(linear_ih, linear_hh, context_);
    auto chunked_gates = chunk(gates, 4, 1, context_);
    auto ingate = sigmoid(chunked_gates[0]);
    auto forgetgate = sigmoid(chunked_gates[1]);
    auto cellgate = tanh(chunked_gates[2], context_);
    auto outgate = sigmoid(chunked_gates[3]);

    auto cy =
        add(mul(forgetgate, cx, context_),
            mul(ingate, cellgate, context_),
            context_);
    auto hy = mul(outgate, tanh(cy, context_), context_);
    return std::make_tuple(std::move(hy), std::move(cy));
  }
  CPUContext* context_;
};

template <typename output_type, typename hidden_type>
struct LayerOutput {
  output_type outputs;
  hidden_type final_hidden;

  LayerOutput(const output_type& _outputs, const hidden_type& _hidden) {
    outputs = copy_ctor(_outputs);
    final_hidden = copy_ctor(_hidden);
  }
};

template <typename hidden_type, typename param_type>
struct Layer {
  using output_type = LayerOutput<Tensor, hidden_type>;
  virtual ~Layer() {}
  virtual output_type operator()(
      const Tensor& input,
      const hidden_type& input_hidden,
      const param_type& params) const = 0;
};

struct FullLSTMLayer : Layer<t_tuple, CellParams> {
  FullLSTMLayer(LSTMCell& cell, CPUContext* context)
      : cell_(cell), context_(context) {}

  LayerOutput<std::vector<Tensor>, t_tuple> operator()(
      const std::vector<Tensor>& step_inputs,
      const std::tuple<Tensor, Tensor>& input_hidden,
      const CellParams& params) const {
    std::vector<Tensor> step_outputs;
    auto hidden = copy_ctor(input_hidden);

    for (const auto i : c10::irange(step_inputs.size())) {
      hidden = cell_(step_inputs[i], hidden, params);
      step_outputs.push_back(copy_ctor(std::get<0>(hidden)));
    }

    return {step_outputs, hidden};
  }

  LayerOutput<Tensor, t_tuple> operator()(
      const Tensor& inputs,
      const std::tuple<Tensor, Tensor>& input_hidden,
      const CellParams& params) const override {
    auto unstacked_output =
        (*this)(unbind(inputs, 0, context_), input_hidden, params);
    return {stack(unstacked_output.outputs, 0, context_),
            unstacked_output.final_hidden};
  }
  LSTMCell cell_;
  CPUContext* context_;
};

struct FullBidirectionalLSTMLayer
    : Layer<std::pair<t_tuple, t_tuple>, std::pair<CellParams, CellParams>> {
  using bidir_hidden_type = std::pair<t_tuple, t_tuple>;
  using param_type = std::pair<CellParams, CellParams>;
  using output_type = LayerOutput<Tensor, bidir_hidden_type>;

  FullBidirectionalLSTMLayer(LSTMCell& cell, CPUContext* context)
      : layer_(cell, context), context_(context) {}

  output_type operator()(
      const Tensor& input,
      const bidir_hidden_type& input_hidden,
      const param_type& params) const override {
    std::vector<Tensor> outputs;
    auto step_inputs = unbind(input, 0, context_);
    auto fw_result = layer_(step_inputs, input_hidden.first, params.first);
    auto fw_output = stack(fw_result.outputs, 0, context_);
    outputs.push_back(copy_ctor(fw_output));
    auto rev_step_inputs = reverse(std::move(step_inputs));
    auto rev_result =
        layer_(rev_step_inputs, input_hidden.second, params.second);
    std::reverse(rev_result.outputs.begin(), rev_result.outputs.end());
    auto rev_output = stack(rev_result.outputs, 0, context_);
    outputs.push_back(copy_ctor(rev_output));
    return {cat(outputs, fw_output.dim() - 1, context_),
            std::make_pair(
                std::move(fw_result.final_hidden),
                std::move(rev_result.final_hidden))};
  }

  inline std::vector<Tensor> reverse(std::vector<Tensor>&& x) const {
    std::reverse(x.begin(), x.end());
    return std::move(x);
  }

 private:
  FullLSTMLayer layer_;
  CPUContext* context_;
};

template <typename hidden_type, typename weight_type>
LayerOutput<Tensor, std::vector<hidden_type>> apply_layer_stack(
    const Layer<hidden_type, weight_type>& layer,
    const Tensor& input,
    const std::vector<hidden_type>& hiddens,
    const std::vector<weight_type>& weights,
    int64_t num_layers) {
  CAFFE_ENFORCE(
      num_layers == hiddens.size(),
      "Expected more hidden states in stacked_rnn");
  CAFFE_ENFORCE(
      num_layers == weights.size(), "Expected more weights in stacked_rnn");

  auto layer_input = input.UnsafeSharedInstance();
  auto hidden_it = hiddens.begin();
  auto weight_it = weights.begin();
  std::vector<hidden_type> final_hiddens(num_layers);
  for (const auto l : c10::irange(num_layers)) {
    auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++));
    final_hiddens.at(l) = std::move(layer_output.final_hidden);
    layer_input = std::move(layer_output.outputs);
  }
  return {layer_input, final_hiddens};
}

std::tuple<Tensor, Tensor, Tensor> _lstm_impl(
    const Tensor& input,
    const std::vector<CellParams>& params,
    const Tensor& hx,
    const Tensor& cx,
    int64_t num_layers,
    bool bidirectional,
    CPUContext* context) {
  using stack_output = LayerOutput<Tensor, std::vector<t_tuple>>;
  auto layer_hx = unbind(hx, 0, context);
  auto layer_cx = unbind(cx, 0, context);
  int64_t total_layers = layer_hx.size();
  std::vector<std::tuple<Tensor, Tensor>> hiddens;
  hiddens.reserve(total_layers);
  for (const auto i : c10::irange(total_layers)) {
    hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i]));
  }
  LSTMCell cell(context);
  std::shared_ptr<stack_output> stack_output_ptr;
  if (bidirectional) {
    auto bidir_result = apply_layer_stack(
        FullBidirectionalLSTMLayer{cell, context},
        input,
        pair_vec(hiddens),
        pair_vec(params),
        num_layers);
    stack_output_ptr.reset(new stack_output(
        bidir_result.outputs,
        unpair_vec(std::move(bidir_result.final_hidden))));
  } else {
    auto result = apply_layer_stack(
        FullLSTMLayer{cell, context}, input, hiddens, params, num_layers);
    stack_output_ptr = std::make_shared<stack_output>(std::move(result));
  }

  std::vector<Tensor> hy, cy;
  hy.reserve(total_layers);
  cy.reserve(total_layers);
  for (auto& hidden : stack_output_ptr->final_hidden) {
    hy.push_back(std::move(std::get<0>(hidden)));
    cy.push_back(std::move(std::get<1>(hidden)));
  }
  return std::make_tuple(
      std::move(stack_output_ptr->outputs),
      stack(hy, 0, context),
      stack(cy, 0, context));
}

// Parses a flat list of parameter tensors into a list of CellParams
std::vector<CellParams> gather_params(
    const std::vector<Tensor>& params,
    bool has_biases,
    CPUContext* context) {
  Tensor undefined;
  std::vector<CellParams> result;
  if (has_biases) {
    CAFFE_ENFORCE_EQ(
        params.size() % 4, 0, "got an incorrect number of LSTM parameters");
    for (size_t i = 0; i < params.size(); i += 4) {
      result.emplace_back(
          params[i], params[i + 1], params[i + 2], params[i + 3], context);
    }
  } else {
    CAFFE_ENFORCE_EQ(
        params.size() % 2, 0, "got an incorrect number of LSTM parameters");
    for (size_t i = 0; i < params.size(); i += 2) {
      result.emplace_back(
          params[i], params[i + 1], undefined, undefined, context);
    }
  }
  return result;
}

class InferenceLSTMOp : public Operator<CPUContext> {
 public:
  template <class... Args>
  explicit InferenceLSTMOp(Args&&... args)
      : Operator(std::forward<Args>(args)...),
        num_layers_(this->template GetSingleArgument<int64_t>("num_layers", 1)),
        bidirectional_(
            this->template GetSingleArgument<bool>("bidirectional", false)),
        has_biases_(this->template GetSingleArgument<bool>("has_biases", true)),
        batch_first_(
            this->template GetSingleArgument<bool>("batch_first", false)) {}

  bool RunOnDevice() override;

 protected:
  int64_t num_layers_;
  bool bidirectional_;
  bool has_biases_;
  bool batch_first_;
};

} // namespace
} // namespace caffe2
#endif // LSTM_OP_H_