pytorch/caffe2/sgd/lars_op.cc

#include "caffe2/sgd/lars_op.h"

namespace caffe2 {

template <>
void LarsOp<float, CPUContext>::ComputeLearningRate(
    const float* wd,
    const float* trust,
    const float* lr_max,
    float offset,
    float lr_min,
    float* X_norm,
    float* dX_norm,
    float* lr_rescaled) {
  float val = 1.0;

  if (*X_norm > 0) {
    val = (*trust) / (*dX_norm / *X_norm + (*wd) + offset);
  }
  *lr_rescaled = fmaxf(fminf(val, *lr_max), lr_min);
}

REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);

OPERATOR_SCHEMA(Lars)
    .NumInputs(5)
    .NumOutputs(1)
    .SetDoc(R"DOC(
Implement Layer-wise Adaptive Rate Scaling (LARS) with clipping. Before adding weight
decay, given a parameter tensor X and its gradient dX, the local learning rate
for X will be

local_lr = trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) )

      = trust / ( norm(dX) / norm(X) + wd + offset ),

where offset is a preset hyper-parameter to avoid numerical issue and trust
indicates how much we trust the layer to change its parameters during one update.
In this implementation, we uses l2 norm and the computed local learning rate is
clipped based on the upper bound lr_max and the lower bound lr_min:

local_lr = min(local_lr, lr_max) and local_lr = max(local_lr, lr_min)

)DOC")
    .Input(0, "X", "Parameter tensor")
    .Input(1, "dX", "Gradient tensor")
    .Input(2, "wd", "Weight decay")
    .Input(3, "trust", "Trust")
    .Input(4, "lr_max", "Upper bound of learning rate")
    .Output(0, "lr_rescaled", "Rescaled local learning rate")
    .Arg("offset", "rescaling offset parameter")
    .Arg("lr_min", "minimum learning rate for clipping");

SHOULD_NOT_DO_GRADIENT(Lars);
} // namespace caffe2