56 lines
1.7 KiB
C++
56 lines
1.7 KiB
C++
#include "caffe2/sgd/lars_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template <>
|
|
void LarsOp<float, CPUContext>::ComputeLearningRate(
|
|
const float* wd,
|
|
const float* trust,
|
|
const float* lr_max,
|
|
float offset,
|
|
float lr_min,
|
|
float* X_norm,
|
|
float* dX_norm,
|
|
float* lr_rescaled) {
|
|
float val = 1.0;
|
|
|
|
if (*X_norm > 0) {
|
|
val = (*trust) / (*dX_norm / *X_norm + (*wd) + offset);
|
|
}
|
|
*lr_rescaled = fmaxf(fminf(val, *lr_max), lr_min);
|
|
}
|
|
|
|
REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
|
|
|
|
OPERATOR_SCHEMA(Lars)
|
|
.NumInputs(5)
|
|
.NumOutputs(1)
|
|
.SetDoc(R"DOC(
|
|
Implement Layer-wise Adaptive Rate Scaling (LARS) with clipping. Before adding weight
|
|
decay, given a parameter tensor X and its gradient dX, the local learning rate
|
|
for X will be
|
|
|
|
local_lr = trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) )
|
|
|
|
= trust / ( norm(dX) / norm(X) + wd + offset ),
|
|
|
|
where offset is a preset hyper-parameter to avoid numerical issue and trust
|
|
indicates how much we trust the layer to change its parameters during one update.
|
|
In this implementation, we uses l2 norm and the computed local learning rate is
|
|
clipped based on the upper bound lr_max and the lower bound lr_min:
|
|
|
|
local_lr = min(local_lr, lr_max) and local_lr = max(local_lr, lr_min)
|
|
|
|
)DOC")
|
|
.Input(0, "X", "Parameter tensor")
|
|
.Input(1, "dX", "Gradient tensor")
|
|
.Input(2, "wd", "Weight decay")
|
|
.Input(3, "trust", "Trust")
|
|
.Input(4, "lr_max", "Upper bound of learning rate")
|
|
.Output(0, "lr_rescaled", "Rescaled local learning rate")
|
|
.Arg("offset", "rescaling offset parameter")
|
|
.Arg("lr_min", "minimum learning rate for clipping");
|
|
|
|
SHOULD_NOT_DO_GRADIENT(Lars);
|
|
} // namespace caffe2
|