pytorch/caffe2/operators/h_softmax_op.cc

663 lines
23 KiB
C++

#include "caffe2/operators/h_softmax_op.h"
#include <queue>
#include <stack>
namespace caffe2 {
template <>
float HSoftmaxOp<float, CPUContext>::RunForwardSingle(const float* X,
const float* W, const float* b, int target, float* int_output,
const float* bias_multiplier, int dim_out, int dim_in,
int& int_output_offset) {
// W * x
float* fc_output_data = int_output + int_output_offset;
math::Gemm<float, CPUContext>(CblasNoTrans, CblasTrans, 1, dim_out, dim_in, 1,
X, W, 0, fc_output_data, &context_);
math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, 1,
b, bias_multiplier, 1, fc_output_data, &context_);
int_output_offset += dim_out;
//Softmax
float* softmax_output_data = int_output + int_output_offset;
if (!scale_.has_value()) {
scale_ = caffe2::empty({1}, at::dtype<float>().device(CPU));
}
if (!sum_multiplier_.has_value()) {
sum_multiplier_ = caffe2::empty({dim_out}, at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(dim_out, 1.f,
sum_multiplier_->mutable_data<float>(), &context_);
} else if (sum_multiplier_->numel() != dim_out) {
sum_multiplier_->Resize(dim_out);
math::Set<float, CPUContext>(dim_out, 1.f,
sum_multiplier_->mutable_data<float>(), &context_);
}
math::RowwiseMax<float, CPUContext>(1, dim_out, fc_output_data,
scale_->mutable_data<float>(), &context_);
// Put the intermediate result X - max(X) into Y
context_.template CopyFromCPU<float>(
dim_out, fc_output_data, softmax_output_data);
// Subtract the scale
math::Gemv<float, CPUContext>(CblasNoTrans, dim_out, 1, -1,
sum_multiplier_->data<float>(), scale_->data<float>(), 1, softmax_output_data,
&context_);
// Exponentiation
math::Exp<float, CPUContext>(dim_out, softmax_output_data,
softmax_output_data, &context_);
math::Gemv<float, CPUContext>(CblasNoTrans, 1, dim_out, 1,
softmax_output_data, sum_multiplier_->data<float>(), 0,
scale_->mutable_data<float>(), &context_);
// Do division
const float scale = *(scale_->data<float>());
for (int j = 0; j < dim_out; ++j) {
softmax_output_data[j] /= scale;
}
int_output_offset += dim_out;
if (target < 0) {
return -1;
}
//Return cross entropy loss
return -log(std::max(softmax_output_data[target], kLOG_THRESHOLD()));
}
// Implementation for the CPU context.
template <>
bool HSoftmaxOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
auto& label = Input(3);
// Batch size
int M = X.dim() > 1 ? X.dim32(0) : 1;
// Input feature dimension
size_t K = X.numel() / M;
CAFFE_ENFORCE_GE(W.dim(), 2); // N*K
CAFFE_ENFORCE_EQ(b.dim(), 1); // N
CAFFE_ENFORCE_EQ(K, W.numel() / (W.dim32(0)));
// Sum of output dimensions of all hierarchy nodes
int N = W.dim32(0);
CAFFE_ENFORCE_EQ(N, b.dim32(0));
auto* Y = Output(0, {M}, at::dtype<float>());
auto* Ydata = Y->template mutable_data<float>();
math::Set<float, CPUContext>(M, 0.f, Ydata, &context_);
const auto* labeldata = label.data<int>();
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
int int_output_size = getIntermediateOutputSize(labeldata, M, hierarchy);
auto* intermediate_output = Output(1, {int_output_size}, at::dtype<float>());
float* int_output_data = intermediate_output->template mutable_data<float>();
int int_output_offset = 0;
if (!bias_multiplier_.has_value()) {
bias_multiplier_ = caffe2::empty({M}, at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(M, static_cast<float>(1),
bias_multiplier_->mutable_data<float>(), &context_);
} else if (bias_multiplier_->numel() != M) {
bias_multiplier_->Resize(M);
math::Set<float, CPUContext>(M, static_cast<float>(1),
bias_multiplier_->mutable_data<float>(), &context_);
}
for (int sample = 0; sample < M; ++sample) {
int word_id = labeldata[sample];
const PathProto& path = hierarchy[word_id];
for (const PathNodeProto& node : path.path_nodes()) {
//Offset of node's weight matrix in W
int w_offset = node.index();
//Number of output dimensions in node's weight matrix
int w_length = node.length();
int target = node.target();
//Adding log probabilities
Ydata[sample] += RunForwardSingle(X.data<float>() + sample*K,
W.data<float>() + w_offset*K, b.data<float>() + w_offset, target,
int_output_data, bias_multiplier_->data<float>()+sample, w_length, K,
int_output_offset);
}
}
return true;
}
template <>
void HSoftmaxGradientOp<float, CPUContext>::RunBackwardSingle(const float* X,
const float* dY, const float* W, int target,
const float* int_output, float* dX, float* dW, float* db, float* dint_output,
int dim_in, int dim_out, int& int_output_offset) {
//Cross entropy
// dX_entropy is the dX for the cross entropy layer
float* dX_entropy = dint_output + int_output_offset - dim_out;
// X_entropy is the X for the cross entropy layer and Y for the softmax layer
const float* X_entropy = int_output + int_output_offset - dim_out;
math::Set<float, CPUContext>(dim_out, 0.f, dX_entropy, &context_);
dX_entropy[target] = - (*dY) / std::max(X_entropy[target], kLOG_THRESHOLD());
int_output_offset -= dim_out;
//Softmax
if (!scale_.has_value()) {
scale_ = caffe2::empty({1}, at::dtype<float>().device(CPU));
}
float* scaledata = scale_->mutable_data<float>();
if (!sum_multiplier_.has_value()) {
sum_multiplier_ = caffe2::empty({dim_out}, at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(dim_out, 1.f,
sum_multiplier_->mutable_data<float>(), &context_);
} else if (sum_multiplier_->numel() != dim_out) {
sum_multiplier_->Resize(dim_out);
math::Set<float, CPUContext>(dim_out, 1.f,
sum_multiplier_->mutable_data<float>(), &context_);
}
float* dX_softmax = dint_output + int_output_offset - dim_out;
context_.CopyFromCPU<float>(dim_out, dX_entropy, dX_softmax);
math::Dot<float, CPUContext>(dim_out, X_entropy, dX_entropy, scaledata,
&context_);
math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, -1,
sum_multiplier_->data<float>(), scaledata , 1, dX_softmax, &context_);
math::Mul<float, CPUContext>(dim_out, dX_softmax, X_entropy, dX_softmax,
&context_);
int_output_offset -= dim_out;
//FC
if (!bias_multiplier_.has_value()) {
// If the helper bias multiplier has not been created, reshape and fill
// it with 1
bias_multiplier_ = caffe2::empty({1}, at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(1, static_cast<float>(1),
bias_multiplier_->template mutable_data<float>(), &context_);
}
// Compute dW and add incrementally
// dW = dW + dX_softmax'*X
math::Gemm<float, CPUContext>(CblasTrans, CblasNoTrans, dim_out, dim_in, 1, 1,
dX_softmax, X, 1, dW, &context_);
// Compute dB and add incrementally
// db = db + dX_softmax*bias_multiplier_
math::Gemv<float, CPUContext>(CblasTrans, 1, dim_out, 1, dX_softmax,
bias_multiplier_->template data<float>(), 1, db, &context_);
// Compute dX and add incrementally
// dX = dX + W'dX_softmax
math::Gemv<float, CPUContext>(CblasTrans, dim_out, dim_in,
1, W, dX_softmax, 1, dX, &context_);
}
// Implementation for the CPU context.
template <>
bool HSoftmaxGradientOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
auto& label = Input(3);
auto& intermediate_output = Input(4);
auto& dY = Input(5);
auto* dX = Output(0, X.sizes(), at::dtype<float>());
auto* dW = Output(1, W.sizes(), at::dtype<float>());
auto* db = Output(2, b.sizes(), at::dtype<float>());
auto* dX_intermediate_output =
Output(3, intermediate_output.sizes(), at::dtype<float>());
float* dX_data = dX->template mutable_data<float>();
float* dW_data = dW->template mutable_data<float>();
float* db_data = db->template mutable_data<float>();
float* dOutput_data = dX_intermediate_output->template mutable_data<float>();
math::Set<float, CPUContext>(X.numel(), 0.f, dX_data, &context_);
math::Set<float, CPUContext>(W.numel(), 0.f, dW_data, &context_);
math::Set<float, CPUContext>(b.numel(), 0.f, db_data, &context_);
math::Set<float, CPUContext>(
intermediate_output.numel(), 0.f, dOutput_data, &context_);
// Batch size
int M = X.dim() > 1 ? X.dim32(0) : 1;
// Input feature dimension
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int K = X.numel() / M;
const auto* labeldata = label.data<int>();
auto hierarchy = getHierarchyForLabels(M, labeldata, hierarchy_all_map_);
int output_offset = getIntermediateOutputSize(labeldata, M, hierarchy);
//Traverse backward to access intermediate_output generated by HSoftmaxOp
// sequentially in reverse order
for (int sample = M-1; sample >= 0; sample--) {
int word_id = labeldata[sample];
PathProto path = hierarchy[word_id];
for (auto node = path.path_nodes().rbegin();
node != path.path_nodes().rend(); node++) {
int w_offset = node->index();
int w_length = node->length();
int target = node->target();
RunBackwardSingle(X.data<float>() + sample*K, dY.data<float>() + sample,
W.data<float>() + w_offset*K, target, intermediate_output.data<float>(),
dX_data + sample*K, dW_data + w_offset*K, db_data + w_offset,
dOutput_data, K, w_length, output_offset);
}
}
return true;
}
// Implementation for the CPU context.
template <>
bool HSoftmaxSearchOp<float, CPUContext>::pruning(
const float* X,
int sample,
int K,
const float* W,
const float* b,
const NodeProto& src_node,
NodeProto& dst_node,
float parent_score,
float beam) {
int w_length = src_node.children_size() + src_node.word_ids_size();
Tensor intermediate_data{CPU};
intermediate_data.Resize(2 * w_length);
float* int_output_data = intermediate_data.template mutable_data<float>();
int int_output_offset = 0;
int w_offset = src_node.offset();
RunForwardSingle(
X + K * sample,
W + w_offset * K,
b + w_offset,
-1,
int_output_data,
bias_multiplier_->template data<float>() + sample,
w_length,
K,
int_output_offset);
float* softmax_output_data = int_output_data + w_length;
// real probabilities
for (int i = 0; i < w_length; i++) {
softmax_output_data[i] =
-log(std::max(softmax_output_data[i], kLOG_THRESHOLD())) + parent_score;
}
for (int i = 0; i < src_node.children_size(); i++) {
if (softmax_output_data[i] < parent_score + beam) {
dst_node.add_children();
int idx = dst_node.children_size() - 1;
CAFFE_ENFORCE(
src_node.children(i).has_offset(),
"HSM Search require the field offset in NodeProte");
dst_node.mutable_children(idx)->set_offset(src_node.children(i).offset());
CAFFE_ENFORCE(
src_node.children(i).has_name(),
"HSM Search require the field name in NodeProte");
dst_node.mutable_children(idx)->set_name(src_node.children(i).name());
dst_node.add_scores(softmax_output_data[i]);
pruning(
X,
sample,
K,
W,
b,
src_node.children(i),
*dst_node.mutable_children(idx),
softmax_output_data[i],
beam);
}
}
for (int i = src_node.children_size(); i < w_length; i++) {
if (softmax_output_data[i] < parent_score + beam) {
dst_node.add_word_ids(src_node.word_ids(i - src_node.children_size()));
dst_node.add_scores(softmax_output_data[i]);
}
}
return true;
}
template <>
bool HSoftmaxSearchOp<float, CPUContext>::extractNodes(
const NodeProto& node,
std::vector<std::pair<string, float>>& info) {
int i = 0;
for (const auto& n : node.children()) {
info.emplace_back(std::make_pair(n.name(), node.scores(i++)));
}
for (const int n : node.word_ids()) {
info.emplace_back(std::make_pair(c10::to_string(n), node.scores(i++)));
}
for (const auto& n : node.children()) {
extractNodes(n, info);
}
return true;
}
// Implementation for the CPU context.
template <>
bool HSoftmaxSearchOp<float, CPUContext>::RunOnDevice() {
auto& X = Input(0);
const auto& W = Input(1);
const auto& b = Input(2);
// Batch size
int M = X.dim() > 1 ? X.dim32(0) : 1;
// Input feature dimension
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
int K = X.numel() / M;
CAFFE_ENFORCE(W.dim() == 2, "Weight must be a matrix."); // N*K
CAFFE_ENFORCE(b.dim() == 1, "Bias must be a vector."); // N
CAFFE_ENFORCE(K == W.numel() / (W.dim32(0)), "feature dimension mismatch.");
// Sum of output dimensions of all hierarchy nodes
int N = W.dim32(0);
CAFFE_ENFORCE(N == b.dim32(0), "mismatch between Weight and Bias.");
auto* Y_names = Output(0, {M, top_n_}, at::dtype<string>());
auto* Y_scores = Output(1, {M, top_n_}, at::dtype<float>());
if (!bias_multiplier_.has_value()) {
bias_multiplier_ = caffe2::empty({M}, at::dtype<float>().device(CPU));
math::Set<float, CPUContext>(M, static_cast<float>(1),
bias_multiplier_->mutable_data<float>(), &context_);
} else if (bias_multiplier_->numel() != M) {
bias_multiplier_->Resize(M);
math::Set<float, CPUContext>(M, static_cast<float>(1),
bias_multiplier_->mutable_data<float>(), &context_);
}
for (int sample = 0; sample < M; ++sample) {
CAFFE_ENFORCE(
tree_.root_node().has_offset(),
"HSM Search require the field offset in NodeProte");
CAFFE_ENFORCE(
tree_.root_node().has_name(),
"HSM Search require the field name in NodeProte");
NodeProto dst_node;
dst_node.set_offset(tree_.root_node().offset());
dst_node.set_name(tree_.root_node().name());
pruning(
X.data<float>(),
sample,
K,
W.data<float>(),
b.data<float>(),
tree_.root_node(),
dst_node,
0,
beam_);
std::vector<std::pair<string, float>> info;
extractNodes(dst_node, info);
// saving the results for each sample.
std::partial_sort(
info.begin(),
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
info.begin() + (top_n_ < info.size() ? top_n_ : info.size() - 1),
info.end(),
[&](std::pair<string, float> a, std::pair<string, float> b) {
return a.second < b.second;
});
auto* y_name_data =
Y_names->template mutable_data<string>() + sample * top_n_;
auto* y_score_data =
Y_scores->template mutable_data<float>() + sample * top_n_;
for (int i = 0; i < top_n_; i++) {
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
if (i < info.size()) {
y_name_data[i] = info[i].first;
y_score_data[i] = info[i].second;
} else {
y_score_data[i] = 0;
}
}
}
return true;
}
template <typename T, class Context>
bool HuffmanTreeHierarchyOp<T, Context>::RunOnDevice() {
const auto& Y = Input(0);
CAFFE_ENFORCE_EQ(Y.dim(), 1, "Input labels must be a vector.");
const auto y_data = Y.template data<T>();
auto treeOutput = Output(0, {1}, at::dtype<string>());
std::vector<int> labelCounts;
labelCounts.resize(num_classes_, 0);
for (int i = 0; i < Y.dim32(0); ++i) {
// Labels are in range [0, num_classes]
const int label_index = y_data[i];
CAFFE_ENFORCE_LT(
label_index,
num_classes_,
"Found an input label ",
label_index,
" not in range [",
0,
",",
num_classes_,
"]");
labelCounts[label_index]++;
}
std::priority_queue<Node, std::vector<Node>, NodeComparator> nodes;
std::vector<Node> huffmanTree;
std::vector<int> labelIndices;
labelIndices.resize(num_classes_);
for (int i = 0; i < num_classes_; ++i) {
Node node(i, labelCounts[i]);
nodes.push(node);
}
// Extract node with minimum count and insert it in the tree array.
auto get_next_node = [&nodes, &huffmanTree, &labelIndices]() {
auto node = nodes.top();
int node_index = huffmanTree.size();
if (node.label != -1) {
labelIndices[node.label] = node_index;
}
nodes.pop();
huffmanTree.push_back(node);
return std::pair<int, Node>(node_index, node);
};
// Merge two nodes and insert the results in the queue.
auto merge_nodes = [&nodes](
const std::pair<int, Node>& node_l, const std::pair<int, Node>& node_r) {
Node node(-1, node_l.second.count + node_r.second.count);
node.left_ch_index = node_l.first;
node.right_ch_index = node_r.first;
nodes.push(node);
};
// Main loop for buttom up huffman tree construction.
while (!nodes.empty()) {
auto lNode = get_next_node();
if (!nodes.empty()) {
auto rNode = get_next_node();
merge_nodes(lNode, rNode);
}
}
auto is_leaf_node = [&huffmanTree](const int node_index) {
return huffmanTree[node_index].left_ch_index == -1 &&
huffmanTree[node_index].right_ch_index == -1;
};
auto get_node_label = [&huffmanTree](const int node_index) {
return huffmanTree[node_index].label;
};
// Build huffman tree.
int current_offset = 0;
std::function<void(int, NodeProto*)> build_tree = [&](
const int node_index, NodeProto* node) {
if (is_leaf_node(node_index) || node_index == -1) {
return;
}
const int left_ch_index = huffmanTree[node_index].left_ch_index;
const int right_ch_index = huffmanTree[node_index].right_ch_index;
if (left_ch_index != -1) {
if (is_leaf_node(left_ch_index)) {
node->add_word_ids(get_node_label(left_ch_index));
} else {
auto* ch_node = node->add_children();
ch_node->set_offset(current_offset);
current_offset += 2;
build_tree(left_ch_index, ch_node);
}
}
if (right_ch_index != -1) {
if (is_leaf_node(right_ch_index)) {
node->add_word_ids(get_node_label(right_ch_index));
current_offset++;
} else {
auto* ch_node = node->add_children();
ch_node->set_offset(current_offset);
current_offset += 2;
build_tree(right_ch_index, ch_node);
}
}
};
// The last element inserted in the tree is the root.
const int rootNodeIndex = huffmanTree.size() - 1;
NodeProto rootNode;
rootNode.set_offset(current_offset);
current_offset += 2;
build_tree(rootNodeIndex, &rootNode);
TreeProto treeProto;
*treeProto.mutable_root_node() = rootNode;
treeProto.SerializeToString(treeOutput->template mutable_data<string>());
return true;
}
namespace {
REGISTER_CPU_OPERATOR(HSoftmax, HSoftmaxOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(HSoftmaxGradient,
HSoftmaxGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(HSoftmaxSearch, HSoftmaxSearchOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
HuffmanTreeHierarchy,
HuffmanTreeHierarchyOp<int64_t, CPUContext>);
OPERATOR_SCHEMA(HSoftmax)
.NumInputs(4)
.NumOutputs(2)
.SetDoc(R"DOC(
Hierarchical softmax is an operator which approximates the softmax operator
while giving significant training speed gains and reasonably comparable
performance. In this operator, instead of calculating the probabilities of all
the classes, we calculate the probability of each step in the path from root to
the target word in the hierarchy.
The operator takes a 2-D tensor (Tensor) containing a batch of layers, a
set of parameters represented by the weight matrix and bias terms, and a 1-D
tensor (Tensor) holding labels, or the indices of the target class. The
hierarchy has to be specified as an argument to the operator.
The operator returns a 1-D tensor holding the computed log probability of the
target class and a 2-D tensor of intermediate outputs (from the weight matrix
and softmax from each step in the path from root to target class) which will be
used by the gradient operator to compute gradients for all samples in the batch.
)DOC")
.Arg(
"hierarchy",
"Serialized HierarchyProto string containing list of "
"vocabulary words and their paths from root of hierarchy to the leaf")
.Input(0, "X", "Input data from previous layer")
.Input(
1,
"W",
"2D blob containing 'stacked' fully connected weight "
"matrices. Each node in the hierarchy contributes one FC weight matrix if "
"it has children nodes. Dimension is N*D, D is input dimension of data (X), "
"N is sum of all output dimensions, or total number of nodes (excl root)")
.Input(2, "b", "1D blob with N parameters")
.Input(3, "labels", "int word_id of the target word")
.Output(0, "Y", "1-D of log probability outputs, one per sample")
.Output(
1,
"intermediate_output",
"Extra blob to store the intermediate "
"FC and softmax outputs for each node in the hierarchical path of a word. "
"The outputs from samples are stored in consecutive blocks in the forward "
"pass and are used in reverse order in the backward gradientOp pass");
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
OPERATOR_SCHEMA(HSoftmaxGradient).NumInputs(6).NumOutputs(4);
class GetHSoftmaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"HSoftmaxGradient", "",
//X, W, b, label, intermediate output, dY
vector<string>{I(0), I(1), I(2), I(3), O(1), GO(0)},
//dX, dW, db, dintermediate_output
vector<string>{GI(0), GI(1), GI(2), GO(1)});
}
};
REGISTER_GRADIENT(HSoftmax, GetHSoftmaxGradient);
OPERATOR_SCHEMA(HSoftmaxSearch)
.NumInputs(3)
.NumOutputs(2)
.SetDoc(R"DOC(
HSoftmaxSearch is an operator to generate the most possible paths given a
well-trained model and input vector. Greedy algorithm is used for pruning the
search tree.
)DOC")
.Arg(
"tree",
"Serialized TreeProto string containing a tree "
"including all intermidate nodes and leafs. All nodes must have names "
"for correct outputs")
.Arg(
"beam",
"beam used for pruning tree. The pruning algorithm is that "
"only children, whose score is smaller than parent's score puls beam, "
"will be propagated. ")
.Arg("topN", "Number of nodes in outputs")
.Input(0, "X", "Input data from previous layer")
.Input(1, "W", "The matrix trained from Softmax Ops")
.Input(2, "b", "The bias trained from Softmax Ops")
.Output(
0,
"Y_names",
"The name of selected nodes and leafs. "
"For nodes, it will be the name defined in the tree. "
"For leafs, it will be the index of the word in the tree.")
.Output(1, "Y_scores", "The corresponding scores of Y_names");
SHOULD_NOT_DO_GRADIENT(HSoftmaxSearch);
OPERATOR_SCHEMA(HuffmanTreeHierarchy)
.NumInputs(1)
.NumOutputs(1)
.SetDoc(R"DOC(
HuffmanTreeHierarchy is an operator to generate huffman tree hierarchy given
the input labels. It returns the tree as serialized HierarchyProto
)DOC")
.Arg("num_classes", "The number of classes used to build the hierarchy.")
.Input(0, "Labels", "The labels vector")
.Output(0, "Hierarch", "Huffman coding hierarchy of the labels");
SHOULD_NOT_DO_GRADIENT(HuffmanTreeHierarchyOp);
} // namespace
} // namespace caffe2