Skip to content

Commit d953611

Browse files
authored
Softmax grad op (#3164)
* init softmax grad op * add compute code * export Backward to python * update test ,export op.type to python * update python test, fix compute bug * update unit test * use eigen * optimize eigen code * add gpu test * register softmax_grad GPU kernel and fix test bug * typo * follow comments
1 parent 809793c commit d953611

File tree

6 files changed

+147
-30
lines changed

6 files changed

+147
-30
lines changed

paddle/framework/operator.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ class OperatorBase {
5555
/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
5656
static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
5757

58+
static std::string GRAD_VAR_NAME(const std::string& name) {
59+
return name + GRAD_VAR_SUFFIX();
60+
}
61+
5862
/// Variables with this suffix are supposed to be filled up with zeros.
5963
static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
6064

paddle/operators/softmax_op.cc

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
22
3-
Licensed under the Apache License, Version 2.0 (the "License");
4-
you may not use this file except in compliance with the License.
5-
You may obtain a copy of the License at
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
66
7-
http://www.apache.org/licenses/LICENSE-2.0
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
814

9-
Unless required by applicable law or agreed to in writing, software
10-
distributed under the License is distributed on an "AS IS" BASIS,
11-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
See the License for the specific language governing permissions and
13-
limitations under the License. */
1415
#include "paddle/operators/softmax_op.h"
1516

1617
namespace paddle {
@@ -19,12 +20,13 @@ namespace operators {
1920
class SoftmaxOp : public OperatorWithKernel {
2021
protected:
2122
void InferShape(const InferShapeContext &ctx) const override {
22-
PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax");
23-
PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
23+
PADDLE_ENFORCE(ctx.InputSize() == 1UL,
24+
"Only one input is need for softmax");
25+
PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
2426
"The input of softmax op must be matrix");
25-
PADDLE_ENFORCE(ctx.OutputSize() == 1,
27+
PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
2628
"Only one output is need for softmax");
27-
ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
29+
ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
2830
}
2931
};
3032

@@ -40,16 +42,27 @@ class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
4042

4143
class SoftmaxOpGrad : public OperatorWithKernel {
4244
protected:
43-
void InferShape(const InferShapeContext &ctx) const override {}
44-
std::string DebugString() const override {
45-
LOG(INFO) << "SoftmaxOpGrad";
46-
return "";
45+
void InferShape(const InferShapeContext &ctx) const override {
46+
PADDLE_ENFORCE(ctx.InputSize() == 3UL,
47+
"Input of SoftmaxOpGrad should be 3, X, Y, YG");
48+
PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
49+
"Output of SoftmaxOpGrad should be 1");
50+
PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
51+
PADDLE_ENFORCE(ctx.InputVar(GRAD_VAR_NAME("Y")) != nullptr,
52+
"Input(Y@GRAD) should not be null");
53+
PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
54+
ctx.Input<Tensor>(GRAD_VAR_NAME("Y"))->dims(),
55+
"the shape of Input(0) and Input(1) should be the same");
56+
ctx.Output<Tensor>(GRAD_VAR_NAME("X"))
57+
->Resize(ctx.Input<Tensor>("Y")->dims());
4758
}
4859
};
4960

5061
} // namespace operators
5162
} // namespace paddle
5263

5364
REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
54-
REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
5565
REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
66+
REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
67+
REGISTER_OP_CPU_KERNEL(softmax_grad,
68+
ops::SoftmaxGradKernel<ops::CPUPlace, float>);

paddle/operators/softmax_op.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
#include "paddle/operators/softmax_op.h"
44

55
REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
6+
REGISTER_OP_GPU_KERNEL(softmax_grad, ops::SoftmaxGradKernel<ops::GPUPlace, float>);

paddle/operators/softmax_op.h

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
22
3-
Licensed under the Apache License, Version 2.0 (the "License");
4-
you may not use this file except in compliance with the License.
5-
You may obtain a copy of the License at
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
66
7-
http://www.apache.org/licenses/LICENSE-2.0
7+
http://www.apache.org/licenses/LICENSE-2.0
88
9-
Unless required by applicable law or agreed to in writing, software
10-
distributed under the License is distributed on an "AS IS" BASIS,
11-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
See the License for the specific language governing permissions and
13-
limitations under the License. */
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
1414

1515
#pragma once
1616

17+
#include "paddle/framework/ddim.h"
18+
#include "paddle/framework/operator.h"
19+
#include "paddle/framework/tensor.h"
1720
#include "paddle/operators/type_alias.h"
1821

1922
namespace paddle {
@@ -23,8 +26,8 @@ template <typename Place, typename T>
2326
class SoftmaxKernel : public OpKernel {
2427
public:
2528
void Compute(const ExecutionContext& context) const override {
26-
auto input = context.Input<Tensor>(0);
27-
auto output = context.Output<Tensor>(0);
29+
auto input = context.Input<Tensor>("X");
30+
auto output = context.Output<Tensor>("Y");
2831
output->mutable_data<T>(context.GetPlace());
2932

3033
auto logits = EigenMatrix<T>::From(*input);
@@ -57,5 +60,38 @@ class SoftmaxKernel : public OpKernel {
5760
.broadcast(one_by_class));
5861
}
5962
};
63+
64+
template <typename Place, typename T>
65+
class SoftmaxGradKernel : public OpKernel {
66+
public:
67+
void Compute(const ExecutionContext& context) const override {
68+
std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
69+
70+
auto Y = context.Input<Tensor>("Y");
71+
auto dY = context.Input<Tensor>(OperatorBase::GRAD_VAR_NAME("Y"));
72+
auto dX = context.Output<Tensor>(OperatorBase::GRAD_VAR_NAME("X"));
73+
dX->mutable_data<T>(context.GetPlace());
74+
75+
const int batch_size = Y->dims()[0];
76+
const int class_num = Y->dims()[1];
77+
78+
Eigen::DSizes<int, 1> along_class(1);
79+
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
80+
Eigen::DSizes<int, 2> one_by_class(1, class_num);
81+
82+
auto Y_eigen = EigenMatrix<T>::From(*Y);
83+
auto dY_eigen = EigenMatrix<T>::From(*dY);
84+
auto dX_eigen = EigenMatrix<T>::From(*dX);
85+
auto place = context.GetEigenDevice<Place>();
86+
87+
auto dot = (Y_eigen * dY_eigen)
88+
.sum(along_class)
89+
.eval()
90+
.reshape(batch_by_one)
91+
.broadcast(one_by_class);
92+
dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
93+
}
94+
};
95+
6096
} // namespace operators
6197
} // namespace paddle

paddle/operators/type_alias.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ namespace paddle {
2222
namespace operators {
2323

2424
using OpKernel = framework::OpKernel;
25+
using OperatorBase = framework::OperatorBase;
2526
using InferShapeContext = framework::InferShapeContext;
2627
using ExecutionContext = framework::ExecutionContext;
2728
using Variable = framework::Variable;

python/paddle/v2/framework/tests/test_softmax_op.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import unittest
2-
from op_test_util import OpTestMeta
2+
33
import numpy as np
4+
import paddle.v2.framework.core as core
5+
import paddle.v2.framework.create_op_creation_methods as creation
6+
7+
from op_test_util import OpTestMeta
48

59

610
def stable_softmax(x):
@@ -19,5 +23,63 @@ def setUp(self):
1923
self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
2024

2125

26+
class TestSoftmaxGradOp(unittest.TestCase):
27+
def test_softmax_grad(self):
28+
op = creation.op_creations.softmax(X="X", Y="Y")
29+
backward_op = core.Operator.backward(op, set())
30+
self.assertEqual(backward_op.type(), "softmax_grad")
31+
expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
32+
self.assertEqual(expected, str(backward_op))
33+
34+
batch_size = 3
35+
class_num = 5
36+
# Initialize X and add 1e-2 for numerical stability
37+
Y = np.random.rand(batch_size, class_num).astype(np.float32)
38+
Y = Y + 1e-2
39+
dY = np.random.rand(batch_size, class_num).astype(np.float32)
40+
41+
# Reference implementation of cross entropy with soft labels
42+
def label_softmax_grad(Y, dY):
43+
dX = Y * 0.0
44+
for i in range(batch_size):
45+
d = np.dot(Y[i, :], dY[i, :])
46+
dX[i, :] = Y[i, :] * (dY[i, :] - d)
47+
return dX
48+
49+
expected = label_softmax_grad(Y, dY)
50+
51+
scope = core.Scope()
52+
places = []
53+
places.append(core.CPUPlace())
54+
if core.is_compile_gpu():
55+
places.append(core.GPUPlace(0))
56+
57+
for place in places:
58+
y = scope.new_var("Y")
59+
y_tensor = y.get_tensor()
60+
y_tensor.set_dims([batch_size, class_num])
61+
y_tensor.alloc_float(place)
62+
y_tensor.set(Y, place)
63+
64+
dy = scope.new_var("Y@GRAD")
65+
dy_tensor = dy.get_tensor()
66+
dy_tensor.set_dims([batch_size, class_num])
67+
dy_tensor.alloc_float(place)
68+
dy_tensor.set(dY, place)
69+
70+
x = scope.new_var("X")
71+
dx = scope.new_var("X@GRAD")
72+
73+
tensor = scope.find_var("X@GRAD").get_tensor()
74+
backward_op.infer_shape(scope)
75+
self.assertEqual([batch_size, class_num], tensor.shape())
76+
77+
ctx = core.DeviceContext.create(place)
78+
backward_op.run(scope, ctx)
79+
actual = np.array(tensor)
80+
81+
np.testing.assert_almost_equal(actual, expected, decimal=3)
82+
83+
2284
if __name__ == '__main__':
2385
unittest.main()

0 commit comments

Comments
 (0)