From 3a328371981d91210f229bf41428092869fb1d56 Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Mon, 7 Jun 2021 14:23:11 +0800
Subject: [PATCH 1/7] add mseloss module

---
 docs/source/experimental.rst                |  1 +
 oneflow/python/nn/modules/loss.py           | 96 +++++++++++++++++++++
 oneflow/python/ops/nn_ops.py                |  1 +
 oneflow/python/test/modules/test_mseloss.py | 67 ++++++++++++++
 4 files changed, 165 insertions(+)
 create mode 100644 oneflow/python/test/modules/test_mseloss.py

diff --git a/docs/source/experimental.rst b/docs/source/experimental.rst
index 3383d8c5eef..fbcde93007a 100644
--- a/docs/source/experimental.rst
+++ b/docs/source/experimental.rst
@@ -85,6 +85,7 @@ Experimental features
 .. autofunction:: oneflow.experimental.nn.Linear
 .. autofunction:: oneflow.experimental.nn.CrossEntropyLoss
 .. autofunction:: oneflow.experimental.nn.NLLLoss
+.. autofunction:: oneflow.experimental.nn.MSELoss
 .. autofunction:: oneflow.experimental.masked_fill
 .. autofunction:: oneflow.experimental.Tensor.masked_fill
 .. autofunction:: oneflow.experimental.sum
diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index abdc723ac2c..8095162e9d3 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -18,6 +18,7 @@
 import oneflow as flow
 from oneflow.python.oneflow_export import oneflow_export, experimental_api
 from oneflow.python.nn.module import Module
+from oneflow.python.nn.modules.math_ops import Subtract, Square, Sum, Mean
 
 
 @oneflow_export("nn.CrossEntropyLoss")
@@ -296,6 +297,101 @@ def forward(self, input, target):
             return res.mean()
 
 
+@oneflow_export("nn.MSELoss")
+@experimental_api
+class MSELoss(Module):
+    r"""The interface is consistent with PyTorch.
+    The documentation is referenced from:
+        https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
+
+    Creates a criterion that measures the mean squared error (squared L2 norm) between
+    each element in the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left( x_n - y_n \right)^2,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The mean operation still operates over all the elements, and divides by :math:`n`.
+
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` means, any number of additional
+          dimensions
+        - Target: :math:`(N, *)`, same shape as the input
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow.experimental as flow
+        >>> import numpy as np
+        >>> flow.enable_eager_execution()
+
+    """
+
+    def __init__(self, reduction: str = "mean", size_average=True, reduce=True) -> None:
+        super().__init__()
+        if size_average is not None and not size_average:
+            raise ValueError("Argument size_average is not supported yet")
+        if reduce is not None and not reduce:
+            raise ValueError("Argument reduce is not supported yet")
+        assert reduction in [
+            "sum",
+            "none",
+            "mean",
+            None,
+        ], "only 'sum', 'mean' and None supported by now"
+
+        self.reduction = reduction
+        self.square_op = Square()
+        self.subtract_op = Subtract()
+        self.sum_op = Sum()
+        self.mean_op = Mean()
+
+    def forward(self, input, target):
+        mean_squared_difference = self.square_op(self.subtract_op(input, target))
+        if self.reduction == "mean":
+            return self.mean_op(mean_squared_difference)
+        elif self.reduction == "sum":
+            return self.sum_op(mean_squared_difference)
+        else:
+            # Do no reduction
+            return mean_squared_difference
+
+
 if __name__ == "__main__":
     import doctest
 
diff --git a/oneflow/python/ops/nn_ops.py b/oneflow/python/ops/nn_ops.py
index 44a94feaa07..e0c354421d5 100644
--- a/oneflow/python/ops/nn_ops.py
+++ b/oneflow/python/ops/nn_ops.py
@@ -3917,6 +3917,7 @@ def bce_with_logits_loss_job(input: tp.Numpy.Placeholder(shape=(2, 3)),
 
 
 @oneflow_export("nn.MSELoss")
+@stable_api
 def mse_loss(
     input: oneflow._oneflow_internal.BlobDesc,
     target: oneflow._oneflow_internal.BlobDesc,
diff --git a/oneflow/python/test/modules/test_mseloss.py b/oneflow/python/test/modules/test_mseloss.py
new file mode 100644
index 00000000000..3e2e9ad68c5
--- /dev/null
+++ b/oneflow/python/test/modules/test_mseloss.py
@@ -0,0 +1,67 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+
+import oneflow.experimental as flow
+from test_util import GenArgList
+
+
+def np_mseloss(np_input, np_target):
+    np_mse = np.square(np_target - np_input)
+    np_mse_mean = np.mean(np_mse)
+    np_mse_sum = np.sum(np_mse)
+
+    return {
+        "none": np_mse,
+        "mean": np_mse_mean,
+        "sum": np_mse_sum,
+    }
+
+
+def _test_mseloss(test_case, device, reduction):
+    x = np.random.randn(2, 3)
+    y = np.random.randn(2, 3)
+    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+
+    loss = flow.nn.MSELoss(reduction=reduction)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = np_mseloss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
+
+
+@unittest.skipIf(
+    not flow.unittest.env.eager_execution_enabled(),
+    ".numpy() doesn't work in lazy mode",
+)
+class TestMSELossModule(flow.unittest.TestCase):
+    def test_mseloss(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [
+            _test_mseloss,
+        ]
+        arg_dict["device"] = ["cpu"]
+        arg_dict["reduction"] = ["none"]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()

From a4a4e8880b698160add4466eb2d69fa7ac883dbb Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Mon, 7 Jun 2021 17:37:00 +0800
Subject: [PATCH 2/7] add mseloss testcase

---
 oneflow/python/nn/modules/loss.py           | 21 ++++++
 oneflow/python/test/modules/test_mseloss.py | 81 ++++++++++++++++++---
 2 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index 8095162e9d3..e0efb72ae46 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -360,6 +360,27 @@ class MSELoss(Module):
         >>> import numpy as np
         >>> flow.enable_eager_execution()
 
+        >>> input = flow.Tensor(
+        ... [[-0.02557137, 0.03101675, 1.37493674],
+        ... [0.25599439, -1.08372561, -0.21006816]], dtype=flow.float32)
+        >>> #1111
+        >>> target = flow.Tensor(
+        ... [[-1.53105064, -0.68137555, 0.5931354],
+        ... [-0.49158347, 0.93673637, 0.1324141]], dtype=flow.float32)
+        >>> m = flow.nn.MSELoss(reduction="none")
+        >>> out = m(input, target)
+        >>> print(out.numpy())
+        [[2.266468   0.50750285 0.61121327]
+         [0.55887264 4.082267   0.1172941 ]]
+        >>> m = flow.nn.MSELoss(reduction="mean")
+        >>> out = m(input, target)
+        >>> print(out.numpy())
+        [1.3572696]
+        >>> m = flow.nn.MSELoss(reduction="sum")
+        >>> out = m(input, target)
+        >>> print(out.numpy())
+        [8.143618]
+
     """
 
     def __init__(self, reduction: str = "mean", size_average=True, reduce=True) -> None:
diff --git a/oneflow/python/test/modules/test_mseloss.py b/oneflow/python/test/modules/test_mseloss.py
index 3e2e9ad68c5..81677c67d42 100644
--- a/oneflow/python/test/modules/test_mseloss.py
+++ b/oneflow/python/test/modules/test_mseloss.py
@@ -22,7 +22,7 @@
 from test_util import GenArgList
 
 
-def np_mseloss(np_input, np_target):
+def _np_mseloss(np_input, np_target):
     np_mse = np.square(np_target - np_input)
     np_mse_mean = np.mean(np_mse)
     np_mse_sum = np.sum(np_mse)
@@ -34,17 +34,76 @@ def np_mseloss(np_input, np_target):
     }
 
 
-def _test_mseloss(test_case, device, reduction):
-    x = np.random.randn(2, 3)
-    y = np.random.randn(2, 3)
-    input = flow.Tensor(x, dtype=flow.float32, device=flow.device(device))
+def _np_mseloss_grad(np_input, np_target):
+    elem_cnt = np_input.size
+    np_mse_grad_sum = -2 * (np_target - np_input)
+    np_mse_grad_mean = np_mse_grad_sum / elem_cnt
+
+    return {
+        "none": np_mse_grad_sum,
+        "mean": np_mse_grad_mean,
+        "sum": np_mse_grad_sum,
+    }
+
+
+def _test_mseloss_backward(test_case, device, reduction):
+    x = np.random.randn(3, 5)
+    y = np.random.randn(3, 5)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+
+    loss = flow.nn.MSELoss(reduction=reduction)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_mseloss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
+
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_mseloss_grad(x, y)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
+
+
+def _test_mseloss_high_dim_input_backward(test_case, device, reduction):
+    x = np.random.randn(3, 2, 4, 16, 5)
+    y = np.random.randn(3, 2, 4, 16, 5)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
     target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
 
     loss = flow.nn.MSELoss(reduction=reduction)
     loss = loss.to(device)
     of_out = loss(input, target)
-    np_out = np_mseloss(x, y)[reduction]
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
+    np_out = _np_mseloss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
+
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_mseloss_grad(x, y)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
+
+
+def _test_mseloss_one_elem_input_backward(test_case, device, reduction):
+    x = np.array([0]).astype(np.float)
+    y = np.array([-1]).astype(np.float)
+    input = flow.Tensor(
+        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
+    )
+    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
+
+    loss = flow.nn.MSELoss(reduction=reduction)
+    loss = loss.to(device)
+    of_out = loss(input, target)
+    np_out = _np_mseloss(x, y)[reduction]
+    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
+
+    of_out = of_out.sum()
+    of_out.backward()
+    np_grad = _np_mseloss_grad(x, y)[reduction]
+    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
 
 
 @unittest.skipIf(
@@ -55,10 +114,12 @@ class TestMSELossModule(flow.unittest.TestCase):
     def test_mseloss(test_case):
         arg_dict = OrderedDict()
         arg_dict["test_fun"] = [
-            _test_mseloss,
+            _test_mseloss_backward,
+            _test_mseloss_high_dim_input_backward,
+            _test_mseloss_one_elem_input_backward,
         ]
-        arg_dict["device"] = ["cpu"]
-        arg_dict["reduction"] = ["none"]
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["reduction"] = ["none", "mean", "sum"]
         for arg in GenArgList(arg_dict):
             arg[0](test_case, *arg[1:])
 

From 7da3e82cca9ff3fbcdfe5eb9348f7c18c1aff242 Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Tue, 8 Jun 2021 09:45:47 +0800
Subject: [PATCH 3/7] delete debug code

---
 oneflow/python/nn/modules/loss.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index e0efb72ae46..ef7ec33dd10 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -363,7 +363,6 @@ class MSELoss(Module):
         >>> input = flow.Tensor(
         ... [[-0.02557137, 0.03101675, 1.37493674],
         ... [0.25599439, -1.08372561, -0.21006816]], dtype=flow.float32)
-        >>> #1111
         >>> target = flow.Tensor(
         ... [[-1.53105064, -0.68137555, 0.5931354],
         ... [-0.49158347, 0.93673637, 0.1324141]], dtype=flow.float32)

From f5982f0339375c2f8bef2bbdc84f43b6d0a8d69e Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Tue, 8 Jun 2021 10:25:33 +0800
Subject: [PATCH 4/7] add mseloss testcase

---
 oneflow/python/nn/modules/loss.py           |  2 +-
 oneflow/python/test/modules/test_mseloss.py | 55 ++++-----------------
 2 files changed, 11 insertions(+), 46 deletions(-)

diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index ef7ec33dd10..af790c7fba1 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -393,7 +393,7 @@ def __init__(self, reduction: str = "mean", size_average=True, reduce=True) -> N
             "none",
             "mean",
             None,
-        ], "only 'sum', 'mean' and None supported by now"
+        ], "reduction parameter only support 'sum'/'mean'/'none'/None for now!"
 
         self.reduction = reduction
         self.square_op = Square()
diff --git a/oneflow/python/test/modules/test_mseloss.py b/oneflow/python/test/modules/test_mseloss.py
index 81677c67d42..8621b181b38 100644
--- a/oneflow/python/test/modules/test_mseloss.py
+++ b/oneflow/python/test/modules/test_mseloss.py
@@ -46,49 +46,9 @@ def _np_mseloss_grad(np_input, np_target):
     }
 
 
-def _test_mseloss_backward(test_case, device, reduction):
-    x = np.random.randn(3, 5)
-    y = np.random.randn(3, 5)
-    input = flow.Tensor(
-        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
-    )
-    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
-
-    loss = flow.nn.MSELoss(reduction=reduction)
-    loss = loss.to(device)
-    of_out = loss(input, target)
-    np_out = _np_mseloss(x, y)[reduction]
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
-
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = _np_mseloss_grad(x, y)[reduction]
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
-
-
-def _test_mseloss_high_dim_input_backward(test_case, device, reduction):
-    x = np.random.randn(3, 2, 4, 16, 5)
-    y = np.random.randn(3, 2, 4, 16, 5)
-    input = flow.Tensor(
-        x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
-    )
-    target = flow.Tensor(y, dtype=flow.float32, device=flow.device(device))
-
-    loss = flow.nn.MSELoss(reduction=reduction)
-    loss = loss.to(device)
-    of_out = loss(input, target)
-    np_out = _np_mseloss(x, y)[reduction]
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
-
-    of_out = of_out.sum()
-    of_out.backward()
-    np_grad = _np_mseloss_grad(x, y)[reduction]
-    test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
-
-
-def _test_mseloss_one_elem_input_backward(test_case, device, reduction):
-    x = np.array([0]).astype(np.float)
-    y = np.array([-1]).astype(np.float)
+def _test_mseloss_backward(test_case, device, shape, reduction):
+    x = np.random.randn(*shape)
+    y = np.random.randn(*shape)
     input = flow.Tensor(
         x, dtype=flow.float32, requires_grad=True, device=flow.device(device)
     )
@@ -115,10 +75,15 @@ def test_mseloss(test_case):
         arg_dict = OrderedDict()
         arg_dict["test_fun"] = [
             _test_mseloss_backward,
-            _test_mseloss_high_dim_input_backward,
-            _test_mseloss_one_elem_input_backward,
         ]
         arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["shape"] = [
+            (3, 5),
+            (10, 9, 21),
+            (14, 22, 9, 21),
+            (3, 2, 4, 16, 5),
+            (1,),
+        ]
         arg_dict["reduction"] = ["none", "mean", "sum"]
         for arg in GenArgList(arg_dict):
             arg[0](test_case, *arg[1:])

From e54a270468c52c2544612ec7ff0357630f89c612 Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Tue, 8 Jun 2021 10:36:21 +0800
Subject: [PATCH 5/7] rename mseloss testcase

---
 oneflow/python/test/modules/test_mseloss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oneflow/python/test/modules/test_mseloss.py b/oneflow/python/test/modules/test_mseloss.py
index 8621b181b38..3f21e55b411 100644
--- a/oneflow/python/test/modules/test_mseloss.py
+++ b/oneflow/python/test/modules/test_mseloss.py
@@ -46,7 +46,7 @@ def _np_mseloss_grad(np_input, np_target):
     }
 
 
-def _test_mseloss_backward(test_case, device, shape, reduction):
+def _test_mseloss_impl(test_case, device, shape, reduction):
     x = np.random.randn(*shape)
     y = np.random.randn(*shape)
     input = flow.Tensor(
@@ -74,7 +74,7 @@ class TestMSELossModule(flow.unittest.TestCase):
     def test_mseloss(test_case):
         arg_dict = OrderedDict()
         arg_dict["test_fun"] = [
-            _test_mseloss_backward,
+            _test_mseloss_impl,
         ]
         arg_dict["device"] = ["cpu", "cuda"]
         arg_dict["shape"] = [

From 65dcf77baa15d947a11e0c21187ae0bb3270cb26 Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Tue, 8 Jun 2021 13:58:52 +0800
Subject: [PATCH 6/7] fix docstring warning

---
 oneflow/python/nn/modules/loss.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index af790c7fba1..e2fd600e3ca 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -16,6 +16,7 @@
 from typing import Optional
 
 import oneflow as flow
+from oneflow.python.framework.tensor import Tensor
 from oneflow.python.oneflow_export import oneflow_export, experimental_api
 from oneflow.python.nn.module import Module
 from oneflow.python.nn.modules.math_ops import Subtract, Square, Sum, Mean
@@ -302,7 +303,7 @@ def forward(self, input, target):
 class MSELoss(Module):
     r"""The interface is consistent with PyTorch.
     The documentation is referenced from:
-        https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
+    https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
 
     Creates a criterion that measures the mean squared error (squared L2 norm) between
     each element in the input :math:`x` and target :math:`y`.
@@ -382,18 +383,20 @@ class MSELoss(Module):
 
     """
 
-    def __init__(self, reduction: str = "mean", size_average=True, reduce=True) -> None:
+    def __init__(
+        self, reduction: str = "mean", size_average: bool = True, reduce: bool = True
+    ) -> None:
         super().__init__()
-        if size_average is not None and not size_average:
+        if size_average is False:
             raise ValueError("Argument size_average is not supported yet")
-        if reduce is not None and not reduce:
+        if reduce is False:
             raise ValueError("Argument reduce is not supported yet")
         assert reduction in [
             "sum",
             "none",
             "mean",
             None,
-        ], "reduction parameter only support 'sum'/'mean'/'none'/None for now!"
+        ], "Argument reduction only support 'sum'/'mean'/'none'/None for now!"
 
         self.reduction = reduction
         self.square_op = Square()
@@ -401,7 +404,7 @@ def __init__(self, reduction: str = "mean", size_average=True, reduce=True) -> N
         self.sum_op = Sum()
         self.mean_op = Mean()
 
-    def forward(self, input, target):
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
         mean_squared_difference = self.square_op(self.subtract_op(input, target))
         if self.reduction == "mean":
             return self.mean_op(mean_squared_difference)

From c36e3e164322ad3681c79c026ebecadbb2be6e6a Mon Sep 17 00:00:00 2001
From: ShiYongtao <847615435@qq.com>
Date: Tue, 8 Jun 2021 18:50:00 +0800
Subject: [PATCH 7/7] format docstring

---
 oneflow/python/nn/modules/loss.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/oneflow/python/nn/modules/loss.py b/oneflow/python/nn/modules/loss.py
index ca7d54f2d37..2a92f8c6ba3 100644
--- a/oneflow/python/nn/modules/loss.py
+++ b/oneflow/python/nn/modules/loss.py
@@ -304,24 +304,33 @@ class MSELoss(Module):
     r"""The interface is consistent with PyTorch.
     The documentation is referenced from:
     https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html?highlight=mseloss#torch.nn.MSELoss
+
     Creates a criterion that measures the mean squared error (squared L2 norm) between
     each element in the input :math:`x` and target :math:`y`.
+
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
     .. math::
         \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
         l_n = \left( x_n - y_n \right)^2,
+
     where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
     (default ``'mean'``), then:
+
     .. math::
         \ell(x, y) =
         \begin{cases}
             \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
             \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
         \end{cases}
+
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
     of :math:`n` elements each.
+
     The mean operation still operates over all the elements, and divides by :math:`n`.
+
     The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
     Args:
         size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
             the losses are averaged over each loss element in the batch. Note that for
@@ -338,15 +347,20 @@ class MSELoss(Module):
             elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
     Shape:
         - Input: :math:`(N, *)` where :math:`*` means, any number of additional
           dimensions
         - Target: :math:`(N, *)`, same shape as the input
+
     For example:
+
     .. code-block:: python
+
         >>> import oneflow.experimental as flow
         >>> import numpy as np
         >>> flow.enable_eager_execution()
+
         >>> input = flow.Tensor(
         ... [[-0.02557137, 0.03101675, 1.37493674],
         ... [0.25599439, -1.08372561, -0.21006816]], dtype=flow.float32)
@@ -366,6 +380,7 @@ class MSELoss(Module):
         >>> out = m(input, target)
         >>> print(out.numpy())
         [8.143618]
+
     """
 
     def __init__(