diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index 6a0838e588c41..ce3b828eaebad 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -38,11 +38,11 @@ def kl_divergence(p, q):
         KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
 
     Args:
-        p (Distribution): ``Distribution`` object.
-        q (Distribution): ``Distribution`` object.
+        p (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
+        q (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
 
     Returns:
-        Tensor: Batchwise KL-divergence between distribution p and q.
+        Tensor, Batchwise KL-divergence between distribution p and q.
 
     Examples:
 
@@ -71,8 +71,8 @@ def register_kl(cls_p, cls_q):
     implemention funciton by the decorator.
 
     Args:
-        cls_p(Distribution): Subclass derived from ``Distribution``.
-        cls_q(Distribution): Subclass derived from ``Distribution``.
+        cls_p (Distribution): The Distribution type of Instance p. Subclass derived from ``Distribution``.
+        cls_q (Distribution): The Distribution type of Instance q. Subclass derived from ``Distribution``.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index 8a9e5cd7372a7..c9235dc940665 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -36,7 +36,7 @@ class Normal(distribution.Distribution):
 
     .. math::
 
-        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+        pdf(x; \mu, \sigma) = \frac{1}{Z}e^{\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
 
     .. math::
 
@@ -49,43 +49,43 @@ class Normal(distribution.Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is float32 and float64.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is float32 and float64.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Normal
-
-          # Define a single scalar Normal distribution.
-          dist = Normal(loc=0., scale=3.)
-          # Define a batch of two scalar valued Normals.
-          # The first has mean 1 and standard deviation 11, the second 2 and 22.
-          dist = Normal(loc=[1., 2.], scale=[11., 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample([3])
-
-          # Define a batch of two scalar valued Normals.
-          # Both have mean 1, but different standard deviations.
-          dist = Normal(loc=1., scale=[11., 22.])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          normal_a = Normal([0.], [1.])
-          normal_b = Normal([0.5], [2.])
-          sample = normal_a.sample([2])
-          # a random tensor created by normal distribution with shape: [2, 1]
-          entropy = normal_a.entropy()
-          # [1.4189385] with shape: [1]
-          lp = normal_a.log_prob(value_tensor)
-          # [-1.2389386] with shape: [1]
-          p = normal_a.probs(value_tensor)
-          # [0.28969154] with shape: [1]
-          kl = normal_a.kl_divergence(normal_b)
-          # [0.34939718] with shape: [1]
+            import paddle
+            from paddle.distribution import Normal
+
+            # Define a single scalar Normal distribution.
+            dist = Normal(loc=0., scale=3.)
+            # Define a batch of two scalar valued Normals.
+            # The first has mean 1 and standard deviation 11, the second 2 and 22.
+            dist = Normal(loc=[1., 2.], scale=[11., 22.])
+            # Get 3 samples, returning a 3 x 2 tensor.
+            dist.sample([3])
+
+            # Define a batch of two scalar valued Normals.
+            # Both have mean 1, but different standard deviations.
+            dist = Normal(loc=1., scale=[11., 22.])
+
+            # Complete example
+            value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            normal_a = Normal([0.], [1.])
+            normal_b = Normal([0.5], [2.])
+            sample = normal_a.sample([2])
+            # a random tensor created by normal distribution with shape: [2, 1]
+            entropy = normal_a.entropy()
+            # [1.4189385] with shape: [1]
+            lp = normal_a.log_prob(value_tensor)
+            # [-1.2389386] with shape: [1]
+            p = normal_a.probs(value_tensor)
+            # [0.28969154] with shape: [1]
+            kl = normal_a.kl_divergence(normal_b)
+            # [0.34939718] with shape: [1]
     """
 
     def __init__(self, loc, scale, name=None):
@@ -132,11 +132,11 @@ def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
 
         Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
+            shape (list): 1D `int32`. Shape of the generated samples.
+            seed (int): Python integer number.
 
         Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape.The data type is float32.
 
         """
         if not _non_static_mode():
@@ -177,14 +177,14 @@ def entropy(self):
 
         .. math::
 
-            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+            entropy(\sigma) = 0.5 \log (2 \pi e \sigma^2)
 
         In the above equation:
 
         * :math:`scale = \sigma`: is the std.
 
         Returns:
-          Tensor: Shannon entropy of normal distribution.The data type is float32.
+            Tensor, Shannon entropy of normal distribution.The data type is float32.
 
         """
         name = self.name + '_entropy'
@@ -221,10 +221,10 @@ def probs(self, value):
         """Probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
 
         """
         name = self.name + '_probs'
@@ -243,11 +243,11 @@ def kl_divergence(self, other):
 
         .. math::
 
-            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\frac{diff}{\sigma_1})^2 - 1 - 2 \ln {ratio})
 
         .. math::
 
-            ratio = \\frac{\sigma_0}{\sigma_1}
+            ratio = \frac{\sigma_0}{\sigma_1}
 
         .. math::
 
@@ -266,7 +266,7 @@ def kl_divergence(self, other):
             other (Normal): instance of Normal.
 
         Returns:
-            Tensor: kl-divergence between two normal distributions.The data type is float32.
+            Tensor, kl-divergence between two normal distributions.The data type is float32.
 
         """
         if not _non_static_mode():
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index efa3248965157..890b7c737aa71 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -58,7 +58,7 @@ class Transform(object):
     Suppose :math:`X` is a K-dimensional random variable with probability
     density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
     be defined by transforming :math:`X` with a suitably well-behaved funciton
-    :math:`f`. It suffices for what follows to note that if f is one-to-one and
+    :math:`f`. It suffices for what follows to note that if `f` is one-to-one and
     its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
     :math:`Y` is
 
@@ -1001,8 +1001,9 @@ class StackTransform(Transform):
     specific axis.
 
     Args:
-        transforms(Sequence[Transform]): The sequence of transformations.
-        axis(int): The axis along which will be transformed.
+        transforms (Sequence[Transform]): The sequence of transformations.
+        axis (int, optional): The axis along which will be transformed. default
+            value is 0.
 
     Examples:
 
@@ -1010,7 +1011,6 @@ class StackTransform(Transform):
 
             import paddle
 
-
             x = paddle.stack(
                 (paddle.to_tensor([1., 2., 3.]), paddle.to_tensor([1, 2., 3.])), 1)
             t = paddle.distribution.StackTransform(
@@ -1023,11 +1023,13 @@ class StackTransform(Transform):
             #        [[2.71828175 , 1.         ],
             #         [7.38905621 , 4.         ],
             #         [20.08553696, 9.         ]])
+
             print(t.inverse(t.forward(x)))
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1., 1.],
             #         [2., 2.],
             #         [3., 3.]])
+
             print(t.forward_log_det_jacobian(x))
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1.        , 0.69314718],
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 7c085da315686..961d846a527c6 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -37,7 +37,7 @@ class Uniform(distribution.Distribution):
 
     .. math::
 
-        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+        pdf(x; a, b) = \frac{1}{Z}, \ a <=x <b
 
     .. math::
 
@@ -50,43 +50,45 @@ class Uniform(distribution.Distribution):
     * :math:`Z`: is the normalizing constant.
 
     The parameters `low` and `high` must be shaped in a way that supports
-    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+    :ref:`user_guide_broadcasting` (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of
+            uniform distribution.The data type is float32 and float64.
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary
+            of uniform distribution.The data type is float32 and float64.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Uniform
-
-          # Without broadcasting, a single uniform distribution [3, 4]:
-          u1 = Uniform(low=3.0, high=4.0)
-          # 2 distributions [1, 3], [2, 4]
-          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
-          # 4 distributions
-          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
-                    high=[[1.5, 2.5], [3.5, 4.5]])
-
-          # With broadcasting:
-          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          uniform = Uniform([0.], [2.])
-
-          sample = uniform.sample([2])
-          # a random tensor created by uniform distribution with shape: [2, 1]
-          entropy = uniform.entropy()
-          # [0.6931472] with shape: [1]
-          lp = uniform.log_prob(value_tensor)
-          # [-0.6931472] with shape: [1]
-          p = uniform.probs(value_tensor)
-          # [0.5] with shape: [1]
+            import paddle
+            from paddle.distribution import Uniform
+
+            # Without broadcasting, a single uniform distribution [3, 4]:
+            u1 = Uniform(low=3.0, high=4.0)
+            # 2 distributions [1, 3], [2, 4]
+            u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+            # 4 distributions
+            u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                        high=[[1.5, 2.5], [3.5, 4.5]])
+
+            # With broadcasting:
+            u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+            # Complete example
+            value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            uniform = Uniform([0.], [2.])
+
+            sample = uniform.sample([2])
+            # a random tensor created by uniform distribution with shape: [2, 1]
+            entropy = uniform.entropy()
+            # [0.6931472] with shape: [1]
+            lp = uniform.log_prob(value_tensor)
+            # [-0.6931472] with shape: [1]
+            p = uniform.probs(value_tensor)
+            # [0.5] with shape: [1]
     """
 
     def __init__(self, low, high, name=None):
@@ -132,11 +134,11 @@ def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
 
         Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
+            shape (list): 1D `int32`. Shape of the generated samples.
+            seed (int): Python integer number.
 
         Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape. The data type is float32.
 
         """
         if not _non_static_mode():
@@ -179,10 +181,10 @@ def log_prob(self, value):
         """Log probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: log probability.The data type is same with value.
+            Tensor, log probability.The data type is same with value.
 
         """
         value = self._check_values_dtype_in_probs(self.low, value)
@@ -216,10 +218,10 @@ def probs(self, value):
         """Probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
 
         """
         value = self._check_values_dtype_in_probs(self.low, value)
@@ -256,7 +258,7 @@ def entropy(self):
             entropy(low, high) = \\log (high - low)
 
         Returns:
-          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+            Tensor, Shannon entropy of uniform distribution.The data type is float32.
 
         """
         name = self.name + '_entropy'
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index a4cb052b5942e..5a39aee304f9a 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -44,8 +44,7 @@ def celu(x, alpha=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -95,8 +94,7 @@ def elu(x, alpha=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -145,6 +143,8 @@ def gelu(x, approximate=False, name=None):
     r"""
     gelu activation.
 
+    The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
+
     if approximate is True
 
     .. math::
@@ -159,9 +159,8 @@ def gelu(x, approximate=False, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        approximate (bool, optional): Wether to enable approximation. Default is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        approximate (bool, optional): Whether to enable approximation. Default is False.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -214,9 +213,8 @@ def hardshrink(x, threshold=0.5, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -267,8 +265,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         min (float, optional): The minimum value of the linear region range. Default is -1.
         max (float, optional): The maximum value of the linear region range. Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -327,8 +324,7 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         slope (float, optional): The slope of hardsigmoid function. Default is 0.1666667.
         offset (float, optional): The offset of hardsigmoid function. Default is 0.5.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -385,8 +381,7 @@ def hardswish(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -432,8 +427,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         negative_slope (float, optional): Slope of the activation function at
             :math:`x < 0` . Default is 0.01.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -479,8 +473,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         weight (Tensor): The learnable parameter with data type same as ``x``.
             The weight shape is [1] or [in], where `in` is the input channel of ``x``.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
 
@@ -607,8 +600,7 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
         lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
         upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
         training (bool, optional): Current mode is in training or others.  Default is True.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -691,8 +683,7 @@ def relu(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -742,8 +733,7 @@ def log_sigmoid(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -803,8 +793,7 @@ def maxout(x, groups, axis=1, name=None):
             is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
             where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
             Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type as ``x`` .
@@ -861,8 +850,7 @@ def relu6(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -915,8 +903,7 @@ def selu(x,
         x (Tensor): The input Tensor with data type float32, float64.
         scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
         alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -968,8 +955,7 @@ def silu(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1079,8 +1065,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
             dimensions of ``x`` . If ``axis`` < 0, it works the same way as
             :math:`axis + D` . Default is -1.
         dtype (str, optional): The data type of the output tensor, can be float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1194,8 +1179,7 @@ def softplus(x, beta=1, threshold=20, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         beta (float, optional): The value of beta for softplus. Default is 1
         threshold (float, optional): The value of threshold for softplus. Default is 20
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1249,8 +1233,7 @@ def softshrink(x, threshold=0.5, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1296,8 +1279,7 @@ def softsign(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1335,8 +1317,7 @@ def swish(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1381,8 +1362,7 @@ def mish(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1418,8 +1398,7 @@ def tanhshrink(x, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1466,8 +1445,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1524,8 +1502,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1615,8 +1592,7 @@ def glu(x, axis=-1, name=None):
             should be in range [-D, D), where D is the dimensions of ``x`` .
             If ``axis`` < 0, it works the same way as :math:`axis + D` .
             Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type as x. The size of the given aixs is
@@ -1678,8 +1654,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             in autograd. Default is False.
         axis (int, optional): The axis along will be calculated softmax value.
             Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 4171281473d05..f9f05d94756ff 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -176,6 +176,7 @@ def interpolate(x,
     """
 
     This API resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
@@ -184,12 +185,13 @@ def interpolate(x,
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
-        'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
-        'area': Area interpolation
+
+    - 'linear' : Linear interpolation
+    - 'bilinear' : Bilinear interpolation
+    - 'trilinear' : Trilinear interpolation
+    - 'nearest' : Nearest neighbor interpolation
+    - 'bicubic' : Bicubic interpolation
+    - 'area': Area interpolation
 
     Linear interpolation is the method of using a line connecting two known quantities
     to determine the value of an unknown quantity between the two known quantities.
@@ -226,13 +228,13 @@ def interpolate(x,
 
     .. code-block:: text
 
-        For scale_factor:
+        # For scale_factor:
             if align_corners = True && out_size > 1 :
               scale_factor = (in_size-1.0)/(out_size-1.0)
             else:
               scale_factor = float(in_size/out_size)
 
-        Linear interpolation:
+        # Linear interpolation:
             if:
                 align_corners = False , align_mode = 0
                 input : (N,C,W_in)
@@ -243,7 +245,7 @@ def interpolate(x,
                 output: (N,C,W_out) where:
                 W_out = W_{in} * scale_{factor}
 
-        Nearest neighbor interpolation:
+        # Nearest neighbor interpolation:
 
               align_corners = False
               input : (N,C,H_in,W_in)
@@ -251,7 +253,7 @@ def interpolate(x,
               H_out = floor (H_{in} * scale_{factor})
               W_out = floor (W_{in} * scale_{factor})
 
-        Bilinear interpolation:
+        # Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
               input : (N,C,H_in,W_in)
@@ -264,7 +266,7 @@ def interpolate(x,
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-        Bicubic interpolation:
+        # Bicubic interpolation:
           if:
               align_corners = False
               input : (N,C,H_in,W_in)
@@ -277,7 +279,7 @@ def interpolate(x,
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-        Trilinear interpolation:
+        # Trilinear interpolation:
           if:
               align_corners = False , align_mode = 0
               input : (N,C,D_in,H_in,W_in)
@@ -907,15 +909,16 @@ def dropout(x,
         training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
         mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer'].
 
-                           1. upscale_in_train(default), upscale the output at training time
+            1. upscale_in_train(default), upscale the output at training time
+
+                - train: out = input * mask / ( 1.0 - dropout_prob )
+                - inference: out = input
 
-                              - train: out = input * mask / ( 1.0 - dropout_prob )
-                              - inference: out = input
+            2. downscale_in_infer, downscale the output at inference
 
-                           2. downscale_in_infer, downscale the output at inference
+                - train: out = input * mask
+                - inference: out = input * (1.0 - dropout_prob)
 
-                              - train: out = input * mask
-                              - inference: out = input * (1.0 - dropout_prob)
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1776,12 +1779,12 @@ def linear(x, weight, bias=None, name=None):
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     r"""
     Label smoothing is a mechanism to regularize the classifier layer and is called
-    label-smoothing regularization (LSR).
+    label-smoothing regularization (LSR).Label smoothing is proposed to encourage
+    the model to be less confident, since optimizing the log-likelihood of the
+    correct label directly may cause overfitting and reduce the ability of the
+    model to adapt.
 
-    Label smoothing is proposed to encourage the model to be less confident,
-    since optimizing the log-likelihood of the correct label directly may
-    cause overfitting and reduce the ability of the model to adapt. Label
-    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    Label smoothing replaces the ground-truth label :math:`y` with the weighted sum
     of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
     i.e.
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 3f5637fa3922b..3e89ef519e9d4 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -854,15 +854,18 @@ def hsigmoid_loss(input,
     """
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
+
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
     For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
     the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+
+    Comparing to softmax, hsigmoid can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
     represents the number of classes or the size of word dict.
 
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+    The API supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_.
+
+    For the custom tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
 
     1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
     2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
@@ -1731,9 +1734,7 @@ def margin_cross_entropy(logits,
 
     .. hint::
         The API supports single GPU and multi GPU, and don't supports CPU.
-
         For data parallel mode, set ``group=False``.
-
         For model parallel mode, set ``group=None`` or the group instance return by paddle.distributed.new_group.
         And logits.shape[-1] can be different at each rank.
 
@@ -1756,12 +1757,12 @@ def margin_cross_entropy(logits,
                     Default value is `'mean'`.
 
     Returns:
-        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
-            `return_softmax` is False, otherwise the tuple \
-            (loss, softmax), softmax is shard_softmax when \
-            using model parallel, otherwise softmax is in \
-            the same shape with input logits. If ``reduction == None``, \
-            the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
+        Tensor|tuple[Tensor, Tensor], return the cross entropy loss if
+            `return_softmax` is False, otherwise the tuple (loss, softmax),
+            softmax is shard_softmax when using model parallel, otherwise
+            softmax is in the same shape with input logits. If
+            ``reduction == None``, the shape of loss is ``[N, 1]``, otherwise
+            the shape is ``[1]``.
 
     Examples:
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 03ba72fdda344..f278ad22244ff 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -31,7 +31,7 @@
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     r"""
-    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
+    Normalize ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
 
     .. math::
 
@@ -45,7 +45,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     Parameters:
         x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
-        p (float|int, optional): The exponent value in the norm formulation. Default: 2
+        p (float|int, optional): The exponent value in the norm formulation. Default: 2.
         axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
         epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 847ba013a0f38..7f7a18d0a2ed2 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -124,7 +124,7 @@ def grid_sample(x,
                 align_corners=True,
                 name=None):
     """
-    This operation samples input X by using bilinear interpolation or
+    Sample input X by using bilinear interpolation or
     nearest interpolation based on flow field grid, which is usually
     generated by :code:`affine_grid` . When the input X is 4-D Tensor,
     the grid of shape [N, H, W, 2] is the concatenation of (x, y)
@@ -209,6 +209,7 @@ def grid_sample(x,
                              None by default.
 
     Returns:
+
         Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid,
                 `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
 
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 785e3bc1c2dd7..dcfeebcdc32f9 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -132,12 +132,12 @@ def make_scheduler(*,
         skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
 
     Returns:
-        A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
+        A scheduler function, conforms to above state transform setting. The function will takes one parameter `step_num`, and returns corresponding ProfilerState.
 
     Examples:
-        1. profiling range [2, 5]
+        1. profiling range [2, 5].
 
-        Assume batch 0: closed, batch 1: ready, batch [2, 5] record
+        Assume batch 0: closed, batch 1: ready, batch [2, 5] record.
 
             .. code-block:: python
                 :name: code-example1
@@ -146,9 +146,9 @@ def make_scheduler(*,
                 profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
 
 
-        2. profiling range [3,6], [9,12], [15,18]...
+        2. profiling range [3,6], [9,12], [15,18].
 
-        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat.
 
             .. code-block:: python
                 :name: code-example2
@@ -196,12 +196,12 @@ def export_chrome_tracing(dir_name: str,
                           worker_name: Optional[str] = None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to chrome tracing format file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    The output file will be saved in directory ``dir_name``, and file name will be set as `worker_name`.
+    if `worker_name` is not set, the default name is `[hostname]_[pid]`.
 
     Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
 
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
@@ -246,12 +246,12 @@ def export_protobuf(dir_name: str,
                     worker_name: Optional[str] = None) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to protobuf file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    The output file will be saved in directory ``dir_name``, and file name will be set as ``worker_name``.
+    if ``worker_name`` is not set, the default name is `[hostname]_[pid]`.
 
     Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
 
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
@@ -317,7 +317,7 @@ class Profiler:
             If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
         on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>`.
         timer_only (bool, optional): If it is True, the cost of Dataloader and every step of the model will be count without profiling. Otherwise, the model will
             be timed and profiled. Default: False.
         record_shapes (bool, optional): If it is True, collect op's input shape information. Default: False.
@@ -339,7 +339,7 @@ class Profiler:
                         #train()
                         p.step()
 
-        2. profiling range [2,4], [7, 9], [11,13]
+        2. profiling range [2,4], [7, 9], [11,13].
 
             .. code-block:: python
                 :name: code-example2
@@ -354,7 +354,7 @@ class Profiler:
                         #train()
                         p.step()
 
-        3. Use profiler without context manager, and use default parameters
+        3. Use profiler without context manager, and use default parameters.
 
             .. code-block:: python
                 :name: code-example3
@@ -369,7 +369,7 @@ class Profiler:
                 p.stop()
                 p.summary()
 
-        4. Use profiler to get throughput and cost of the model
+        4. Use profiler to get throughput and cost of the model.
 
             .. code-block:: python
                 :name: code-example-timer1
@@ -399,8 +399,7 @@ def forward(self, image, label=None):
 
                 dataset = RandomDataset(20 * 4)
                 simple_net = SimpleNet()
-                opt = paddle.optimizer.SGD(learning_rate=1e-3,
-                                           parameters=simple_net.parameters())
+                opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=simple_net.parameters())
                 BATCH_SIZE = 4
                 loader = paddle.io.DataLoader(
                     dataset,
@@ -531,7 +530,7 @@ def start(self):
                 prof.stop()
 
         '''
-        # Timing only without profiling
+        # Timing only without profiling.
         benchmark().begin()
         if not self.timer_only or self.emit_nvtx:
             utils._is_profiler_used = True
@@ -584,7 +583,7 @@ def stop(self):
         if self.profile_memory:
             disable_memory_recorder()
         # self.current_state -> CLOSED
-        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN.
         if self.record_event:
             self.record_event.end()
             self.record_event = None
@@ -607,7 +606,7 @@ def step(self, num_samples: Optional[int] = None):
 
         Args:
             num_samples (int|None, optional): Specifies the batch size of every step of the model
-                that is used to compute throughput when timer_only is True. Default: None.
+                that is used to compute throughput when `timer_only` is True. Default: None.
 
         Examples:
             .. code-block:: python
@@ -645,7 +644,7 @@ def step_info(self, unit=None):
         r"""
         Get statistics for current step. If the function is called at certain iteration
         intervals, the result is the average of all steps between the previous call and
-        this call. Statistics are as follows：
+        this call. Statistics are as follows:
 
         1. reader_cost: the cost of loading data measured in seconds.
 
@@ -751,7 +750,7 @@ def export(self, path="", format="json"):
 
         Args:
             path(str): file path of the output.
-            format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
+            format(str, optional): output format, can be chosen from ['json', 'pb'], 'json' for chrome tracing and 'pb' for protobuf, default value is 'json'.
 
 
         Examples:
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 6eeea876a9c90..fe05aaeb81f9e 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -36,8 +36,10 @@ class RecordEvent(ContextDecorator):
     Interface for recording a time range by user defined.
 
     Args:
-        name(str): Name of the record event
-        event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter.
+        name (str): Name of the record event.
+        event_type (TracerEventType, optional): Optional, default value is
+            `TracerEventType.PythonUserDefined`. It is reserved for internal
+            purpose, and it is better not to specify this parameter.
 
     Examples:
         .. code-block:: python
@@ -59,7 +61,7 @@ class RecordEvent(ContextDecorator):
             record_event.end()
 
     **Note**:
-        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
+        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of `RECORD`.
     """
 
     def __init__(
@@ -134,7 +136,7 @@ def load_profiler_result(filename: str):
         filename(str): Name of the exported protobuf file of profiler data.
 
     Returns:
-        ProfilerResult object, which stores profiling data.
+        ``ProfilerResult`` object, which stores profiling data.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index e9369b5da380b..15dde6a6e3b8e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4122,9 +4122,8 @@ def lerp_(x, y, weight, name=None):
 
 def erfinv(x, name=None):
     r"""
-    The inverse error function of x.
+    The inverse error function of x. Please refer to :ref:`api_paddle_erf`
 
-    Equation:
         .. math::
 
             erfinv(erf(x)) = x.
@@ -4237,7 +4236,6 @@ def deg2rad(x, name=None):
     r"""
     Convert each of the elements of input x from degrees to angles in radians.
 
-    Equation:
         .. math::
 
             deg2rad(x)=\pi * x / 180
@@ -4253,7 +4251,6 @@ def deg2rad(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
             result1 = paddle.deg2rad(x1)
@@ -4679,18 +4676,18 @@ def angle(x, name=None):
     return out
 
 def heaviside(x, y, name=None):
-    """
+    r"""
     Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is
 
     .. math::
         heaviside(x, y)=
             \left\{
-                \\begin{array}{lcl}
-                0,& &\\text{if} \ x < 0, \\\\
-                y,& &\\text{if} \ x = 0, \\\\
-                1,& &\\text{if} \ x > 0.
+                \begin{array}{lcl}
+                0,& &\text{if} \ x < 0, \\
+                y,& &\text{if} \ x = 0, \\
+                1,& &\text{if} \ x > 0.
                 \end{array}
-            \\right.
+            \right.
 
     Note:
         ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
@@ -4716,7 +4713,7 @@ def heaviside(x, y, name=None):
             paddle.heaviside(x, y)
             #    [[0.        , 0.20000000, 1.        ],
             #     [0.        , 1.        , 0.30000001]]
-     """
+    """
     op_type = 'elementwise_heaviside'
     axis = -1
     act = None
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 1d87e4857cd0d..a14266412c4b3 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -293,7 +293,7 @@ def CUDAExtension(sources, *args, **kwargs):
         **kwargs(dict[option], optional): Specify other arguments same as ``setuptools.Extension`` .
 
     Returns:
-        setuptools.Extension: An instance of setuptools.Extension
+        setuptools.Extension: An instance of setuptools.Extension.
     """
     kwargs = normalize_extension_kwargs(kwargs, use_cuda=True)
     # Note(Aurelius84): While using `setup` and `jit`, the Extension `name` will