From 14fcb57290a7e9cd81795f424332a720a36ccee5 Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Thu, 26 Oct 2023 23:09:07 +0800 Subject: [PATCH 01/15] :sparkles: Refactor: added basic code --- python/paddle/__init__.py | 3 + python/paddle/tensor/__init__.py | 6 ++ python/paddle/tensor/random.py | 119 +++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 842151d83b332..6f3acf150373a 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -909,4 +909,7 @@ 'polygamma_', 'hypot', 'hypot_', + 'bernoulli_', + 'log_normal', + 'log_normal_', ] diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index c8bfe99f91e6b..033f45dce1e97 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -346,6 +346,9 @@ from .random import randperm # noqa: F401 from .random import poisson # noqa: F401 from .random import exponential_ # noqa: F401 +from .random import bernoulli_ # noqa: F401 +from .random import log_normal # noqa: F401 +from .random import log_normal_ # noqa: F401 from .search import argmax # noqa: F401 from .search import argmin # noqa: F401 from .search import argsort # noqa: F401 @@ -721,6 +724,9 @@ 'asinh_', 'diag', 'normal_', + 'bernoulli_', + 'log_normal', + 'log_normal_', ] # this list used in math_op_patch.py for magic_method bind diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 479e7a7ea09cc..af8f64133a844 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1387,3 +1387,122 @@ def exponential_(x, lam=1.0, name=None): attrs={"lambda": lam}, ) return x + + +@dygraph_only +def bernoulli_(x, p=0.5, name=None): + r""" + This inplace OP fill input Tensor ``x`` with random number from a Bernoulli Distribution with probability ``p``. + + Args: + x (Tensor): Input tensor. The data type should be float32, float64. + p (float, optional): probability :math:`p` parameter of Bernoulli Distribution. Default: 0.5. + name(str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + - x (Tensor): Input Tensor ``x``. + Examples: + .. code-block:: python + """ + if 0 <= p and p <= 1: + raise ValueError(f"bernoulli_ expects p to be in [0, 1], but got p={p}") + + check_variable_and_dtype( + x, "x", ["float32", "float64"], "exponential" + ) + + uniform_(x, 0, 1) + return (x < p).astype(x.dtype) + pass + + +def log_normal(mean=1.0, std=2.0, shape=None, dtype=None): + r""" + Returns a Tensor filled with random values sampled from a log normal + distribution, with ``mean``, ``std``, ``shape`` and ``dtype``. + The Log Normal Distribution is defined as follows: + + .. math:: + f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} + Args: + mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. + If ``mean`` is float, all elements of the output Tensor shared the same mean. + If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means. + Default is 0.0 + std (float|Tensor, optional): The standard deviation of the output Tensor's normal distribution. + If ``std`` is float, all elements of the output Tensor shared the same standard deviation. + If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations. + Defaule is 1.0 + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. If ``mean`` or ``std`` + is a Tensor, the shape of the output Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored. + Default is None + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + Returns: + - out (Tensor): A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` . + Examples: + .. code-block:: python + >>> import paddle + >>> out1 = paddle.log_normal(shape=[2, 3]) + >>> print(out1) + >>> # doctest: +SKIP("Random output") + Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + [[-0.85107994, -0.85490644, -1.35941815], + [-0.55500370, 0.20964541, 2.24193954]]) + >>> # doctest: -SKIP + >>> mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) + >>> out2 = paddle.log_normal(mean=mean_tensor) + >>> print(out2) + >>> # doctest: +SKIP("Random output") + Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True, + [1.05411839, 3.71514320, 3.42665267]) + >>> # doctest: -SKIP + >>> std_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) + >>> out3 = paddle.log_normal(mean=mean_tensor, std=std_tensor) + >>> print(out3) + >>> # doctest: +SKIP("Random output") + Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True, + [0.48646951, 0.00815189, 3.74022293]) + >>> # doctest: -SKIP + """ + pass + + +def log_normal_(x, mean=1.0, std=2.0): + r""" + This inplace OP fill input Tensor ``x`` with random number from a Log Normal Distribution + with ``mean`` and ``std``. The Log Normal Distribution is defined as follows: + + .. math:: + f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} + + Args: + x (Tensor): The input tensor to be filled with random values. + mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. + If ``mean`` is float, all elements of the output Tensor shared the same mean. + If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means. + Default is 0.0 + std (float|Tensor, optional): The standard deviation of the output Tensor's normal distribution. + If ``std`` is float, all elements of the output Tensor shared the same standard deviation. + If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations. + Defaule is 1.0 + name(str, optional): The default value is None. Normally there is no + need for user to set this property. For more information, please + refer to :ref:`api_guide_Name`. + Returns: + A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` . + Examples: + .. code-block:: python + >>> import paddle + >>> x = paddle.randn([3, 4]) + >>> x.log_normal_() + >>> # doctest: +SKIP('random check') + >>> print(x) + Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True, + [[ 0.06132207, 1.11349595, 0.41906244, -0.24858207], + [-1.85169315, -1.50370061, 1.73954511, 0.13331604], + [ 1.66359663, -0.55764782, -0.59911072, -0.57773495]]) + """ + pass \ No newline at end of file From 87515f3d69e509d63995aa445e963920d8e90568 Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Tue, 7 Nov 2023 00:59:13 +0800 Subject: [PATCH 02/15] :sparkles: Feature: added log_normal and log_normal_ --- python/paddle/__init__.py | 3 +++ python/paddle/tensor/random.py | 46 +++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 6f3acf150373a..d32228ff7d9c4 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -424,6 +424,9 @@ randint, randint_like, randperm, + bernoulli_, + log_normal, + log_normal_, ) from .tensor.search import ( argmax, diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index af8f64133a844..a2cde16f3a175 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1393,7 +1393,7 @@ def exponential_(x, lam=1.0, name=None): def bernoulli_(x, p=0.5, name=None): r""" This inplace OP fill input Tensor ``x`` with random number from a Bernoulli Distribution with probability ``p``. - + Args: x (Tensor): Input tensor. The data type should be float32, float64. p (float, optional): probability :math:`p` parameter of Bernoulli Distribution. Default: 0.5. @@ -1408,23 +1408,22 @@ def bernoulli_(x, p=0.5, name=None): if 0 <= p and p <= 1: raise ValueError(f"bernoulli_ expects p to be in [0, 1], but got p={p}") - check_variable_and_dtype( - x, "x", ["float32", "float64"], "exponential" - ) + check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential") uniform_(x, 0, 1) return (x < p).astype(x.dtype) pass -def log_normal(mean=1.0, std=2.0, shape=None, dtype=None): +def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): r""" - Returns a Tensor filled with random values sampled from a log normal - distribution, with ``mean``, ``std``, ``shape`` and ``dtype``. + Returns a Tensor filled with random values sampled from a Log Normal + Distribution, with ``mean``, ``std``, ``shape`` and ``dtype``. The Log Normal Distribution is defined as follows: - + .. math:: f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} + Args: mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. If ``mean`` is float, all elements of the output Tensor shared the same mean. @@ -1441,7 +1440,7 @@ def log_normal(mean=1.0, std=2.0, shape=None, dtype=None): Default is None name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - - out (Tensor): A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` . + - out (Tensor): A Tensor filled with random values sampled from a log normal distribution with ``mean`` and ``std`` . Examples: .. code-block:: python >>> import paddle @@ -1467,17 +1466,33 @@ def log_normal(mean=1.0, std=2.0, shape=None, dtype=None): [0.48646951, 0.00815189, 3.74022293]) >>> # doctest: -SKIP """ - pass + op_type_for_check = 'gaussian/standard_normal/randn/normal' + supported_dtypes = ['float32', 'float64', 'float16', 'uint16', 'bfloat16'] + + if dtype is None: + dtype = paddle.framework.get_default_dtype() + if dtype not in supported_dtypes: + raise TypeError( + "{} only supports {}, but the default dtype is {}".format( + op_type_for_check, supported_dtypes, dtype + ) + ) + if not isinstance(dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(dtype) + + distribution = gaussian(shape, dtype, mean, std, seed) + return paddle.exp(distribution) -def log_normal_(x, mean=1.0, std=2.0): +@dygraph_only +def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): r""" - This inplace OP fill input Tensor ``x`` with random number from a Log Normal Distribution + This inplace OP fill input Tensor ``x`` with random number from a Log Normal Distribution with ``mean`` and ``std``. The Log Normal Distribution is defined as follows: - + .. math:: f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} - + Args: x (Tensor): The input tensor to be filled with random values. mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. @@ -1505,4 +1520,5 @@ def log_normal_(x, mean=1.0, std=2.0): [-1.85169315, -1.50370061, 1.73954511, 0.13331604], [ 1.66359663, -0.55764782, -0.59911072, -0.57773495]]) """ - pass \ No newline at end of file + + return gaussian_(x, mean, std, seed).exp_() From c31782f89405495e952a616e31dca78d9f0c18ed Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Thu, 23 Nov 2023 13:22:37 +0800 Subject: [PATCH 03/15] :test_tube: Test: added unittest case --- python/paddle/tensor/random.py | 4 +- test/legacy_test/test_inplace.py | 13 +++++ test/legacy_test/test_log_normal_op.py | 74 ++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 test/legacy_test/test_log_normal_op.py diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 9c1733a3a9a6b..36f93e0526197 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1495,7 +1495,7 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - distribution = gaussian(shape, dtype, mean, std, seed) + distribution = gaussian(shape, mean=mean, std=std, seed=seed, dtype=dtype) return paddle.exp(distribution) @@ -1536,4 +1536,4 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): [ 1.66359663, -0.55764782, -0.59911072, -0.57773495]]) """ - return gaussian_(x, mean, std, seed).exp_() + return gaussian_(x, mean=mean, std=std, seed=seed).exp_() diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index cac243f5e8682..0490a340304c7 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1616,5 +1616,18 @@ def non_inplace_api_processing(self, var): return paddle.index_fill(var, self.index, self.axis, self.value) +class TestDygraphInplaceIndexFill(TestDygraphInplace): + def init_data(self): + self.shape = (20, 40) + self.x = np.random.random(self.shape) + self.dtype = "float32" + self.mean = 0 + self.std = 1 + self.seed = 100 + + def inplace_api_processing(self, var): + return paddle.log_normal_(self.x, self.shape, self.mean, self.std, self.seed) + + if __name__ == '__main__': unittest.main() diff --git a/test/legacy_test/test_log_normal_op.py b/test/legacy_test/test_log_normal_op.py new file mode 100644 index 0000000000000..5083c9f18e955 --- /dev/null +++ b/test/legacy_test/test_log_normal_op.py @@ -0,0 +1,74 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.base import core + +SEED = 100 +np.random.seed(SEED) +paddle.seed(SEED) + + +def output_log_normal(shape, mean, std): + return np.exp(np.random.normal(mean, std, shape)) + + +class TestLogNormalAPI(unittest.TestCase): + DTYPE = "float64" + SHAPE = [2, 4] + MEAN = 0 + STD = 1 + + def setUp(self): + self.x = output_log_normal(self.SHAPE, self.MEAN, self.STD) + self.place = [paddle.CPUPlace()] + if core.is_compiled_with_cuda(): + self.place.append(paddle.CUDAPlace(0)) + + def test_api_static(self): + def run(place): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + out = paddle.log_normal() + exe = paddle.static.Executor(place) + res = exe.run( + paddle.static.default_main_program(), + feed={}, + fetch_list=[out], + ) + return res[0] + + for place in self.place: + res = run(place) + self.assertTrue(np.allclose(res, self.x)) + + def test_api_dygraph(self): + def run(place): + paddle.disable_static(place) + out = paddle.log_normal(self.SHAPE, self.MEAN, self.STD, seed=SEED) + + out_ref = output_log_normal(self.SHAPE, self.MEAN, self.STD) + np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-5) + paddle.enable_static() + + for place in self.place: + run(place) + + +if __name__ == "__main__": + unittest.main() From 7aed63d83ad67611a0bc66ce76d648800425bb3c Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Tue, 19 Dec 2023 13:19:18 +0800 Subject: [PATCH 04/15] :recycle: Refactor: updated log_normal and bernoulli_ --- .../ops_signature/generated_fused_sig.cc | 1022 ++ .../operators/ops_signature/generated_sig.cc | 9755 +++++++++++++++++ .../ops_signature/generated_sparse_sig.cc | 2735 +++++ .../ops_signature/generated_static_sig.cc | 1585 +++ .../conv2d/generated_tmp/conv2d_bias_act.cu | 4349 ++++++++ .../generated_tmp/conv2d_bias_residual.cu | 2389 ++++ .../conv2d_depthwise_bias_act.cu | 3502 ++++++ .../fpA_intB_gemm/autogen_tmp/arch_define.h | 4 + ...m_kernelLauncher_bf16_sm80_stages2_bias.cu | 439 + ...kernelLauncher_bf16_sm80_stages2_noBias.cu | 439 + ...m_kernelLauncher_bf16_sm80_stages3_bias.cu | 439 + ...kernelLauncher_bf16_sm80_stages3_noBias.cu | 439 + ...m_kernelLauncher_bf16_sm80_stages4_bias.cu | 439 + ...kernelLauncher_bf16_sm80_stages4_noBias.cu | 439 + ...m_kernelLauncher_bf16_sm80_stages5_bias.cu | 439 + ...kernelLauncher_bf16_sm80_stages5_noBias.cu | 439 + ...m_kernelLauncher_fp16_sm80_stages2_bias.cu | 439 + ...kernelLauncher_fp16_sm80_stages2_noBias.cu | 439 + ...m_kernelLauncher_fp16_sm80_stages3_bias.cu | 439 + ...kernelLauncher_fp16_sm80_stages3_noBias.cu | 439 + ...m_kernelLauncher_fp16_sm80_stages4_bias.cu | 439 + ...kernelLauncher_fp16_sm80_stages4_noBias.cu | 439 + ...m_kernelLauncher_fp16_sm80_stages5_bias.cu | 439 + ...kernelLauncher_fp16_sm80_stages5_noBias.cu | 439 + python/paddle/tensor/random.py | 50 +- test/legacy_test/test_inplace.py | 24 +- 26 files changed, 32428 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/operators/ops_signature/generated_fused_sig.cc create mode 100644 paddle/fluid/operators/ops_signature/generated_sig.cc create mode 100644 paddle/fluid/operators/ops_signature/generated_sparse_sig.cc create mode 100644 paddle/fluid/operators/ops_signature/generated_static_sig.cc create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu diff --git a/paddle/fluid/operators/ops_signature/generated_fused_sig.cc b/paddle/fluid/operators/ops_signature/generated_fused_sig.cc new file mode 100644 index 0000000000000..6a21ac83c4714 --- /dev/null +++ b/paddle/fluid/operators/ops_signature/generated_fused_sig.cc @@ -0,0 +1,1022 @@ +// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AddActXpuOpArgumentMapping: + +return KernelSignature("add_act_xpu", {"x", "x_max", "y", "y_max"}, {"act_type"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature AddActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "y", "y_max"}; + paddle::small_vector attrs; + attrs.emplace_back("act_type"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("add_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AddLayernormXpuOpArgumentMapping: + +return KernelSignature("add_layernorm_xpu", {"x", "y", "scale", "bias"}, {"begin_norm_axis", "epsilon"}, {"out"}); +****************************************************************** +*/ + +KernelSignature AddLayernormXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("begin_norm_axis"); + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"out"}; + return KernelSignature("add_layernorm_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AddcmulXpuOpArgumentMapping: + +return KernelSignature("addcmul_xpu", {"x", "y", "w"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature AddcmulXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "w"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("addcmul_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BlockMultiheadAttentionOpArgumentMapping: + +return KernelSignature("block_multihead_attention", {"qkv", "key_cache", "value_cache", "seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time", "padding_offsets", "cum_offsets", "cu_seqlens_q", "cu_seqlens_k", "block_tables", "pre_key_cache", "pre_value_cache", "rope_emb", "mask", "tgt_mask"}, {"max_seq_len", "block_size", "use_neox_style"}, {"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"}); +****************************************************************** +*/ + +KernelSignature BlockMultiheadAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"qkv", "key_cache", "value_cache", "seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time", "padding_offsets", "cum_offsets", "cu_seqlens_q", "cu_seqlens_k", "block_tables", "pre_key_cache", "pre_value_cache", "rope_emb", "mask", "tgt_mask"}; + paddle::small_vector attrs; + attrs.emplace_back("max_seq_len"); + attrs.emplace_back("block_size"); + attrs.emplace_back("use_neox_style"); + paddle::small_vector outputs {"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"}; + return KernelSignature("block_multihead_attention", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BnActXpuOpArgumentMapping: + +return KernelSignature("bn_act_xpu", {"x", "mean", "variance", "scale", "bias"}, {"momentum", "epsilon", "data_layout", "act_type"}, {"out"}); +****************************************************************** +*/ + +KernelSignature BnActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("data_layout"); + attrs.emplace_back("act_type"); + paddle::small_vector outputs {"out"}; + return KernelSignature("bn_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv1dXpuOpArgumentMapping: + +return KernelSignature("conv1d_xpu", {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max"}, {"paddings", "padding_algorithm", "dilations", "strides", "groups", "act_type", "act_param"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature Conv1dXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("groups"); + attrs.emplace_back("act_type"); + attrs.emplace_back("act_param"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("conv1d_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dTransposeXpuOpArgumentMapping: + +return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); +return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); +return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature Conv2dTransposeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back( + ctx.HasInput("OutputSizeTensor") + ? "OutputSizeTensor" + : ctx.InputSize("OutputSizeTensorList") > 0 + ? "OutputSizeTensorList" + : "output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + attrs.emplace_back("has_bias"); + attrs.emplace_back("with_act"); + attrs.emplace_back("act_type"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("conv2d_transpose_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dXpuOpArgumentMapping: + +return KernelSignature("conv2d_xpu", {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max", "scale_max", "out_max_in"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "act_type", "act_param", "out_dtype"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature Conv2dXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max", "scale_max", "out_max_in"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("act_type"); + attrs.emplace_back("act_param"); + attrs.emplace_back("out_dtype"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("conv2d_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DequantizeXpuOpArgumentMapping: + +return KernelSignature("dequantize_xpu", {"x"}, {"out_dtype", "scale"}, {"y"}); +****************************************************************** +*/ + +KernelSignature DequantizeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("out_dtype"); + attrs.emplace_back("scale"); + paddle::small_vector outputs {"y"}; + return KernelSignature("dequantize_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EmbeddingWithEltwiseAddXpuOpArgumentMapping: + +return KernelSignature("embedding_with_eltwise_add_xpu", {"ids", "tables", "mask"}, {"padding_idx"}, {"out", "seq_lod", "max_seq_len"}); +****************************************************************** +*/ + +KernelSignature EmbeddingWithEltwiseAddXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"ids", "tables", "mask"}; + paddle::small_vector attrs; + attrs.emplace_back("padding_idx"); + paddle::small_vector outputs {"out", "seq_lod", "max_seq_len"}; + return KernelSignature("embedding_with_eltwise_add_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FastLayernormXpuOpArgumentMapping: + +return KernelSignature("fast_layernorm_xpu", {"x", "scale", "bias"}, {"begin_norm_axis", "epsilon"}, {"out"}); +****************************************************************** +*/ + +KernelSignature FastLayernormXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("begin_norm_axis"); + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"out"}; + return KernelSignature("fast_layernorm_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FastWhereXpuOpArgumentMapping: + +return KernelSignature("fast_where_xpu", {"condition", "x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature FastWhereXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"condition", "x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("fast_where_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FcOpArgumentMapping: + +return KernelSignature("fc", {"Input", "W", "Bias"}, {"in_num_col_dims", "activation_type", "use_mkldnn", "padding_weights", "use_quantizer", "mkldnn_data_type", "Scale_in", "Scale_weights", "Scale_out", "force_fp32_output"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FcOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "W", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("in_num_col_dims"); + attrs.emplace_back("activation_type"); + attrs.emplace_back("use_mkldnn"); + attrs.emplace_back("padding_weights"); + attrs.emplace_back("use_quantizer"); + attrs.emplace_back("mkldnn_data_type"); + attrs.emplace_back("Scale_in"); + attrs.emplace_back("Scale_weights"); + attrs.emplace_back("Scale_out"); + attrs.emplace_back("force_fp32_output"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fc", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FcXpuOpArgumentMapping: + +return KernelSignature("fc_xpu", {"x", "x_max", "w", "w_max", "bias", "scale_max", "out_max_in"}, {"in_num_col_dims", "transpose_x", "alpha", "beta", "act_type", "act_alpha", "out_dtype"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature FcXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "w", "w_max", "bias", "scale_max", "out_max_in"}; + paddle::small_vector attrs; + attrs.emplace_back("in_num_col_dims"); + attrs.emplace_back("transpose_x"); + attrs.emplace_back("alpha"); + attrs.emplace_back("beta"); + attrs.emplace_back("act_type"); + attrs.emplace_back("act_alpha"); + attrs.emplace_back("out_dtype"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("fc_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedBiasActOpArgumentMapping: + +return KernelSignature("fused_bias_act", {"x", "bias", "dequant_scales", "shift", "smooth"}, {"act_method", "compute_dtype", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out"}); +****************************************************************** +*/ + +KernelSignature FusedBiasActOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "bias", "dequant_scales", "shift", "smooth"}; + paddle::small_vector attrs; + attrs.emplace_back("act_method"); + attrs.emplace_back("compute_dtype"); + attrs.emplace_back("quant_scale"); + attrs.emplace_back("quant_round_type"); + attrs.emplace_back("quant_max_bound"); + attrs.emplace_back("quant_min_bound"); + paddle::small_vector outputs {"out"}; + return KernelSignature("fused_bias_act", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedBiasDropoutResidualLayerNormOpArgumentMapping: + +return KernelSignature("fused_bias_dropout_residual_layer_norm", {"X", "Residual", "Bias", "LnScale", "LnBias"}, {"dropout_rate", "is_test", "dropout_fix_seed", "dropout_seed", "dropout_implementation", "ln_epsilon"}, {"Y", "BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance"}); +****************************************************************** +*/ + +KernelSignature FusedBiasDropoutResidualLayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Residual", "Bias", "LnScale", "LnBias"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout_rate"); + attrs.emplace_back("is_test"); + attrs.emplace_back("dropout_fix_seed"); + attrs.emplace_back("dropout_seed"); + attrs.emplace_back("dropout_implementation"); + attrs.emplace_back("ln_epsilon"); + paddle::small_vector outputs {"Y", "BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance"}; + return KernelSignature("fused_bias_dropout_residual_layer_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedBiasResidualLayernormOpArgumentMapping: + +return KernelSignature("fused_bias_residual_layernorm", {"x", "bias", "residual", "norm_weight", "norm_bias"}, {"epsilon", "residual_alpha", "begin_norm_axis", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "residual_out", "mean", "variance"}); +****************************************************************** +*/ + +KernelSignature FusedBiasResidualLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "bias", "residual", "norm_weight", "norm_bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("residual_alpha"); + attrs.emplace_back("begin_norm_axis"); + attrs.emplace_back("quant_scale"); + attrs.emplace_back("quant_round_type"); + attrs.emplace_back("quant_max_bound"); + attrs.emplace_back("quant_min_bound"); + paddle::small_vector outputs {"out", "residual_out", "mean", "variance"}; + return KernelSignature("fused_bias_residual_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedConv2dAddActOpArgumentMapping: + +return KernelSignature("fused_conv2d_add_act", {"Input", "Filter", "Bias", "ResidualData"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format", "activation", "split_channels", "exhaustive_search", "workspace_size_MB", "fuse_alpha"}, {"Output", "Outputs"}); +****************************************************************** +*/ + +KernelSignature FusedConv2dAddActOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Bias", "ResidualData"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("dilations"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + attrs.emplace_back("activation"); + attrs.emplace_back("split_channels"); + attrs.emplace_back("exhaustive_search"); + attrs.emplace_back("workspace_size_MB"); + attrs.emplace_back("fuse_alpha"); + paddle::small_vector outputs {"Output", "Outputs"}; + return KernelSignature("fused_conv2d_add_act", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedDconvDreluDbnOpArgumentMapping: + +return KernelSignature("fused_dconv_drelu_dbn", {"grad_output", "weight", "grad_output_add", "residual_input", "bn1_eqscale", "bn1_eqbias", "conv_input", "bn1_mean", "bn1_inv_std", "bn1_gamma", "bn1_beta", "bn1_input", "bn2_mean", "bn2_inv_std", "bn2_gamma", "bn2_beta", "bn2_input"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "data_format", "fuse_shortcut", "fuse_dual", "fuse_add", "exhaustive_search"}, {"grad_weight", "grad_bn1_input", "grad_bn1_gamma", "grad_bn1_beta", "grad_bn2_input", "grad_bn2_gamma", "grad_bn2_beta"}); +****************************************************************** +*/ + +KernelSignature FusedDconvDreluDbnOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"grad_output", "weight", "grad_output_add", "residual_input", "bn1_eqscale", "bn1_eqbias", "conv_input", "bn1_mean", "bn1_inv_std", "bn1_gamma", "bn1_beta", "bn1_input", "bn2_mean", "bn2_inv_std", "bn2_gamma", "bn2_beta", "bn2_input"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + attrs.emplace_back("fuse_shortcut"); + attrs.emplace_back("fuse_dual"); + attrs.emplace_back("fuse_add"); + attrs.emplace_back("exhaustive_search"); + paddle::small_vector outputs {"grad_weight", "grad_bn1_input", "grad_bn1_gamma", "grad_bn1_beta", "grad_bn2_input", "grad_bn2_gamma", "grad_bn2_beta"}; + return KernelSignature("fused_dconv_drelu_dbn", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedDropoutAddOpArgumentMapping: + +return KernelSignature("fused_dropout_add", {"x", "y", "seed_tensor"}, {"p", "is_test", "mode", "seed", "fix_seed"}, {"out", "seed_offset"}); +return KernelSignature("fused_dropout_add", {"x", "y", "seed_tensor"}, {"PTensor", "is_test", "mode", "seed", "fix_seed"}, {"out", "seed_offset"}); +****************************************************************** +*/ + +KernelSignature FusedDropoutAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "seed_tensor"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("PTensor") ? "PTensor" : "p"); + attrs.emplace_back("is_test"); + attrs.emplace_back("mode"); + attrs.emplace_back("seed"); + attrs.emplace_back("fix_seed"); + paddle::small_vector outputs {"out", "seed_offset"}; + return KernelSignature("fused_dropout_add", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedEmbeddingEltwiseLayernormOpArgumentMapping: + +return KernelSignature("fused_embedding_eltwise_layernorm", {"Ids", "Embs", "Bias", "Scale"}, {"epsilon"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FusedEmbeddingEltwiseLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Ids", "Embs", "Bias", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fused_embedding_eltwise_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedFcElementwiseLayernormOpArgumentMapping: + +return KernelSignature("fused_fc_elementwise_layernorm", {"X", "W", "Y", "Bias0", "Scale", "Bias1"}, {"x_num_col_dims", "activation_type", "epsilon", "begin_norm_axis"}, {"Out", "Mean", "Variance"}); +****************************************************************** +*/ + +KernelSignature FusedFcElementwiseLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "W", "Y", "Bias0", "Scale", "Bias1"}; + paddle::small_vector attrs; + attrs.emplace_back("x_num_col_dims"); + attrs.emplace_back("activation_type"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("begin_norm_axis"); + paddle::small_vector outputs {"Out", "Mean", "Variance"}; + return KernelSignature("fused_fc_elementwise_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedLinearParamGradAddOpArgumentMapping: + +return KernelSignature("fused_linear_param_grad_add", {"x", "dout", "dweight", "dbias"}, {"multi_precision", "has_bias"}, {"dweight_out", "dbias_out"}); +****************************************************************** +*/ + +KernelSignature FusedLinearParamGradAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "dout", "dweight", "dbias"}; + paddle::small_vector attrs; + attrs.emplace_back("multi_precision"); + attrs.emplace_back("has_bias"); + paddle::small_vector outputs {"dweight_out", "dbias_out"}; + return KernelSignature("fused_linear_param_grad_add", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedMultiTransformerInt8XpuOpArgumentMapping: + +return KernelSignature("fused_multi_transformer_int8_xpu", {"x", "ln_scale", "ln_bias", "qkv_in_max", "qkvw", "qkv_bias", "qkv_scales", "out_linear_in_max", "out_linear_w", "out_linear_bias", "out_linear_scales", "ffn_ln_scale", "ffn_ln_bias", "ffn1_in_max", "ffn1_weight", "ffn1_bias", "ffn1_scales", "ffn2_in_max", "ffn2_weight", "ffn2_bias", "ffn2_scales", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}, {"pre_layer_norm", "rotary_emb_dims", "epsilon", "dropout_rate", "is_test", "dropout_implementation", "act_method", "trans_qkvw", "ring_id", "gather_axis"}, {"out", "cache_kv_out"}); +****************************************************************** +*/ + +KernelSignature FusedMultiTransformerInt8XpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "ln_scale", "ln_bias", "qkv_in_max", "qkvw", "qkv_bias", "qkv_scales", "out_linear_in_max", "out_linear_w", "out_linear_bias", "out_linear_scales", "ffn_ln_scale", "ffn_ln_bias", "ffn1_in_max", "ffn1_weight", "ffn1_bias", "ffn1_scales", "ffn2_in_max", "ffn2_weight", "ffn2_bias", "ffn2_scales", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}; + paddle::small_vector attrs; + attrs.emplace_back("pre_layer_norm"); + attrs.emplace_back("rotary_emb_dims"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("dropout_rate"); + attrs.emplace_back("is_test"); + attrs.emplace_back("dropout_implementation"); + attrs.emplace_back("act_method"); + attrs.emplace_back("trans_qkvw"); + attrs.emplace_back("ring_id"); + attrs.emplace_back("gather_axis"); + paddle::small_vector outputs {"out", "cache_kv_out"}; + return KernelSignature("fused_multi_transformer_int8_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedMultiTransformerXpuOpArgumentMapping: + +return KernelSignature("fused_multi_transformer_xpu", {"x", "ln_scale", "ln_bias", "qkvw", "qkvw_max", "qkv_bias", "out_linear_w", "out_linear_wmax", "out_linear_bias", "ffn_ln_scale", "ffn_ln_bias", "ffn1_weight", "ffn1_weight_max", "ffn1_bias", "ffn2_weight", "ffn2_weight_max", "ffn2_bias", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}, {"pre_layer_norm", "rotary_emb_dims", "epsilon", "dropout_rate", "is_test", "dropout_implementation", "act_method", "trans_qkvw", "ring_id", "gather_axis"}, {"out", "cache_kv_out"}); +****************************************************************** +*/ + +KernelSignature FusedMultiTransformerXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "ln_scale", "ln_bias", "qkvw", "qkvw_max", "qkv_bias", "out_linear_w", "out_linear_wmax", "out_linear_bias", "ffn_ln_scale", "ffn_ln_bias", "ffn1_weight", "ffn1_weight_max", "ffn1_bias", "ffn2_weight", "ffn2_weight_max", "ffn2_bias", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}; + paddle::small_vector attrs; + attrs.emplace_back("pre_layer_norm"); + attrs.emplace_back("rotary_emb_dims"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("dropout_rate"); + attrs.emplace_back("is_test"); + attrs.emplace_back("dropout_implementation"); + attrs.emplace_back("act_method"); + attrs.emplace_back("trans_qkvw"); + attrs.emplace_back("ring_id"); + attrs.emplace_back("gather_axis"); + paddle::small_vector outputs {"out", "cache_kv_out"}; + return KernelSignature("fused_multi_transformer_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedRotaryPositionEmbeddingOpArgumentMapping: + +return KernelSignature("fused_rotary_position_embedding", {"q", "k", "v", "sin", "cos", "position_ids"}, {"use_neox_rotary_style"}, {"out_q", "out_k", "out_v"}); +****************************************************************** +*/ + +KernelSignature FusedRotaryPositionEmbeddingOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"q", "k", "v", "sin", "cos", "position_ids"}; + paddle::small_vector attrs; + attrs.emplace_back("use_neox_rotary_style"); + paddle::small_vector outputs {"out_q", "out_k", "out_v"}; + return KernelSignature("fused_rotary_position_embedding", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedScaleBiasAddReluOpArgumentMapping: + +return KernelSignature("fused_scale_bias_add_relu", {"x1", "scale1", "bias1", "x2", "scale2", "bias2"}, {"fuse_dual", "exhaustive_search"}, {"out"}); +****************************************************************** +*/ + +KernelSignature FusedScaleBiasAddReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x1", "scale1", "bias1", "x2", "scale2", "bias2"}; + paddle::small_vector attrs; + attrs.emplace_back("fuse_dual"); + attrs.emplace_back("exhaustive_search"); + paddle::small_vector outputs {"out"}; + return KernelSignature("fused_scale_bias_add_relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedScaleBiasReluConvBnOpArgumentMapping: + +return KernelSignature("fused_scale_bias_relu_conv_bn", {"x", "w", "scale", "bias", "bn_scale", "bn_bias", "input_running_mean", "input_running_var"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "data_format", "momentum", "epsilon", "fuse_prologue", "exhaustive_search", "accumulation_count"}, {"out", "out_running_mean", "out_running_var", "saved_mean", "saved_var", "eq_scale", "eq_bias"}); +****************************************************************** +*/ + +KernelSignature FusedScaleBiasReluConvBnOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "w", "scale", "bias", "bn_scale", "bn_bias", "input_running_mean", "input_running_var"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("fuse_prologue"); + attrs.emplace_back("exhaustive_search"); + attrs.emplace_back("accumulation_count"); + paddle::small_vector outputs {"out", "out_running_mean", "out_running_var", "saved_mean", "saved_var", "eq_scale", "eq_bias"}; + return KernelSignature("fused_scale_bias_relu_conv_bn", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionGruOpArgumentMapping: + +return KernelSignature("fusion_gru", {"X", "H0", "WeightX", "WeightH", "Bias"}, {"activation", "gate_activation", "is_reverse", "use_seq", "origin_mode", "use_mkldnn", "mkldnn_data_type", "Scale_data", "Shift_data", "Scale_weights", "force_fp32_output"}, {"ReorderedH0", "XX", "BatchedInput", "BatchedOut", "Hidden"}); +****************************************************************** +*/ + +KernelSignature FusionGruOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "H0", "WeightX", "WeightH", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("activation"); + attrs.emplace_back("gate_activation"); + attrs.emplace_back("is_reverse"); + attrs.emplace_back("use_seq"); + attrs.emplace_back("origin_mode"); + attrs.emplace_back("use_mkldnn"); + attrs.emplace_back("mkldnn_data_type"); + attrs.emplace_back("Scale_data"); + attrs.emplace_back("Shift_data"); + attrs.emplace_back("Scale_weights"); + attrs.emplace_back("force_fp32_output"); + paddle::small_vector outputs {"ReorderedH0", "XX", "BatchedInput", "BatchedOut", "Hidden"}; + return KernelSignature("fusion_gru", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionRepeatedFcReluOpArgumentMapping: + +return KernelSignature("fusion_repeated_fc_relu", {"X", "W", "Bias"}, {}, {"ReluOut", "Out"}); +****************************************************************** +*/ + +KernelSignature FusionRepeatedFcReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "W", "Bias"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"ReluOut", "Out"}; + return KernelSignature("fusion_repeated_fc_relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionSeqconvEltaddReluOpArgumentMapping: + +return KernelSignature("fusion_seqconv_eltadd_relu", {"X", "Filter", "Bias"}, {"contextLength", "contextStart", "contextStride"}, {"Out", "ColMat"}); +****************************************************************** +*/ + +KernelSignature FusionSeqconvEltaddReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Filter", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("contextLength"); + attrs.emplace_back("contextStart"); + attrs.emplace_back("contextStride"); + paddle::small_vector outputs {"Out", "ColMat"}; + return KernelSignature("fusion_seqconv_eltadd_relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionSeqexpandConcatFcOpArgumentMapping: + +return KernelSignature("fusion_seqexpand_concat_fc", {"X", "FCWeight", "FCBias"}, {"fc_activation"}, {"Out", "FCOut"}); +****************************************************************** +*/ + +KernelSignature FusionSeqexpandConcatFcOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "FCWeight", "FCBias"}; + paddle::small_vector attrs; + attrs.emplace_back("fc_activation"); + paddle::small_vector outputs {"Out", "FCOut"}; + return KernelSignature("fusion_seqexpand_concat_fc", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionSquaredMatSubOpArgumentMapping: + +return KernelSignature("fusion_squared_mat_sub", {"X", "Y"}, {"scalar"}, {"SquaredX", "SquaredY", "SquaredXY", "Out"}); +****************************************************************** +*/ + +KernelSignature FusionSquaredMatSubOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("scalar"); + paddle::small_vector outputs {"SquaredX", "SquaredY", "SquaredXY", "Out"}; + return KernelSignature("fusion_squared_mat_sub", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusionTransposeFlattenConcatOpArgumentMapping: + +return KernelSignature("fusion_transpose_flatten_concat", {"X"}, {"trans_axis", "flatten_axis", "concat_axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FusionTransposeFlattenConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("trans_axis"); + attrs.emplace_back("flatten_axis"); + attrs.emplace_back("concat_axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fusion_transpose_flatten_concat", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GenerateSequenceXpuOpArgumentMapping: + +return KernelSignature("generate_sequence_xpu", {"x"}, {"dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature GenerateSequenceXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out"}; + return KernelSignature("generate_sequence_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LayerNormActXpuOpArgumentMapping: + +return KernelSignature("layer_norm_act_xpu", {"x", "scale", "bias"}, {"begin_norm_axis", "epsilon", "act_type", "act_param"}, {"out"}); +****************************************************************** +*/ + +KernelSignature LayerNormActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("begin_norm_axis"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("act_type"); + attrs.emplace_back("act_param"); + paddle::small_vector outputs {"out"}; + return KernelSignature("layer_norm_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiEncoderXpuOpArgumentMapping: + +return KernelSignature("multi_encoder_xpu", {"x", "fc_weight", "fc_weight_max", "fc_bias", "ln_scale", "ln_bias", "mask", "seq_lod", "max_seq_len"}, {"layer_num", "norm_before", "hidden_dim", "head_num", "size_per_head", "ffn_hidden_dim_scale", "act_type", "relative_type", "slice_idx"}, {"out", "x_fp16", "out_fp16"}); +****************************************************************** +*/ + +KernelSignature MultiEncoderXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "fc_weight", "fc_weight_max", "fc_bias", "ln_scale", "ln_bias", "mask", "seq_lod", "max_seq_len"}; + paddle::small_vector attrs; + attrs.emplace_back("layer_num"); + attrs.emplace_back("norm_before"); + attrs.emplace_back("hidden_dim"); + attrs.emplace_back("head_num"); + attrs.emplace_back("size_per_head"); + attrs.emplace_back("ffn_hidden_dim_scale"); + attrs.emplace_back("act_type"); + attrs.emplace_back("relative_type"); + attrs.emplace_back("slice_idx"); + paddle::small_vector outputs {"out", "x_fp16", "out_fp16"}; + return KernelSignature("multi_encoder_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiheadMatmulOpArgumentMapping: + +return KernelSignature("multihead_matmul", {"Input", "W", "Bias", "BiasQK"}, {"transpose_Q", "transpose_K", "transpose_V", "alpha", "head_number"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MultiheadMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "W", "Bias", "BiasQK"}; + paddle::small_vector attrs; + attrs.emplace_back("transpose_Q"); + attrs.emplace_back("transpose_K"); + attrs.emplace_back("transpose_V"); + attrs.emplace_back("alpha"); + attrs.emplace_back("head_number"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("multihead_matmul", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by QuantizeXpuOpArgumentMapping: + +return KernelSignature("quantize_xpu", {"x"}, {"out_dtype", "scale"}, {"y"}); +****************************************************************** +*/ + +KernelSignature QuantizeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("out_dtype"); + attrs.emplace_back("scale"); + paddle::small_vector outputs {"y"}; + return KernelSignature("quantize_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SelfDpAttentionOpArgumentMapping: + +return KernelSignature("self_dp_attention", {"X"}, {"alpha", "head_number"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SelfDpAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + attrs.emplace_back("head_number"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("self_dp_attention", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SkipLayernormOpArgumentMapping: + +return KernelSignature("skip_layernorm", {"X", "Y", "Scale", "Bias"}, {"epsilon", "begin_norm_axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SkipLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Scale", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("begin_norm_axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("skip_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqueezeExcitationBlockOpArgumentMapping: + +return KernelSignature("squeeze_excitation_block", {"x", "filter", "filter_max", "bias", "branch"}, {"act_type", "act_param", "filter_dims"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SqueezeExcitationBlockOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "filter", "filter_max", "bias", "branch"}; + paddle::small_vector attrs; + attrs.emplace_back("act_type"); + attrs.emplace_back("act_param"); + attrs.emplace_back("filter_dims"); + paddle::small_vector outputs {"out"}; + return KernelSignature("squeeze_excitation_block", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by VariableLengthMemoryEfficientAttentionOpArgumentMapping: + +return KernelSignature("variable_length_memory_efficient_attention", {"query", "key", "value", "seq_lens", "kv_seq_lens", "mask"}, {"scale", "causal", "pre_cache_length"}, {"out"}); +****************************************************************** +*/ + +KernelSignature VariableLengthMemoryEfficientAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"query", "key", "value", "seq_lens", "kv_seq_lens", "mask"}; + paddle::small_vector attrs; + attrs.emplace_back("scale"); + attrs.emplace_back("causal"); + attrs.emplace_back("pre_cache_length"); + paddle::small_vector outputs {"out"}; + return KernelSignature("variable_length_memory_efficient_attention", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by YoloBoxXpuOpArgumentMapping: + +return KernelSignature("yolo_box_xpu", {"x", "x_max", "grid", "stride", "anchor_grid"}, {"offset"}, {"out", "out_max"}); +****************************************************************** +*/ + +KernelSignature YoloBoxXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "x_max", "grid", "stride", "anchor_grid"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + paddle::small_vector outputs {"out", "out_max"}; + return KernelSignature("yolo_box_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedBiasDropoutResidualLayerNormGradOpArgumentMapping: + +return KernelSignature("fused_bias_dropout_residual_layer_norm_grad", {"Y@GRAD", "X", "Residual", "Bias", "LnScale", "LnBias", "LnMean", "LnVariance", "BiasDropoutResidualOut", "DropoutMaskOut"}, {"dropout_rate", "is_test", "dropout_fix_seed", "dropout_seed", "dropout_implementation", "ln_epsilon"}, {"X@GRAD", "Residual@GRAD", "Bias@GRAD", "LnScale@GRAD", "LnBias@GRAD"}); +****************************************************************** +*/ + +KernelSignature FusedBiasDropoutResidualLayerNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Y@GRAD", "X", "Residual", "Bias", "LnScale", "LnBias", "LnMean", "LnVariance", "BiasDropoutResidualOut", "DropoutMaskOut"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout_rate"); + attrs.emplace_back("is_test"); + attrs.emplace_back("dropout_fix_seed"); + attrs.emplace_back("dropout_seed"); + attrs.emplace_back("dropout_implementation"); + attrs.emplace_back("ln_epsilon"); + paddle::small_vector outputs {"X@GRAD", "Residual@GRAD", "Bias@GRAD", "LnScale@GRAD", "LnBias@GRAD"}; + return KernelSignature("fused_bias_dropout_residual_layer_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedDropoutAddGradOpArgumentMapping: + +return KernelSignature("fused_dropout_add_grad", {"seed_offset", "out@GRAD"}, {"p", "is_test", "mode", "fix_seed"}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("fused_dropout_add_grad", {"seed_offset", "out@GRAD"}, {"PTensor", "is_test", "mode", "fix_seed"}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature FusedDropoutAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"seed_offset", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("PTensor") ? "PTensor" : "p"); + attrs.emplace_back("is_test"); + attrs.emplace_back("mode"); + attrs.emplace_back("fix_seed"); + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + return KernelSignature("fused_dropout_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FusedRotaryPositionEmbeddingGradOpArgumentMapping: + +return KernelSignature("fused_rotary_position_embedding_grad", {"sin", "cos", "position_ids", "out_q@GRAD", "out_k@GRAD", "out_v@GRAD"}, {"use_neox_rotary_style"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); +****************************************************************** +*/ + +KernelSignature FusedRotaryPositionEmbeddingGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"sin", "cos", "position_ids", "out_q@GRAD", "out_k@GRAD", "out_v@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("use_neox_rotary_style"); + paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; + return KernelSignature("fused_rotary_position_embedding_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(add_act_xpu, phi::AddActXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(add_layernorm_xpu, phi::AddLayernormXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(addcmul_xpu, phi::AddcmulXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(block_multihead_attention, phi::BlockMultiheadAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bn_act_xpu, phi::BnActXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv1d_xpu, phi::Conv1dXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_xpu, phi::Conv2dTransposeXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_xpu, phi::Conv2dXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dequantize_xpu, phi::DequantizeXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(embedding_with_eltwise_add_xpu, phi::EmbeddingWithEltwiseAddXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fast_layernorm_xpu, phi::FastLayernormXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fast_where_xpu, phi::FastWhereXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fc, phi::FcOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fc_xpu, phi::FcXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_bias_act, phi::FusedBiasActOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_bias_dropout_residual_layer_norm, phi::FusedBiasDropoutResidualLayerNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_bias_residual_layernorm, phi::FusedBiasResidualLayernormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_conv2d_add_act, phi::FusedConv2dAddActOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_dconv_drelu_dbn, phi::FusedDconvDreluDbnOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_dropout_add, phi::FusedDropoutAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_embedding_eltwise_layernorm, phi::FusedEmbeddingEltwiseLayernormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_fc_elementwise_layernorm, phi::FusedFcElementwiseLayernormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_linear_param_grad_add, phi::FusedLinearParamGradAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_multi_transformer_int8_xpu, phi::FusedMultiTransformerInt8XpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_multi_transformer_xpu, phi::FusedMultiTransformerXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_rotary_position_embedding, phi::FusedRotaryPositionEmbeddingOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_scale_bias_add_relu, phi::FusedScaleBiasAddReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_scale_bias_relu_conv_bn, phi::FusedScaleBiasReluConvBnOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_gru, phi::FusionGruOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_repeated_fc_relu, phi::FusionRepeatedFcReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_seqconv_eltadd_relu, phi::FusionSeqconvEltaddReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_seqexpand_concat_fc, phi::FusionSeqexpandConcatFcOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_squared_mat_sub, phi::FusionSquaredMatSubOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fusion_transpose_flatten_concat, phi::FusionTransposeFlattenConcatOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(generate_sequence_xpu, phi::GenerateSequenceXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(layer_norm_act_xpu, phi::LayerNormActXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multi_encoder_xpu, phi::MultiEncoderXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multihead_matmul, phi::MultiheadMatmulOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(quantize_xpu, phi::QuantizeXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(self_dp_attention, phi::SelfDpAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(skip_layernorm, phi::SkipLayernormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(squeeze_excitation_block, phi::SqueezeExcitationBlockOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(variable_length_memory_efficient_attention, phi::VariableLengthMemoryEfficientAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(yolo_box_xpu, phi::YoloBoxXpuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_bias_dropout_residual_layer_norm_grad, phi::FusedBiasDropoutResidualLayerNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_dropout_add_grad, phi::FusedDropoutAddGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fused_rotary_position_embedding_grad, phi::FusedRotaryPositionEmbeddingGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_sig.cc b/paddle/fluid/operators/ops_signature/generated_sig.cc new file mode 100644 index 0000000000000..b33adeafd0471 --- /dev/null +++ b/paddle/fluid/operators/ops_signature/generated_sig.cc @@ -0,0 +1,9755 @@ +// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AbsOpArgumentMapping: + +return KernelSignature("abs", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AbsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("abs", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AccuracyOpArgumentMapping: + +return KernelSignature("accuracy", {"Out", "Indices", "Label"}, {}, {"Accuracy", "Correct", "Total"}); +****************************************************************** +*/ + +KernelSignature AccuracyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Indices", "Label"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Accuracy", "Correct", "Total"}; + return KernelSignature("accuracy", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AcosOpArgumentMapping: + +return KernelSignature("acos", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AcosOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("acos", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AcoshOpArgumentMapping: + +return KernelSignature("acosh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AcoshOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("acosh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AdagradOpArgumentMapping: + +return KernelSignature("adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, {"epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "MasterParamOut"}); +return KernelSignature("adagrad_dense_param_sparse_grad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, {"epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature AdagradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("multi_precision"); + paddle::small_vector outputs {"ParamOut", "MomentOut", "MasterParamOut"}; + if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("Grad") && + ctx.IsDenseTensorInput("Moment") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("adagrad", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Param") && + ctx.IsSelectedRowsInput("Grad") && + ctx.IsDenseTensorInput("Moment") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("adagrad_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AdamaxOpArgumentMapping: + +return KernelSignature("adamax", {"Param", "Grad", "LearningRate", "Moment", "InfNorm", "Beta1Pow", "MasterParam"}, {"beta1", "beta2", "epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature AdamaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment", "InfNorm", "Beta1Pow", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("beta1"); + attrs.emplace_back("beta2"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("multi_precision"); + paddle::small_vector outputs {"ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}; + return KernelSignature("adamax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AdamwOpArgumentMapping: + +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "beta2", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "beta2", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "Beta2Tensor", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "Beta2Tensor", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "beta2", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "beta2", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "Beta2Tensor", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "Beta2Tensor", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" : "beta1"); + attrs.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" : "beta2"); + attrs.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" : "epsilon"); + attrs.emplace_back("lr_ratio"); + attrs.emplace_back("coeff"); + attrs.emplace_back("with_decay"); + attrs.emplace_back("lazy_mode"); + attrs.emplace_back("min_row_size_to_use_multithread"); + attrs.emplace_back("multi_precision"); + attrs.emplace_back("use_global_beta_pow"); + paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; + return KernelSignature("adamw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AddmmOpArgumentMapping: + +return KernelSignature("addmm", {"Input", "X", "Y"}, {"Beta", "Alpha"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("Beta"); + attrs.emplace_back("Alpha"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("addmm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AffineGridOpArgumentMapping: + +return KernelSignature("affine_grid", {"Theta"}, {"output_shape", "align_corners"}, {"Output"}); +return KernelSignature("affine_grid", {"Theta"}, {"OutputShape", "align_corners"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature AffineGridOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Theta"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("OutputShape") + ? "OutputShape" + : "output_shape"); + + attrs.emplace_back("align_corners"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("affine_grid", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AllcloseOpArgumentMapping: + +return KernelSignature("allclose", {"Input", "Other"}, {"rtol", "atol", "equal_nan"}, {"Out"}); +return KernelSignature("allclose", {"Input", "Other"}, {"rtol", "Atol", "equal_nan"}, {"Out"}); +return KernelSignature("allclose", {"Input", "Other"}, {"Rtol", "atol", "equal_nan"}, {"Out"}); +return KernelSignature("allclose", {"Input", "Other"}, {"Rtol", "Atol", "equal_nan"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AllcloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Other"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Rtol") ? "Rtol" : "rtol"); + attrs.emplace_back(ctx.HasInput("Atol") ? "Atol" : "atol"); + attrs.emplace_back("equal_nan"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("allclose", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AngleOpArgumentMapping: + +return KernelSignature("angle", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AngleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("angle", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ArgmaxOpArgumentMapping: + +return KernelSignature("argmax", {"X"}, {"axis", "keepdims", "flatten", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ArgMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("keepdims"); + attrs.emplace_back("flatten"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("argmax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ArgminOpArgumentMapping: + +return KernelSignature("argmin", {"X"}, {"axis", "keepdims", "flatten", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ArgMinOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("keepdims"); + attrs.emplace_back("flatten"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("argmin", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ArgsortOpArgumentMapping: + +return KernelSignature("argsort", {"X"}, {"axis", "descending"}, {"Out", "Indices"}); +****************************************************************** +*/ + +KernelSignature ArgsortOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("descending"); + paddle::small_vector outputs {"Out", "Indices"}; + return KernelSignature("argsort", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsComplexOpArgumentMapping: + +return KernelSignature("as_complex", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AsComplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("as_complex", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsRealOpArgumentMapping: + +return KernelSignature("as_real", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AsRealOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("as_real", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsStridedOpArgumentMapping: + +return KernelSignature("as_strided", {"input"}, {"dims", "stride", "offset"}, {"out"}); +****************************************************************** +*/ + +KernelSignature AsStridedOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input"}; + paddle::small_vector attrs; + attrs.emplace_back("dims"); + attrs.emplace_back("stride"); + attrs.emplace_back("offset"); + paddle::small_vector outputs {"out"}; + return KernelSignature("as_strided", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsinOpArgumentMapping: + +return KernelSignature("asin", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AsinOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("asin", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsinhOpArgumentMapping: + +return KernelSignature("asinh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AsinhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("asinh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AtanOpArgumentMapping: + +return KernelSignature("atan", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AtanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("atan", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Atan2OpArgumentMapping: + +return KernelSignature("atan2", {"X1", "X2"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Atan2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X1", "X2"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("atan2", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AtanhOpArgumentMapping: + +return KernelSignature("atanh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature AtanhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("atanh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AucOpArgumentMapping: + +return KernelSignature("auc", {"Predict", "Label", "StatPos", "StatNeg", "InsTagWeight"}, {"curve", "num_thresholds", "slide_steps"}, {"AUC", "StatPosOut", "StatNegOut"}); +****************************************************************** +*/ + +KernelSignature AucOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Predict", "Label", "StatPos", "StatNeg", "InsTagWeight"}; + paddle::small_vector attrs; + attrs.emplace_back("curve"); + attrs.emplace_back("num_thresholds"); + attrs.emplace_back("slide_steps"); + paddle::small_vector outputs {"AUC", "StatPosOut", "StatNegOut"}; + return KernelSignature("auc", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AverageAccumulatesOpArgumentMapping: + +return KernelSignature("average_accumulates", {"param", "in_sum_1", "in_sum_2", "in_sum_3", "in_num_accumulates", "in_old_num_accumulates", "in_num_updates"}, {"average_window", "max_average_window", "min_average_window"}, {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}); +****************************************************************** +*/ + +KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"param", "in_sum_1", "in_sum_2", "in_sum_3", "in_num_accumulates", "in_old_num_accumulates", "in_num_updates"}; + paddle::small_vector attrs; + attrs.emplace_back("average_window"); + attrs.emplace_back("max_average_window"); + attrs.emplace_back("min_average_window"); + paddle::small_vector outputs {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}; + return KernelSignature("average_accumulates", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BceLossOpArgumentMapping: + +return KernelSignature("bce_loss", {"X", "Label"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BceLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bce_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BernoulliOpArgumentMapping: + +return KernelSignature("bernoulli", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BernoulliOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bernoulli", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BicubicInterpOpArgumentMapping: + +return KernelSignature("bicubic_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BicubicInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("bicubic_interp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BilinearOpArgumentMapping: + +return KernelSignature("bilinear", {"X", "Y", "Weight", "Bias"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BilinearTensorProductOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Weight", "Bias"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bilinear", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BilinearInterpOpArgumentMapping: + +return KernelSignature("bilinear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BilinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("bilinear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BincountOpArgumentMapping: + +return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Weights"}; + paddle::small_vector attrs; + attrs.emplace_back("minlength"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("bincount", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BitwiseAndOpArgumentMapping: + +return KernelSignature("bitwise_and", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BitwiseAndOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bitwise_and", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BitwiseNotOpArgumentMapping: + +return KernelSignature("bitwise_not", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BitwiseNotOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bitwise_not", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BitwiseOrOpArgumentMapping: + +return KernelSignature("bitwise_or", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BitwiseOrOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bitwise_or", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BitwiseXorOpArgumentMapping: + +return KernelSignature("bitwise_xor", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BitwiseXorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bitwise_xor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BmmOpArgumentMapping: + +return KernelSignature("bmm", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BmmOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("bmm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BoxCoderOpArgumentMapping: + +return KernelSignature("box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}, {"code_type", "box_normalized", "axis", "variance"}, {"OutputBox"}); +****************************************************************** +*/ + +KernelSignature BoxCoderOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"PriorBox", "PriorBoxVar", "TargetBox"}; + paddle::small_vector attrs; + attrs.emplace_back("code_type"); + attrs.emplace_back("box_normalized"); + attrs.emplace_back("axis"); + attrs.emplace_back("variance"); + paddle::small_vector outputs {"OutputBox"}; + return KernelSignature("box_coder", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BroadcastTensorsOpArgumentMapping: + +return KernelSignature("broadcast_tensors", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BroadcastTensorsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("broadcast_tensors", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CeilOpArgumentMapping: + +return KernelSignature("ceil", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CeilOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("ceil", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CeluOpArgumentMapping: + +return KernelSignature("celu", {"X"}, {"alpha"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("celu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CheckFiniteAndUnscaleOpArgumentMapping: + +return KernelSignature("check_finite_and_unscale", {"X", "Scale"}, {}, {"Out", "FoundInfinite"}); +****************************************************************** +*/ + +KernelSignature CheckFiniteAndUnscaleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out", "FoundInfinite"}; + return KernelSignature("check_finite_and_unscale", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CheckNumericsOpArgumentMapping: + +return KernelSignature("check_numerics", {"tensor"}, {"op_type", "var_name", "check_nan_inf_level", "stack_height_limit", "output_dir"}, {"stats", "values"}); +****************************************************************** +*/ + +KernelSignature CheckNumericsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"tensor"}; + paddle::small_vector attrs; + attrs.emplace_back("op_type"); + attrs.emplace_back("var_name"); + attrs.emplace_back("check_nan_inf_level"); + attrs.emplace_back("stack_height_limit"); + attrs.emplace_back("output_dir"); + paddle::small_vector outputs {"stats", "values"}; + return KernelSignature("check_numerics", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CholeskyOpArgumentMapping: + +return KernelSignature("cholesky", {"X"}, {"upper"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CholeskyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("cholesky", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CholeskySolveOpArgumentMapping: + +return KernelSignature("cholesky_solve", {"X", "Y"}, {"upper"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CholeskySolveOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("cholesky_solve", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ClassCenterSampleOpArgumentMapping: + +return KernelSignature("class_center_sample", {"Label"}, {"num_classes", "num_samples", "ring_id", "rank", "nranks", "fix_seed", "seed"}, {"RemappedLabel", "SampledLocalClassCenter"}); +****************************************************************** +*/ + +KernelSignature ClassCenterSampleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Label"}; + paddle::small_vector attrs; + attrs.emplace_back("num_classes"); + attrs.emplace_back("num_samples"); + attrs.emplace_back("ring_id"); + attrs.emplace_back("rank"); + attrs.emplace_back("nranks"); + attrs.emplace_back("fix_seed"); + attrs.emplace_back("seed"); + paddle::small_vector outputs {"RemappedLabel", "SampledLocalClassCenter"}; + return KernelSignature("class_center_sample", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ClipOpArgumentMapping: + +return KernelSignature("clip", {"X"}, {"min", "max"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ClipOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); + attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("clip", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ClipByNormOpArgumentMapping: + +return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"}); +return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("max_norm"); + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("clip_by_norm", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("clip_by_norm_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CoalesceTensorOpArgumentMapping: + +return KernelSignature("coalesce_tensor", {"Input"}, {"dtype", "copy_data", "set_constant", "persist_output", "constant", "use_align", "align_size", "user_defined_size_of_dtype", "concated_shapes", "concated_ranks"}, {"Output", "FusedOutput"}); +****************************************************************** +*/ + +KernelSignature CoalesceTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + attrs.emplace_back("dtype"); + attrs.emplace_back("copy_data"); + attrs.emplace_back("set_constant"); + attrs.emplace_back("persist_output"); + attrs.emplace_back("constant"); + attrs.emplace_back("use_align"); + attrs.emplace_back("align_size"); + attrs.emplace_back("user_defined_size_of_dtype"); + attrs.emplace_back("concated_shapes"); + attrs.emplace_back("concated_ranks"); + paddle::small_vector outputs {"Output", "FusedOutput"}; + return KernelSignature("coalesce_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ComplexOpArgumentMapping: + +return KernelSignature("complex", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ComplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("complex", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ConcatOpArgumentMapping: + +return KernelSignature("concat", {"X"}, {"axis"}, {"Out"}); +return KernelSignature("concat", {"X"}, {"AxisTensor"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("AxisTensor") ? "AxisTensor" : "axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("concat", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ConjOpArgumentMapping: + +return KernelSignature("conj", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ConjOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("conj", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dOpArgumentMapping: + +return KernelSignature("conv2d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("dilations"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("conv2d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv3dOpArgumentMapping: + +return KernelSignature("conv3d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("conv3d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv3dTransposeOpArgumentMapping: + +return KernelSignature("conv3d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature Conv3dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("conv3d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CosOpArgumentMapping: + +return KernelSignature("cos", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CosOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("cos", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CoshOpArgumentMapping: + +return KernelSignature("cosh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CoshOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("cosh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CropOpArgumentMapping: + +return KernelSignature("crop", {"X"}, {"shape", "offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"shape", "Offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"shape", "OffsetsTensor"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"Shape", "offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"Shape", "Offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"Shape", "OffsetsTensor"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"ShapeTensor", "offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"ShapeTensor", "Offsets"}, {"Out"}); +return KernelSignature("crop", {"X"}, {"ShapeTensor", "OffsetsTensor"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CropTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("Shape") + ? "Shape" + : ctx.InputSize("ShapeTensor") > 0 + ? "ShapeTensor" + : "shape"); + attrs.emplace_back( + ctx.HasInput("Offsets") + ? "Offsets" + : ctx.InputSize("OffsetsTensor") > 0 + ? "OffsetsTensor" + : "offsets"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("crop", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CrossOpArgumentMapping: + +return KernelSignature("cross", {"X", "Y"}, {"dim"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("cross", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CrossEntropyWithSoftmaxOpArgumentMapping: + +return KernelSignature("cross_entropy_with_softmax", {"Logits", "Label"}, {"soft_label", "use_softmax", "numeric_stable_mode", "ignore_index", "axis"}, {"Softmax", "Loss"}); +****************************************************************** +*/ + +KernelSignature SoftmaxWithCrossEntropyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Logits", "Label"}; + paddle::small_vector attrs; + attrs.emplace_back("soft_label"); + attrs.emplace_back("use_softmax"); + attrs.emplace_back("numeric_stable_mode"); + attrs.emplace_back("ignore_index"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Softmax", "Loss"}; + return KernelSignature("cross_entropy_with_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CummaxOpArgumentMapping: + +return KernelSignature("cummax", {"x"}, {"axis", "dtype"}, {"out", "indices"}); +****************************************************************** +*/ + +KernelSignature CummaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out", "indices"}; + return KernelSignature("cummax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumminOpArgumentMapping: + +return KernelSignature("cummin", {"x"}, {"axis", "dtype"}, {"out", "indices"}); +****************************************************************** +*/ + +KernelSignature CumminOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out", "indices"}; + return KernelSignature("cummin", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumprodOpArgumentMapping: + +return KernelSignature("cumprod", {"X"}, {"dim"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CumprodOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("cumprod", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumsumOpArgumentMapping: + +return KernelSignature("cumsum", {"X"}, {"axis", "flatten", "exclusive", "reverse"}, {"Out"}); +return KernelSignature("cumsum", {"X"}, {"AxisTensor", "flatten", "exclusive", "reverse"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature CumsumOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("flatten"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("reverse"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("cumsum", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DataOpArgumentMapping: + +return KernelSignature("data", {}, {"name", "shape", "dtype", "place"}, {"out"}); +return KernelSignature("data", {}, {"name", "ShapeTensor", "dtype", "place"}, {"out"}); +return KernelSignature("data", {}, {"name", "ShapeTensorList", "dtype", "place"}, {"out"}); +****************************************************************** +*/ + +KernelSignature DataOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("name"); + attrs.emplace_back( + ctx.HasInput("ShapeTensor") + ? "ShapeTensor" + : ctx.InputSize("ShapeTensorList") > 0 + ? "ShapeTensorList" + : "shape"); + attrs.emplace_back("dtype"); + + paddle::small_vector outputs {"out"}; + return KernelSignature("data", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DepthwiseConv2dOpArgumentMapping: + +return KernelSignature("depthwise_conv2d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature DepthwiseConv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("depthwise_conv2d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DetOpArgumentMapping: + +return KernelSignature("determinant", {"Input"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DeterminantOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("determinant", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DiagOpArgumentMapping: + +return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DiagV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("padding_value"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("diag", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DiagEmbedOpArgumentMapping: + +return KernelSignature("diag_embed", {"Input"}, {"offset", "dim1", "dim2"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DiagEmbedOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("dim1"); + attrs.emplace_back("dim2"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("diag_embed", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DiagonalOpArgumentMapping: + +return KernelSignature("diagonal", {"Input"}, {"offset", "axis1", "axis2"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DiagonalOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("axis1"); + attrs.emplace_back("axis2"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("diagonal", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DigammaOpArgumentMapping: + +return KernelSignature("digamma", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DigammaOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("digamma", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DirichletOpArgumentMapping: + +return KernelSignature("dirichlet", {"Alpha"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DirichletOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Alpha"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("dirichlet", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DistOpArgumentMapping: + +return KernelSignature("dist", {"X", "Y"}, {"p"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DistOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("p"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("dist", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DotOpArgumentMapping: + +return KernelSignature("dot", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DotOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("dot", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EditDistanceOpArgumentMapping: + +return KernelSignature("edit_distance", {"Hyps", "Refs", "HypsLength", "RefsLength"}, {"normalized"}, {"SequenceNum", "Out"}); +****************************************************************** +*/ + +KernelSignature EditDistanceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Hyps", "Refs", "HypsLength", "RefsLength"}; + paddle::small_vector attrs; + attrs.emplace_back("normalized"); + paddle::small_vector outputs {"SequenceNum", "Out"}; + return KernelSignature("edit_distance", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EigOpArgumentMapping: + +return KernelSignature("eig", {"X"}, {}, {"Eigenvalues", "Eigenvectors"}); +****************************************************************** +*/ + +KernelSignature EigOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; + return KernelSignature("eig", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EighOpArgumentMapping: + +return KernelSignature("eigh", {"X"}, {"UPLO"}, {"Eigenvalues", "Eigenvectors"}); +****************************************************************** +*/ + +KernelSignature EighOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("UPLO"); + paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; + return KernelSignature("eigh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EigvalsOpArgumentMapping: + +return KernelSignature("eigvals", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EigvalsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("eigvals", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EigvalshOpArgumentMapping: + +return KernelSignature("eigvalsh", {"X"}, {"UPLO", "is_test"}, {"Eigenvalues", "Eigenvectors"}); +****************************************************************** +*/ + +KernelSignature EigvalshOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("UPLO"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; + return KernelSignature("eigvalsh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EluOpArgumentMapping: + +return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("elu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EqualAllOpArgumentMapping: + +return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EqualAllOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("equal_all", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ErfOpArgumentMapping: + +return KernelSignature("erf", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ErfOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("erf", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ErfinvOpArgumentMapping: + +return KernelSignature("erfinv", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ErfinvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("erfinv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ExpOpArgumentMapping: + +return KernelSignature("exp", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ExpOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("exp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ExpandAsOpArgumentMapping: + +return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ExpandAsV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("target_shape"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("expand_as", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Expm1OpArgumentMapping: + +return KernelSignature("expm1", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Expm1OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("expm1", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftC2cOpArgumentMapping: + +return KernelSignature("fft_c2c", {"X"}, {"axes", "normalization", "forward"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FftC2cOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fft_c2c", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftC2rOpArgumentMapping: + +return KernelSignature("fft_c2r", {"X"}, {"axes", "normalization", "forward", "last_dim_size"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FftC2rOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + attrs.emplace_back("last_dim_size"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fft_c2r", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftR2cOpArgumentMapping: + +return KernelSignature("fft_r2c", {"X"}, {"axes", "normalization", "forward", "onesided"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FftR2cOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + attrs.emplace_back("onesided"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fft_r2c", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillOpArgumentMapping: + +return KernelSignature("fill", {"X"}, {"value"}, {"Out"}); +return KernelSignature("fill", {"X"}, {"ValueTensor"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FillAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fill", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillDiagonalOpArgumentMapping: + +return KernelSignature("fill_diagonal", {"X"}, {"value", "offset", "wrap"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FillDiagonalOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + attrs.emplace_back("offset"); + attrs.emplace_back("wrap"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fill_diagonal", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillDiagonalTensorOpArgumentMapping: + +return KernelSignature("fill_diagonal_tensor", {"X", "Y"}, {"offset", "dim1", "dim2"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FillDiagonalTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("dim1"); + attrs.emplace_back("dim2"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("fill_diagonal_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FlashAttnOpArgumentMapping: + +return KernelSignature("flash_attn", {"q", "k", "v", "fixed_seed_offset", "attn_mask"}, {"dropout", "causal", "return_softmax", "is_test", "rng_name"}, {"out", "softmax", "softmax_lse", "seed_offset"}); +****************************************************************** +*/ + +KernelSignature FlashAttnOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"q", "k", "v", "fixed_seed_offset", "attn_mask"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout"); + attrs.emplace_back("causal"); + attrs.emplace_back("return_softmax"); + attrs.emplace_back("is_test"); + attrs.emplace_back("rng_name"); + paddle::small_vector outputs {"out", "softmax", "softmax_lse", "seed_offset"}; + return KernelSignature("flash_attn", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FlashAttnUnpaddedOpArgumentMapping: + +return KernelSignature("flash_attn_unpadded", {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "fixed_seed_offset", "attn_mask"}, {"max_seqlen_q", "max_seqlen_k", "scale", "dropout", "causal", "return_softmax", "is_test", "rng_name"}, {"out", "softmax", "softmax_lse", "seed_offset"}); +****************************************************************** +*/ + +KernelSignature FlashAttnUnpaddedOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "fixed_seed_offset", "attn_mask"}; + paddle::small_vector attrs; + attrs.emplace_back("max_seqlen_q"); + attrs.emplace_back("max_seqlen_k"); + attrs.emplace_back("scale"); + attrs.emplace_back("dropout"); + attrs.emplace_back("causal"); + attrs.emplace_back("return_softmax"); + attrs.emplace_back("is_test"); + attrs.emplace_back("rng_name"); + paddle::small_vector outputs {"out", "softmax", "softmax_lse", "seed_offset"}; + return KernelSignature("flash_attn_unpadded", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FlipOpArgumentMapping: + +return KernelSignature("flip", {"X"}, {"axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FlipOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("flip", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FloorOpArgumentMapping: + +return KernelSignature("floor", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FloorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("floor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FoldOpArgumentMapping: + +return KernelSignature("fold", {"X"}, {"output_sizes", "kernel_sizes", "strides", "paddings", "dilations"}, {"Y"}); +****************************************************************** +*/ + +KernelSignature FoldOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("output_sizes"); + attrs.emplace_back("kernel_sizes"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + paddle::small_vector outputs {"Y"}; + return KernelSignature("fold", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FrameOpArgumentMapping: + +return KernelSignature("frame", {"X"}, {"frame_length", "hop_length", "axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FrameOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("frame_length"); + attrs.emplace_back("hop_length"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("frame", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FullIntArrayOpArgumentMapping: + +return KernelSignature("full_int_array", {}, {"value", "dtype", "place"}, {"out"}); +****************************************************************** +*/ + +KernelSignature FullIntArrayOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + attrs.emplace_back("dtype"); + + paddle::small_vector outputs {"out"}; + return KernelSignature("full_int_array", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GatherOpArgumentMapping: + +return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"}); +return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Axis") ? "Axis" : "axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("gather", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GatherNdOpArgumentMapping: + +return KernelSignature("gather_nd", {"X", "Index"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GatherNdOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("gather_nd", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GatherTreeOpArgumentMapping: + +return KernelSignature("gather_tree", {"Ids", "Parents"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GatherTreeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Ids", "Parents"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("gather_tree", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GaussianInplaceOpArgumentMapping: + +return KernelSignature("gaussian_inplace", {"x"}, {"mean", "std", "seed"}, {"out"}); +****************************************************************** +*/ + +KernelSignature GaussianInplaceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("mean"); + attrs.emplace_back("std"); + attrs.emplace_back("seed"); + paddle::small_vector outputs {"out"}; + return KernelSignature("gaussian_inplace", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GeluOpArgumentMapping: + +return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("approximate"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("gelu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GenerateProposalsOpArgumentMapping: + +return KernelSignature("generate_proposals", {"Scores", "BboxDeltas", "ImShape", "Anchors", "Variances"}, {"pre_nms_topN", "post_nms_topN", "nms_thresh", "min_size", "eta", "pixel_offset"}, {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}); +****************************************************************** +*/ + +KernelSignature GenerateProposalsV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Scores", "BboxDeltas", "ImShape", "Anchors", "Variances"}; + paddle::small_vector attrs; + attrs.emplace_back("pre_nms_topN"); + attrs.emplace_back("post_nms_topN"); + attrs.emplace_back("nms_thresh"); + attrs.emplace_back("min_size"); + attrs.emplace_back("eta"); + attrs.emplace_back("pixel_offset"); + paddle::small_vector outputs {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}; + return KernelSignature("generate_proposals", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GridSampleOpArgumentMapping: + +return KernelSignature("grid_sample", {"X", "Grid"}, {"mode", "padding_mode", "align_corners"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature GridSamplerOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Grid"}; + paddle::small_vector attrs; + attrs.emplace_back("mode"); + attrs.emplace_back("padding_mode"); + attrs.emplace_back("align_corners"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("grid_sample", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GroupNormOpArgumentMapping: + +return KernelSignature("group_norm", {"X", "Scale", "Bias"}, {"epsilon", "groups", "data_layout"}, {"Y", "Mean", "Variance"}); +****************************************************************** +*/ + +KernelSignature GroupNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_layout"); + paddle::small_vector outputs {"Y", "Mean", "Variance"}; + return KernelSignature("group_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GumbelSoftmaxOpArgumentMapping: + +return KernelSignature("gumbel_softmax", {"X"}, {"temperature", "hard", "axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GumbelSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("temperature"); + attrs.emplace_back("hard"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("gumbel_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardshrinkOpArgumentMapping: + +return KernelSignature("hard_shrink", {"X"}, {"threshold"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature HardShrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("hard_shrink", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardsigmoidOpArgumentMapping: + +return KernelSignature("hardsigmoid", {"X"}, {"slope", "offset"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature HardSigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("slope"); + attrs.emplace_back("offset"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("hardsigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardtanhOpArgumentMapping: + +return KernelSignature("hardtanh", {"X"}, {"t_min", "t_max"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature BreluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("t_min"); + attrs.emplace_back("t_max"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("hardtanh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HeavisideOpArgumentMapping: + +return KernelSignature("heaviside", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ElementwiseHeavisideOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("heaviside", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HistogramOpArgumentMapping: + +return KernelSignature("histogram", {"X"}, {"bins", "min", "max"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("bins"); + attrs.emplace_back("min"); + attrs.emplace_back("max"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("histogram", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HuberLossOpArgumentMapping: + +return KernelSignature("huber_loss", {"X", "Y"}, {"delta"}, {"Out", "Residual"}); +****************************************************************** +*/ + +KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("delta"); + paddle::small_vector outputs {"Out", "Residual"}; + return KernelSignature("huber_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I0OpArgumentMapping: + +return KernelSignature("i0", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature I0OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("i0", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I0eOpArgumentMapping: + +return KernelSignature("i0e", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature I0eOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("i0e", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I1OpArgumentMapping: + +return KernelSignature("i1", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature I1OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("i1", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I1eOpArgumentMapping: + +return KernelSignature("i1e", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature I1eOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("i1e", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ImagOpArgumentMapping: + +return KernelSignature("imag", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ImagOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("imag", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexAddOpArgumentMapping: + +return KernelSignature("index_add", {"X", "Index", "AddValue"}, {"axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IndexAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "AddValue"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("index_add", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexPutOpArgumentMapping: + +return KernelSignature("index_put", {"x", "indices", "value"}, {"accumulate"}, {"out"}); +****************************************************************** +*/ + +KernelSignature IndexPutOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "indices", "value"}; + paddle::small_vector attrs; + attrs.emplace_back("accumulate"); + paddle::small_vector outputs {"out"}; + return KernelSignature("index_put", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSampleOpArgumentMapping: + +return KernelSignature("index_sample", {"X", "Index"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IndexSampleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("index_sample", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSelectOpArgumentMapping: + +return KernelSignature("index_select", {"X", "Index"}, {"dim"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IndexSelectOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("index_select", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSelectStridedOpArgumentMapping: + +return KernelSignature("index_select_strided", {"x"}, {"index", "axis"}, {"out"}); +****************************************************************** +*/ + +KernelSignature IndexSelectStridedOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("index"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"out"}; + return KernelSignature("index_select_strided", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by InstanceNormOpArgumentMapping: + +return KernelSignature("instance_norm", {"X", "Scale", "Bias"}, {"epsilon"}, {"Y", "SavedMean", "SavedVariance"}); +****************************************************************** +*/ + +KernelSignature InstanceNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"Y", "SavedMean", "SavedVariance"}; + return KernelSignature("instance_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by InverseOpArgumentMapping: + +return KernelSignature("inverse", {"Input"}, {}, {"Output"}); +****************************************************************** +*/ + +KernelSignature InverseOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Output"}; + return KernelSignature("inverse", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IsEmptyOpArgumentMapping: + +return KernelSignature("is_empty", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IsEmptyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("is_empty", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IscloseOpArgumentMapping: + +return KernelSignature("isclose", {"Input", "Other"}, {"rtol", "atol", "equal_nan"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Other"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Rtol") ? "Rtol" : "rtol"); + attrs.emplace_back(ctx.HasInput("Atol") ? "Atol" : "atol"); + attrs.emplace_back("equal_nan"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("isclose", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IsfiniteOpArgumentMapping: + +return KernelSignature("isfinite", {"X"}, {}, {"Out"}); +return KernelSignature("isfinite_sr", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IsfiniteV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("isfinite", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("isfinite_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IsinfOpArgumentMapping: + +return KernelSignature("isinf", {"X"}, {}, {"Out"}); +return KernelSignature("isinf_sr", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IsinfV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("isinf", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("isinf_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IsnanOpArgumentMapping: + +return KernelSignature("isnan", {"X"}, {}, {"Out"}); +return KernelSignature("isnan_sr", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature IsnanV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("isnan", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("isnan_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KldivLossOpArgumentMapping: + +return KernelSignature("kldiv_loss", {"X", "Target"}, {"reduction"}, {"Loss"}); +****************************************************************** +*/ + +KernelSignature KldivLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Target"}; + paddle::small_vector attrs; + attrs.emplace_back("reduction"); + paddle::small_vector outputs {"Loss"}; + return KernelSignature("kldiv_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KronOpArgumentMapping: + +return KernelSignature("kron", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature KronOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("kron", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KthvalueOpArgumentMapping: + +return KernelSignature("kthvalue", {"X"}, {"k", "axis", "keepdim"}, {"Out", "Indices"}); +****************************************************************** +*/ + +KernelSignature KthvalueOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("k"); + attrs.emplace_back("axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"Out", "Indices"}; + return KernelSignature("kthvalue", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LabelSmoothOpArgumentMapping: + +return KernelSignature("label_smooth", {"X", "PriorDist"}, {"epsilon"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LabelSmoothOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "PriorDist"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("label_smooth", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LambOpArgumentMapping: + +return KernelSignature("lamb", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"weight_decay", "beta1", "beta2", "epsilon", "always_adapt", "multi_precision"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("lamb_sr", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"weight_decay", "beta1", "beta2", "epsilon", "always_adapt", "multi_precision"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature LambOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}; + paddle::small_vector attrs; + attrs.emplace_back("weight_decay"); + attrs.emplace_back("beta1"); + attrs.emplace_back("beta2"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("always_adapt"); + attrs.emplace_back("multi_precision"); + paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; + if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("Grad") && + ctx.IsDenseTensorInput("LearningRate") && + ctx.IsDenseTensorInput("Moment1") && + ctx.IsDenseTensorInput("Moment2") && + ctx.IsDenseTensorInput("Beta1Pow") && + ctx.IsDenseTensorInput("Beta2Pow") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam"))) && + ((ctx.HasInput("SkipUpdate") && ctx.IsDenseTensorInput("SkipUpdate")) || (!ctx.HasInput("SkipUpdate")))) { + return KernelSignature("lamb", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Param") && + ctx.IsSelectedRowsInput("Grad") && + ctx.IsDenseTensorInput("LearningRate") && + ctx.IsDenseTensorInput("Moment1") && + ctx.IsDenseTensorInput("Moment2") && + ctx.IsDenseTensorInput("Beta1Pow") && + ctx.IsDenseTensorInput("Beta2Pow") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam"))) && + ((ctx.HasInput("SkipUpdate") && ctx.IsDenseTensorInput("SkipUpdate")) || (!ctx.HasInput("SkipUpdate")))) { + return KernelSignature("lamb_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LayerNormOpArgumentMapping: + +return KernelSignature("layer_norm", {"X", "Scale", "Bias"}, {"epsilon", "begin_norm_axis"}, {"Y", "Mean", "Variance"}); +****************************************************************** +*/ + +KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "Bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("begin_norm_axis"); + paddle::small_vector outputs {"Y", "Mean", "Variance"}; + return KernelSignature("layer_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LeakyReluOpArgumentMapping: + +return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("leaky_relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LerpOpArgumentMapping: + +return KernelSignature("lerp", {"X", "Y", "Weight"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LerpOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Weight"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("lerp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LgammaOpArgumentMapping: + +return KernelSignature("lgamma", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LgammaOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("lgamma", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LinearInterpOpArgumentMapping: + +return KernelSignature("linear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("linear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LlmInt8LinearOpArgumentMapping: + +return KernelSignature("llm_int8_linear", {"x", "weight", "bias", "weight_scale"}, {"threshold"}, {"out"}); +****************************************************************** +*/ + +KernelSignature LlmInt8LinearOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "weight", "bias", "weight_scale"}; + paddle::small_vector attrs; + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"out"}; + return KernelSignature("llm_int8_linear", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogOpArgumentMapping: + +return KernelSignature("log", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("log", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log10OpArgumentMapping: + +return KernelSignature("log10", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Log10OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("log10", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log1pOpArgumentMapping: + +return KernelSignature("log1p", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Log1pOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("log1p", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log2OpArgumentMapping: + +return KernelSignature("log2", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Log2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("log2", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogLossOpArgumentMapping: + +return KernelSignature("log_loss", {"Predicted", "Labels"}, {"epsilon"}, {"Loss"}); +****************************************************************** +*/ + +KernelSignature LogLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Predicted", "Labels"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"Loss"}; + return KernelSignature("log_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogSoftmaxOpArgumentMapping: + +return KernelSignature("log_softmax", {"X"}, {"axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("log_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogcumsumexpOpArgumentMapping: + +return KernelSignature("logcumsumexp", {"X"}, {"axis", "flatten", "exclusive", "reverse"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogcumsumexpOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("flatten"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("reverse"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("logcumsumexp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogicalAndOpArgumentMapping: + +return KernelSignature("logical_and", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogicalAndOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("logical_and", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogicalNotOpArgumentMapping: + +return KernelSignature("logical_not", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogicalNotOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("logical_not", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogicalOrOpArgumentMapping: + +return KernelSignature("logical_or", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogicalOrOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("logical_or", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogicalXorOpArgumentMapping: + +return KernelSignature("logical_xor", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogicalXorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("logical_xor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogitOpArgumentMapping: + +return KernelSignature("logit", {"X"}, {"eps"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogitOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("eps"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("logit", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogsigmoidOpArgumentMapping: + +return KernelSignature("logsigmoid", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LogsigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("logsigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LstsqOpArgumentMapping: + +return KernelSignature("lstsq", {"X", "Y"}, {"rcond", "driver"}, {"Solution", "Residuals", "Rank", "SingularValues"}); +return KernelSignature("lstsq", {"X", "Y"}, {"RcondTensor", "driver"}, {"Solution", "Residuals", "Rank", "SingularValues"}); +****************************************************************** +*/ + +KernelSignature LstsqOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("rcond"); + attrs.emplace_back("driver"); + paddle::small_vector outputs {"Solution", "Residuals", "Rank", "SingularValues"}; + return KernelSignature("lstsq", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LuOpArgumentMapping: + +return KernelSignature("lu", {"X"}, {"pivots"}, {"Out", "Pivots", "Infos"}); +****************************************************************** +*/ + +KernelSignature LuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("pivots"); + paddle::small_vector outputs {"Out", "Pivots", "Infos"}; + return KernelSignature("lu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LuUnpackOpArgumentMapping: + +return KernelSignature("lu_unpack", {"X", "Pivots"}, {"unpack_ludata", "unpack_pivots"}, {"Pmat", "L", "U"}); +****************************************************************** +*/ + +KernelSignature LuUnpackOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Pivots"}; + paddle::small_vector attrs; + attrs.emplace_back("unpack_ludata"); + attrs.emplace_back("unpack_pivots"); + paddle::small_vector outputs {"Pmat", "L", "U"}; + return KernelSignature("lu_unpack", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MarginCrossEntropyOpArgumentMapping: + +return KernelSignature("margin_cross_entropy", {"Logits", "Label"}, {"return_softmax", "ring_id", "rank", "nranks", "margin1", "margin2", "margin3", "scale"}, {"Softmax", "Loss"}); +****************************************************************** +*/ + +KernelSignature MarginCrossEntropyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Logits", "Label"}; + paddle::small_vector attrs; + attrs.emplace_back("return_softmax"); + attrs.emplace_back("ring_id"); + attrs.emplace_back("rank"); + attrs.emplace_back("nranks"); + attrs.emplace_back("margin1"); + attrs.emplace_back("margin2"); + attrs.emplace_back("margin3"); + attrs.emplace_back("scale"); + paddle::small_vector outputs {"Softmax", "Loss"}; + return KernelSignature("margin_cross_entropy", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaskedMultiheadAttentionOpArgumentMapping: + +return KernelSignature("masked_multihead_attention", {"x", "cache_kv", "bias", "src_mask", "cum_offsets", "sequence_lengths", "rotary_tensor", "beam_cache_offset", "qkv_out_scale", "out_shift", "out_smooth"}, {"seq_len", "rotary_emb_dims", "use_neox_rotary_style", "compute_dtype", "out_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "cache_kv_out", "beam_cache_offset_out"}); +****************************************************************** +*/ + +KernelSignature MaskedMultiheadAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "cache_kv", "bias", "src_mask", "cum_offsets", "sequence_lengths", "rotary_tensor", "beam_cache_offset", "qkv_out_scale", "out_shift", "out_smooth"}; + paddle::small_vector attrs; + attrs.emplace_back("seq_len"); + attrs.emplace_back("rotary_emb_dims"); + attrs.emplace_back("use_neox_rotary_style"); + attrs.emplace_back("compute_dtype"); + attrs.emplace_back("out_scale"); + attrs.emplace_back("quant_round_type"); + attrs.emplace_back("quant_max_bound"); + attrs.emplace_back("quant_min_bound"); + paddle::small_vector outputs {"out", "cache_kv_out", "beam_cache_offset_out"}; + return KernelSignature("masked_multihead_attention", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaskedSelectOpArgumentMapping: + +return KernelSignature("masked_select", {"X", "Mask"}, {}, {"Y"}); +****************************************************************** +*/ + +KernelSignature MaskedSelectOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Mask"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Y"}; + return KernelSignature("masked_select", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatrixNmsOpArgumentMapping: + +return KernelSignature("matrix_nms", {"BBoxes", "Scores"}, {"score_threshold", "nms_top_k", "keep_top_k", "post_threshold", "use_gaussian", "gaussian_sigma", "background_label", "normalized"}, {"Out", "Index", "RoisNum"}); +****************************************************************** +*/ + +KernelSignature MatrixNmsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"BBoxes", "Scores"}; + paddle::small_vector attrs; + attrs.emplace_back("score_threshold"); + attrs.emplace_back("nms_top_k"); + attrs.emplace_back("keep_top_k"); + attrs.emplace_back("post_threshold"); + attrs.emplace_back("use_gaussian"); + attrs.emplace_back("gaussian_sigma"); + attrs.emplace_back("background_label"); + attrs.emplace_back("normalized"); + paddle::small_vector outputs {"Out", "Index", "RoisNum"}; + return KernelSignature("matrix_nms", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatrixPowerOpArgumentMapping: + +return KernelSignature("matrix_power", {"X"}, {"n"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MatrixPowerOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("n"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("matrix_power", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxPool2dWithIndexOpArgumentMapping: + +return KernelSignature("max_pool2d_with_index", {"X"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"Out", "Mask"}); +****************************************************************** +*/ + +KernelSignature MaxPool2dWithIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + paddle::small_vector outputs {"Out", "Mask"}; + return KernelSignature("max_pool2d_with_index", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxPool3dWithIndexOpArgumentMapping: + +return KernelSignature("max_pool3d_with_index", {"X"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"Out", "Mask"}); +****************************************************************** +*/ + +KernelSignature MaxPool3dWithIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + paddle::small_vector outputs {"Out", "Mask"}; + return KernelSignature("max_pool3d_with_index", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxoutOpArgumentMapping: + +return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MaxoutOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("groups"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("maxout", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MeanAllOpArgumentMapping: + +return KernelSignature("mean_all", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MeanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("mean_all", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MemoryEfficientAttentionOpArgumentMapping: + +return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"max_seqlen_q", "max_seqlen_k", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); +return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"max_seqlen_q", "MaxSeqlenKTensor", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); +return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"MaxSeqlenQTensor", "max_seqlen_k", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); +return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"MaxSeqlenQTensor", "MaxSeqlenKTensor", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); +****************************************************************** +*/ + +KernelSignature MemoryEfficientAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("MaxSeqlenQTensor") ? "MaxSeqlenQTensor" : "max_seqlen_q"); + attrs.emplace_back(ctx.HasInput("MaxSeqlenKTensor") ? "MaxSeqlenKTensor" : "max_seqlen_k"); + attrs.emplace_back("causal"); + attrs.emplace_back("dropout_p"); + attrs.emplace_back("scale"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"output", "logsumexp", "seed_and_offset"}; + return KernelSignature("memory_efficient_attention", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MergeSelectedRowsOpArgumentMapping: + +return KernelSignature("merge_selected_rows", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MergeSelectedRowsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("merge_selected_rows", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MergedAdamOpArgumentMapping: + +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "beta2", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "Beta2Tensor", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "Beta2Tensor", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "beta2", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "Beta2Tensor", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "Beta2Tensor", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("beta1"); + attrs.emplace_back("beta2"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("multi_precision"); + attrs.emplace_back("use_global_beta_pow"); + paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; + return KernelSignature("merged_adam", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MergedMomentumOpArgumentMapping: + +return KernelSignature("merged_momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature MergedMomentumOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("mu"); + attrs.emplace_back("use_nesterov"); + attrs.emplace_back("regularization_method"); + attrs.emplace_back("regularization_coeff"); + attrs.emplace_back("multi_precision"); + attrs.emplace_back("rescale_grad"); + paddle::small_vector outputs {"ParamOut", "VelocityOut", "MasterParamOut"}; + return KernelSignature("merged_momentum", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MeshgridOpArgumentMapping: + +return KernelSignature("meshgrid", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MeshgridOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("meshgrid", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ModeOpArgumentMapping: + +return KernelSignature("mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); +****************************************************************** +*/ + +KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"Out", "Indices"}; + return KernelSignature("mode", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MomentumOpArgumentMapping: + +return KernelSignature("momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); +return KernelSignature("momentum_dense_param_sparse_grad", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature MomentumOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("mu"); + attrs.emplace_back("use_nesterov"); + attrs.emplace_back("regularization_method"); + attrs.emplace_back("regularization_coeff"); + attrs.emplace_back("multi_precision"); + attrs.emplace_back("rescale_grad"); + paddle::small_vector outputs {"ParamOut", "VelocityOut", "MasterParamOut"}; + if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("Grad") && + ctx.IsDenseTensorInput("Velocity") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("momentum", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Param") && + ctx.IsSelectedRowsInput("Grad") && + ctx.IsDenseTensorInput("Velocity") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("momentum_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiDotOpArgumentMapping: + +return KernelSignature("multi_dot", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MultiDotOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("multi_dot", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MulticlassNms3OpArgumentMapping: + +return KernelSignature("multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}, {"score_threshold", "nms_top_k", "keep_top_k", "nms_threshold", "normalized", "nms_eta", "background_label"}, {"Out", "Index", "NmsRoisNum"}); +****************************************************************** +*/ + +KernelSignature MulticlassNms3OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"BBoxes", "Scores", "RoisNum"}; + paddle::small_vector attrs; + attrs.emplace_back("score_threshold"); + attrs.emplace_back("nms_top_k"); + attrs.emplace_back("keep_top_k"); + attrs.emplace_back("nms_threshold"); + attrs.emplace_back("normalized"); + attrs.emplace_back("nms_eta"); + attrs.emplace_back("background_label"); + paddle::small_vector outputs {"Out", "Index", "NmsRoisNum"}; + return KernelSignature("multiclass_nms3", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultinomialOpArgumentMapping: + +return KernelSignature("multinomial", {"X"}, {"num_samples", "replacement"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MultinomialOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("num_samples"); + attrs.emplace_back("replacement"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("multinomial", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiplexOpArgumentMapping: + +return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Ids"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("multiplex", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MvOpArgumentMapping: + +return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Vec"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("mv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NanmedianOpArgumentMapping: + +return KernelSignature("nanmedian", {"X"}, {"axis", "keepdim"}, {"Out", "MedianIndex"}); +return KernelSignature("nanmedian", {"X"}, {"AxisTensorList", "keepdim"}, {"Out", "MedianIndex"}); +****************************************************************** +*/ + +KernelSignature NanmedianOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"Out", "MedianIndex"}; + return KernelSignature("nanmedian", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NearestInterpOpArgumentMapping: + +return KernelSignature("nearest_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature NearestInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("nearest_interp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NextafterOpArgumentMapping: + +return KernelSignature("nextafter", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature NextafterOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + return KernelSignature("nextafter", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NllLossOpArgumentMapping: + +return KernelSignature("nll_loss", {"X", "Label", "Weight"}, {"ignore_index", "reduction"}, {"Out", "Total_weight"}); +****************************************************************** +*/ + +KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label", "Weight"}; + paddle::small_vector attrs; + attrs.emplace_back("ignore_index"); + attrs.emplace_back("reduction"); + paddle::small_vector outputs {"Out", "Total_weight"}; + return KernelSignature("nll_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NmsOpArgumentMapping: + +return KernelSignature("nms", {"Boxes"}, {"iou_threshold"}, {"KeepBoxesIdxs"}); +****************************************************************** +*/ + +KernelSignature NmsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Boxes"}; + paddle::small_vector attrs; + attrs.emplace_back("iou_threshold"); + paddle::small_vector outputs {"KeepBoxesIdxs"}; + return KernelSignature("nms", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NonzeroOpArgumentMapping: + +return KernelSignature("nonzero", {"Condition"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature WhereIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Condition"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("nonzero", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NpuIdentityOpArgumentMapping: + +return KernelSignature("npu_identity", {"x"}, {"format"}, {"out"}); +****************************************************************** +*/ + +KernelSignature NpuIdentityOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("format"); + paddle::small_vector outputs {"out"}; + return KernelSignature("npu_identity", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NumelOpArgumentMapping: + +return KernelSignature("numel", {"Input"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SizeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("numel", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by OverlapAddOpArgumentMapping: + +return KernelSignature("overlap_add", {"X"}, {"hop_length", "axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature OverlapAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("hop_length"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("overlap_add", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PNormOpArgumentMapping: + +return KernelSignature("p_norm", {"X"}, {"porder", "axis", "epsilon", "keepdim", "asvector"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("porder"); + attrs.emplace_back("axis"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("keepdim"); + attrs.emplace_back("asvector"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("p_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pad3dOpArgumentMapping: + +return KernelSignature("pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"}); +return KernelSignature("pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("Paddings") + ? "Paddings" + : "paddings"); + + attrs.emplace_back("mode"); + attrs.emplace_back("value"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("pad3d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PixelShuffleOpArgumentMapping: + +return KernelSignature("pixel_shuffle", {"X"}, {"upscale_factor", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PixelShuffleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("upscale_factor"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("pixel_shuffle", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PixelUnshuffleOpArgumentMapping: + +return KernelSignature("pixel_unshuffle", {"X"}, {"downscale_factor", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PixelUnshuffleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("downscale_factor"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("pixel_unshuffle", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PoissonOpArgumentMapping: + +return KernelSignature("poisson", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PoissonOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("poisson", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PolygammaOpArgumentMapping: + +return KernelSignature("polygamma", {"x"}, {"n"}, {"out"}); +****************************************************************** +*/ + +KernelSignature PolygammaOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("n"); + paddle::small_vector outputs {"out"}; + return KernelSignature("polygamma", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PowOpArgumentMapping: + +return KernelSignature("pow", {"X"}, {"factor"}, {"Out"}); +return KernelSignature("pow", {"X"}, {"FactorTensor"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("pow", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PreluOpArgumentMapping: + +return KernelSignature("prelu", {"X", "Alpha"}, {"data_format", "mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PreluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Alpha"}; + paddle::small_vector attrs; + attrs.emplace_back("data_format"); + attrs.emplace_back("mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("prelu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PriorBoxOpArgumentMapping: + +return KernelSignature("prior_box", {"Input", "Image"}, {"min_sizes", "max_sizes", "aspect_ratios", "variances", "flip", "clip", "step_w", "step_h", "offset", "min_max_aspect_ratios_order"}, {"Boxes", "Variances"}); +****************************************************************** +*/ + +KernelSignature PriorBoxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Image"}; + paddle::small_vector attrs; + attrs.emplace_back("min_sizes"); + attrs.emplace_back("max_sizes"); + attrs.emplace_back("aspect_ratios"); + attrs.emplace_back("variances"); + attrs.emplace_back("flip"); + attrs.emplace_back("clip"); + attrs.emplace_back("step_w"); + attrs.emplace_back("step_h"); + attrs.emplace_back("offset"); + attrs.emplace_back("min_max_aspect_ratios_order"); + paddle::small_vector outputs {"Boxes", "Variances"}; + return KernelSignature("prior_box", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PsroiPoolOpArgumentMapping: + +return KernelSignature("psroi_pool", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("output_channels"); + attrs.emplace_back("spatial_scale"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("psroi_pool", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PutAlongAxisOpArgumentMapping: + +return KernelSignature("put_along_axis", {"Input", "Index", "Value"}, {"Axis", "Reduce"}, {"Result"}); +****************************************************************** +*/ + +KernelSignature PutAlongAxisOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Index", "Value"}; + paddle::small_vector attrs; + attrs.emplace_back("Axis"); + attrs.emplace_back("Reduce"); + paddle::small_vector outputs {"Result"}; + return KernelSignature("put_along_axis", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by QrOpArgumentMapping: + +return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"}); +****************************************************************** +*/ + +KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("mode"); + paddle::small_vector outputs {"Q", "R"}; + return KernelSignature("qr", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RealOpArgumentMapping: + +return KernelSignature("real", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RealOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("real", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReciprocalOpArgumentMapping: + +return KernelSignature("reciprocal", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ReciprocalOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("reciprocal", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReindexGraphOpArgumentMapping: + +return KernelSignature("graph_reindex", {"X", "Neighbors", "Count", "HashTable_Value", "HashTable_Index"}, {}, {"Reindex_Src", "Reindex_Dst", "Out_Nodes"}); +****************************************************************** +*/ + +KernelSignature GraphReindexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Neighbors", "Count", "HashTable_Value", "HashTable_Index"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Reindex_Src", "Reindex_Dst", "Out_Nodes"}; + return KernelSignature("graph_reindex", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReluOpArgumentMapping: + +return KernelSignature("relu", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Relu6OpArgumentMapping: + +return KernelSignature("relu6", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Relu6OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("relu6", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RenormOpArgumentMapping: + +return KernelSignature("renorm", {"X"}, {"p", "axis", "max_norm"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RenormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("p"); + attrs.emplace_back("axis"); + attrs.emplace_back("max_norm"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("renorm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RmsNormOpArgumentMapping: + +return KernelSignature("rms_norm", {"x", "bias", "residual", "norm_weight", "norm_bias"}, {"epsilon", "begin_norm_axis", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "residual_out"}); +****************************************************************** +*/ + +KernelSignature RmsNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "bias", "residual", "norm_weight", "norm_bias"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("begin_norm_axis"); + attrs.emplace_back("quant_scale"); + attrs.emplace_back("quant_round_type"); + attrs.emplace_back("quant_max_bound"); + attrs.emplace_back("quant_min_bound"); + paddle::small_vector outputs {"out", "residual_out"}; + return KernelSignature("rms_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RmspropOpArgumentMapping: + +return KernelSignature("rmsprop", {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}, {"epsilon", "decay", "momentum", "centered", "multi_precision"}, {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}); +return KernelSignature("rmsprop_dense_param_sparse_grad", {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}, {"epsilon", "decay", "momentum", "centered", "multi_precision"}, {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("decay"); + attrs.emplace_back("momentum"); + attrs.emplace_back("centered"); + attrs.emplace_back("multi_precision"); + paddle::small_vector outputs {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}; + if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("MeanSquare") && + ctx.IsDenseTensorInput("Grad") && + ctx.IsDenseTensorInput("Moment") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MeanGrad") && ctx.IsDenseTensorInput("MeanGrad")) || (!ctx.HasInput("MeanGrad"))) && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("rmsprop", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("MeanSquare") && + ctx.IsSelectedRowsInput("Grad") && + ctx.IsDenseTensorInput("Moment") && + ctx.IsDenseTensorInput("LearningRate") && + ((ctx.HasInput("MeanGrad") && ctx.IsDenseTensorInput("MeanGrad")) || (!ctx.HasInput("MeanGrad"))) && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("rmsprop_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoiAlignOpArgumentMapping: + +return KernelSignature("roi_align", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "spatial_scale", "sampling_ratio", "aligned"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("spatial_scale"); + attrs.emplace_back("sampling_ratio"); + attrs.emplace_back("aligned"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("roi_align", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoiPoolOpArgumentMapping: + +return KernelSignature("roi_pool", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "spatial_scale"}, {"Out", "Argmax"}); +****************************************************************** +*/ + +KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("spatial_scale"); + paddle::small_vector outputs {"Out", "Argmax"}; + return KernelSignature("roi_pool", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RollOpArgumentMapping: + +return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"}); +return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("ShiftsTensor") + ? "ShiftsTensor" + : "shifts"); + + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("roll", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoundOpArgumentMapping: + +return KernelSignature("round", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RoundOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("round", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RsqrtOpArgumentMapping: + +return KernelSignature("rsqrt", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RsqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("rsqrt", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ScaleOpArgumentMapping: + +return KernelSignature("scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); +return KernelSignature("scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); +return KernelSignature("scale_sr", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); +return KernelSignature("scale_sr", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("ScaleTensor") ? "ScaleTensor" : "scale"); + attrs.emplace_back("bias"); + attrs.emplace_back("bias_after_scale"); + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("scale", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("scale_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ScatterOpArgumentMapping: + +return KernelSignature("scatter", {"X", "Ids", "Updates"}, {"overwrite"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ScatterOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Ids", "Updates"}; + paddle::small_vector attrs; + attrs.emplace_back("overwrite"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("scatter", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ScatterNdAddOpArgumentMapping: + +return KernelSignature("scatter_nd_add", {"X", "Index", "Updates"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ScatterNdAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "Updates"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("scatter_nd_add", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SearchsortedOpArgumentMapping: + +return KernelSignature("searchsorted", {"SortedSequence", "Values"}, {"out_int32", "right"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SearchsortedOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"SortedSequence", "Values"}; + paddle::small_vector attrs; + attrs.emplace_back("out_int32"); + attrs.emplace_back("right"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("searchsorted", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SegmentPoolOpArgumentMapping: + +return KernelSignature("segment_pool", {"X", "SegmentIds"}, {"pooltype"}, {"Out", "SummedIds"}); +****************************************************************** +*/ + +KernelSignature SegmentPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "SegmentIds"}; + paddle::small_vector attrs; + attrs.emplace_back("pooltype"); + paddle::small_vector outputs {"Out", "SummedIds"}; + return KernelSignature("segment_pool", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SeluOpArgumentMapping: + +return KernelSignature("selu", {"X"}, {"scale", "alpha"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("scale"); + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("selu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendURecvOpArgumentMapping: + +return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "out_size"}, {"Out", "Dst_count"}); +return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "Out_size"}, {"Out", "Dst_count"}); +****************************************************************** +*/ + +KernelSignature GraphSendRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Src_index", "Dst_index"}; + paddle::small_vector attrs; + attrs.emplace_back("reduce_op"); + attrs.emplace_back( + ctx.HasInput("Out_size") + ? "Out_size" + : "out_size"); + + paddle::small_vector outputs {"Out", "Dst_count"}; + return KernelSignature("send_u_recv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendUeRecvOpArgumentMapping: + +return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "out_size"}, {"Out", "Dst_count"}); +return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "Out_size"}, {"Out", "Dst_count"}); +****************************************************************** +*/ + +KernelSignature GraphSendUeRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Src_index", "Dst_index"}; + paddle::small_vector attrs; + attrs.emplace_back("message_op"); + attrs.emplace_back("reduce_op"); + attrs.emplace_back( + ctx.HasInput("Out_size") + ? "Out_size" + : "out_size"); + + paddle::small_vector outputs {"Out", "Dst_count"}; + return KernelSignature("send_ue_recv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendUvOpArgumentMapping: + +return KernelSignature("send_uv", {"x", "y", "src_index", "dst_index"}, {"message_op"}, {"out"}); +****************************************************************** +*/ + +KernelSignature GraphSendUvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "src_index", "dst_index"}; + paddle::small_vector attrs; + attrs.emplace_back("message_op"); + paddle::small_vector outputs {"out"}; + return KernelSignature("send_uv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SgdOpArgumentMapping: + +return KernelSignature("sgd", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); +return KernelSignature("sgd_dense_param_sparse_grad", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); +return KernelSignature("sgd_sparse_param_sparse_grad", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); +****************************************************************** +*/ + +KernelSignature SgdOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Param", "LearningRate", "Grad", "MasterParam"}; + paddle::small_vector attrs; + attrs.emplace_back("multi_precision"); + paddle::small_vector outputs {"ParamOut", "MasterParamOut"}; + if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("LearningRate") && + ctx.IsDenseTensorInput("Grad") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("sgd", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Param") && + ctx.IsDenseTensorInput("LearningRate") && + ctx.IsSelectedRowsInput("Grad") && + ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("sgd_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("Param") && + ctx.IsDenseTensorInput("LearningRate") && + ctx.IsSelectedRowsInput("Grad") && + ((ctx.HasInput("MasterParam") && ctx.IsSelectedRowsInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { + return KernelSignature("sgd_sparse_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ShapeOpArgumentMapping: + +return KernelSignature("shape", {"Input"}, {}, {"Out"}); +return KernelSignature("shape_sr", {"Input"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ShapeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("Input")) { + return KernelSignature("shape", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("Input")) { + return KernelSignature("shape_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ShardIndexOpArgumentMapping: + +return KernelSignature("shard_index", {"X"}, {"index_num", "nshards", "shard_id", "ignore_value"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ShardIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("index_num"); + attrs.emplace_back("nshards"); + attrs.emplace_back("shard_id"); + attrs.emplace_back("ignore_value"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("shard_index", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidOpArgumentMapping: + +return KernelSignature("sigmoid", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("sigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidCrossEntropyWithLogitsOpArgumentMapping: + +return KernelSignature("sigmoid_cross_entropy_with_logits", {"X", "Label", "pos_weight"}, {"normalize", "ignore_index"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SigmoidCrossEntropyWithLogitsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label", "pos_weight"}; + paddle::small_vector attrs; + attrs.emplace_back("normalize"); + attrs.emplace_back("ignore_index"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("sigmoid_cross_entropy_with_logits", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SignOpArgumentMapping: + +return KernelSignature("sign", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SignOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("sign", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SiluOpArgumentMapping: + +return KernelSignature("silu", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SiluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("silu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinOpArgumentMapping: + +return KernelSignature("sin", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SinOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("sin", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinhOpArgumentMapping: + +return KernelSignature("sinh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SinhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("sinh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SlogdetOpArgumentMapping: + +return KernelSignature("slogdet", {"Input"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SlogdeterminantOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("slogdet", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftplusOpArgumentMapping: + +return KernelSignature("softplus", {"X"}, {"beta", "threshold"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SoftplusOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("beta"); + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("softplus", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftshrinkOpArgumentMapping: + +return KernelSignature("softshrink", {"X"}, {"lambda"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SoftshrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("lambda"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("softshrink", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftsignOpArgumentMapping: + +return KernelSignature("softsign", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SoftsignOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("softsign", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SolveOpArgumentMapping: + +return KernelSignature("solve", {"X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SolveOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("solve", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SpectralNormOpArgumentMapping: + +return KernelSignature("spectral_norm", {"Weight", "U", "V"}, {"dim", "power_iters", "eps"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SpectralNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Weight", "U", "V"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("power_iters"); + attrs.emplace_back("eps"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("spectral_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqrtOpArgumentMapping: + +return KernelSignature("sqrt", {"X"}, {}, {"Out"}); +return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("sqrt", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("sqrt_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SquareOpArgumentMapping: + +return KernelSignature("square", {"X"}, {}, {"Out"}); +return KernelSignature("square_sr", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SquareOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("X")) { + return KernelSignature("square", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsSelectedRowsInput("X")) { + return KernelSignature("square_sr", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SquaredL2NormOpArgumentMapping: + +return KernelSignature("squared_l2_norm", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SquaredL2NormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("squared_l2_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqueezeOpArgumentMapping: + +return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out", "XShape"}); +return KernelSignature("squeeze", {"X"}, {"AxisTensor"}, {"Out", "XShape"}); +return KernelSignature("squeeze", {"X"}, {"AxisTensorList"}, {"Out", "XShape"}); +****************************************************************** +*/ + +KernelSignature Squeeze2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + paddle::small_vector outputs {"Out", "XShape"}; + return KernelSignature("squeeze", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by StackOpArgumentMapping: + +return KernelSignature("stack", {"X"}, {"axis"}, {"Y"}); +****************************************************************** +*/ + +KernelSignature StackOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Y"}; + return KernelSignature("stack", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by StanhOpArgumentMapping: + +return KernelSignature("stanh", {"X"}, {"scale_a", "scale_b"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature StanhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("scale_a"); + attrs.emplace_back("scale_b"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("stanh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SvdOpArgumentMapping: + +return KernelSignature("svd", {"X"}, {"full_matrices"}, {"U", "S", "VH"}); +****************************************************************** +*/ + +KernelSignature SvdOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("full_matrices"); + paddle::small_vector outputs {"U", "S", "VH"}; + return KernelSignature("svd", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TakeAlongAxisOpArgumentMapping: + +return KernelSignature("take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"}); +****************************************************************** +*/ + +KernelSignature TakeAlongAxisOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Index"}; + paddle::small_vector attrs; + attrs.emplace_back("Axis"); + paddle::small_vector outputs {"Result"}; + return KernelSignature("take_along_axis", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanOpArgumentMapping: + +return KernelSignature("tan", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("tan", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhOpArgumentMapping: + +return KernelSignature("tanh", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TanhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("tanh", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhShrinkOpArgumentMapping: + +return KernelSignature("tanh_shrink", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TanhShrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("tanh_shrink", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TemporalShiftOpArgumentMapping: + +return KernelSignature("temporal_shift", {"X"}, {"seg_num", "shift_ratio", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TemporalShiftOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("seg_num"); + attrs.emplace_back("shift_ratio"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("temporal_shift", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TensorUnfoldOpArgumentMapping: + +return KernelSignature("tensor_unfold", {"input"}, {"axis", "size", "step"}, {"out"}); +****************************************************************** +*/ + +KernelSignature TensorUnfoldOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("size"); + attrs.emplace_back("step"); + paddle::small_vector outputs {"out"}; + return KernelSignature("tensor_unfold", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ThresholdedReluOpArgumentMapping: + +return KernelSignature("thresholded_relu", {"X"}, {"threshold"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ThresholdedReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("thresholded_relu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TopPSamplingOpArgumentMapping: + +return KernelSignature("top_p_sampling", {"x", "ps", "threshold"}, {"seed"}, {"out", "ids"}); +****************************************************************** +*/ + +KernelSignature TopPSamplingOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "ps", "threshold"}; + paddle::small_vector attrs; + attrs.emplace_back("seed"); + paddle::small_vector outputs {"out", "ids"}; + return KernelSignature("top_p_sampling", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TopkOpArgumentMapping: + +return KernelSignature("topk", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"}); +****************************************************************** +*/ + +KernelSignature TopKV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("K") ? "K" : "k"); + attrs.emplace_back("axis"); + attrs.emplace_back("largest"); + attrs.emplace_back("sorted"); + paddle::small_vector outputs {"Out", "Indices"}; + return KernelSignature("topk", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TraceOpArgumentMapping: + +return KernelSignature("trace", {"Input"}, {"offset", "axis1", "axis2"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TraceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("axis1"); + attrs.emplace_back("axis2"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("trace", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TriangularSolveOpArgumentMapping: + +return KernelSignature("triangular_solve", {"X", "Y"}, {"upper", "transpose", "unitriangular"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TriangularSolveOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + attrs.emplace_back("transpose"); + attrs.emplace_back("unitriangular"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("triangular_solve", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TrilinearInterpOpArgumentMapping: + +return KernelSignature("trilinear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TrilinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("trilinear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TruncOpArgumentMapping: + +return KernelSignature("trunc", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TruncOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("trunc", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnbindOpArgumentMapping: + +return KernelSignature("unbind", {"X"}, {"axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature UnbindOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("unbind", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnfoldOpArgumentMapping: + +return KernelSignature("unfold", {"X"}, {"kernel_sizes", "strides", "paddings", "dilations"}, {"Y"}); +****************************************************************** +*/ + +KernelSignature UnfoldOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("kernel_sizes"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + paddle::small_vector outputs {"Y"}; + return KernelSignature("unfold", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UniformInplaceOpArgumentMapping: + +return KernelSignature("uniform_inplace", {"X"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature UniformRandomInplaceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("min"); + attrs.emplace_back("max"); + attrs.emplace_back("seed"); + attrs.emplace_back("diag_num"); + attrs.emplace_back("diag_step"); + attrs.emplace_back("diag_val"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("uniform_inplace", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UniqueConsecutiveOpArgumentMapping: + +return KernelSignature("unique_consecutive", {"X"}, {"return_inverse", "return_counts", "axis", "dtype"}, {"Out", "Index", "Counts"}); +****************************************************************** +*/ + +KernelSignature UniqueConsecutiveOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("return_inverse"); + attrs.emplace_back("return_counts"); + attrs.emplace_back("axis"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out", "Index", "Counts"}; + return KernelSignature("unique_consecutive", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Unpool3dOpArgumentMapping: + +return KernelSignature("unpool3d", {"X", "Indices"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Unpool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_size"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("unpool3d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnsqueezeOpArgumentMapping: + +return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out", "XShape"}); +return KernelSignature("unsqueeze", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); +return KernelSignature("unsqueeze", {"X"}, {"AxesTensorList"}, {"Out", "XShape"}); +****************************************************************** +*/ + +KernelSignature Unsqueeze2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxesTensor") + ? "AxesTensor" + : ctx.InputSize("AxesTensorList") > 0 + ? "AxesTensorList" + : "axes"); + paddle::small_vector outputs {"Out", "XShape"}; + return KernelSignature("unsqueeze", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnstackOpArgumentMapping: + +return KernelSignature("unstack", {"X"}, {"axis", "num"}, {"Y"}); +****************************************************************** +*/ + +KernelSignature UnstackOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("num"); + paddle::small_vector outputs {"Y"}; + return KernelSignature("unstack", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UpdateLossScalingOpArgumentMapping: + +return KernelSignature("update_loss_scaling", {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, {"incr_every_n_steps", "decr_every_n_nan_or_inf", "incr_ratio", "decr_ratio", "stop_update"}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); +return KernelSignature("update_loss_scaling", {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, {"incr_every_n_steps", "decr_every_n_nan_or_inf", "incr_ratio", "decr_ratio", "StopUpdate"}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); +****************************************************************** +*/ + +KernelSignature UpdateLossScalingOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}; + paddle::small_vector attrs; + attrs.emplace_back("incr_every_n_steps"); + attrs.emplace_back("decr_every_n_nan_or_inf"); + attrs.emplace_back("incr_ratio"); + attrs.emplace_back("decr_ratio"); + attrs.emplace_back(ctx.HasInput("StopUpdate") ? "StopUpdate" : "stop_update"); + paddle::small_vector outputs {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}; + return KernelSignature("update_loss_scaling", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ViewDtypeOpArgumentMapping: + +return KernelSignature("view_dtype", {"input"}, {"dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature ViewDtypeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input"}; + paddle::small_vector attrs; + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out"}; + return KernelSignature("view_dtype", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ViewShapeOpArgumentMapping: + +return KernelSignature("view_shape", {"input"}, {"dims"}, {"out"}); +****************************************************************** +*/ + +KernelSignature ViewShapeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input"}; + paddle::small_vector attrs; + attrs.emplace_back("dims"); + paddle::small_vector outputs {"out"}; + return KernelSignature("view_shape", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ViterbiDecodeOpArgumentMapping: + +return KernelSignature("viterbi_decode", {"Input", "Transition", "Length"}, {"include_bos_eos_tag"}, {"Scores", "Path"}); +****************************************************************** +*/ + +KernelSignature ViterbiDecodeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Transition", "Length"}; + paddle::small_vector attrs; + attrs.emplace_back("include_bos_eos_tag"); + paddle::small_vector outputs {"Scores", "Path"}; + return KernelSignature("viterbi_decode", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WarpctcOpArgumentMapping: + +return KernelSignature("warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}, {"blank", "norm_by_times"}, {"Loss", "WarpCTCGrad"}); +****************************************************************** +*/ + +KernelSignature WarpctcOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Logits", "Label", "LogitsLength", "LabelLength"}; + paddle::small_vector attrs; + attrs.emplace_back("blank"); + attrs.emplace_back("norm_by_times"); + paddle::small_vector outputs {"Loss", "WarpCTCGrad"}; + return KernelSignature("warpctc", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WarprnntOpArgumentMapping: + +return KernelSignature("warprnnt", {"input", "label", "input_lengths", "label_lengths"}, {"blank", "fastemit_lambda"}, {"loss", "warprnntgrad"}); +****************************************************************** +*/ + +KernelSignature WarprnntOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "label", "input_lengths", "label_lengths"}; + paddle::small_vector attrs; + attrs.emplace_back("blank"); + attrs.emplace_back("fastemit_lambda"); + paddle::small_vector outputs {"loss", "warprnntgrad"}; + return KernelSignature("warprnnt", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WeightDequantizeOpArgumentMapping: + +return KernelSignature("weight_dequantize", {"x", "scale"}, {"algo", "out_dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature WeightDequantizeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "scale"}; + paddle::small_vector attrs; + attrs.emplace_back("algo"); + attrs.emplace_back("out_dtype"); + paddle::small_vector outputs {"out"}; + return KernelSignature("weight_dequantize", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WeightOnlyLinearOpArgumentMapping: + +return KernelSignature("weight_only_linear", {"x", "weight", "bias", "weight_scale"}, {"weight_dtype", "arch"}, {"out"}); +****************************************************************** +*/ + +KernelSignature WeightOnlyLinearOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "weight", "bias", "weight_scale"}; + paddle::small_vector attrs; + attrs.emplace_back("weight_dtype"); + attrs.emplace_back("arch"); + paddle::small_vector outputs {"out"}; + return KernelSignature("weight_only_linear", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WeightQuantizeOpArgumentMapping: + +return KernelSignature("weight_quantize", {"x"}, {"algo", "arch"}, {"out", "scale"}); +****************************************************************** +*/ + +KernelSignature WeightQuantizeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("algo"); + attrs.emplace_back("arch"); + paddle::small_vector outputs {"out", "scale"}; + return KernelSignature("weight_quantize", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WeightedSampleNeighborsOpArgumentMapping: + +return KernelSignature("weighted_sample_neighbors", {"row", "colptr", "edge_weight", "input_nodes", "eids"}, {"sample_size", "return_eids"}, {"out_neighbors", "out_count", "out_eids"}); +****************************************************************** +*/ + +KernelSignature WeightedSampleNeighborsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"row", "colptr", "edge_weight", "input_nodes", "eids"}; + paddle::small_vector attrs; + attrs.emplace_back("sample_size"); + attrs.emplace_back("return_eids"); + paddle::small_vector outputs {"out_neighbors", "out_count", "out_eids"}; + return KernelSignature("weighted_sample_neighbors", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WhereOpArgumentMapping: + +return KernelSignature("where", {"Condition", "X", "Y"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature WhereOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Condition", "X", "Y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("where", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by YoloBoxOpArgumentMapping: + +return KernelSignature("yolo_box", {"X", "ImgSize"}, {"anchors", "class_num", "conf_thresh", "downsample_ratio", "clip_bbox", "scale_x_y", "iou_aware", "iou_aware_factor"}, {"Boxes", "Scores"}); +****************************************************************** +*/ + +KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ImgSize"}; + paddle::small_vector attrs; + attrs.emplace_back("anchors"); + attrs.emplace_back("class_num"); + attrs.emplace_back("conf_thresh"); + attrs.emplace_back("downsample_ratio"); + attrs.emplace_back("clip_bbox"); + attrs.emplace_back("scale_x_y"); + attrs.emplace_back("iou_aware"); + attrs.emplace_back("iou_aware_factor"); + paddle::small_vector outputs {"Boxes", "Scores"}; + return KernelSignature("yolo_box", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by YoloLossOpArgumentMapping: + +return KernelSignature("yolo_loss", {"X", "GTBox", "GTLabel", "GTScore"}, {"anchors", "anchor_mask", "class_num", "ignore_thresh", "downsample_ratio", "use_label_smooth", "scale_x_y"}, {"Loss", "ObjectnessMask", "GTMatchMask"}); +****************************************************************** +*/ + +KernelSignature Yolov3LossOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "GTBox", "GTLabel", "GTScore"}; + paddle::small_vector attrs; + attrs.emplace_back("anchors"); + attrs.emplace_back("anchor_mask"); + attrs.emplace_back("class_num"); + attrs.emplace_back("ignore_thresh"); + attrs.emplace_back("downsample_ratio"); + attrs.emplace_back("use_label_smooth"); + attrs.emplace_back("scale_x_y"); + paddle::small_vector outputs {"Loss", "ObjectnessMask", "GTMatchMask"}; + return KernelSignature("yolo_loss", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AbsDoubleGradOpArgumentMapping: + +return KernelSignature("abs_double_grad", {"x", "grad_x@GRAD"}, {}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature AbsDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("abs_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AbsGradOpArgumentMapping: + +return KernelSignature("abs_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("abs_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AcosGradOpArgumentMapping: + +return KernelSignature("acos_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AcosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("acos_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AcoshGradOpArgumentMapping: + +return KernelSignature("acosh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AcoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("acosh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AddmmGradOpArgumentMapping: + +return KernelSignature("addmm_grad", {"Input", "X", "Y", "Out@GRAD"}, {"Alpha", "Beta"}, {"Input@GRAD", "X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("Alpha"); + attrs.emplace_back("Beta"); + paddle::small_vector outputs {"Input@GRAD", "X@GRAD", "Y@GRAD"}; + return KernelSignature("addmm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AffineGridGradOpArgumentMapping: + +return KernelSignature("affine_grid_grad", {"Output@GRAD"}, {"output_shape", "align_corners"}, {"Theta@GRAD"}); +return KernelSignature("affine_grid_grad", {"Output@GRAD"}, {"OutputShape", "align_corners"}, {"Theta@GRAD"}); +****************************************************************** +*/ + +KernelSignature AffineGridGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("OutputShape") + ? "OutputShape" + : "output_shape"); + + attrs.emplace_back("align_corners"); + paddle::small_vector outputs {"Theta@GRAD"}; + return KernelSignature("affine_grid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AngleGradOpArgumentMapping: + +return KernelSignature("angle_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AngleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("angle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ArgsortGradOpArgumentMapping: + +return KernelSignature("argsort_grad", {"Indices", "X", "Out@GRAD"}, {"axis", "descending"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ArgsortGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Indices", "X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("descending"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("argsort_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsStridedGradOpArgumentMapping: + +return KernelSignature("as_strided_grad", {"input", "out@GRAD"}, {"dims", "stride", "offset"}, {"input@GRAD"}); +****************************************************************** +*/ + +KernelSignature AsStridedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dims"); + attrs.emplace_back("stride"); + attrs.emplace_back("offset"); + paddle::small_vector outputs {"input@GRAD"}; + return KernelSignature("as_strided_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsinGradOpArgumentMapping: + +return KernelSignature("asin_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AsinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("asin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AsinhGradOpArgumentMapping: + +return KernelSignature("asinh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AsinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("asinh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Atan2GradOpArgumentMapping: + +return KernelSignature("atan2_grad", {"X1", "X2", "Out@GRAD"}, {}, {"X1@GRAD", "X2@GRAD"}); +****************************************************************** +*/ + +KernelSignature Atan2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X1", "X2", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X1@GRAD", "X2@GRAD"}; + return KernelSignature("atan2_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AtanGradOpArgumentMapping: + +return KernelSignature("atan_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AtanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("atan_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AtanhGradOpArgumentMapping: + +return KernelSignature("atanh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature AtanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("atanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BceLossGradOpArgumentMapping: + +return KernelSignature("bce_loss_grad", {"X", "Label", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature BceLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("bce_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BicubicInterpGradOpArgumentMapping: + +return KernelSignature("bicubic_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature BicubicInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("bicubic_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BilinearGradOpArgumentMapping: + +return KernelSignature("bilinear_grad", {"X", "Y", "Weight", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature BilinearTensorProductGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Weight", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"}; + return KernelSignature("bilinear_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BilinearInterpGradOpArgumentMapping: + +return KernelSignature("bilinear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature BilinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("bilinear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BmmGradOpArgumentMapping: + +return KernelSignature("bmm_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature BmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("bmm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BroadcastTensorsGradOpArgumentMapping: + +return KernelSignature("broadcast_tensors_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature BroadcastTensorsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("broadcast_tensors_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CeilGradOpArgumentMapping: + +return KernelSignature("ceil_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CeilGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("ceil_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CeluDoubleGradOpArgumentMapping: + +return KernelSignature("celu_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"alpha"}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature CeluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("celu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CeluGradOpArgumentMapping: + +return KernelSignature("celu_grad", {"X", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("celu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CholeskyGradOpArgumentMapping: + +return KernelSignature("cholesky_grad", {"Out", "Out@GRAD"}, {"upper"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CholeskyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("cholesky_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CholeskySolveGradOpArgumentMapping: + +return KernelSignature("cholesky_solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {"upper"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature CholeskySolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("cholesky_solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ClipDoubleGradOpArgumentMapping: + +return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"min", "max"}, {"grad_out@GRAD"}); +return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"min", "Max"}, {"grad_out@GRAD"}); +return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"Min", "max"}, {"grad_out@GRAD"}); +return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"Min", "Max"}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature ClipDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); + attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("clip_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ClipGradOpArgumentMapping: + +return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"min", "max"}, {"X@GRAD"}); +return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"min", "Max"}, {"X@GRAD"}); +return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"Min", "max"}, {"X@GRAD"}); +return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"Min", "Max"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ClipGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); + attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("clip_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ComplexGradOpArgumentMapping: + +return KernelSignature("complex_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ComplexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("complex_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ConcatGradOpArgumentMapping: + +return KernelSignature("concat_grad", {"X", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); +return KernelSignature("concat_grad", {"X", "Out@GRAD"}, {"AxisTensor"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ConcatGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("AxisTensor") ? "AxisTensor" : "axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("concat_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dGradOpArgumentMapping: + +return KernelSignature("conv2d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("dilations"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("conv2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dGradGradOpArgumentMapping: + +return KernelSignature("conv2d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv2dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("dilations"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; + return KernelSignature("conv2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv3dDoubleGradOpArgumentMapping: + +return KernelSignature("conv3d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv3dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; + return KernelSignature("conv3d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv3dGradOpArgumentMapping: + +return KernelSignature("conv3d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("conv3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv3dTransposeGradOpArgumentMapping: + +return KernelSignature("conv3d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv3dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("conv3d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CosDoubleGradOpArgumentMapping: + +return KernelSignature("cos_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature CosDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("cos_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CosGradOpArgumentMapping: + +return KernelSignature("cos_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("cos_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CosTripleGradOpArgumentMapping: + +return KernelSignature("cos_triple_grad", {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}, {}, {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); +****************************************************************** +*/ + +KernelSignature CosTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; + return KernelSignature("cos_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CoshGradOpArgumentMapping: + +return KernelSignature("cosh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("cosh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CropGradOpArgumentMapping: + +return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"offsets"}, {"X@GRAD"}); +return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"Offsets"}, {"X@GRAD"}); +return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"OffsetsTensor"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CropTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("Offsets") + ? "Offsets" + : ctx.InputSize("OffsetsTensor") > 0 + ? "OffsetsTensor" + : "offsets"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("crop_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CrossEntropyWithSoftmaxGradOpArgumentMapping: + +return KernelSignature("cross_entropy_with_softmax_grad", {"Label", "Softmax", "Loss@GRAD"}, {"soft_label", "use_softmax", "numeric_stable_mode", "ignore_index", "axis"}, {"Logits@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftmaxWithCrossEntropyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Label", "Softmax", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("soft_label"); + attrs.emplace_back("use_softmax"); + attrs.emplace_back("numeric_stable_mode"); + attrs.emplace_back("ignore_index"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Logits@GRAD"}; + return KernelSignature("cross_entropy_with_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CrossGradOpArgumentMapping: + +return KernelSignature("cross_grad", {"X", "Y", "Out@GRAD"}, {"dim"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("cross_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CummaxGradOpArgumentMapping: + +return KernelSignature("cummax_grad", {"x", "indices", "out@GRAD"}, {"axis", "dtype"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature CummaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "indices", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("cummax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumminGradOpArgumentMapping: + +return KernelSignature("cummin_grad", {"x", "indices", "out@GRAD"}, {"axis", "dtype"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature CumminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "indices", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("cummin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumprodGradOpArgumentMapping: + +return KernelSignature("cumprod_grad", {"X", "Out", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CumprodGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("cumprod_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by CumsumGradOpArgumentMapping: + +return KernelSignature("cumsum_grad", {"X", "Out@GRAD"}, {"axis", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); +return KernelSignature("cumsum_grad", {"X", "Out@GRAD"}, {"AxisTensor", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature CumsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("flatten"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("reverse"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("cumsum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DepthwiseConv2dDoubleGradOpArgumentMapping: + +return KernelSignature("depthwise_conv2d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature DepthwiseConv2dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; + return KernelSignature("depthwise_conv2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DepthwiseConv2dGradOpArgumentMapping: + +return KernelSignature("depthwise_conv2d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature DepthwiseConv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("depthwise_conv2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DetGradOpArgumentMapping: + +return KernelSignature("determinant_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature DeterminantGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("determinant_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DiagGradOpArgumentMapping: + +return KernelSignature("diag_grad", {"X", "Out@GRAD"}, {"offset"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature DiagV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("diag_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DiagonalGradOpArgumentMapping: + +return KernelSignature("diagonal_grad", {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature DiagonalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("axis1"); + attrs.emplace_back("axis2"); + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("diagonal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DigammaGradOpArgumentMapping: + +return KernelSignature("digamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature DigammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("digamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DistGradOpArgumentMapping: + +return KernelSignature("dist_grad", {"X", "Y", "Out", "Out@GRAD"}, {"p"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("p"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("dist_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DotGradOpArgumentMapping: + +return KernelSignature("dot_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("dot_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EigGradOpArgumentMapping: + +return KernelSignature("eig_grad", {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature EigGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("eig_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EighGradOpArgumentMapping: + +return KernelSignature("eigh_grad", {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("eigh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EigvalshGradOpArgumentMapping: + +return KernelSignature("eigvalsh_grad", {"Eigenvectors", "Eigenvalues@GRAD"}, {"UPLO", "is_test"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature EigvalshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Eigenvectors", "Eigenvalues@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("UPLO"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("eigvalsh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EluDoubleGradOpArgumentMapping: + +return KernelSignature("elu_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"alpha"}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature EluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("elu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EluGradOpArgumentMapping: + +return KernelSignature("elu_grad", {"X", "Out", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("elu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ErfGradOpArgumentMapping: + +return KernelSignature("erf_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("erf_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ErfinvGradOpArgumentMapping: + +return KernelSignature("erfinv_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("erfinv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ExpGradOpArgumentMapping: + +return KernelSignature("exp_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ExpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("exp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ExpandAsGradOpArgumentMapping: + +return KernelSignature("expand_as_grad", {"X", "Out@GRAD"}, {"target_shape"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ExpandAsV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("target_shape"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("expand_as_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Expm1GradOpArgumentMapping: + +return KernelSignature("expm1_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Expm1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("expm1_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftC2cGradOpArgumentMapping: + +return KernelSignature("fft_c2c_grad", {"Out@GRAD"}, {"axes", "normalization", "forward"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FftC2cGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fft_c2c_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftC2rGradOpArgumentMapping: + +return KernelSignature("fft_c2r_grad", {"Out@GRAD"}, {"axes", "normalization", "forward", "last_dim_size"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FftC2rGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + attrs.emplace_back("last_dim_size"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fft_c2r_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FftR2cGradOpArgumentMapping: + +return KernelSignature("fft_r2c_grad", {"X", "Out@GRAD"}, {"axes", "normalization", "forward", "onesided"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FftR2cGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + attrs.emplace_back("normalization"); + attrs.emplace_back("forward"); + attrs.emplace_back("onesided"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fft_r2c_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillDiagonalGradOpArgumentMapping: + +return KernelSignature("fill_diagonal_grad", {"Out@GRAD"}, {"value", "offset", "wrap"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FillDiagonalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + attrs.emplace_back("offset"); + attrs.emplace_back("wrap"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fill_diagonal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillDiagonalTensorGradOpArgumentMapping: + +return KernelSignature("fill_diagonal_tensor_grad", {"Out@GRAD"}, {"offset", "dim1", "dim2"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FillDiagonalTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("dim1"); + attrs.emplace_back("dim2"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fill_diagonal_tensor_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FillGradOpArgumentMapping: + +return KernelSignature("fill_grad", {"Out@GRAD"}, {"value"}, {"X@GRAD"}); +return KernelSignature("fill_grad", {"Out@GRAD"}, {"ValueTensor"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FillAnyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fill_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FlashAttnGradOpArgumentMapping: + +return KernelSignature("flash_attn_grad", {"q", "k", "v", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}, {"dropout", "causal"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); +****************************************************************** +*/ + +KernelSignature FlashAttnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"q", "k", "v", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout"); + attrs.emplace_back("causal"); + paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; + return KernelSignature("flash_attn_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FlashAttnUnpaddedGradOpArgumentMapping: + +return KernelSignature("flash_attn_unpadded_grad", {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}, {"max_seqlen_q", "max_seqlen_k", "scale", "dropout", "causal"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); +****************************************************************** +*/ + +KernelSignature FlashAttnUnpaddedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("max_seqlen_q"); + attrs.emplace_back("max_seqlen_k"); + attrs.emplace_back("scale"); + attrs.emplace_back("dropout"); + attrs.emplace_back("causal"); + paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; + return KernelSignature("flash_attn_unpadded_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FloorGradOpArgumentMapping: + +return KernelSignature("floor_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FloorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("floor_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FmaxGradOpArgumentMapping: + +return KernelSignature("fmax_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwiseFmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("fmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FminGradOpArgumentMapping: + +return KernelSignature("fmin_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwiseFminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("fmin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FoldGradOpArgumentMapping: + +return KernelSignature("fold_grad", {"X", "Y@GRAD"}, {"output_sizes", "kernel_sizes", "strides", "paddings", "dilations"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("output_sizes"); + attrs.emplace_back("kernel_sizes"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("fold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FrameGradOpArgumentMapping: + +return KernelSignature("frame_grad", {"X", "Out@GRAD"}, {"frame_length", "hop_length", "axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FrameGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("frame_length"); + attrs.emplace_back("hop_length"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("frame_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GatherGradOpArgumentMapping: + +return KernelSignature("gather_grad", {"X", "Index", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); +return KernelSignature("gather_grad", {"X", "Index", "Out@GRAD"}, {"Axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("Axis") ? "Axis" : "axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("gather_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GatherNdGradOpArgumentMapping: + +return KernelSignature("gather_nd_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature GatherNdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("gather_nd_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GaussianInplaceGradOpArgumentMapping: + +return KernelSignature("gaussian_inplace_grad", {"out@GRAD"}, {"mean", "std", "seed"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature GaussianInplaceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("mean"); + attrs.emplace_back("std"); + attrs.emplace_back("seed"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("gaussian_inplace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GeluGradOpArgumentMapping: + +return KernelSignature("gelu_grad", {"X", "Out@GRAD"}, {"approximate"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("approximate"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("gelu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GridSampleGradOpArgumentMapping: + +return KernelSignature("grid_sample_grad", {"X", "Grid", "Output@GRAD"}, {"mode", "padding_mode", "align_corners"}, {"X@GRAD", "Grid@GRAD"}); +****************************************************************** +*/ + +KernelSignature GridSamplerGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Grid", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("mode"); + attrs.emplace_back("padding_mode"); + attrs.emplace_back("align_corners"); + paddle::small_vector outputs {"X@GRAD", "Grid@GRAD"}; + return KernelSignature("grid_sample_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GroupNormGradOpArgumentMapping: + +return KernelSignature("group_norm_grad", {"X", "Scale", "Bias", "Y", "Mean", "Variance", "Y@GRAD"}, {"epsilon", "groups", "data_layout"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature GroupNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "Bias", "Y", "Mean", "Variance", "Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("groups"); + attrs.emplace_back("data_layout"); + paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; + return KernelSignature("group_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GumbelSoftmaxGradOpArgumentMapping: + +return KernelSignature("gumbel_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature GumbelSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("gumbel_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardshrinkGradOpArgumentMapping: + +return KernelSignature("hard_shrink_grad", {"X", "Out@GRAD"}, {"threshold"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature HardShrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("hard_shrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardsigmoidGradOpArgumentMapping: + +return KernelSignature("hardsigmoid_grad", {"Out", "Out@GRAD"}, {"slope", "offset"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature HardSigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("slope"); + attrs.emplace_back("offset"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("hardsigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardtanhGradOpArgumentMapping: + +return KernelSignature("hardtanh_grad", {"X", "Out@GRAD"}, {"t_min", "t_max"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature BreluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("t_min"); + attrs.emplace_back("t_max"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("hardtanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HeavisideGradOpArgumentMapping: + +return KernelSignature("heaviside_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwiseHeavisideGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("heaviside_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HuberLossGradOpArgumentMapping: + +return KernelSignature("huber_loss_grad", {"Residual", "Out@GRAD"}, {"delta"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature HuberLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Residual", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("delta"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("huber_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I0GradOpArgumentMapping: + +return KernelSignature("i0_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature I0GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("i0_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I0eGradOpArgumentMapping: + +return KernelSignature("i0e_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature I0eGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("i0e_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I1GradOpArgumentMapping: + +return KernelSignature("i1_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature I1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("i1_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by I1eGradOpArgumentMapping: + +return KernelSignature("i1e_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature I1eGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("i1e_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ImagGradOpArgumentMapping: + +return KernelSignature("imag_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("imag_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexAddGradOpArgumentMapping: + +return KernelSignature("index_add_grad", {"Index", "AddValue", "Out@GRAD"}, {"axis"}, {"X@GRAD", "AddValue@GRAD"}); +****************************************************************** +*/ + +KernelSignature IndexAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Index", "AddValue", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD", "AddValue@GRAD"}; + return KernelSignature("index_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexPutGradOpArgumentMapping: + +return KernelSignature("index_put_grad", {"x", "indices", "value", "out@GRAD"}, {"accumulate"}, {"x@GRAD", "value@GRAD"}); +****************************************************************** +*/ + +KernelSignature IndexPutGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "indices", "value", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("accumulate"); + paddle::small_vector outputs {"x@GRAD", "value@GRAD"}; + return KernelSignature("index_put_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSampleGradOpArgumentMapping: + +return KernelSignature("index_sample_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature IndexSampleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("index_sample_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSelectGradOpArgumentMapping: + +return KernelSignature("index_select_grad", {"X", "Index", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature IndexSelectGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("index_select_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by IndexSelectStridedGradOpArgumentMapping: + +return KernelSignature("index_select_strided_grad", {"x", "out@GRAD"}, {"index", "axis"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature IndexSelectStridedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("index"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("index_select_strided_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by InstanceNormDoubleGradOpArgumentMapping: + +return KernelSignature("instance_norm_double_grad", {"x", "fwd_scale", "saved_mean", "saved_variance", "grad_y", "grad_x@GRAD", "grad_scale@GRAD", "grad_bias@GRAD"}, {"epsilon"}, {"x@GRAD", "fwd_scale@GRAD", "grad_y@GRAD"}); +****************************************************************** +*/ + +KernelSignature InstanceNormDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "fwd_scale", "saved_mean", "saved_variance", "grad_y", "grad_x@GRAD", "grad_scale@GRAD", "grad_bias@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"x@GRAD", "fwd_scale@GRAD", "grad_y@GRAD"}; + return KernelSignature("instance_norm_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by InstanceNormGradOpArgumentMapping: + +return KernelSignature("instance_norm_grad", {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"}, {"epsilon"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature InstanceNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; + return KernelSignature("instance_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by InverseGradOpArgumentMapping: + +return KernelSignature("inverse_grad", {"Output", "Output@GRAD"}, {}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature InverseGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Output", "Output@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("inverse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KldivLossGradOpArgumentMapping: + +return KernelSignature("kldiv_loss_grad", {"X", "Target", "Loss@GRAD"}, {"reduction"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature KldivLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Target", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("reduction"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("kldiv_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KronGradOpArgumentMapping: + +return KernelSignature("kron_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("kron_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by KthvalueGradOpArgumentMapping: + +return KernelSignature("kthvalue_grad", {"X", "Indices", "Out@GRAD"}, {"k", "axis", "keepdim"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature KthvalueGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("k"); + attrs.emplace_back("axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("kthvalue_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LabelSmoothGradOpArgumentMapping: + +return KernelSignature("label_smooth_grad", {"Out@GRAD"}, {"epsilon"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LabelSmoothGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("label_smooth_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LayerNormGradOpArgumentMapping: + +return KernelSignature("layer_norm_grad", {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}, {"epsilon", "begin_norm_axis"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature LayerNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + attrs.emplace_back("begin_norm_axis"); + paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; + return KernelSignature("layer_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LeakyReluDoubleGradOpArgumentMapping: + +return KernelSignature("leaky_relu_double_grad", {"X", "grad_x@GRAD"}, {"alpha"}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature LeakyReluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("leaky_relu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LeakyReluGradOpArgumentMapping: + +return KernelSignature("leaky_relu_grad", {"X", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LeakyReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("leaky_relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LerpGradOpArgumentMapping: + +return KernelSignature("lerp_grad", {"X", "Y", "Weight", "Out", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Weight", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("lerp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LgammaGradOpArgumentMapping: + +return KernelSignature("lgamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("lgamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LinearInterpGradOpArgumentMapping: + +return KernelSignature("linear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("linear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log10GradOpArgumentMapping: + +return KernelSignature("log10_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Log10GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("log10_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log1pGradOpArgumentMapping: + +return KernelSignature("log1p_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Log1pGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("log1p_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Log2GradOpArgumentMapping: + +return KernelSignature("log2_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Log2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("log2_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogDoubleGradOpArgumentMapping: + +return KernelSignature("log_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("log_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogGradOpArgumentMapping: + +return KernelSignature("log_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("log_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogLossGradOpArgumentMapping: + +return KernelSignature("log_loss_grad", {"Predicted", "Labels", "Loss@GRAD"}, {"epsilon"}, {"Predicted@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Predicted", "Labels", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("epsilon"); + paddle::small_vector outputs {"Predicted@GRAD"}; + return KernelSignature("log_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogSoftmaxGradOpArgumentMapping: + +return KernelSignature("log_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("log_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogcumsumexpGradOpArgumentMapping: + +return KernelSignature("logcumsumexp_grad", {"X", "Out", "Out@GRAD"}, {"axis", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogcumsumexpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("flatten"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("reverse"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("logcumsumexp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogitGradOpArgumentMapping: + +return KernelSignature("logit_grad", {"X", "Out@GRAD"}, {"eps"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("eps"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("logit_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LogsigmoidGradOpArgumentMapping: + +return KernelSignature("logsigmoid_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LogsigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("logsigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LuGradOpArgumentMapping: + +return KernelSignature("lu_grad", {"X", "Out", "Pivots", "Out@GRAD"}, {"pivots"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LuGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Pivots", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("pivots"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("lu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LuUnpackGradOpArgumentMapping: + +return KernelSignature("lu_unpack_grad", {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"}, {"unpack_ludata", "unpack_pivots"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature LuUnpackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("unpack_ludata"); + attrs.emplace_back("unpack_pivots"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("lu_unpack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MarginCrossEntropyGradOpArgumentMapping: + +return KernelSignature("margin_cross_entropy_grad", {"Logits", "Label", "Softmax", "Loss@GRAD"}, {"return_softmax", "ring_id", "rank", "nranks", "margin1", "margin2", "margin3", "scale"}, {"Logits@GRAD"}); +****************************************************************** +*/ + +KernelSignature MarginCrossEntropyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Logits", "Label", "Softmax", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("return_softmax"); + attrs.emplace_back("ring_id"); + attrs.emplace_back("rank"); + attrs.emplace_back("nranks"); + attrs.emplace_back("margin1"); + attrs.emplace_back("margin2"); + attrs.emplace_back("margin3"); + attrs.emplace_back("scale"); + paddle::small_vector outputs {"Logits@GRAD"}; + return KernelSignature("margin_cross_entropy_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaskedSelectGradOpArgumentMapping: + +return KernelSignature("masked_select_grad", {"X", "Mask", "Y@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MaskedSelectGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Mask", "Y@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("masked_select_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatrixPowerGradOpArgumentMapping: + +return KernelSignature("matrix_power_grad", {"X", "Out", "Out@GRAD"}, {"n"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MatrixPowerGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("n"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("matrix_power_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxPool2dWithIndexGradOpArgumentMapping: + +return KernelSignature("max_pool2d_with_index_grad", {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MaxPool2dWithIndexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Mask", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("max_pool2d_with_index_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxPool3dWithIndexGradOpArgumentMapping: + +return KernelSignature("max_pool3d_with_index_grad", {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MaxPool3dWithIndexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Mask", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("max_pool3d_with_index_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxoutGradOpArgumentMapping: + +return KernelSignature("maxout_grad", {"X", "Out", "Out@GRAD"}, {"groups", "axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MaxoutGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("groups"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("maxout_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MeanAllGradOpArgumentMapping: + +return KernelSignature("mean_all_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MeanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("mean_all_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MemoryEfficientAttentionGradOpArgumentMapping: + +return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"max_seqlen_q", "max_seqlen_k", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); +return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"max_seqlen_q", "MaxSeqlenKTensor", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); +return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"MaxSeqlenQTensor", "max_seqlen_k", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); +return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"MaxSeqlenQTensor", "MaxSeqlenKTensor", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature MemoryEfficientAttentionGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("MaxSeqlenQTensor") ? "MaxSeqlenQTensor" : "max_seqlen_q"); + attrs.emplace_back(ctx.HasInput("MaxSeqlenKTensor") ? "MaxSeqlenKTensor" : "max_seqlen_k"); + attrs.emplace_back("causal"); + attrs.emplace_back("dropout_p"); + attrs.emplace_back("scale"); + paddle::small_vector outputs {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}; + return KernelSignature("memory_efficient_attention_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MeshgridGradOpArgumentMapping: + +return KernelSignature("meshgrid_grad", {"X", "outputs@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MeshgridGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "outputs@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("meshgrid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ModeGradOpArgumentMapping: + +return KernelSignature("mode_grad", {"X", "Indices", "Out@GRAD"}, {"axis", "keepdim"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("mode_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiDotGradOpArgumentMapping: + +return KernelSignature("multi_dot_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MultiDotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("multi_dot_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MultiplexGradOpArgumentMapping: + +return KernelSignature("multiplex_grad", {"Ids", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature MultiplexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Ids", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("multiplex_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MvGradOpArgumentMapping: + +return KernelSignature("mv_grad", {"X", "Vec", "Out@GRAD"}, {}, {"X@GRAD", "Vec@GRAD"}); +****************************************************************** +*/ + +KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Vec", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Vec@GRAD"}; + return KernelSignature("mv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NanmedianGradOpArgumentMapping: + +return KernelSignature("nanmedian_grad", {"X", "MedianIndex", "Out@GRAD"}, {"axis", "keepdim"}, {"X@GRAD"}); +return KernelSignature("nanmedian_grad", {"X", "MedianIndex", "Out@GRAD"}, {"AxisTensorList", "keepdim"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature NanmedianGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "MedianIndex", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("nanmedian_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NearestInterpGradOpArgumentMapping: + +return KernelSignature("nearest_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature NearestInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("nearest_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NllLossGradOpArgumentMapping: + +return KernelSignature("nll_loss_grad", {"X", "Label", "Weight", "Total_weight", "Out@GRAD"}, {"ignore_index", "reduction"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature NllLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label", "Weight", "Total_weight", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ignore_index"); + attrs.emplace_back("reduction"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("nll_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by OverlapAddGradOpArgumentMapping: + +return KernelSignature("overlap_add_grad", {"X", "Out@GRAD"}, {"hop_length", "axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature OverlapAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("hop_length"); + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("overlap_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PNormGradOpArgumentMapping: + +return KernelSignature("p_norm_grad", {"X", "Out", "Out@GRAD"}, {"porder", "axis", "epsilon", "keepdim", "asvector"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("porder"); + attrs.emplace_back("axis"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("keepdim"); + attrs.emplace_back("asvector"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("p_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pad3dDoubleGradOpArgumentMapping: + +return KernelSignature("pad3d", {"grad_x@GRAD"}, {"paddings", "mode", "value", "data_format"}, {"grad_out@GRAD"}); +return KernelSignature("pad3d", {"grad_x@GRAD"}, {"Paddings", "mode", "value", "data_format"}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature Pad3dDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("Paddings") + ? "Paddings" + : "paddings"); + + attrs.emplace_back("mode"); + attrs.emplace_back("value"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("pad3d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pad3dGradOpArgumentMapping: + +return KernelSignature("pad3d_grad", {"X", "Out@GRAD"}, {"paddings", "mode", "value", "data_format"}, {"X@GRAD"}); +return KernelSignature("pad3d_grad", {"X", "Out@GRAD"}, {"Paddings", "mode", "value", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("Paddings") + ? "Paddings" + : "paddings"); + + attrs.emplace_back("mode"); + attrs.emplace_back("value"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pad3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PixelShuffleGradOpArgumentMapping: + +return KernelSignature("pixel_shuffle_grad", {"Out@GRAD"}, {"upscale_factor", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PixelShuffleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("upscale_factor"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pixel_shuffle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PixelUnshuffleGradOpArgumentMapping: + +return KernelSignature("pixel_unshuffle_grad", {"Out@GRAD"}, {"downscale_factor", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PixelUnshuffleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("downscale_factor"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pixel_unshuffle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PoissonGradOpArgumentMapping: + +return KernelSignature("poisson_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PoissonGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("poisson_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PolygammaGradOpArgumentMapping: + +return KernelSignature("polygamma_grad", {"x", "out@GRAD"}, {"n"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature PolygammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("n"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("polygamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PowDoubleGradOpArgumentMapping: + +return KernelSignature("pow_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"factor"}, {"X@GRAD", "grad_out@GRAD"}); +return KernelSignature("pow_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"FactorTensor"}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature PowDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("pow_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PowGradOpArgumentMapping: + +return KernelSignature("pow_grad", {"X", "Out@GRAD"}, {"factor"}, {"X@GRAD"}); +return KernelSignature("pow_grad", {"X", "Out@GRAD"}, {"FactorTensor"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pow_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PowTripleGradOpArgumentMapping: + +return KernelSignature("pow_triple_grad", {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}, {"factor"}, {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}); +return KernelSignature("pow_triple_grad", {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}, {"FactorTensor"}, {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}); +****************************************************************** +*/ + +KernelSignature PowTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}; + return KernelSignature("pow_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PreluGradOpArgumentMapping: + +return KernelSignature("prelu_grad", {"X", "Alpha", "Out@GRAD"}, {"data_format", "mode"}, {"X@GRAD", "Alpha@GRAD"}); +****************************************************************** +*/ + +KernelSignature PreluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Alpha", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_format"); + attrs.emplace_back("mode"); + paddle::small_vector outputs {"X@GRAD", "Alpha@GRAD"}; + return KernelSignature("prelu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PsroiPoolGradOpArgumentMapping: + +return KernelSignature("psroi_pool_grad", {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature PsroiPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("output_channels"); + attrs.emplace_back("spatial_scale"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("psroi_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PutAlongAxisGradOpArgumentMapping: + +return KernelSignature("put_along_axis_grad", {"Input", "Index", "Result@GRAD"}, {"Axis", "Reduce"}, {"Input@GRAD", "Value@GRAD"}); +****************************************************************** +*/ + +KernelSignature PutAlongAxisGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Index", "Result@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("Axis"); + attrs.emplace_back("Reduce"); + paddle::small_vector outputs {"Input@GRAD", "Value@GRAD"}; + return KernelSignature("put_along_axis_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by QrGradOpArgumentMapping: + +return KernelSignature("qr_grad", {"X", "Q", "R", "Q@GRAD", "R@GRAD"}, {"mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature QrGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Q", "R", "Q@GRAD", "R@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("qr_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RealGradOpArgumentMapping: + +return KernelSignature("real_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RealGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("real_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReciprocalGradOpArgumentMapping: + +return KernelSignature("reciprocal_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReciprocalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("reciprocal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Relu6GradOpArgumentMapping: + +return KernelSignature("relu6_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Relu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("relu6_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReluDoubleGradOpArgumentMapping: + +return KernelSignature("relu_double_grad", {"Out", "grad_x@GRAD"}, {}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("relu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReluGradOpArgumentMapping: + +return KernelSignature("relu_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RenormGradOpArgumentMapping: + +return KernelSignature("renorm_grad", {"X", "Out@GRAD"}, {"p", "axis", "max_norm"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RenormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("p"); + attrs.emplace_back("axis"); + attrs.emplace_back("max_norm"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("renorm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoiAlignGradOpArgumentMapping: + +return KernelSignature("roi_align_grad", {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale", "sampling_ratio", "aligned"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RoiAlignGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("spatial_scale"); + attrs.emplace_back("sampling_ratio"); + attrs.emplace_back("aligned"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("roi_align_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoiPoolGradOpArgumentMapping: + +return KernelSignature("roi_pool_grad", {"X", "ROIs", "RoisNum", "Argmax", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RoiPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Argmax", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("pooled_height"); + attrs.emplace_back("pooled_width"); + attrs.emplace_back("spatial_scale"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("roi_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RollGradOpArgumentMapping: + +return KernelSignature("roll_grad", {"X", "Out@GRAD"}, {"shifts", "axis"}, {"X@GRAD"}); +return KernelSignature("roll_grad", {"X", "Out@GRAD"}, {"ShiftsTensor", "axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("ShiftsTensor") + ? "ShiftsTensor" + : "shifts"); + + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("roll_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RoundGradOpArgumentMapping: + +return KernelSignature("round_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RoundGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("round_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RsqrtDoubleGradOpArgumentMapping: + +return KernelSignature("rsqrt_double_grad", {"Out", "grad_x", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature RsqrtGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "grad_x", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; + return KernelSignature("rsqrt_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RsqrtGradOpArgumentMapping: + +return KernelSignature("rsqrt_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature RsqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("rsqrt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ScatterGradOpArgumentMapping: + +return KernelSignature("scatter_grad", {"Ids", "Updates", "Out@GRAD"}, {"overwrite"}, {"X@GRAD", "Updates@GRAD"}); +****************************************************************** +*/ + +KernelSignature ScatterGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Ids", "Updates", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("overwrite"); + paddle::small_vector outputs {"X@GRAD", "Updates@GRAD"}; + return KernelSignature("scatter_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ScatterNdAddGradOpArgumentMapping: + +return KernelSignature("scatter_nd_add_grad", {"Index", "Updates", "Out@GRAD"}, {}, {"X@GRAD", "Updates@GRAD"}); +****************************************************************** +*/ + +KernelSignature ScatterNdAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Index", "Updates", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Updates@GRAD"}; + return KernelSignature("scatter_nd_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SegmentPoolGradOpArgumentMapping: + +return KernelSignature("segment_pool_grad", {"X", "SegmentIds", "Out", "SummedIds", "Out@GRAD"}, {"pooltype"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SegmentPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "SegmentIds", "Out", "SummedIds", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("pooltype"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("segment_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SeluGradOpArgumentMapping: + +return KernelSignature("selu_grad", {"Out", "Out@GRAD"}, {"scale", "alpha"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("scale"); + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("selu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendURecvGradOpArgumentMapping: + +return KernelSignature("send_u_recv_grad", {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"reduce_op"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature GraphSendRecvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("reduce_op"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("send_u_recv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendUeRecvGradOpArgumentMapping: + +return KernelSignature("send_ue_recv_grad", {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"message_op", "reduce_op"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature GraphSendUeRecvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("message_op"); + attrs.emplace_back("reduce_op"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("send_ue_recv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SendUvGradOpArgumentMapping: + +return KernelSignature("send_uv_grad", {"x", "y", "src_index", "dst_index", "out@GRAD"}, {"message_op"}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature GraphSendUvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "src_index", "dst_index", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("message_op"); + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + return KernelSignature("send_uv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidCrossEntropyWithLogitsGradOpArgumentMapping: + +return KernelSignature("sigmoid_cross_entropy_with_logits_grad", {"X", "Label", "pos_weight", "Out@GRAD"}, {"normalize", "ignore_index"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SigmoidCrossEntropyWithLogitsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Label", "pos_weight", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("normalize"); + attrs.emplace_back("ignore_index"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sigmoid_cross_entropy_with_logits_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidDoubleGradOpArgumentMapping: + +return KernelSignature("sigmoid_double_grad", {"Out", "fwd_grad_out", "grad_x@GRAD"}, {}, {"Out@GRAD", "fwd_grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature SigmoidGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "fwd_grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "fwd_grad_out@GRAD"}; + return KernelSignature("sigmoid_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidGradOpArgumentMapping: + +return KernelSignature("sigmoid_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SigmoidTripleGradOpArgumentMapping: + +return KernelSignature("sigmoid_triple_grad", {"Out", "fwd_grad_out", "grad_grad_x", "grad_out@GRAD", "grad_grad_out@GRAD"}, {}, {"Out@GRAD", "fwd_grad_out@GRAD", "grad_grad_x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SigmoidTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "fwd_grad_out", "grad_grad_x", "grad_out@GRAD", "grad_grad_out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "fwd_grad_out@GRAD", "grad_grad_x@GRAD"}; + return KernelSignature("sigmoid_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SiluGradOpArgumentMapping: + +return KernelSignature("silu_grad", {"X", "Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SiluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("silu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinDoubleGradOpArgumentMapping: + +return KernelSignature("sin_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature SinDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("sin_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinGradOpArgumentMapping: + +return KernelSignature("sin_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinTripleGradOpArgumentMapping: + +return KernelSignature("sin_triple_grad", {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}, {}, {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); +****************************************************************** +*/ + +KernelSignature SinTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; + return KernelSignature("sin_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SinhGradOpArgumentMapping: + +return KernelSignature("sinh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sinh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SlogdetGradOpArgumentMapping: + +return KernelSignature("slogdet_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature SlogdeterminantGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("slogdet_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftplusDoubleGradOpArgumentMapping: + +return KernelSignature("softplus_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"beta", "threshold"}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftplusDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("beta"); + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("softplus_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftplusGradOpArgumentMapping: + +return KernelSignature("softplus_grad", {"X", "Out@GRAD"}, {"beta", "threshold"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftplusGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("beta"); + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("softplus_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftshrinkGradOpArgumentMapping: + +return KernelSignature("softshrink_grad", {"X", "Out@GRAD"}, {"lambda"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftshrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("lambda"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("softshrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftsignGradOpArgumentMapping: + +return KernelSignature("softsign_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftsignGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("softsign_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SolveGradOpArgumentMapping: + +return KernelSignature("solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SpectralNormGradOpArgumentMapping: + +return KernelSignature("spectral_norm_grad", {"Weight", "U", "V", "Out@GRAD"}, {"dim", "power_iters", "eps"}, {"Weight@GRAD"}); +****************************************************************** +*/ + +KernelSignature SpectralNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Weight", "U", "V", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("power_iters"); + attrs.emplace_back("eps"); + paddle::small_vector outputs {"Weight@GRAD"}; + return KernelSignature("spectral_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqrtDoubleGradOpArgumentMapping: + +return KernelSignature("sqrt_double_grad", {"Out", "grad_x", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature SqrtGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "grad_x", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; + return KernelSignature("sqrt_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqrtGradOpArgumentMapping: + +return KernelSignature("sqrt_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sqrt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SquareDoubleGradOpArgumentMapping: + +return KernelSignature("square_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature SquareGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; + return KernelSignature("square_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SquareGradOpArgumentMapping: + +return KernelSignature("square_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SquareGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("square_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SquaredL2NormGradOpArgumentMapping: + +return KernelSignature("squared_l2_norm_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SquaredL2NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("squared_l2_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SqueezeGradOpArgumentMapping: + +return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); +return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"AxisTensor"}, {"X@GRAD"}); +return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"AxisTensorList"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Squeeze2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"XShape", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axes"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("squeeze_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by StackGradOpArgumentMapping: + +return KernelSignature("stack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("stack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by StanhGradOpArgumentMapping: + +return KernelSignature("stanh_grad", {"X", "Out@GRAD"}, {"scale_a", "scale_b"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature StanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("scale_a"); + attrs.emplace_back("scale_b"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("stanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SvdGradOpArgumentMapping: + +return KernelSignature("svd_grad", {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"}, {"full_matrices"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SvdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("full_matrices"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("svd_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TakeAlongAxisGradOpArgumentMapping: + +return KernelSignature("take_along_axis_grad", {"Input", "Index", "Result@GRAD"}, {"Axis"}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature TakeAlongAxisGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Index", "Result@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("Axis"); + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("take_along_axis_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanGradOpArgumentMapping: + +return KernelSignature("tan_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("tan_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhDoubleGradOpArgumentMapping: + +return KernelSignature("tanh_double_grad", {"Out", "grad_out", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature TanhGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "grad_out", "grad_x@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; + return KernelSignature("tanh_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhGradOpArgumentMapping: + +return KernelSignature("tanh_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("tanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhShrinkGradOpArgumentMapping: + +return KernelSignature("tanh_shrink_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TanhShrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("tanh_shrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TanhTripleGradOpArgumentMapping: + +return KernelSignature("tanh_triple_grad", {"Out", "grad_out_forward", "grad_x_grad_forward", "grad_out_new@GRAD", "grad_out_grad@GRAD"}, {}, {"Out@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); +****************************************************************** +*/ + +KernelSignature TanhTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "grad_out_forward", "grad_x_grad_forward", "grad_out_new@GRAD", "grad_out_grad@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; + return KernelSignature("tanh_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TemporalShiftGradOpArgumentMapping: + +return KernelSignature("temporal_shift_grad", {"Out@GRAD"}, {"seg_num", "shift_ratio", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TemporalShiftGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("seg_num"); + attrs.emplace_back("shift_ratio"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("temporal_shift_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TensorUnfoldGradOpArgumentMapping: + +return KernelSignature("tensor_unfold_grad", {"input", "out@GRAD"}, {"axis", "size", "step"}, {"input@GRAD"}); +****************************************************************** +*/ + +KernelSignature TensorUnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("size"); + attrs.emplace_back("step"); + paddle::small_vector outputs {"input@GRAD"}; + return KernelSignature("tensor_unfold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ThresholdedReluGradOpArgumentMapping: + +return KernelSignature("thresholded_relu_grad", {"X", "Out@GRAD"}, {"threshold"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ThresholdedReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("threshold"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("thresholded_relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TopkGradOpArgumentMapping: + +return KernelSignature("topk_grad", {"X", "Indices", "Out@GRAD"}, {"k", "axis", "largest", "sorted"}, {"X@GRAD"}); +return KernelSignature("topk_grad", {"X", "Indices", "Out@GRAD"}, {"K", "axis", "largest", "sorted"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TopKV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("K") ? "K" : "k"); + attrs.emplace_back("axis"); + attrs.emplace_back("largest"); + attrs.emplace_back("sorted"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("topk_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TraceGradOpArgumentMapping: + +return KernelSignature("trace_grad", {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, {"Input@GRAD"}); +****************************************************************** +*/ + +KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("offset"); + attrs.emplace_back("axis1"); + attrs.emplace_back("axis2"); + paddle::small_vector outputs {"Input@GRAD"}; + return KernelSignature("trace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TriangularSolveGradOpArgumentMapping: + +return KernelSignature("triangular_solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {"upper", "transpose", "unitriangular"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature TriangularSolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("upper"); + attrs.emplace_back("transpose"); + attrs.emplace_back("unitriangular"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("triangular_solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TrilinearInterpGradOpArgumentMapping: + +return KernelSignature("trilinear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TrilinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("data_layout"); + attrs.emplace_back("out_d"); + attrs.emplace_back("out_h"); + attrs.emplace_back("out_w"); + attrs.emplace_back("scale"); + attrs.emplace_back("interp_method"); + attrs.emplace_back("align_corners"); + attrs.emplace_back("align_mode"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("trilinear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TruncGradOpArgumentMapping: + +return KernelSignature("trunc_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("trunc_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnfoldGradOpArgumentMapping: + +return KernelSignature("unfold_grad", {"X", "Y@GRAD"}, {"kernel_sizes", "strides", "paddings", "dilations"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature UnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("kernel_sizes"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("unfold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UniformInplaceGradOpArgumentMapping: + +return KernelSignature("uniform_inplace_grad", {"Out@GRAD"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature UniformRandomInplaceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("min"); + attrs.emplace_back("max"); + attrs.emplace_back("seed"); + attrs.emplace_back("diag_num"); + attrs.emplace_back("diag_step"); + attrs.emplace_back("diag_val"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("uniform_inplace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnsqueezeGradOpArgumentMapping: + +return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); +return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"AxesTensor"}, {"X@GRAD"}); +return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"AxesTensorList"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Unsqueeze2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"XShape", "Out@GRAD"}; + paddle::small_vector attrs; + + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("unsqueeze_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnstackGradOpArgumentMapping: + +return KernelSignature("unstack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature UnstackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("unstack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ViewDtypeGradOpArgumentMapping: + +return KernelSignature("view_dtype_grad", {"input", "out@GRAD"}, {"dtype"}, {"input@GRAD"}); +****************************************************************** +*/ + +KernelSignature ViewDtypeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"input@GRAD"}; + return KernelSignature("view_dtype_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ViewShapeGradOpArgumentMapping: + +return KernelSignature("view_shape_grad", {"input", "out@GRAD"}, {"dims"}, {"input@GRAD"}); +****************************************************************** +*/ + +KernelSignature ViewShapeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dims"); + paddle::small_vector outputs {"input@GRAD"}; + return KernelSignature("view_shape_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WarpctcGradOpArgumentMapping: + +return KernelSignature("warpctc_grad", {"Logits", "LogitsLength", "WarpCTCGrad", "Loss@GRAD"}, {"blank", "norm_by_times"}, {"Logits@GRAD"}); +****************************************************************** +*/ + +KernelSignature WarpctcGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Logits", "LogitsLength", "WarpCTCGrad", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("blank"); + attrs.emplace_back("norm_by_times"); + paddle::small_vector outputs {"Logits@GRAD"}; + return KernelSignature("warpctc_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WarprnntGradOpArgumentMapping: + +return KernelSignature("warprnnt_grad", {"input", "input_lengths", "warprnntgrad", "loss@GRAD"}, {"blank", "fastemit_lambda"}, {"input@GRAD"}); +****************************************************************** +*/ + +KernelSignature WarprnntGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "input_lengths", "warprnntgrad", "loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("blank"); + attrs.emplace_back("fastemit_lambda"); + paddle::small_vector outputs {"input@GRAD"}; + return KernelSignature("warprnnt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WeightOnlyLinearGradOpArgumentMapping: + +return KernelSignature("weight_only_linear_grad", {"x", "weight", "bias", "weight_scale", "out@GRAD"}, {"weight_dtype", "arch"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature WeightOnlyLinearGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "weight", "bias", "weight_scale", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("weight_dtype"); + attrs.emplace_back("arch"); + paddle::small_vector outputs {"x@GRAD"}; + return KernelSignature("weight_only_linear_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by WhereGradOpArgumentMapping: + +return KernelSignature("where_grad", {"Condition", "X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Condition", "X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("where_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by YoloLossGradOpArgumentMapping: + +return KernelSignature("yolo_loss_grad", {"X", "GTBox", "GTLabel", "GTScore", "ObjectnessMask", "GTMatchMask", "Loss@GRAD"}, {"anchors", "anchor_mask", "class_num", "ignore_thresh", "downsample_ratio", "use_label_smooth", "scale_x_y"}, {"X@GRAD", "GTBox@GRAD", "GTLabel@GRAD", "GTScore@GRAD"}); +****************************************************************** +*/ + +KernelSignature Yolov3LossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "GTBox", "GTLabel", "GTScore", "ObjectnessMask", "GTMatchMask", "Loss@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("anchors"); + attrs.emplace_back("anchor_mask"); + attrs.emplace_back("class_num"); + attrs.emplace_back("ignore_thresh"); + attrs.emplace_back("downsample_ratio"); + attrs.emplace_back("use_label_smooth"); + attrs.emplace_back("scale_x_y"); + paddle::small_vector outputs {"X@GRAD", "GTBox@GRAD", "GTLabel@GRAD", "GTScore@GRAD"}; + return KernelSignature("yolo_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Unpool3dGradOpArgumentMapping: + +return KernelSignature("unpool3d_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Unpool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_size"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("unpool3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(accuracy, phi::AccuracyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acos, phi::AcosOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acosh, phi::AcoshOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(adagrad, phi::AdagradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(adamax, phi::AdamaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(affine_grid, phi::AffineGridOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllcloseOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(angle, phi::AngleOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(arg_max, argmax); +PD_REGISTER_ARG_MAPPING_FN(arg_max, phi::ArgMaxOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(arg_min, argmin); +PD_REGISTER_ARG_MAPPING_FN(arg_min, phi::ArgMinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(argsort, phi::ArgsortOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(as_complex, phi::AsComplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(as_real, phi::AsRealOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(as_strided, phi::AsStridedOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asin, phi::AsinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asinh, phi::AsinhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan, phi::AtanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan2, phi::Atan2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atanh, phi::AtanhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(auc, phi::AucOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(average_accumulates, phi::AverageAccumulatesOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bce_loss, phi::BceLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bernoulli, phi::BernoulliOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bicubic_interp_v2, bicubic_interp); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2, phi::BicubicInterpV2OpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bilinear_tensor_product, bilinear); +PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product, phi::BilinearTensorProductOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bilinear_interp_v2, bilinear_interp); +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2, phi::BilinearInterpV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bitwise_and, phi::BitwiseAndOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bitwise_not, phi::BitwiseNotOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bitwise_or, phi::BitwiseOrOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bitwise_xor, phi::BitwiseXorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bmm, phi::BmmOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(box_coder, phi::BoxCoderOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors, phi::BroadcastTensorsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(ceil, phi::CeilOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(celu, phi::CeluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(check_finite_and_unscale, phi::CheckFiniteAndUnscaleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(check_numerics, phi::CheckNumericsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cholesky, phi::CholeskyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cholesky_solve, phi::CholeskySolveOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(class_center_sample, phi::ClassCenterSampleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(clip, phi::ClipOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(coalesce_tensor, phi::CoalesceTensorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(complex, phi::ComplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conj, phi::ConjOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose, phi::Conv3dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cos, phi::CosOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cosh, phi::CoshOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(crop_tensor, crop); +PD_REGISTER_ARG_MAPPING_FN(crop_tensor, phi::CropTensorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cross, phi::CrossOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy, cross_entropy_with_softmax); +PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy, phi::SoftmaxWithCrossEntropyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cummax, phi::CummaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cummin, phi::CumminOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cumprod, phi::CumprodOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cumsum, phi::CumsumOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(data, phi::DataOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, phi::DepthwiseConv2dOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(determinant, det); +PD_REGISTER_ARG_MAPPING_FN(determinant, phi::DeterminantOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); +PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(diag_embed, phi::DiagEmbedOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(diagonal, phi::DiagonalOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(digamma, phi::DigammaOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dirichlet, phi::DirichletOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dist, phi::DistOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dot, phi::DotOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(edit_distance, phi::EditDistanceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eig, phi::EigOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eigh, phi::EighOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eigvals, phi::EigvalsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eigvalsh, phi::EigvalshOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(erf, phi::ErfOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(erfinv, phi::ErfinvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(exp, phi::ExpOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as); +PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(expm1, phi::Expm1OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_c2c, phi::FftC2cOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_c2r, phi::FftC2rOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_r2c, phi::FftR2cOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(fill_any, fill); +PD_REGISTER_ARG_MAPPING_FN(fill_any, phi::FillAnyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal, phi::FillDiagonalOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor, phi::FillDiagonalTensorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(flash_attn, phi::FlashAttnOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(flash_attn_unpadded, phi::FlashAttnUnpaddedOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(flip, phi::FlipOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(floor, phi::FloorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fold, phi::FoldOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(frame, phi::FrameOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(full_int_array, phi::FullIntArrayOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_nd, phi::GatherNdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_tree, phi::GatherTreeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gaussian_inplace, phi::GaussianInplaceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(generate_proposals_v2, generate_proposals); +PD_REGISTER_ARG_MAPPING_FN(generate_proposals_v2, phi::GenerateProposalsV2OpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); +PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(group_norm, phi::GroupNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax, phi::GumbelSoftmaxOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(hard_shrink, hardshrink); +PD_REGISTER_ARG_MAPPING_FN(hard_shrink, phi::HardShrinkOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(hard_sigmoid, hardsigmoid); +PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid, phi::HardSigmoidOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(brelu, hardtanh); +PD_REGISTER_ARG_MAPPING_FN(brelu, phi::BreluOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_heaviside, heaviside); +PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside, phi::ElementwiseHeavisideOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i0, phi::I0OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i0e, phi::I0eOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i1, phi::I1OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i1e, phi::I1eOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(imag, phi::ImagOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_add, phi::IndexAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_put, phi::IndexPutOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_sample, phi::IndexSampleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_select, phi::IndexSelectOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_select_strided, phi::IndexSelectStridedOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(inverse, phi::InverseOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(is_empty, phi::IsEmptyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite); +PD_REGISTER_ARG_MAPPING_FN(isfinite_v2, phi::IsfiniteV2OpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf); +PD_REGISTER_ARG_MAPPING_FN(isinf_v2, phi::IsinfV2OpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan); +PD_REGISTER_ARG_MAPPING_FN(isnan_v2, phi::IsnanV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kldiv_loss, phi::KldivLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kron, phi::KronOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kthvalue, phi::KthvalueOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(label_smooth, phi::LabelSmoothOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lamb, phi::LambOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lgamma, phi::LgammaOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(linear_interp_v2, linear_interp); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2, phi::LinearInterpV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(llm_int8_linear, phi::LlmInt8LinearOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log, phi::LogOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log10, phi::Log10OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log1p, phi::Log1pOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log2, phi::Log2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log_loss, phi::LogLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log_softmax, phi::LogSoftmaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logcumsumexp, phi::LogcumsumexpOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logical_and, phi::LogicalAndOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logical_not, phi::LogicalNotOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logical_or, phi::LogicalOrOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logical_xor, phi::LogicalXorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logit, phi::LogitOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logsigmoid, phi::LogsigmoidOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lstsq, phi::LstsqOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lu, phi::LuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lu_unpack, phi::LuUnpackOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy, phi::MarginCrossEntropyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(masked_multihead_attention, phi::MaskedMultiheadAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(matrix_nms, phi::MatrixNmsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(matrix_power, phi::MatrixPowerOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, phi::MaxPool2dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, phi::MaxPool3dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(mean, mean_all); +PD_REGISTER_ARG_MAPPING_FN(mean, phi::MeanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(memory_efficient_attention, phi::MemoryEfficientAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(merge_selected_rows, phi::MergeSelectedRowsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(merged_momentum, phi::MergedMomentumOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(meshgrid, phi::MeshgridOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(momentum, phi::MomentumOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multi_dot, phi::MultiDotOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiclass_nms3, phi::MulticlassNms3OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multinomial, phi::MultinomialOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nanmedian, phi::NanmedianOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(nearest_interp_v2, nearest_interp); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2, phi::NearestInterpV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nextafter, phi::NextafterOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nms, phi::NmsOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(where_index, nonzero); +PD_REGISTER_ARG_MAPPING_FN(where_index, phi::WhereIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(npu_identity, phi::NpuIdentityOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(size, numel); +PD_REGISTER_ARG_MAPPING_FN(size, phi::SizeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(overlap_add, phi::OverlapAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(p_norm, phi::PNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle, phi::PixelShuffleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle, phi::PixelUnshuffleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(poisson, phi::PoissonOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(polygamma, phi::PolygammaOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pow, phi::PowOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(prelu, phi::PreluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(prior_box, phi::PriorBoxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(real, phi::RealOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reciprocal, phi::ReciprocalOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_reindex, reindex_graph); +PD_REGISTER_ARG_MAPPING_FN(graph_reindex, phi::GraphReindexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu, phi::ReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu6, phi::Relu6OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(renorm, phi::RenormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rms_norm, phi::RmsNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rmsprop, phi::RmspropOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(round, phi::RoundOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rsqrt, phi::RsqrtOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter, phi::ScatterOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add, phi::ScatterNdAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(searchsorted, phi::SearchsortedOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(segment_pool, phi::SegmentPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(selu, phi::SeluOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv, send_u_recv); +PD_REGISTER_ARG_MAPPING_FN(graph_send_recv, phi::GraphSendRecvOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv, send_ue_recv); +PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv, phi::GraphSendUeRecvOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_uv, send_uv); +PD_REGISTER_ARG_MAPPING_FN(graph_send_uv, phi::GraphSendUvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sgd, phi::SgdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(shape, phi::ShapeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(shard_index, phi::ShardIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid, phi::SigmoidOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_cross_entropy_with_logits, phi::SigmoidCrossEntropyWithLogitsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sign, phi::SignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(silu, phi::SiluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin, phi::SinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sinh, phi::SinhOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant, slogdet); +PD_REGISTER_ARG_MAPPING_FN(slogdeterminant, phi::SlogdeterminantOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softplus, phi::SoftplusOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softshrink, phi::SoftshrinkOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softsign, phi::SoftsignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(solve, phi::SolveOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(spectral_norm, phi::SpectralNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm, phi::SquaredL2NormOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze); +PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::Squeeze2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(stack, phi::StackOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(stanh, phi::StanhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(svd, phi::SvdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tan, phi::TanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh, phi::TanhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_shrink, phi::TanhShrinkOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tensor_unfold, phi::TensorUnfoldOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(thresholded_relu, phi::ThresholdedReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(top_p_sampling, phi::TopPSamplingOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, topk); +PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopKV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(triangular_solve, phi::TriangularSolveOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(trilinear_interp_v2, trilinear_interp); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2, phi::TrilinearInterpV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unbind, phi::UnbindOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unfold, phi::UnfoldOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(uniform_random_inplace, uniform_inplace); +PD_REGISTER_ARG_MAPPING_FN(uniform_random_inplace, phi::UniformRandomInplaceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unique_consecutive, phi::UniqueConsecutiveOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool3d, phi::Unpool3dOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze); +PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::Unsqueeze2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unstack, phi::UnstackOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(update_loss_scaling, phi::UpdateLossScalingOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(view_dtype, phi::ViewDtypeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(view_shape, phi::ViewShapeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(viterbi_decode, phi::ViterbiDecodeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(warpctc, phi::WarpctcOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(warprnnt, phi::WarprnntOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(weight_dequantize, phi::WeightDequantizeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(weight_only_linear, phi::WeightOnlyLinearOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(weight_quantize, phi::WeightQuantizeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(weighted_sample_neighbors, phi::WeightedSampleNeighborsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(where, phi::WhereOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss, yolo_loss); +PD_REGISTER_ARG_MAPPING_FN(yolov3_loss, phi::Yolov3LossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(abs_double_grad, phi::AbsDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(affine_grid_grad, phi::AffineGridGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(angle_grad, phi::AngleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(as_strided_grad, phi::AsStridedGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan2_grad, phi::Atan2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bce_loss_grad, phi::BceLossGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bicubic_interp_v2_grad, bicubic_interp_grad); +PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2_grad, phi::BicubicInterpV2GradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bilinear_tensor_product_grad, bilinear_grad); +PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad, phi::BilinearTensorProductGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(bilinear_interp_v2_grad, bilinear_interp_grad); +PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2_grad, phi::BilinearInterpV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bmm_grad, phi::BmmGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad, phi::BroadcastTensorsGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(ceil_grad, phi::CeilGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(celu_grad_grad, celu_double_grad); +PD_REGISTER_ARG_MAPPING_FN(celu_grad_grad, phi::CeluGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(celu_grad, phi::CeluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad, phi::CholeskySolveGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(clip_double_grad, phi::ClipDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(clip_grad, phi::ClipGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(complex_grad, phi::ComplexGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(concat_grad, phi::ConcatGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, phi::Conv2dGradGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(conv3d_grad_grad, conv3d_double_grad); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, phi::Conv3dGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad, phi::Conv3dTransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cos_double_grad, phi::CosDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cos_triple_grad, phi::CosTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(crop_tensor_grad, crop_grad); +PD_REGISTER_ARG_MAPPING_FN(crop_tensor_grad, phi::CropTensorGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy_grad, cross_entropy_with_softmax_grad); +PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy_grad, phi::SoftmaxWithCrossEntropyGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cross_grad, phi::CrossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cummax_grad, phi::CummaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cummin_grad, phi::CumminGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cumsum_grad, phi::CumsumGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(depthwise_conv2d_grad_grad, depthwise_conv2d_double_grad); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad, phi::DepthwiseConv2dGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, phi::DepthwiseConv2dGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(determinant_grad, det_grad); +PD_REGISTER_ARG_MAPPING_FN(determinant_grad, phi::DeterminantGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad); +PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eig_grad, phi::EigGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eigvalsh_grad, phi::EigvalshGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); +PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad); +PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad, phi::ExpandAsV2GradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(expand_v2_double_grad, expand_double_grad); +PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_c2c_grad, phi::FftC2cGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_c2r_grad, phi::FftC2rGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fft_r2c_grad, phi::FftR2cGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_grad, phi::FillDiagonalGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor_grad, phi::FillDiagonalTensorGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(fill_any_grad, fill_grad); +PD_REGISTER_ARG_MAPPING_FN(fill_any_grad, phi::FillAnyGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(flash_attn_grad, phi::FlashAttnGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(flash_attn_unpadded_grad, phi::FlashAttnUnpaddedGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(floor_grad, phi::FloorGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax_grad, fmax_grad); +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad, phi::ElementwiseFmaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin_grad, fmin_grad); +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad, phi::ElementwiseFminGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fold_grad, phi::FoldGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(frame_grad, phi::FrameGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gaussian_inplace_grad, phi::GaussianInplaceGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); +PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, phi::GridSamplerGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(group_norm_grad, phi::GroupNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad, phi::GumbelSoftmaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(hard_shrink_grad, hardshrink_grad); +PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, phi::HardShrinkGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(hard_sigmoid_grad, hardsigmoid_grad); +PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, phi::HardSigmoidGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(brelu_grad, hardtanh_grad); +PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BreluGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_heaviside_grad, heaviside_grad); +PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad, phi::ElementwiseHeavisideGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(huber_loss_grad, phi::HuberLossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i0_grad, phi::I0GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i0e_grad, phi::I0eGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i1_grad, phi::I1GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(i1e_grad, phi::I1eGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_add_grad, phi::IndexAddGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_put_grad, phi::IndexPutGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_sample_grad, phi::IndexSampleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_select_grad, phi::IndexSelectGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(index_select_strided_grad, phi::IndexSelectStridedGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(instance_norm_double_grad, phi::InstanceNormDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad, phi::InstanceNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(inverse_grad, phi::InverseGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad, phi::KldivLossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(label_smooth_grad, phi::LabelSmoothGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad, phi::LayerNormGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, phi::LeakyReluGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, phi::LeakyReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(linear_interp_v2_grad, linear_interp_grad); +PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2_grad, phi::LinearInterpV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad); +PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, phi::LogSoftmaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logcumsumexp_grad, phi::LogcumsumexpGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, phi::LogsigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lu_grad, phi::LuGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(lu_unpack_grad, phi::LuUnpackGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy_grad, phi::MarginCrossEntropyGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(masked_select_grad, phi::MaskedSelectGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad, phi::MatrixPowerGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, phi::MaxPool2dWithIndexGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, phi::MaxPool3dWithIndexGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(mean_grad, mean_all_grad); +PD_REGISTER_ARG_MAPPING_FN(mean_grad, phi::MeanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(memory_efficient_attention_grad, phi::MemoryEfficientAttentionGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(meshgrid_grad, phi::MeshgridGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nanmedian_grad, phi::NanmedianGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(nearest_interp_v2_grad, nearest_interp_grad); +PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2_grad, phi::NearestInterpV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(overlap_add_grad, phi::OverlapAddGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(p_norm_grad, phi::PNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pad3d_double_grad, phi::Pad3dDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle_grad, phi::PixelShuffleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad, phi::PixelUnshuffleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(poisson_grad, phi::PoissonGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(polygamma_grad, phi::PolygammaGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pow_double_grad, phi::PowDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pow_triple_grad, phi::PowTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PreluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad, phi::PsroiPoolGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad, phi::PutAlongAxisGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(qr_grad, phi::QrGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad, phi::ReciprocalGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); +PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(renorm_grad, phi::RenormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(round_grad, phi::RoundGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(rsqrt_grad_grad, rsqrt_double_grad); +PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad_grad, phi::RsqrtGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad, phi::ScatterNdAddGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad, phi::SegmentPoolGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv_grad, send_u_recv_grad); +PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, phi::GraphSendRecvGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv_grad, send_ue_recv_grad); +PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv_grad, phi::GraphSendUeRecvGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(graph_send_uv_grad, send_uv_grad); +PD_REGISTER_ARG_MAPPING_FN(graph_send_uv_grad, phi::GraphSendUvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_cross_entropy_with_logits_grad, phi::SigmoidCrossEntropyWithLogitsGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, phi::SigmoidGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, phi::SigmoidTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin_double_grad, phi::SinDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin_triple_grad, phi::SinTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant_grad, slogdet_grad); +PD_REGISTER_ARG_MAPPING_FN(slogdeterminant_grad, phi::SlogdeterminantGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softplus_double_grad, phi::SoftplusDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, phi::SoftshrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softsign_grad, phi::SoftsignGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(solve_grad, phi::SolveGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(spectral_norm_grad, phi::SpectralNormGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(sqrt_grad_grad, sqrt_double_grad); +PD_REGISTER_ARG_MAPPING_FN(sqrt_grad_grad, phi::SqrtGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(square_grad_grad, square_double_grad); +PD_REGISTER_ARG_MAPPING_FN(square_grad_grad, phi::SquareGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm_grad, phi::SquaredL2NormGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(squeeze2_double_grad, squeeze_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad); +PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::Squeeze2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::StanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(svd_grad, phi::SvdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad, phi::TakeAlongAxisGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, phi::TanhGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, phi::TanhShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, phi::TanhTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad, phi::TemporalShiftGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tensor_unfold_grad, phi::TensorUnfoldGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, phi::ThresholdedReluGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, topk_grad); +PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopKV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad, phi::TriangularSolveGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(trilinear_interp_v2_grad, trilinear_interp_grad); +PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2_grad, phi::TrilinearInterpV2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unfold_grad, phi::UnfoldGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(uniform_random_inplace_grad, uniform_inplace_grad); +PD_REGISTER_ARG_MAPPING_FN(uniform_random_inplace_grad, phi::UniformRandomInplaceGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_double_grad, unsqueeze_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad); +PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad, phi::Unsqueeze2GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnstackGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(view_dtype_grad, phi::ViewDtypeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(view_shape_grad, phi::ViewShapeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(warpctc_grad, phi::WarpctcGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(warprnnt_grad, phi::WarprnntGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(weight_only_linear_grad, phi::WeightOnlyLinearGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss_grad, yolo_loss_grad); +PD_REGISTER_ARG_MAPPING_FN(yolov3_loss_grad, phi::Yolov3LossGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool3d_grad, phi::Unpool3dGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc b/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc new file mode 100644 index 0000000000000..5f487e09b0e4e --- /dev/null +++ b/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc @@ -0,0 +1,2735 @@ +// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit. +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAbsOpArgumentMapping: + +return KernelSignature("abs_coo", {"x"}, {}, {"out"}); +return KernelSignature("abs_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAbsOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "abs_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "abs_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAcosOpArgumentMapping: + +return KernelSignature("acos_coo", {"x"}, {}, {"out"}); +return KernelSignature("acos_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAcosOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "acos_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "acos_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAcoshOpArgumentMapping: + +return KernelSignature("acosh_coo", {"x"}, {}, {"out"}); +return KernelSignature("acosh_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAcoshOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "acosh_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "acosh_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAddOpArgumentMapping: + +return KernelSignature("add_coo_coo", {"x", "y"}, {}, {"out"}); +return KernelSignature("add_csr_csr", {"x", "y"}, {}, {"out"}); +return KernelSignature("add_coo_dense", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAddOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "add_coo_coo"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "add_csr_csr"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { + kernel_name = "add_coo_dense"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAsinOpArgumentMapping: + +return KernelSignature("asin_coo", {"x"}, {}, {"out"}); +return KernelSignature("asin_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAsinOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "asin_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "asin_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAsinhOpArgumentMapping: + +return KernelSignature("asinh_coo", {"x"}, {}, {"out"}); +return KernelSignature("asinh_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAsinhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "asinh_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "asinh_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAtanOpArgumentMapping: + +return KernelSignature("atan_coo", {"x"}, {}, {"out"}); +return KernelSignature("atan_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAtanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "atan_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "atan_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAtanhOpArgumentMapping: + +return KernelSignature("atanh_coo", {"x"}, {}, {"out"}); +return KernelSignature("atanh_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAtanhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "atanh_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "atanh_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseBatchNormOpArgumentMapping: + +return KernelSignature("batch_norm_coo", {"x", "mean", "variance", "scale", "bias"}, {"is_test", "momentum", "epsilon", "data_layout", "use_global_stats", "trainable_statistics"}, {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}); +****************************************************************** +*/ + +KernelSignature SparseBatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("is_test"); + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("data_layout"); + attrs.emplace_back("use_global_stats"); + attrs.emplace_back("trainable_statistics"); + paddle::small_vector outputs {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("mean") && ctx.IsDenseTensorInput("variance") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias")) { + kernel_name = "batch_norm_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseCastOpArgumentMapping: + +return KernelSignature("cast_coo", {"x"}, {"index_dtype", "value_dtype"}, {"out"}); +return KernelSignature("cast_csr", {"x"}, {"index_dtype", "value_dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseCastOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("index_dtype"); + attrs.emplace_back("value_dtype"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "cast_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "cast_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseConv3dOpArgumentMapping: + +return KernelSignature("conv3d_coo", {"x", "kernel"}, {"paddings", "dilations", "strides", "groups", "subm", "key"}, {"out", "rulebook", "counter"}); +****************************************************************** +*/ + +KernelSignature SparseConv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "kernel"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("groups"); + attrs.emplace_back("subm"); + attrs.emplace_back("key"); + paddle::small_vector outputs {"out", "rulebook", "counter"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("kernel")) { + kernel_name = "conv3d_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseDivideOpArgumentMapping: + +return KernelSignature("divide_coo_coo", {"x", "y"}, {}, {"out"}); +return KernelSignature("divide_csr_csr", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseDivideOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "divide_coo_coo"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "divide_csr_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseDivideScalarOpArgumentMapping: + +return KernelSignature("divide_scalar_coo", {"x"}, {"scalar"}, {"out"}); +return KernelSignature("divide_scalar_csr", {"x"}, {"scalar"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseDivideScalarOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("scalar"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "divide_scalar_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "divide_scalar_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseExpm1OpArgumentMapping: + +return KernelSignature("expm1_coo", {"x"}, {}, {"out"}); +return KernelSignature("expm1_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseExpm1OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "expm1_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "expm1_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseIsnanOpArgumentMapping: + +return KernelSignature("isnan_coo", {"x"}, {}, {"out"}); +return KernelSignature("isnan_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseIsnanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "isnan_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "isnan_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseLeakyReluOpArgumentMapping: + +return KernelSignature("leaky_relu_coo", {"x"}, {"alpha"}, {"out"}); +return KernelSignature("leaky_relu_csr", {"x"}, {"alpha"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseLeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "leaky_relu_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "leaky_relu_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseLog1pOpArgumentMapping: + +return KernelSignature("log1p_coo", {"x"}, {}, {"out"}); +return KernelSignature("log1p_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseLog1pOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "log1p_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "log1p_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMultiplyOpArgumentMapping: + +return KernelSignature("multiply_coo_coo", {"x", "y"}, {}, {"out"}); +return KernelSignature("multiply_csr_csr", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseMultiplyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "multiply_coo_coo"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "multiply_csr_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparsePowOpArgumentMapping: + +return KernelSignature("pow_coo", {"x"}, {"factor"}, {"out"}); +return KernelSignature("pow_csr", {"x"}, {"factor"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparsePowOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("factor"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "pow_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "pow_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseReluOpArgumentMapping: + +return KernelSignature("relu_coo", {"x"}, {}, {"out"}); +return KernelSignature("relu_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "relu_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "relu_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseRelu6OpArgumentMapping: + +return KernelSignature("relu6_coo", {"x"}, {}, {"out"}); +return KernelSignature("relu6_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseRelu6OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "relu6_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "relu6_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseReshapeOpArgumentMapping: + +return KernelSignature("reshape_coo", {"x"}, {"shape"}, {"out"}); +return KernelSignature("reshape_coo", {"x"}, {"ShapeTensor"}, {"out"}); +return KernelSignature("reshape_coo", {"x"}, {"ShapeTensorList"}, {"out"}); +return KernelSignature("reshape_csr", {"x"}, {"shape"}, {"out"}); +return KernelSignature("reshape_csr", {"x"}, {"ShapeTensor"}, {"out"}); +return KernelSignature("reshape_csr", {"x"}, {"ShapeTensorList"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseReshapeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("ShapeTensor") + ? "ShapeTensor" + : ctx.InputSize("ShapeTensorList") > 0 + ? "ShapeTensorList" + : "shape"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "reshape_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "reshape_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseScaleOpArgumentMapping: + +return KernelSignature("scale_coo", {"x"}, {"scale", "bias", "bias_after_scale"}, {"out"}); +return KernelSignature("scale_csr", {"x"}, {"scale", "bias", "bias_after_scale"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("scale"); + attrs.emplace_back("bias"); + attrs.emplace_back("bias_after_scale"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "scale_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "scale_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSinOpArgumentMapping: + +return KernelSignature("sin_coo", {"x"}, {}, {"out"}); +return KernelSignature("sin_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSinOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "sin_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "sin_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSinhOpArgumentMapping: + +return KernelSignature("sinh_coo", {"x"}, {}, {"out"}); +return KernelSignature("sinh_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSinhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "sinh_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "sinh_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSoftmaxOpArgumentMapping: + +return KernelSignature("softmax_coo", {"x"}, {"axis"}, {"out"}); +return KernelSignature("softmax_csr", {"x"}, {"axis"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "softmax_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "softmax_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSparseCooTensorOpArgumentMapping: + +return KernelSignature("sparse_coo_tensor", {"values", "indices"}, {"shape"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSparseCooTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"values", "indices"}; + paddle::small_vector attrs; + attrs.emplace_back("shape"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("values") && ctx.IsDenseTensorInput("indices")) { + kernel_name = "sparse_coo_tensor"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSqrtOpArgumentMapping: + +return KernelSignature("sqrt_coo", {"x"}, {}, {"out"}); +return KernelSignature("sqrt_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "sqrt_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "sqrt_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSquareOpArgumentMapping: + +return KernelSignature("square_coo", {"x"}, {}, {"out"}); +return KernelSignature("square_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSquareOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "square_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "square_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSubtractOpArgumentMapping: + +return KernelSignature("subtract_coo_coo", {"x", "y"}, {}, {"out"}); +return KernelSignature("subtract_csr_csr", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSubtractOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "subtract_coo_coo"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "subtract_csr_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSumOpArgumentMapping: + +return KernelSignature("sum_coo", {"x"}, {"axis", "dtype", "keepdim"}, {"out"}); +return KernelSignature("sum_coo", {"x"}, {"AxisTensor", "dtype", "keepdim"}, {"out"}); +return KernelSignature("sum_coo", {"x"}, {"AxisTensorList", "dtype", "keepdim"}, {"out"}); +return KernelSignature("sum_csr", {"x"}, {"axis", "dtype", "keepdim"}, {"out"}); +return KernelSignature("sum_csr", {"x"}, {"AxisTensor", "dtype", "keepdim"}, {"out"}); +return KernelSignature("sum_csr", {"x"}, {"AxisTensorList", "dtype", "keepdim"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSumOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "axis"); + attrs.emplace_back("dtype"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "sum_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "sum_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSyncBatchNormOpArgumentMapping: + +return KernelSignature("sync_batch_norm_coo", {"x", "mean", "variance", "scale", "bias"}, {"is_test", "momentum", "epsilon", "data_layout", "use_global_stats", "trainable_statistics"}, {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}); +****************************************************************** +*/ + +KernelSignature SparseSyncBatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("is_test"); + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("data_layout"); + attrs.emplace_back("use_global_stats"); + attrs.emplace_back("trainable_statistics"); + paddle::small_vector outputs {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("mean") && ctx.IsDenseTensorInput("variance") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias")) { + kernel_name = "sync_batch_norm_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTanOpArgumentMapping: + +return KernelSignature("tan_coo", {"x"}, {}, {"out"}); +return KernelSignature("tan_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseTanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "tan_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "tan_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTanhOpArgumentMapping: + +return KernelSignature("tanh_coo", {"x"}, {}, {"out"}); +return KernelSignature("tanh_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseTanhOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "tanh_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "tanh_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseToDenseOpArgumentMapping: + +return KernelSignature("coo_to_dense", {"x"}, {}, {"out"}); +return KernelSignature("csr_to_dense", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseToDenseOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "coo_to_dense"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "csr_to_dense"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseToSparseCooOpArgumentMapping: + +return KernelSignature("dense_to_coo", {"x"}, {"sparse_dim"}, {"out"}); +return KernelSignature("csr_to_coo", {"x"}, {"sparse_dim"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseToSparseCooOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("sparse_dim"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("x")) { + kernel_name = "dense_to_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "csr_to_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseToSparseCsrOpArgumentMapping: + +return KernelSignature("dense_to_csr", {"x"}, {}, {"out"}); +return KernelSignature("coo_to_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseToSparseCsrOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("x")) { + kernel_name = "dense_to_csr"; + } + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "coo_to_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTransposeOpArgumentMapping: + +return KernelSignature("transpose_coo", {"x"}, {"perm"}, {"out"}); +return KernelSignature("transpose_csr", {"x"}, {"perm"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("perm"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "transpose_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "transpose_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseValuesOpArgumentMapping: + +return KernelSignature("values_coo", {"x"}, {}, {"out"}); +return KernelSignature("values_csr", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseValuesOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "values_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "values_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAddmmOpArgumentMapping: + +return KernelSignature("addmm_csr_dense", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); +return KernelSignature("addmm_csr_csr", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); +return KernelSignature("addmm_coo_dense", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); +return KernelSignature("addmm_coo_coo", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseAddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "x", "y"}; + paddle::small_vector attrs; + attrs.emplace_back("beta"); + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y")) { + kernel_name = "addmm_csr_dense"; + } + if (ctx.IsSparseCsrTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "addmm_csr_csr"; + } + if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { + kernel_name = "addmm_coo_dense"; + } + if (ctx.IsSparseCooTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "addmm_coo_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseCoalesceOpArgumentMapping: + +return KernelSignature("coalesce_coo", {"x"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseCoalesceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "coalesce_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseFullLikeOpArgumentMapping: + +return KernelSignature("full_like_coo", {"x"}, {"value", "dtype"}, {"out"}); +return KernelSignature("full_like_coo", {"x"}, {"ValueTensor", "dtype"}, {"out"}); +return KernelSignature("full_like_csr", {"x"}, {"value", "dtype"}, {"out"}); +return KernelSignature("full_like_csr", {"x"}, {"ValueTensor", "dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseFullLikeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("ValueTensor") ? "ValueTensor" : "value"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "full_like_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "full_like_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseFusedAttentionOpArgumentMapping: + +return KernelSignature("fused_attention_csr", {"query", "key", "value", "sparse_mask", "key_padding_mask", "attn_mask"}, {}, {"out", "softmax"}); +****************************************************************** +*/ + +KernelSignature SparseFusedAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"query", "key", "value", "sparse_mask", "key_padding_mask", "attn_mask"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out", "softmax"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("query") && ctx.IsDenseTensorInput("key") && ctx.IsDenseTensorInput("value") && ctx.IsSparseCsrTensorInput("sparse_mask") && ctx.IsDenseTensorInput("key_padding_mask") && ctx.IsDenseTensorInput("attn_mask")) { + kernel_name = "fused_attention_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMaskedMatmulOpArgumentMapping: + +return KernelSignature("masked_matmul_csr", {"x", "y", "mask"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseMaskedMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "mask"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCsrTensorInput("mask")) { + kernel_name = "masked_matmul_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMatmulOpArgumentMapping: + +return KernelSignature("matmul_csr_dense", {"x", "y"}, {}, {"out"}); +return KernelSignature("matmul_csr_csr", {"x", "y"}, {}, {"out"}); +return KernelSignature("matmul_coo_dense", {"x", "y"}, {}, {"out"}); +return KernelSignature("matmul_coo_coo", {"x", "y"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y")) { + kernel_name = "matmul_csr_dense"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { + kernel_name = "matmul_csr_csr"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { + kernel_name = "matmul_coo_dense"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { + kernel_name = "matmul_coo_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMaxpoolOpArgumentMapping: + +return KernelSignature("maxpool_coo", {"x"}, {"kernel_sizes", "paddings", "dilations", "strides"}, {"out", "rulebook", "counter"}); +****************************************************************** +*/ + +KernelSignature SparseMaxpoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back("kernel_sizes"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + paddle::small_vector outputs {"out", "rulebook", "counter"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "maxpool_coo"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMvOpArgumentMapping: + +return KernelSignature("mv_coo", {"x", "vec"}, {}, {"out"}); +return KernelSignature("mv_csr", {"x", "vec"}, {}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseMvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "vec"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("vec")) { + kernel_name = "mv_coo"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("vec")) { + kernel_name = "mv_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSliceOpArgumentMapping: + +return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"out"}); +return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"out"}); +****************************************************************** +*/ + +KernelSignature SparseSliceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxesTensor") + ? "AxesTensor" + : ctx.InputSize("AxesTensorList") > 0 + ? "AxesTensorList" + : "axes"); + attrs.emplace_back( + ctx.HasInput("StartsTensor") + ? "StartsTensor" + : ctx.InputSize("StartsTensorList") > 0 + ? "StartsTensorList" + : "starts"); + attrs.emplace_back( + ctx.HasInput("EndsTensor") + ? "EndsTensor" + : ctx.InputSize("EndsTensorList") > 0 + ? "EndsTensorList" + : "ends"); + paddle::small_vector outputs {"out"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x")) { + kernel_name = "slice_coo"; + } + if (ctx.IsSparseCsrTensorInput("x")) { + kernel_name = "slice_csr"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAbsGradOpArgumentMapping: + +return KernelSignature("abs_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("abs_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "abs_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "abs_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAcosGradOpArgumentMapping: + +return KernelSignature("acos_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("acos_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAcosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "acos_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "acos_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAcoshGradOpArgumentMapping: + +return KernelSignature("acosh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("acosh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAcoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "acosh_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "acosh_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAddGradOpArgumentMapping: + +return KernelSignature("add_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("add_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("add_coo_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "add_coo_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "add_csr_csr_grad"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "add_coo_dense_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAddmmGradOpArgumentMapping: + +return KernelSignature("addmm_csr_dense_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); +return KernelSignature("addmm_csr_csr_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); +return KernelSignature("addmm_coo_dense_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); +return KernelSignature("addmm_coo_coo_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"input", "x", "y", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + attrs.emplace_back("beta"); + paddle::small_vector outputs {"input@GRAD", "x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "addmm_csr_dense_grad"; + } + if (ctx.IsSparseCsrTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "addmm_csr_csr_grad"; + } + if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "addmm_coo_dense_grad"; + } + if (ctx.IsSparseCooTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "addmm_coo_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAsinGradOpArgumentMapping: + +return KernelSignature("asin_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("asin_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAsinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "asin_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "asin_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAsinhGradOpArgumentMapping: + +return KernelSignature("asinh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("asinh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAsinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "asinh_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "asinh_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAtanGradOpArgumentMapping: + +return KernelSignature("atan_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("atan_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAtanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "atan_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "atan_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseAtanhGradOpArgumentMapping: + +return KernelSignature("atanh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("atanh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseAtanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "atanh_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "atanh_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseBatchNormGradOpArgumentMapping: + +return KernelSignature("batch_norm_coo_grad", {"x", "scale", "bias", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}, {"momentum", "epsilon", "data_layout", "is_test", "use_global_stats", "trainable_statistics"}, {"x@GRAD", "scale@GRAD", "bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseBatchNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "scale", "bias", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("data_layout"); + attrs.emplace_back("is_test"); + attrs.emplace_back("use_global_stats"); + attrs.emplace_back("trainable_statistics"); + paddle::small_vector outputs {"x@GRAD", "scale@GRAD", "bias@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias") && ctx.IsDenseTensorInput("mean_out") && ctx.IsDenseTensorInput("variance_out") && ctx.IsDenseTensorInput("saved_mean") && ctx.IsDenseTensorInput("saved_variance") && ctx.IsDenseTensorInput("reserve_space") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "batch_norm_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseCastGradOpArgumentMapping: + +return KernelSignature("cast_coo_grad", {"x", "out@GRAD"}, {"value_dtype"}, {"x@GRAD"}); +return KernelSignature("cast_csr_grad", {"x", "out@GRAD"}, {"value_dtype"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseCastGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("value_dtype"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "cast_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "cast_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseConv3dGradOpArgumentMapping: + +return KernelSignature("conv3d_coo_grad", {"x", "kernel", "out", "rulebook", "counter", "out@GRAD"}, {"paddings", "dilations", "strides", "groups", "subm", "key"}, {"x@GRAD", "kernel@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseConv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "kernel", "out", "rulebook", "counter", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("strides"); + attrs.emplace_back("groups"); + attrs.emplace_back("subm"); + attrs.emplace_back("key"); + paddle::small_vector outputs {"x@GRAD", "kernel@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("kernel") && ctx.IsSparseCooTensorInput("out") && ctx.IsDenseTensorInput("rulebook") && ctx.IsDenseTensorInput("counter") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "conv3d_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseDivideGradOpArgumentMapping: + +return KernelSignature("divide_coo_coo_grad", {"x", "y", "out", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("divide_csr_csr_grad", {"x", "y", "out", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseDivideGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "divide_coo_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "divide_csr_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseExpm1GradOpArgumentMapping: + +return KernelSignature("expm1_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("expm1_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseExpm1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "expm1_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "expm1_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseLeakyReluGradOpArgumentMapping: + +return KernelSignature("leaky_relu_coo_grad", {"x", "out@GRAD"}, {"alpha"}, {"x@GRAD"}); +return KernelSignature("leaky_relu_csr_grad", {"x", "out@GRAD"}, {"alpha"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseLeakyReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("alpha"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "leaky_relu_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "leaky_relu_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseLog1pGradOpArgumentMapping: + +return KernelSignature("log1p_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("log1p_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseLog1pGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "log1p_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "log1p_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMaskedMatmulGradOpArgumentMapping: + +return KernelSignature("masked_matmul_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseMaskedMatmulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "masked_matmul_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMatmulGradOpArgumentMapping: + +return KernelSignature("matmul_csr_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("matmul_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("matmul_coo_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("matmul_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseMatmulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "matmul_csr_dense_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "matmul_csr_csr_grad"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "matmul_coo_dense_grad"; + } + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "matmul_coo_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMaxpoolGradOpArgumentMapping: + +return KernelSignature("maxpool_coo_grad", {"x", "rulebook", "counter", "out", "out@GRAD"}, {"kernel_sizes"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseMaxpoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "rulebook", "counter", "out", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("kernel_sizes"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("rulebook") && ctx.IsDenseTensorInput("counter") && ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "maxpool_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMultiplyGradOpArgumentMapping: + +return KernelSignature("multiply_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("multiply_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseMultiplyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "multiply_coo_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "multiply_csr_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseMvGradOpArgumentMapping: + +return KernelSignature("mv_coo_grad", {"x", "vec", "out@GRAD"}, {}, {"x@GRAD", "vec@GRAD"}); +return KernelSignature("mv_csr_grad", {"x", "vec", "out@GRAD"}, {}, {"x@GRAD", "vec@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseMvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "vec", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "vec@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("vec") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "mv_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("vec") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "mv_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparsePowGradOpArgumentMapping: + +return KernelSignature("pow_coo_grad", {"x", "out@GRAD"}, {"factor"}, {"x@GRAD"}); +return KernelSignature("pow_csr_grad", {"x", "out@GRAD"}, {"factor"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparsePowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("factor"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "pow_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "pow_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseRelu6GradOpArgumentMapping: + +return KernelSignature("relu6_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("relu6_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseRelu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "relu6_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "relu6_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseReluGradOpArgumentMapping: + +return KernelSignature("relu_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("relu_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "relu_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "relu_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseReshapeGradOpArgumentMapping: + +return KernelSignature("reshape_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("reshape_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseReshapeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "reshape_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "reshape_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSinGradOpArgumentMapping: + +return KernelSignature("sin_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("sin_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sin_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "sin_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSinhGradOpArgumentMapping: + +return KernelSignature("sinh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("sinh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sinh_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "sinh_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSoftmaxGradOpArgumentMapping: + +return KernelSignature("softmax_coo_grad", {"out", "out@GRAD"}, {"axis"}, {"x@GRAD"}); +return KernelSignature("softmax_csr_grad", {"out", "out@GRAD"}, {"axis"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "softmax_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "softmax_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSparseCooTensorGradOpArgumentMapping: + +return KernelSignature("sparse_coo_tensor_grad", {"indices", "out@GRAD"}, {}, {"values@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSparseCooTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"indices", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"values@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("indices") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sparse_coo_tensor_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSqrtGradOpArgumentMapping: + +return KernelSignature("sqrt_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("sqrt_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sqrt_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "sqrt_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSquareGradOpArgumentMapping: + +return KernelSignature("square_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("square_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSquareGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "square_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "square_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSubtractGradOpArgumentMapping: + +return KernelSignature("subtract_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +return KernelSignature("subtract_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSubtractGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "y", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "subtract_coo_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "subtract_csr_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSumGradOpArgumentMapping: + +return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"axis", "keepdim"}, {"x@GRAD"}); +return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"AxisTensor", "keepdim"}, {"x@GRAD"}); +return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"AxisTensorList", "keepdim"}, {"x@GRAD"}); +return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"axis", "keepdim"}, {"x@GRAD"}); +return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"AxisTensor", "keepdim"}, {"x@GRAD"}); +return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"AxisTensorList", "keepdim"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "axis"); + attrs.emplace_back("keepdim"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sum_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "sum_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSyncBatchNormGradOpArgumentMapping: + +return KernelSignature("sync_batch_norm_coo_grad", {"x", "scale", "bias", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}, {"momentum", "epsilon", "data_layout", "is_test", "use_global_stats", "trainable_statistics"}, {"x@GRAD", "scale@GRAD", "bias@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSyncBatchNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "scale", "bias", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("momentum"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("data_layout"); + attrs.emplace_back("is_test"); + attrs.emplace_back("use_global_stats"); + attrs.emplace_back("trainable_statistics"); + paddle::small_vector outputs {"x@GRAD", "scale@GRAD", "bias@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias") && ctx.IsDenseTensorInput("saved_mean") && ctx.IsDenseTensorInput("saved_variance") && ctx.IsDenseTensorInput("reserve_space") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "sync_batch_norm_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTanGradOpArgumentMapping: + +return KernelSignature("tan_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("tan_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseTanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "tan_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "tan_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTanhGradOpArgumentMapping: + +return KernelSignature("tanh_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +return KernelSignature("tanh_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseTanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "tanh_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "tanh_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseToDenseGradOpArgumentMapping: + +return KernelSignature("coo_to_dense_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseToDenseGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "coo_to_dense_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseToSparseCooGradOpArgumentMapping: + +return KernelSignature("coo_to_dense", {"out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseToSparseCooGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "coo_to_dense"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseTransposeGradOpArgumentMapping: + +return KernelSignature("transpose_coo_grad", {"out@GRAD"}, {"perm"}, {"x@GRAD"}); +return KernelSignature("transpose_csr_grad", {"out@GRAD"}, {"perm"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("perm"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "transpose_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "transpose_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseValuesGradOpArgumentMapping: + +return KernelSignature("values_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseValuesGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "values_coo_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseFusedAttentionGradOpArgumentMapping: + +return KernelSignature("fused_attention_csr_grad", {"query", "key", "value", "softmax", "out@GRAD"}, {}, {"query@GRAD", "key@GRAD", "value@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseFusedAttentionGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"query", "key", "value", "softmax", "out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"query@GRAD", "key@GRAD", "value@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsDenseTensorInput("query") && ctx.IsDenseTensorInput("key") && ctx.IsDenseTensorInput("value") && ctx.IsSparseCsrTensorInput("softmax") && ctx.IsDenseTensorInput("out_grad")) { + kernel_name = "fused_attention_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SparseSliceGradOpArgumentMapping: + +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); +return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); +****************************************************************** +*/ + +KernelSignature SparseSliceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxesTensor") + ? "AxesTensor" + : ctx.InputSize("AxesTensorList") > 0 + ? "AxesTensorList" + : "axes"); + attrs.emplace_back( + ctx.HasInput("StartsTensor") + ? "StartsTensor" + : ctx.InputSize("StartsTensorList") > 0 + ? "StartsTensorList" + : "starts"); + attrs.emplace_back( + ctx.HasInput("EndsTensor") + ? "EndsTensor" + : ctx.InputSize("EndsTensorList") > 0 + ? "EndsTensorList" + : "ends"); + paddle::small_vector outputs {"x@GRAD"}; + + const char* kernel_name = "unregistered"; + + if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { + kernel_name = "slice_coo_grad"; + } + if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { + kernel_name = "slice_csr_grad"; + } + KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); + return sig; +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(sparse_abs, phi::SparseAbsOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_acos, phi::SparseAcosOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_acosh, phi::SparseAcoshOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_add, phi::SparseAddOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_asin, phi::SparseAsinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_asinh, phi::SparseAsinhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_atan, phi::SparseAtanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_atanh, phi::SparseAtanhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_batch_norm, phi::SparseBatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_cast, phi::SparseCastOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_conv3d, phi::SparseConv3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_divide, phi::SparseDivideOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_divide_scalar, phi::SparseDivideScalarOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_expm1, phi::SparseExpm1OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_isnan, phi::SparseIsnanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_leaky_relu, phi::SparseLeakyReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_log1p, phi::SparseLog1pOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_multiply, phi::SparseMultiplyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_pow, phi::SparsePowOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_relu, phi::SparseReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_relu6, phi::SparseRelu6OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_reshape, phi::SparseReshapeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_scale, phi::SparseScaleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sin, phi::SparseSinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sinh, phi::SparseSinhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_softmax, phi::SparseSoftmaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sparse_coo_tensor, phi::SparseSparseCooTensorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sqrt, phi::SparseSqrtOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_square, phi::SparseSquareOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_subtract, phi::SparseSubtractOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sum, phi::SparseSumOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sync_batch_norm, phi::SparseSyncBatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_tan, phi::SparseTanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_tanh, phi::SparseTanhOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_to_dense, phi::SparseToDenseOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_coo, phi::SparseToSparseCooOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_csr, phi::SparseToSparseCsrOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_transpose, phi::SparseTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_values, phi::SparseValuesOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_addmm, phi::SparseAddmmOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_coalesce, phi::SparseCoalesceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_full_like, phi::SparseFullLikeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_fused_attention, phi::SparseFusedAttentionOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_masked_matmul, phi::SparseMaskedMatmulOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_matmul, phi::SparseMatmulOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_maxpool, phi::SparseMaxpoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_mv, phi::SparseMvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_slice, phi::SparseSliceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_abs_grad, phi::SparseAbsGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_acos_grad, phi::SparseAcosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_acosh_grad, phi::SparseAcoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_add_grad, phi::SparseAddGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_addmm_grad, phi::SparseAddmmGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_asin_grad, phi::SparseAsinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_asinh_grad, phi::SparseAsinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_atan_grad, phi::SparseAtanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_atanh_grad, phi::SparseAtanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_batch_norm_grad, phi::SparseBatchNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_cast_grad, phi::SparseCastGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_conv3d_grad, phi::SparseConv3dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_divide_grad, phi::SparseDivideGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_expm1_grad, phi::SparseExpm1GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_leaky_relu_grad, phi::SparseLeakyReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_log1p_grad, phi::SparseLog1pGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_masked_matmul_grad, phi::SparseMaskedMatmulGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_matmul_grad, phi::SparseMatmulGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_maxpool_grad, phi::SparseMaxpoolGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_multiply_grad, phi::SparseMultiplyGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_mv_grad, phi::SparseMvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_pow_grad, phi::SparsePowGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_relu6_grad, phi::SparseRelu6GradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_relu_grad, phi::SparseReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_reshape_grad, phi::SparseReshapeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sin_grad, phi::SparseSinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sinh_grad, phi::SparseSinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_softmax_grad, phi::SparseSoftmaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sparse_coo_tensor_grad, phi::SparseSparseCooTensorGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sqrt_grad, phi::SparseSqrtGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_square_grad, phi::SparseSquareGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_subtract_grad, phi::SparseSubtractGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sum_grad, phi::SparseSumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_sync_batch_norm_grad, phi::SparseSyncBatchNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_tan_grad, phi::SparseTanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_tanh_grad, phi::SparseTanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_to_dense_grad, phi::SparseToDenseGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_coo_grad, phi::SparseToSparseCooGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_transpose_grad, phi::SparseTransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_values_grad, phi::SparseValuesGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_fused_attention_grad, phi::SparseFusedAttentionGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sparse_slice_grad, phi::SparseSliceGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_static_sig.cc b/paddle/fluid/operators/ops_signature/generated_static_sig.cc new file mode 100644 index 0000000000000..8e3ffbef1ffde --- /dev/null +++ b/paddle/fluid/operators/ops_signature/generated_static_sig.cc @@ -0,0 +1,1585 @@ +// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AllGatherOpArgumentMapping: + +return KernelSignature("all_gather", {"x"}, {"ring_id", "nranks"}, {"out"}); +****************************************************************** +*/ + +KernelSignature AllGatherOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("nranks"); + paddle::small_vector outputs {"out"}; + return KernelSignature("all_gather", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AllReduceOpArgumentMapping: + +return KernelSignature("all_reduce", {"x"}, {"ring_id", "reduce_type"}, {"out"}); +****************************************************************** +*/ + +KernelSignature AllReduceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("reduce_type"); + paddle::small_vector outputs {"out"}; + return KernelSignature("all_reduce", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AllToAllOpArgumentMapping: + +return KernelSignature("all_to_all", {"x"}, {"ring_id"}, {"out"}); +****************************************************************** +*/ + +KernelSignature AllToAllOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + paddle::small_vector outputs {"out"}; + return KernelSignature("all_to_all", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ArangeOpArgumentMapping: + +return KernelSignature("arange_tensor", {"Start", "End", "Step"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RangeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Start", "End", "Step"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("arange_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by BroadcastOpArgumentMapping: + +return KernelSignature("broadcast", {"x"}, {"ring_id", "root"}, {"out"}); +****************************************************************** +*/ + +KernelSignature BroadcastOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("root"); + paddle::small_vector outputs {"out"}; + return KernelSignature("broadcast", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dTransposeOpArgumentMapping: + +return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature Conv2dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", }; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("conv2d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DecodeJpegOpArgumentMapping: + +return KernelSignature("decode_jpeg", {"X"}, {"mode"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature DecodeJpegOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("mode"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("decode_jpeg", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DeformableConvOpArgumentMapping: + +return KernelSignature("deformable_conv", {"Input", "Offset", "Filter", "Mask"}, {"strides", "paddings", "dilations", "deformable_groups", "groups", "im2col_step"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature DeformableConvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Offset", "Filter", "Mask"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("deformable_groups"); + attrs.emplace_back("groups"); + attrs.emplace_back("im2col_step"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("deformable_conv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DepthwiseConv2dTransposeOpArgumentMapping: + +return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); +****************************************************************** +*/ + +KernelSignature DepthwiseConv2dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", }; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Output"}; + return KernelSignature("depthwise_conv2d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DistConcatOpArgumentMapping: + +return KernelSignature("dist_concat", {"x"}, {"ring_id", "nranks"}, {"out"}); +****************************************************************** +*/ + +KernelSignature DistConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("nranks"); + paddle::small_vector outputs {"out"}; + return KernelSignature("dist_concat", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EinsumOpArgumentMapping: + +return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"}); +****************************************************************** +*/ + +KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Operands"}; + paddle::small_vector attrs; + attrs.emplace_back("equation"); + paddle::small_vector outputs {"Out", "InnerCache", "XShape"}; + return KernelSignature("einsum", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EmbeddingOpArgumentMapping: + +return KernelSignature("embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"}); +return KernelSignature("sparse_weight_embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LookupTableV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Ids", "W"}; + paddle::small_vector attrs; + attrs.emplace_back("padding_idx"); + paddle::small_vector outputs {"Out"}; + if ( ctx.IsDenseTensorInput("Ids") && + ctx.IsDenseTensorInput("W")) { + return KernelSignature("embedding", std::move(inputs), std::move(attrs), std::move(outputs)); + } + else if ( ctx.IsDenseTensorInput("Ids") && + ctx.IsSelectedRowsInput("W")) { + return KernelSignature("sparse_weight_embedding", std::move(inputs), std::move(attrs), std::move(outputs)); + } +else { return KernelSignature("unregistered", {}, {}, {}); } +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EmptyOpArgumentMapping: + +return KernelSignature("empty", {}, {"shape", "dtype"}, {"Out"}); +return KernelSignature("empty", {}, {"ShapeTensor", "dtype"}, {"Out"}); +return KernelSignature("empty", {}, {"ShapeTensorList", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("ShapeTensor") + ? "ShapeTensor" + : ctx.InputSize("ShapeTensorList") > 0 + ? "ShapeTensorList" + : "shape"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("empty", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EqualOpArgumentMapping: + +return KernelSignature("equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EqualOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ExponentialOpArgumentMapping: + +return KernelSignature("exponential", {"X"}, {"lambda"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature ExponentialOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("lambda"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("exponential", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EyeOpArgumentMapping: + +return KernelSignature("eye", {}, {"num_rows", "num_columns", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature EyeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("num_rows"); + attrs.emplace_back("num_columns"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("eye", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FrobeniusNormOpArgumentMapping: + +return KernelSignature("frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); +return KernelSignature("frobenius_norm", {"X"}, {"AxisTensor", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); +return KernelSignature("frobenius_norm", {"X"}, {"AxisTensorList", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FrobeniusNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + + + paddle::small_vector outputs {"Out"}; + return KernelSignature("frobenius_norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FullLikeOpArgumentMapping: + +return KernelSignature("full_like", {"X"}, {"value", "dtype"}, {"Out"}); +return KernelSignature("full_like", {"X"}, {"ValueTensor", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature FillAnyLikeOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("value"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("full_like", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GreaterEqualOpArgumentMapping: + +return KernelSignature("greater_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GreaterEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("greater_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by GreaterThanOpArgumentMapping: + +return KernelSignature("greater_than_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature GreaterThanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("greater_than_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LessEqualOpArgumentMapping: + +return KernelSignature("less_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LessEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("less_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LessThanOpArgumentMapping: + +return KernelSignature("less_than_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LessThanOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("less_than_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by LinspaceOpArgumentMapping: + +return KernelSignature("linspace", {"Start", "Stop", "Num"}, {"dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature LinspaceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Start", "Stop", "Num"}; + paddle::small_vector attrs; + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("linspace", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatmulOpArgumentMapping: + +return KernelSignature("matmul", {"X", "Y"}, {"trans_x", "trans_y"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature MatmulV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("trans_x"); + attrs.emplace_back("trans_y"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("matmul", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NormOpArgumentMapping: + +return KernelSignature("norm", {"X"}, {"axis", "epsilon", "is_test"}, {"Out", "Norm"}); +****************************************************************** +*/ + +KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"Out", "Norm"}; + return KernelSignature("norm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NotEqualOpArgumentMapping: + +return KernelSignature("not_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature NotEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("not_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by OneHotOpArgumentMapping: + +return KernelSignature("one_hot_raw", {"X"}, {"depth", "dtype", "allow_out_of_range"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature OneHotV2OpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back(ctx.HasInput("depth_tensor") ? "depth_tensor" : "depth"); + attrs.emplace_back("dtype"); + attrs.emplace_back("allow_out_of_range"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("one_hot_raw", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PRecvOpArgumentMapping: + +return KernelSignature("p_recv", {}, {"ring_id", "peer", "dtype", "dynamic_shape"}, {"out"}); +****************************************************************** +*/ + +KernelSignature PRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + + attrs.emplace_back("peer"); + attrs.emplace_back("dtype"); + attrs.emplace_back("dynamic_shape"); + paddle::small_vector outputs {"out"}; + return KernelSignature("p_recv", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by PRecvArrayOpArgumentMapping: + +return KernelSignature("p_recv_array", {}, {"ring_id", "peer", "dtype", "out_shape"}, {"out"}); +****************************************************************** +*/ + +KernelSignature PRecvArrayOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + + attrs.emplace_back("peer"); + attrs.emplace_back("dtype"); + attrs.emplace_back("out_shape"); + paddle::small_vector outputs {"out"}; + return KernelSignature("p_recv_array", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pool2dOpArgumentMapping: + +return KernelSignature("pool2d", {"X"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); +return KernelSignature("pool2d", {"X"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); +return KernelSignature("pool2d", {"X"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("ceil_mode"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("data_format"); + attrs.emplace_back("pooling_type"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + attrs.emplace_back("padding_algorithm"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("pool2d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pool3dOpArgumentMapping: + +return KernelSignature("pool3d", {"X"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("ceil_mode"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("data_format"); + attrs.emplace_back("pooling_type"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + attrs.emplace_back("padding_algorithm"); + + paddle::small_vector outputs {"Out"}; + return KernelSignature("pool3d", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by QuantLinearOpArgumentMapping: + +return KernelSignature("quant_linear", {"x", "w", "bias"}, {"in_num_col_dims", "activation_type", "padding_weights", "scale_in", "scale_weights", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out"}); +****************************************************************** +*/ + +KernelSignature QuantLinearOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x", "w", "bias"}; + paddle::small_vector attrs; + attrs.emplace_back("in_num_col_dims"); + attrs.emplace_back("activation_type"); + attrs.emplace_back("padding_weights"); + attrs.emplace_back("scale_in"); + attrs.emplace_back("scale_weights"); + attrs.emplace_back("quant_round_type"); + attrs.emplace_back("quant_max_bound"); + attrs.emplace_back("quant_min_bound"); + paddle::small_vector outputs {"out"}; + return KernelSignature("quant_linear", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RandpermOpArgumentMapping: + +return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("n"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("randperm", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReduceOpArgumentMapping: + +return KernelSignature("reduce", {"x"}, {"ring_id", "root_id", "reduce_type"}, {"out"}); +****************************************************************** +*/ + +KernelSignature ReduceOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("root_id"); + attrs.emplace_back("reduce_type"); + paddle::small_vector outputs {"out"}; + return KernelSignature("reduce", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ReduceScatterOpArgumentMapping: + +return KernelSignature("reduce_scatter", {"x"}, {"ring_id", "nranks"}, {"out"}); +****************************************************************** +*/ + +KernelSignature ReduceScatterOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + attrs.emplace_back("nranks"); + paddle::small_vector outputs {"out"}; + return KernelSignature("reduce_scatter", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RnnOpArgumentMapping: + +return KernelSignature("rnn", {"Input", "PreState", "WeightList", "SequenceLength"}, {"dropout_prob", "is_bidirec", "input_size", "hidden_size", "num_layers", "mode", "seed", "is_test"}, {"Out", "DropoutState", "State", "Reserve"}); +****************************************************************** +*/ + +KernelSignature RnnOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "PreState", "WeightList", "SequenceLength"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout_prob"); + attrs.emplace_back("is_bidirec"); + attrs.emplace_back("input_size"); + attrs.emplace_back("hidden_size"); + attrs.emplace_back("num_layers"); + attrs.emplace_back("mode"); + attrs.emplace_back("seed"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"Out", "DropoutState", "State", "Reserve"}; + return KernelSignature("rnn", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ShadowOutputOpArgumentMapping: + +return KernelSignature("shadow_output", {"x"}, {"name"}, {"out"}); +****************************************************************** +*/ + +KernelSignature ShadowOutputOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x"}; + paddle::small_vector attrs; + + paddle::small_vector outputs {"out"}; + return KernelSignature("shadow_output", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ShareBufferOpArgumentMapping: + +return KernelSignature("share_buffer", {"X"}, {"share_dims_and_dtype"}, {"Out", "XOut"}); +****************************************************************** +*/ + +KernelSignature ShareBufferOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("share_dims_and_dtype"); + paddle::small_vector outputs {"Out", "XOut"}; + return KernelSignature("share_buffer", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftmaxOpArgumentMapping: + +return KernelSignature("softmax", {"X"}, {"axis"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("softmax", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SwishOpArgumentMapping: + +return KernelSignature("swish", {"X"}, {}, {"Out"}); +****************************************************************** +*/ + +KernelSignature SwishOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"Out"}; + return KernelSignature("swish", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TrilIndicesOpArgumentMapping: + +return KernelSignature("tril_indices", {}, {"rows", "cols", "offset", "dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature TrilIndicesOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("rows"); + attrs.emplace_back("cols"); + attrs.emplace_back("offset"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out"}; + return KernelSignature("tril_indices", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TrilTriuOpArgumentMapping: + +return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X"}; + paddle::small_vector attrs; + attrs.emplace_back("diagonal"); + attrs.emplace_back("lower"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("tril_triu", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TriuIndicesOpArgumentMapping: + +return KernelSignature("triu_indices", {}, {"row", "col", "offset", "dtype"}, {"out"}); +****************************************************************** +*/ + +KernelSignature TriuIndicesOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("row"); + attrs.emplace_back("col"); + attrs.emplace_back("offset"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"out"}; + return KernelSignature("triu_indices", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TruncatedGaussianRandomOpArgumentMapping: + +return KernelSignature("truncated_gaussian_random", {}, {"shape", "mean", "std", "seed", "dtype"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature TruncatedGaussianRandomOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {}; + paddle::small_vector attrs; + attrs.emplace_back("shape"); + attrs.emplace_back("mean"); + attrs.emplace_back("std"); + attrs.emplace_back("seed"); + attrs.emplace_back("dtype"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("truncated_gaussian_random", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnpoolOpArgumentMapping: + +return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "output_size", "data_format"}, {"Out"}); +return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "OutputSizeTensor", "data_format"}, {"Out"}); +return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "OutputSizeTensorList", "data_format"}, {"Out"}); +****************************************************************** +*/ + +KernelSignature UnpoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_size"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Out"}; + return KernelSignature("unpool", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AmaxGradOpArgumentMapping: + +return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceAmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("amax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by AminGradOpArgumentMapping: + +return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceAminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back( + ctx.HasInput("AxisTensor") + ? "AxisTensor" + : ctx.InputSize("AxisTensorList") > 0 + ? "AxisTensorList" + : "dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("amin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dTransposeDoubleGradOpArgumentMapping: + +return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv2dTransposeGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; + return KernelSignature("conv2d_transpose_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Conv2dTransposeGradOpArgumentMapping: + +return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature Conv2dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("conv2d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DeformableConvGradOpArgumentMapping: + +return KernelSignature("deformable_conv_grad", {"Input", "Offset", "Filter", "Mask", "Output@GRAD"}, {"strides", "paddings", "dilations", "deformable_groups", "groups", "im2col_step"}, {"Input@GRAD", "Offset@GRAD", "Filter@GRAD", "Mask@GRAD"}); +****************************************************************** +*/ + +KernelSignature DeformableConvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Offset", "Filter", "Mask", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("dilations"); + attrs.emplace_back("deformable_groups"); + attrs.emplace_back("groups"); + attrs.emplace_back("im2col_step"); + paddle::small_vector outputs {"Input@GRAD", "Offset@GRAD", "Filter@GRAD", "Mask@GRAD"}; + return KernelSignature("deformable_conv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by DepthwiseConv2dTransposeGradOpArgumentMapping: + +return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); +****************************************************************** +*/ + +KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_padding"); + attrs.emplace_back("output_size"); + attrs.emplace_back("padding_algorithm"); + attrs.emplace_back("groups"); + attrs.emplace_back("dilations"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; + return KernelSignature("depthwise_conv2d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by EinsumGradOpArgumentMapping: + +return KernelSignature("einsum_grad", {"x_shape", "InnerCache", "Out@GRAD"}, {"equation"}, {"Operands@GRAD"}); +****************************************************************** +*/ + +KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"x_shape", "InnerCache", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("equation"); + paddle::small_vector outputs {"Operands@GRAD"}; + return KernelSignature("einsum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ElementwisePowGradOpArgumentMapping: + +return KernelSignature("elementwise_pow_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwisePowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("elementwise_pow_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by FrobeniusNormGradOpArgumentMapping: + +return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); +return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); +return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature FrobeniusNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + + + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("frobenius_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by HardswishGradOpArgumentMapping: + +return KernelSignature("hardswish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature HardSwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("hardswish_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatmulDoubleGradOpArgumentMapping: + +return KernelSignature("matmul_double_grad", {"X", "Y", "grad_out", "grad_x@GRAD", "grad_y@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD", "grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature MatmulV2GradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "grad_out", "grad_x@GRAD", "grad_y@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("trans_x"); + attrs.emplace_back("trans_y"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "grad_out@GRAD"}; + return KernelSignature("matmul_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatmulGradOpArgumentMapping: + +return KernelSignature("matmul_grad", {"X", "Y", "Out@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature MatmulV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("trans_x"); + attrs.emplace_back("trans_y"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("matmul_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MatmulTripleGradOpArgumentMapping: + +return KernelSignature("matmul_triple_grad", {"X", "Y", "grad_out", "grad_grad_x", "grad_grad_y", "grad_x@GRAD", "grad_y@GRAD", "grad_grad_out@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD", "grad_grad_y@GRAD"}); +****************************************************************** +*/ + +KernelSignature MatmulV2TripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "grad_out", "grad_grad_x", "grad_grad_y", "grad_x@GRAD", "grad_y@GRAD", "grad_grad_out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("trans_x"); + attrs.emplace_back("trans_y"); + paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD", "grad_grad_y@GRAD"}; + return KernelSignature("matmul_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaxGradOpArgumentMapping: + +return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceMaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("max_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MaximumGradOpArgumentMapping: + +return KernelSignature("maximum_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwiseMaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("maximum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MinGradOpArgumentMapping: + +return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceMinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("min_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by MinimumGradOpArgumentMapping: + +return KernelSignature("minimum_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +****************************************************************** +*/ + +KernelSignature ElementwiseMinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; + return KernelSignature("minimum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by NormGradOpArgumentMapping: + +return KernelSignature("norm_grad", {"X", "Norm", "Out@GRAD"}, {"axis", "epsilon", "is_test"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Norm", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + attrs.emplace_back("epsilon"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pool2dDoubleGradOpArgumentMapping: + +return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); +return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); +return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); +****************************************************************** +*/ + +KernelSignature Pool2dDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"grad_x@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("ceil_mode"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("data_format"); + attrs.emplace_back("pooling_type"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + attrs.emplace_back("padding_algorithm"); + paddle::small_vector outputs {"grad_out@GRAD"}; + return KernelSignature("pool2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pool2dGradOpArgumentMapping: + +return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); +return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); +return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("ceil_mode"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("data_format"); + attrs.emplace_back("pooling_type"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + attrs.emplace_back("padding_algorithm"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pool2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by Pool3dGradOpArgumentMapping: + +return KernelSignature("pool3d_grad", {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("ceil_mode"); + attrs.emplace_back("exclusive"); + attrs.emplace_back("data_format"); + attrs.emplace_back("pooling_type"); + attrs.emplace_back("global_pooling"); + attrs.emplace_back("adaptive"); + attrs.emplace_back("padding_algorithm"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("pool3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by ProdGradOpArgumentMapping: + +return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"DimsTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"DimsTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceProdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("prod_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by RnnGradOpArgumentMapping: + +return KernelSignature("rnn_grad", {"Input", "PreState", "WeightList", "SequenceLength", "Out", "DropoutState", "Reserve", "Out@GRAD", "State@GRAD"}, {"dropout_prob", "is_bidirec", "input_size", "hidden_size", "num_layers", "mode", "seed", "is_test"}, {"Input@GRAD", "PreState@GRAD", "WeightList@GRAD"}); +****************************************************************** +*/ + +KernelSignature RnnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Input", "PreState", "WeightList", "SequenceLength", "Out", "DropoutState", "Reserve", "Out@GRAD", "State@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dropout_prob"); + attrs.emplace_back("is_bidirec"); + attrs.emplace_back("input_size"); + attrs.emplace_back("hidden_size"); + attrs.emplace_back("num_layers"); + attrs.emplace_back("mode"); + attrs.emplace_back("seed"); + attrs.emplace_back("is_test"); + paddle::small_vector outputs {"Input@GRAD", "PreState@GRAD", "WeightList@GRAD"}; + return KernelSignature("rnn_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SoftmaxGradOpArgumentMapping: + +return KernelSignature("softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("axis"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SumGradOpArgumentMapping: + +return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); +return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature ReduceSumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("dim"); + attrs.emplace_back("keep_dim"); + attrs.emplace_back("reduce_all"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("sum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by SwishGradOpArgumentMapping: + +return KernelSignature("swish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature SwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Out@GRAD"}; + paddle::small_vector attrs; + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("swish_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by TrilTriuGradOpArgumentMapping: + +return KernelSignature("tril_triu_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature TrilTriuGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("diagonal"); + attrs.emplace_back("lower"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("tril_triu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +/* +****************************************************************** +NOTE: The following codes are for 'get_compat_kernel_signature.py' +All possible KernelSignatures returned by UnpoolGradOpArgumentMapping: + +return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"X@GRAD"}); +return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "OutputSizeTensor", "data_format"}, {"X@GRAD"}); +return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "OutputSizeTensorList", "data_format"}, {"X@GRAD"}); +****************************************************************** +*/ + +KernelSignature UnpoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector inputs {"X", "Indices", "Out", "Out@GRAD"}; + paddle::small_vector attrs; + attrs.emplace_back("ksize"); + attrs.emplace_back("strides"); + attrs.emplace_back("paddings"); + attrs.emplace_back("output_size"); + attrs.emplace_back("data_format"); + paddle::small_vector outputs {"X@GRAD"}; + return KernelSignature("unpool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(all_gather, phi::AllGatherOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(all_reduce, phi::AllReduceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(all_to_all, phi::AllToAllOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(range, arange); +PD_REGISTER_ARG_MAPPING_FN(range, phi::RangeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(broadcast, phi::BroadcastOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose, phi::Conv2dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(decode_jpeg, phi::DecodeJpegOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(deformable_conv, phi::DeformableConvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose, phi::DepthwiseConv2dTransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(dist_concat, phi::DistConcatOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(einsum, phi::EinsumOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2, embedding); +PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2, phi::LookupTableV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(exponential, phi::ExponentialOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(eye, phi::EyeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like); +PD_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(linspace, phi::LinspaceOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul); +PD_REGISTER_ARG_MAPPING_FN(matmul_v2, phi::MatmulV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot); +PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotV2OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(p_recv, phi::PRecvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(p_recv_array, phi::PRecvArrayOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(quant_linear, phi::QuantLinearOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce, phi::ReduceOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_scatter, phi::ReduceScatterOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rnn, phi::RnnOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(shadow_output, phi::ShadowOutputOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(share_buffer, phi::ShareBufferOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softmax, phi::SoftmaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(swish, phi::SwishOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_indices, phi::TrilIndicesOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(triu_indices, phi::TriuIndicesOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(truncated_gaussian_random, phi::TruncatedGaussianRandomOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool, phi::UnpoolOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_amax_grad, amax_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_amax_grad, phi::ReduceAmaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_amin_grad, amin_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_amin_grad, phi::ReduceAminGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(conv2d_transpose_grad_grad, conv2d_transpose_double_grad); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad, phi::Conv2dTransposeGradGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad, phi::Conv2dTransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(deformable_conv_grad, phi::DeformableConvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad, phi::DepthwiseConv2dTransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(einsum_grad, phi::EinsumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad, phi::ElementwisePowGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(exponential_grad, exponential__grad); +PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad, phi::FrobeniusNormGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(hard_swish_grad, hardswish_grad); +PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad, phi::HardSwishGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad); +PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad, phi::MatmulV2GradGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad); +PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulV2GradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad); +PD_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad, phi::MatmulV2TripleGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad, phi::ReduceMaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_max_grad, maximum_grad); +PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad, phi::ElementwiseMaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad, phi::ReduceMinGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_min_grad, minimum_grad); +PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad, phi::ElementwiseMinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, phi::Pool2dDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad, phi::ReduceProdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(rnn_grad, phi::RnnGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softmax_grad, phi::SoftmaxGradOpArgumentMapping); +PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); +PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool_grad, phi::UnpoolGradOpArgumentMapping); diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu new file mode 100644 index 0000000000000..c8bc7b420c82d --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu @@ -0,0 +1,4349 @@ + +// Generated by conv2d_bias_act.py - Do not edit. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h" +#include "cutlass/epilogue/thread/linear_combination_silu.h" +#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" +#include "cutlass/epilogue/thread/linear_combination_sigmoid.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + + +namespace phi { +namespace fusion { +namespace cutlass_internal { + +cutlass::Status conv2d_bias_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_sm75_all_func = {conv2d_bias_sm750, +conv2d_bias_sm751, +conv2d_bias_sm752, +conv2d_bias_sm753, +conv2d_bias_sm754, +conv2d_bias_sm755, +conv2d_bias_sm756, +conv2d_bias_sm757, +conv2d_bias_sm758, +}; + +std::map, int> map_problem_conv2d_bias_sm75; +std::mutex conv2d_bias_sm75_mutex; + +void conv2d_bias_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_sm75.count(problem_size)) { + conv2d_bias_sm75_all_func[map_problem_conv2d_bias_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_sm75_all_func, params, CONV2D_BIAS); + + std::lock_guard guard(conv2d_bias_sm75_mutex); + + map_problem_conv2d_bias_sm75[problem_size] = best_config_index; + conv2d_bias_sm75_all_func[best_config_index](params); +} + +cutlass::Status conv2d_bias_relu_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_relu_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_relu_sm75_all_func = {conv2d_bias_relu_sm750, +conv2d_bias_relu_sm751, +conv2d_bias_relu_sm752, +conv2d_bias_relu_sm753, +conv2d_bias_relu_sm754, +conv2d_bias_relu_sm755, +conv2d_bias_relu_sm756, +conv2d_bias_relu_sm757, +conv2d_bias_relu_sm758, +}; + +std::map, int> map_problem_conv2d_bias_relu_sm75; +std::mutex conv2d_bias_relu_sm75_mutex; + +void conv2d_bias_relu_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_relu_sm75.count(problem_size)) { + conv2d_bias_relu_sm75_all_func[map_problem_conv2d_bias_relu_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_relu_sm75_all_func, params, CONV2D_BIAS_RELU); + + std::lock_guard guard(conv2d_bias_relu_sm75_mutex); + + map_problem_conv2d_bias_relu_sm75[problem_size] = best_config_index; + conv2d_bias_relu_sm75_all_func[best_config_index](params); +} + +cutlass::Status conv2d_bias_silu_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_silu_sm75_all_func = {conv2d_bias_silu_sm750, +conv2d_bias_silu_sm751, +conv2d_bias_silu_sm752, +conv2d_bias_silu_sm753, +conv2d_bias_silu_sm754, +conv2d_bias_silu_sm755, +conv2d_bias_silu_sm756, +conv2d_bias_silu_sm757, +conv2d_bias_silu_sm758, +}; + +std::map, int> map_problem_conv2d_bias_silu_sm75; +std::mutex conv2d_bias_silu_sm75_mutex; + +void conv2d_bias_silu_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_silu_sm75.count(problem_size)) { + conv2d_bias_silu_sm75_all_func[map_problem_conv2d_bias_silu_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_silu_sm75_all_func, params, CONV2D_BIAS_SILU); + + std::lock_guard guard(conv2d_bias_silu_sm75_mutex); + + map_problem_conv2d_bias_silu_sm75[problem_size] = best_config_index; + conv2d_bias_silu_sm75_all_func[best_config_index](params); +} + +cutlass::Status conv2d_bias_leaky_relu_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_leaky_relu_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f, alpha}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_leaky_relu_sm75_all_func = {conv2d_bias_leaky_relu_sm750, +conv2d_bias_leaky_relu_sm751, +conv2d_bias_leaky_relu_sm752, +conv2d_bias_leaky_relu_sm753, +conv2d_bias_leaky_relu_sm754, +conv2d_bias_leaky_relu_sm755, +conv2d_bias_leaky_relu_sm756, +conv2d_bias_leaky_relu_sm757, +conv2d_bias_leaky_relu_sm758, +}; + +std::map, int> map_problem_conv2d_bias_leaky_relu_sm75; +std::mutex conv2d_bias_leaky_relu_sm75_mutex; + +void conv2d_bias_leaky_relu_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_leaky_relu_sm75.count(problem_size)) { + conv2d_bias_leaky_relu_sm75_all_func[map_problem_conv2d_bias_leaky_relu_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_leaky_relu_sm75_all_func, params, CONV2D_BIAS_LEAKY_RELU); + + std::lock_guard guard(conv2d_bias_leaky_relu_sm75_mutex); + + map_problem_conv2d_bias_leaky_relu_sm75[problem_size] = best_config_index; + conv2d_bias_leaky_relu_sm75_all_func[best_config_index](params); +} + +cutlass::Status conv2d_bias_sigmoid_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_sigmoid_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)(bias), {0, 0, 0}}, + {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_sigmoid_sm75_all_func = {conv2d_bias_sigmoid_sm750, +conv2d_bias_sigmoid_sm751, +conv2d_bias_sigmoid_sm752, +conv2d_bias_sigmoid_sm753, +conv2d_bias_sigmoid_sm754, +conv2d_bias_sigmoid_sm755, +conv2d_bias_sigmoid_sm756, +conv2d_bias_sigmoid_sm757, +conv2d_bias_sigmoid_sm758, +}; + +std::map, int> map_problem_conv2d_bias_sigmoid_sm75; +std::mutex conv2d_bias_sigmoid_sm75_mutex; + +void conv2d_bias_sigmoid_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_sigmoid_sm75.count(problem_size)) { + conv2d_bias_sigmoid_sm75_all_func[map_problem_conv2d_bias_sigmoid_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_sigmoid_sm75_all_func, params, CONV2D_BIAS_SIGMOID); + + std::lock_guard guard(conv2d_bias_sigmoid_sm75_mutex); + + map_problem_conv2d_bias_sigmoid_sm75[problem_size] = best_config_index; + conv2d_bias_sigmoid_sm75_all_func[best_config_index](params); +} + +void Conv2dBias(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_sm75(params); + } + +} + +void Conv2dBiasRelu(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_relu_sm75(params); + } + +} + +void Conv2dBiasSilu(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_silu_sm75(params); + } + +} + +void Conv2dBiasLeakyRelu(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_leaky_relu_sm75(params); + } + +} + +void Conv2dBiasSigmoid(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_sigmoid_sm75(params); + } + +} + +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu new file mode 100644 index 0000000000000..15729531e5dc8 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu @@ -0,0 +1,2389 @@ + +// Generated by conv2d_bias_residual.py - Do not edit. + +#include +#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h" +#include "cutlass/epilogue/thread/linear_combination_residual_block.h" +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" + +namespace phi { +namespace fusion { +namespace cutlass_internal { + +cutlass::Status conv2d_bias_silu_add_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm759(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm7510(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_silu_add_sm7511(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_silu_add_sm75_all_func = {conv2d_bias_silu_add_sm750, +conv2d_bias_silu_add_sm751, +conv2d_bias_silu_add_sm752, +conv2d_bias_silu_add_sm753, +conv2d_bias_silu_add_sm754, +conv2d_bias_silu_add_sm755, +conv2d_bias_silu_add_sm756, +conv2d_bias_silu_add_sm757, +conv2d_bias_silu_add_sm758, +conv2d_bias_silu_add_sm759, +conv2d_bias_silu_add_sm7510, +conv2d_bias_silu_add_sm7511, +}; + +std::map, int> map_problem_conv2d_bias_silu_add_sm75; +std::mutex conv2d_bias_silu_add_sm75_mutex; + +void conv2d_bias_silu_add_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_silu_add_sm75.count(problem_size)) { + conv2d_bias_silu_add_sm75_all_func[map_problem_conv2d_bias_silu_add_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_silu_add_sm75_all_func, params, CONV2D_BIAS_SILU_ADD); + + std::lock_guard guard(conv2d_bias_silu_add_sm75_mutex); + + map_problem_conv2d_bias_silu_add_sm75[problem_size] = best_config_index; + conv2d_bias_silu_add_sm75_all_func[best_config_index](params); +} + +cutlass::Status conv2d_bias_add_relu_sm750(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm751(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm752(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm753(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm754(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm755(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm756(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm757(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm758(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm759(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm7510(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_bias_add_relu_sm7511(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16,8,8>, + cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided, + 8, + 8 + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::ImplicitGemmConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + + const half *residual = params.residual; + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + cutlass::conv::SplitKMode::kSerial, + (cutlass::half_t *)(bias), nullptr, + 0, oc}; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + conv2d_bias_add_relu_sm75_all_func = {conv2d_bias_add_relu_sm750, +conv2d_bias_add_relu_sm751, +conv2d_bias_add_relu_sm752, +conv2d_bias_add_relu_sm753, +conv2d_bias_add_relu_sm754, +conv2d_bias_add_relu_sm755, +conv2d_bias_add_relu_sm756, +conv2d_bias_add_relu_sm757, +conv2d_bias_add_relu_sm758, +conv2d_bias_add_relu_sm759, +conv2d_bias_add_relu_sm7510, +conv2d_bias_add_relu_sm7511, +}; + +std::map, int> map_problem_conv2d_bias_add_relu_sm75; +std::mutex conv2d_bias_add_relu_sm75_mutex; + +void conv2d_bias_add_relu_sm75(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_conv2d_bias_add_relu_sm75.count(problem_size)) { + conv2d_bias_add_relu_sm75_all_func[map_problem_conv2d_bias_add_relu_sm75.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + conv2d_bias_add_relu_sm75_all_func, params, CONV2D_BIAS_ADD_RELU); + + std::lock_guard guard(conv2d_bias_add_relu_sm75_mutex); + + map_problem_conv2d_bias_add_relu_sm75[problem_size] = best_config_index; + conv2d_bias_add_relu_sm75_all_func[best_config_index](params); +} + +void Conv2dBiasSiluAdd(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_silu_add_sm75(params); + } + +} + +void Conv2dBiasAddRelu(const ConvAllParams& params) { + + if (params.sm_version == 75) + { + conv2d_bias_add_relu_sm75(params); + } + +} + +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu new file mode 100644 index 0000000000000..c6a6dd6a9e129 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu @@ -0,0 +1,3502 @@ + +// Generated by conv2d_depthwise_bias_act.py - Do not edit. + +#include +#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/conv/kernel/default_depthwise_fprop.h" +#include "cutlass/epilogue/thread/linear_combination_silu.h" +#include "cutlass/conv/device/direct_convolution.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +namespace phi { +namespace fusion { +namespace cutlass_internal { + +cutlass::Status conv2d_depthwise_bias_0(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_1(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_2(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_3(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_4(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_5(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_6(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_7(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + Conv2dDepthwiseBias_all_func = {conv2d_depthwise_bias_0, +conv2d_depthwise_bias_1, +conv2d_depthwise_bias_2, +conv2d_depthwise_bias_3, +conv2d_depthwise_bias_4, +conv2d_depthwise_bias_5, +conv2d_depthwise_bias_6, +conv2d_depthwise_bias_7, +}; + +std::map, int> map_problem_Conv2dDepthwiseBias; +std::mutex Conv2dDepthwiseBias_mutex; + +void Conv2dDepthwiseBias(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_Conv2dDepthwiseBias.count(problem_size)) { + Conv2dDepthwiseBias_all_func[map_problem_Conv2dDepthwiseBias.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + Conv2dDepthwiseBias_all_func, params, CONV2D_DEPTHWISE_BIAS); + + std::lock_guard guard(Conv2dDepthwiseBias_mutex); + + map_problem_Conv2dDepthwiseBias[problem_size] = best_config_index; + Conv2dDepthwiseBias_all_func[best_config_index](params); +} + +cutlass::Status conv2d_depthwise_bias_relu_0(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_1(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_2(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_3(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_4(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_5(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_6(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_relu_7(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + Conv2dDepthwiseBiasRelu_all_func = {conv2d_depthwise_bias_relu_0, +conv2d_depthwise_bias_relu_1, +conv2d_depthwise_bias_relu_2, +conv2d_depthwise_bias_relu_3, +conv2d_depthwise_bias_relu_4, +conv2d_depthwise_bias_relu_5, +conv2d_depthwise_bias_relu_6, +conv2d_depthwise_bias_relu_7, +}; + +std::map, int> map_problem_Conv2dDepthwiseBiasRelu; +std::mutex Conv2dDepthwiseBiasRelu_mutex; + +void Conv2dDepthwiseBiasRelu(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_Conv2dDepthwiseBiasRelu.count(problem_size)) { + Conv2dDepthwiseBiasRelu_all_func[map_problem_Conv2dDepthwiseBiasRelu.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + Conv2dDepthwiseBiasRelu_all_func, params, CONV2D_DEPTHWISE_BIAS_RELU); + + std::lock_guard guard(Conv2dDepthwiseBiasRelu_mutex); + + map_problem_Conv2dDepthwiseBiasRelu[problem_size] = best_config_index; + Conv2dDepthwiseBiasRelu_all_func[best_config_index](params); +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_0(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_1(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_2(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_3(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_4(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_5(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_6(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_sigmoid_7(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + Conv2dDepthwiseBiasSigmoid_all_func = {conv2d_depthwise_bias_sigmoid_0, +conv2d_depthwise_bias_sigmoid_1, +conv2d_depthwise_bias_sigmoid_2, +conv2d_depthwise_bias_sigmoid_3, +conv2d_depthwise_bias_sigmoid_4, +conv2d_depthwise_bias_sigmoid_5, +conv2d_depthwise_bias_sigmoid_6, +conv2d_depthwise_bias_sigmoid_7, +}; + +std::map, int> map_problem_Conv2dDepthwiseBiasSigmoid; +std::mutex Conv2dDepthwiseBiasSigmoid_mutex; + +void Conv2dDepthwiseBiasSigmoid(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_Conv2dDepthwiseBiasSigmoid.count(problem_size)) { + Conv2dDepthwiseBiasSigmoid_all_func[map_problem_Conv2dDepthwiseBiasSigmoid.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + Conv2dDepthwiseBiasSigmoid_all_func, params, CONV2D_DEPTHWISE_BIAS_SIGMOID); + + std::lock_guard guard(Conv2dDepthwiseBiasSigmoid_mutex); + + map_problem_Conv2dDepthwiseBiasSigmoid[problem_size] = best_config_index; + Conv2dDepthwiseBiasSigmoid_all_func[best_config_index](params); +} + +cutlass::Status conv2d_depthwise_bias_silu_0(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_1(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_2(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,9>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,16,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_3(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,9>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<3,3>, + + cutlass::gemm::GemmShape<16,32,9>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_4(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_5(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<1,1>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_6(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,16,25>, + cutlass::conv::TensorNHWCShape<1,8,8,16>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,16,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +cutlass::Status conv2d_depthwise_bias_silu_7(const ConvAllParams& params) { + using kernel_base = + typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<64,32,25>, + cutlass::conv::TensorNHWCShape<1,8,8,32>, + cutlass::MatrixShape<5,5>, + + cutlass::gemm::GemmShape<16,32,25>, + cutlass::gemm::GemmShape<1,1,1>, + cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, + cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, + cutlass::conv::StrideSupport::kStrided, + cutlass::MatrixShape<2,2>, + cutlass::MatrixShape<1, 1> + >::Kernel; + + using ImplicitGemm = + cutlass::conv::device::DirectConvolution; + const half *input = params.input; + const half *weight = params.weight; + const half *bias = params.bias; + half *output = params.output; + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + int pad_h0 = params.pad_h0; + int pad_w0 = params.pad_w0; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + int groups = params.groups; + int kc = ic / groups; + + int oh = params.oh; + int ow = params.ow; + int dilation_h = params.dilation_h; + int dilation_w = params.dilation_w; + int split_k_slices = (oh * ow + 63) / 64; + + cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, + {oc, kh, kw, ic / groups}, + {pad_h0, 0, pad_w0, 0}, + {stride_h, stride_w}, + {dilation_h, dilation_w}, + {batch, oh, ow, oc}, + cutlass::conv::Mode::kCrossCorrelation, + split_k_slices, + groups); + +size_t filter_size = oc * kh * kw * kc * sizeof(half); +phi::Allocator::AllocationPtr filter_gpu_ptrs_data = + phi::memory_utils::Alloc( + params.ctx->GetPlace(), + filter_size, + phi::Stream(reinterpret_cast(params.ctx->stream()))); +void *filter_workspace = filter_gpu_ptrs_data->ptr(); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, + {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, + {(cutlass::half_t *)bias, {0, 0, 0}}, + {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, + {1.f, 1.f}, + {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, + }; + + ImplicitGemm implicit_gemm_op; + size_t bytes = implicit_gemm_op.get_workspace_size(arguments); + + auto ctx = params.ctx; + auto stream = ctx->stream(); + phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = + phi::memory_utils::Alloc( + ctx->GetPlace(), + bytes, + phi::Stream(reinterpret_cast(stream))); + void *workspace = tmp_gpu_ptrs_data->ptr(); + + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + status = implicit_gemm_op.initialize(arguments, workspace); + CUTLASS_CHECK(status); + status = implicit_gemm_op(stream); + CUTLASS_CHECK(status); + return status; +} + +std::vector> + Conv2dDepthwiseBiasSilu_all_func = {conv2d_depthwise_bias_silu_0, +conv2d_depthwise_bias_silu_1, +conv2d_depthwise_bias_silu_2, +conv2d_depthwise_bias_silu_3, +conv2d_depthwise_bias_silu_4, +conv2d_depthwise_bias_silu_5, +conv2d_depthwise_bias_silu_6, +conv2d_depthwise_bias_silu_7, +}; + +std::map, int> map_problem_Conv2dDepthwiseBiasSilu; +std::mutex Conv2dDepthwiseBiasSilu_mutex; + +void Conv2dDepthwiseBiasSilu(const ConvAllParams& params) { + int batch = params.batch; + int ic = params.ic; + int ih = params.ih; + int iw = params.iw; + int kh = params.kh; + int kw = params.kw; + int oc = params.oc; + //int pad_h0 = params.pad_h0; + //int pad_w0 = params.pad_w0; + int groups = params.groups; + int stride_h = params.stride_h; + int stride_w = params.stride_w; + + std::vector problem_size = { + batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; + + if (map_problem_Conv2dDepthwiseBiasSilu.count(problem_size)) { + Conv2dDepthwiseBiasSilu_all_func[map_problem_Conv2dDepthwiseBiasSilu.at(problem_size)]( + params); + return; + } + + int best_config_index = ProfileToGetBestConfig( + Conv2dDepthwiseBiasSilu_all_func, params, CONV2D_DEPTHWISE_BIAS_SILU); + + std::lock_guard guard(Conv2dDepthwiseBiasSilu_mutex); + + map_problem_Conv2dDepthwiseBiasSilu[problem_size] = best_config_index; + Conv2dDepthwiseBiasSilu_all_func[best_config_index](params); +} + +} // namespace cutlass_internal +} // namespace fusion +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h new file mode 100644 index 0000000000000..95537512437e7 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h @@ -0,0 +1,4 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#define USE_FPAINTB_GEMM_WITH_SM80 diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu new file mode 100644 index 0000000000000..de7e712e91f73 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu new file mode 100644 index 0000000000000..9e094fbb407d1 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu new file mode 100644 index 0000000000000..3dda2e10c076d --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu new file mode 100644 index 0000000000000..ef063b6b55bfb --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu new file mode 100644 index 0000000000000..8e83a67d39ac3 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu new file mode 100644 index 0000000000000..037a9df710d29 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu new file mode 100644 index 0000000000000..1707f4580d253 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu new file mode 100644 index 0000000000000..6190532c83f51 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const uint8_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + uint8_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<16, 128, 64>, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<32, 128, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const __nv_bfloat16* A, + const cutlass::uint4b_t* B, + const __nv_bfloat16* weight_scales, + const __nv_bfloat16* biases, + __nv_bfloat16* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<__nv_bfloat16, + cutlass::uint4b_t, + cutlass::arch::Sm80, + EpilogueOpNoBias, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu new file mode 100644 index 0000000000000..b9961a387e02d --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu new file mode 100644 index 0000000000000..65c5476ee32ee --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 2>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu new file mode 100644 index 0000000000000..8412cb4a9a29a --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu new file mode 100644 index 0000000000000..ffda7f835c359 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 3>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu new file mode 100644 index 0000000000000..f5c6440565500 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu new file mode 100644 index 0000000000000..bed2b479bf58e --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 4>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu new file mode 100644 index 0000000000000..a9cbad5330d9a --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu new file mode 100644 index 0000000000000..2fdeec9273b6c --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu @@ -0,0 +1,439 @@ + +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const uint8_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<16, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<32, 32, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +template<> +void generic_mixed_gemm_kernelLauncher_template, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + const half* A, + const cutlass::uint4b_t* B, + const half* weight_scales, + const half* biases, + half* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher, + cutlass::gemm::GemmShape<64, 64, 64>, + 5>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} + +} // namespace phi + diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 22c92ab9ebeb5..41f71a5a835fd 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1363,7 +1363,7 @@ def exponential_(x, lam=1.0, name=None): f(x) = \lambda e^{-\lambda x} Args: - x(Tensor): Input tensor. The data type should be float32, float64. + x (Tensor): Input tensor. The data type should be float32, float64. lam(float, optional): :math:`\lambda` parameter of Exponential Distribution. Default, 1.0. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please @@ -1419,15 +1419,26 @@ def bernoulli_(x, p=0.5, name=None): - x (Tensor): Input Tensor ``x``. Examples: .. code-block:: python + + >>> import paddle + >>> x = paddle.empty((3, 4)).uniform_(0, 1) + >>> x.bernoulli_() + >>> # doctest: +SKIP('random check') + >>> print(x) + Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True, + [[1., 1., 1., 1.], + [1., 0., 0., 1.], + [0., 1., 1., 0.]]) + >>> # doctest: -SKIP + """ if 0 <= p and p <= 1: raise ValueError(f"bernoulli_ expects p to be in [0, 1], but got p={p}") check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential") - uniform_(x, 0, 1) - return (x < p).astype(x.dtype) - pass + uniform_(x, min=0., max=1.) + return x.set_value((x < p).astype(x.dtype)) def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): @@ -1436,8 +1447,10 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): Distribution, with ``mean``, ``std``, ``shape`` and ``dtype``. The Log Normal Distribution is defined as follows: - .. math:: - f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} + Equation: + .. math:: + + f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} Args: mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. @@ -1458,6 +1471,8 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): - out (Tensor): A Tensor filled with random values sampled from a log normal distribution with ``mean`` and ``std`` . Examples: .. code-block:: python + + :name: log_normal-example-1 >>> import paddle >>> out1 = paddle.log_normal(shape=[2, 3]) >>> print(out1) @@ -1466,6 +1481,9 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): [[-0.85107994, -0.85490644, -1.35941815], [-0.55500370, 0.20964541, 2.24193954]]) >>> # doctest: -SKIP + + :name: log_normal-example-2 + >>> import paddle >>> mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) >>> out2 = paddle.log_normal(mean=mean_tensor) >>> print(out2) @@ -1473,6 +1491,9 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True, [1.05411839, 3.71514320, 3.42665267]) >>> # doctest: -SKIP + + :name: log_normal-example-3 + >>> import paddle >>> std_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) >>> out3 = paddle.log_normal(mean=mean_tensor, std=std_tensor) >>> print(out3) @@ -1480,6 +1501,7 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True, [0.48646951, 0.00815189, 3.74022293]) >>> # doctest: -SKIP + """ op_type_for_check = 'gaussian/standard_normal/randn/normal' supported_dtypes = ['float32', 'float64', 'float16', 'uint16', 'bfloat16'] @@ -1494,8 +1516,11 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): ) if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) + + n_mean = paddle.log(mean ** 2 / paddle.sqrt(mean ** 2 + std ** 2)) + n_std = paddle.sqrt(paddle.log(1 + (std ** 2 / mean ** 2))) - distribution = gaussian(shape, mean=mean, std=std, seed=seed, dtype=dtype) + distribution = gaussian(shape, mean=n_mean, std=n_std, seed=seed, dtype=dtype) return paddle.exp(distribution) @@ -1505,8 +1530,10 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): This inplace OP fill input Tensor ``x`` with random number from a Log Normal Distribution with ``mean`` and ``std``. The Log Normal Distribution is defined as follows: - .. math:: - f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} + Equation: + .. math:: + + f(x) = \frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(\ln{x}-\mu)^2}{2\sigma^2}} Args: x (Tensor): The input tensor to be filled with random values. @@ -1525,6 +1552,7 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` . Examples: .. code-block:: python + >>> import paddle >>> x = paddle.randn([3, 4]) >>> x.log_normal_() @@ -1534,6 +1562,10 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): [[ 0.06132207, 1.11349595, 0.41906244, -0.24858207], [-1.85169315, -1.50370061, 1.73954511, 0.13331604], [ 1.66359663, -0.55764782, -0.59911072, -0.57773495]]) + >>> # doctest: -SKIP + """ + n_mean = paddle.log(mean ** 2 / paddle.sqrt(mean ** 2 + std ** 2)) + n_std = paddle.sqrt(paddle.log(1 + (std ** 2 / mean ** 2))) return gaussian_(x, mean=mean, std=std, seed=seed).exp_() diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index fd91763e1a2df..52c2514f29ea5 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1632,7 +1632,24 @@ def non_inplace_api_processing(self, var): return paddle.index_fill(var, self.index, self.axis, self.value) -class TestDygraphInplaceIndexFill(TestDygraphInplace): +class TestDygraphInplaceBernoulli(TestDygraphInplace): + def init_data(self): + self.shape = (20, 40) + self.x = np.random.random(self.shape) + self.dtype = "float32" + self.mean = 0 + self.std = 1 + self.seed = 100 + self.p = 0.5 + + def inplace_api_processing(self, var): + return paddle.bernoulli_(var, p=self.p) + + def non_inplace_api_processing(self, var): + return paddle.bernoulli(paddle.zeros(self.shape) + self.p) + + +class TestDygraphInplaceLogNormal(TestDygraphInplace): def init_data(self): self.shape = (20, 40) self.x = np.random.random(self.shape) @@ -1642,7 +1659,10 @@ def init_data(self): self.seed = 100 def inplace_api_processing(self, var): - return paddle.log_normal_(self.x, self.shape, self.mean, self.std, self.seed) + return paddle.log_normal_(var, self.shape, self.mean, self.std, self.seed) + + def non_inplace_api_processing(self, var): + return paddle.log_normal(var, self.index, self.axis, self.value) if __name__ == '__main__': From 59bb88f46110bbde5a3e37dcf22d151d56b10dee Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Tue, 19 Dec 2023 13:20:43 +0800 Subject: [PATCH 05/15] :mute: Remove: deleted extra files --- .../ops_signature/generated_fused_sig.cc | 1022 -- .../operators/ops_signature/generated_sig.cc | 9755 ----------------- .../ops_signature/generated_sparse_sig.cc | 2735 ----- .../ops_signature/generated_static_sig.cc | 1585 --- 4 files changed, 15097 deletions(-) delete mode 100644 paddle/fluid/operators/ops_signature/generated_fused_sig.cc delete mode 100644 paddle/fluid/operators/ops_signature/generated_sig.cc delete mode 100644 paddle/fluid/operators/ops_signature/generated_sparse_sig.cc delete mode 100644 paddle/fluid/operators/ops_signature/generated_static_sig.cc diff --git a/paddle/fluid/operators/ops_signature/generated_fused_sig.cc b/paddle/fluid/operators/ops_signature/generated_fused_sig.cc deleted file mode 100644 index 6a21ac83c4714..0000000000000 --- a/paddle/fluid/operators/ops_signature/generated_fused_sig.cc +++ /dev/null @@ -1,1022 +0,0 @@ -// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AddActXpuOpArgumentMapping: - -return KernelSignature("add_act_xpu", {"x", "x_max", "y", "y_max"}, {"act_type"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature AddActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "y", "y_max"}; - paddle::small_vector attrs; - attrs.emplace_back("act_type"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("add_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AddLayernormXpuOpArgumentMapping: - -return KernelSignature("add_layernorm_xpu", {"x", "y", "scale", "bias"}, {"begin_norm_axis", "epsilon"}, {"out"}); -****************************************************************** -*/ - -KernelSignature AddLayernormXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("begin_norm_axis"); - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"out"}; - return KernelSignature("add_layernorm_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AddcmulXpuOpArgumentMapping: - -return KernelSignature("addcmul_xpu", {"x", "y", "w"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature AddcmulXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "w"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("addcmul_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BlockMultiheadAttentionOpArgumentMapping: - -return KernelSignature("block_multihead_attention", {"qkv", "key_cache", "value_cache", "seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time", "padding_offsets", "cum_offsets", "cu_seqlens_q", "cu_seqlens_k", "block_tables", "pre_key_cache", "pre_value_cache", "rope_emb", "mask", "tgt_mask"}, {"max_seq_len", "block_size", "use_neox_style"}, {"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"}); -****************************************************************** -*/ - -KernelSignature BlockMultiheadAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"qkv", "key_cache", "value_cache", "seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time", "padding_offsets", "cum_offsets", "cu_seqlens_q", "cu_seqlens_k", "block_tables", "pre_key_cache", "pre_value_cache", "rope_emb", "mask", "tgt_mask"}; - paddle::small_vector attrs; - attrs.emplace_back("max_seq_len"); - attrs.emplace_back("block_size"); - attrs.emplace_back("use_neox_style"); - paddle::small_vector outputs {"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"}; - return KernelSignature("block_multihead_attention", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BnActXpuOpArgumentMapping: - -return KernelSignature("bn_act_xpu", {"x", "mean", "variance", "scale", "bias"}, {"momentum", "epsilon", "data_layout", "act_type"}, {"out"}); -****************************************************************** -*/ - -KernelSignature BnActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("data_layout"); - attrs.emplace_back("act_type"); - paddle::small_vector outputs {"out"}; - return KernelSignature("bn_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv1dXpuOpArgumentMapping: - -return KernelSignature("conv1d_xpu", {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max"}, {"paddings", "padding_algorithm", "dilations", "strides", "groups", "act_type", "act_param"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature Conv1dXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("groups"); - attrs.emplace_back("act_type"); - attrs.emplace_back("act_param"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("conv1d_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dTransposeXpuOpArgumentMapping: - -return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); -return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); -return KernelSignature("conv2d_transpose_xpu", {"x", "x_max", "filter", "filter_max", "bias"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format", "has_bias", "with_act", "act_type"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature Conv2dTransposeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back( - ctx.HasInput("OutputSizeTensor") - ? "OutputSizeTensor" - : ctx.InputSize("OutputSizeTensorList") > 0 - ? "OutputSizeTensorList" - : "output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - attrs.emplace_back("has_bias"); - attrs.emplace_back("with_act"); - attrs.emplace_back("act_type"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("conv2d_transpose_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dXpuOpArgumentMapping: - -return KernelSignature("conv2d_xpu", {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max", "scale_max", "out_max_in"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "act_type", "act_param", "out_dtype"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature Conv2dXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "filter", "filter_max", "bias", "branch", "branch_max", "scale_max", "out_max_in"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("act_type"); - attrs.emplace_back("act_param"); - attrs.emplace_back("out_dtype"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("conv2d_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DequantizeXpuOpArgumentMapping: - -return KernelSignature("dequantize_xpu", {"x"}, {"out_dtype", "scale"}, {"y"}); -****************************************************************** -*/ - -KernelSignature DequantizeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("out_dtype"); - attrs.emplace_back("scale"); - paddle::small_vector outputs {"y"}; - return KernelSignature("dequantize_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EmbeddingWithEltwiseAddXpuOpArgumentMapping: - -return KernelSignature("embedding_with_eltwise_add_xpu", {"ids", "tables", "mask"}, {"padding_idx"}, {"out", "seq_lod", "max_seq_len"}); -****************************************************************** -*/ - -KernelSignature EmbeddingWithEltwiseAddXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"ids", "tables", "mask"}; - paddle::small_vector attrs; - attrs.emplace_back("padding_idx"); - paddle::small_vector outputs {"out", "seq_lod", "max_seq_len"}; - return KernelSignature("embedding_with_eltwise_add_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FastLayernormXpuOpArgumentMapping: - -return KernelSignature("fast_layernorm_xpu", {"x", "scale", "bias"}, {"begin_norm_axis", "epsilon"}, {"out"}); -****************************************************************** -*/ - -KernelSignature FastLayernormXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("begin_norm_axis"); - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"out"}; - return KernelSignature("fast_layernorm_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FastWhereXpuOpArgumentMapping: - -return KernelSignature("fast_where_xpu", {"condition", "x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature FastWhereXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"condition", "x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("fast_where_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FcOpArgumentMapping: - -return KernelSignature("fc", {"Input", "W", "Bias"}, {"in_num_col_dims", "activation_type", "use_mkldnn", "padding_weights", "use_quantizer", "mkldnn_data_type", "Scale_in", "Scale_weights", "Scale_out", "force_fp32_output"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FcOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "W", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("in_num_col_dims"); - attrs.emplace_back("activation_type"); - attrs.emplace_back("use_mkldnn"); - attrs.emplace_back("padding_weights"); - attrs.emplace_back("use_quantizer"); - attrs.emplace_back("mkldnn_data_type"); - attrs.emplace_back("Scale_in"); - attrs.emplace_back("Scale_weights"); - attrs.emplace_back("Scale_out"); - attrs.emplace_back("force_fp32_output"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fc", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FcXpuOpArgumentMapping: - -return KernelSignature("fc_xpu", {"x", "x_max", "w", "w_max", "bias", "scale_max", "out_max_in"}, {"in_num_col_dims", "transpose_x", "alpha", "beta", "act_type", "act_alpha", "out_dtype"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature FcXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "w", "w_max", "bias", "scale_max", "out_max_in"}; - paddle::small_vector attrs; - attrs.emplace_back("in_num_col_dims"); - attrs.emplace_back("transpose_x"); - attrs.emplace_back("alpha"); - attrs.emplace_back("beta"); - attrs.emplace_back("act_type"); - attrs.emplace_back("act_alpha"); - attrs.emplace_back("out_dtype"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("fc_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedBiasActOpArgumentMapping: - -return KernelSignature("fused_bias_act", {"x", "bias", "dequant_scales", "shift", "smooth"}, {"act_method", "compute_dtype", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out"}); -****************************************************************** -*/ - -KernelSignature FusedBiasActOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "bias", "dequant_scales", "shift", "smooth"}; - paddle::small_vector attrs; - attrs.emplace_back("act_method"); - attrs.emplace_back("compute_dtype"); - attrs.emplace_back("quant_scale"); - attrs.emplace_back("quant_round_type"); - attrs.emplace_back("quant_max_bound"); - attrs.emplace_back("quant_min_bound"); - paddle::small_vector outputs {"out"}; - return KernelSignature("fused_bias_act", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedBiasDropoutResidualLayerNormOpArgumentMapping: - -return KernelSignature("fused_bias_dropout_residual_layer_norm", {"X", "Residual", "Bias", "LnScale", "LnBias"}, {"dropout_rate", "is_test", "dropout_fix_seed", "dropout_seed", "dropout_implementation", "ln_epsilon"}, {"Y", "BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance"}); -****************************************************************** -*/ - -KernelSignature FusedBiasDropoutResidualLayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Residual", "Bias", "LnScale", "LnBias"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout_rate"); - attrs.emplace_back("is_test"); - attrs.emplace_back("dropout_fix_seed"); - attrs.emplace_back("dropout_seed"); - attrs.emplace_back("dropout_implementation"); - attrs.emplace_back("ln_epsilon"); - paddle::small_vector outputs {"Y", "BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance"}; - return KernelSignature("fused_bias_dropout_residual_layer_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedBiasResidualLayernormOpArgumentMapping: - -return KernelSignature("fused_bias_residual_layernorm", {"x", "bias", "residual", "norm_weight", "norm_bias"}, {"epsilon", "residual_alpha", "begin_norm_axis", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "residual_out", "mean", "variance"}); -****************************************************************** -*/ - -KernelSignature FusedBiasResidualLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "bias", "residual", "norm_weight", "norm_bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("residual_alpha"); - attrs.emplace_back("begin_norm_axis"); - attrs.emplace_back("quant_scale"); - attrs.emplace_back("quant_round_type"); - attrs.emplace_back("quant_max_bound"); - attrs.emplace_back("quant_min_bound"); - paddle::small_vector outputs {"out", "residual_out", "mean", "variance"}; - return KernelSignature("fused_bias_residual_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedConv2dAddActOpArgumentMapping: - -return KernelSignature("fused_conv2d_add_act", {"Input", "Filter", "Bias", "ResidualData"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format", "activation", "split_channels", "exhaustive_search", "workspace_size_MB", "fuse_alpha"}, {"Output", "Outputs"}); -****************************************************************** -*/ - -KernelSignature FusedConv2dAddActOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Bias", "ResidualData"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("dilations"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - attrs.emplace_back("activation"); - attrs.emplace_back("split_channels"); - attrs.emplace_back("exhaustive_search"); - attrs.emplace_back("workspace_size_MB"); - attrs.emplace_back("fuse_alpha"); - paddle::small_vector outputs {"Output", "Outputs"}; - return KernelSignature("fused_conv2d_add_act", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedDconvDreluDbnOpArgumentMapping: - -return KernelSignature("fused_dconv_drelu_dbn", {"grad_output", "weight", "grad_output_add", "residual_input", "bn1_eqscale", "bn1_eqbias", "conv_input", "bn1_mean", "bn1_inv_std", "bn1_gamma", "bn1_beta", "bn1_input", "bn2_mean", "bn2_inv_std", "bn2_gamma", "bn2_beta", "bn2_input"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "data_format", "fuse_shortcut", "fuse_dual", "fuse_add", "exhaustive_search"}, {"grad_weight", "grad_bn1_input", "grad_bn1_gamma", "grad_bn1_beta", "grad_bn2_input", "grad_bn2_gamma", "grad_bn2_beta"}); -****************************************************************** -*/ - -KernelSignature FusedDconvDreluDbnOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"grad_output", "weight", "grad_output_add", "residual_input", "bn1_eqscale", "bn1_eqbias", "conv_input", "bn1_mean", "bn1_inv_std", "bn1_gamma", "bn1_beta", "bn1_input", "bn2_mean", "bn2_inv_std", "bn2_gamma", "bn2_beta", "bn2_input"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - attrs.emplace_back("fuse_shortcut"); - attrs.emplace_back("fuse_dual"); - attrs.emplace_back("fuse_add"); - attrs.emplace_back("exhaustive_search"); - paddle::small_vector outputs {"grad_weight", "grad_bn1_input", "grad_bn1_gamma", "grad_bn1_beta", "grad_bn2_input", "grad_bn2_gamma", "grad_bn2_beta"}; - return KernelSignature("fused_dconv_drelu_dbn", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedDropoutAddOpArgumentMapping: - -return KernelSignature("fused_dropout_add", {"x", "y", "seed_tensor"}, {"p", "is_test", "mode", "seed", "fix_seed"}, {"out", "seed_offset"}); -return KernelSignature("fused_dropout_add", {"x", "y", "seed_tensor"}, {"PTensor", "is_test", "mode", "seed", "fix_seed"}, {"out", "seed_offset"}); -****************************************************************** -*/ - -KernelSignature FusedDropoutAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "seed_tensor"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("PTensor") ? "PTensor" : "p"); - attrs.emplace_back("is_test"); - attrs.emplace_back("mode"); - attrs.emplace_back("seed"); - attrs.emplace_back("fix_seed"); - paddle::small_vector outputs {"out", "seed_offset"}; - return KernelSignature("fused_dropout_add", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedEmbeddingEltwiseLayernormOpArgumentMapping: - -return KernelSignature("fused_embedding_eltwise_layernorm", {"Ids", "Embs", "Bias", "Scale"}, {"epsilon"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FusedEmbeddingEltwiseLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Ids", "Embs", "Bias", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fused_embedding_eltwise_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedFcElementwiseLayernormOpArgumentMapping: - -return KernelSignature("fused_fc_elementwise_layernorm", {"X", "W", "Y", "Bias0", "Scale", "Bias1"}, {"x_num_col_dims", "activation_type", "epsilon", "begin_norm_axis"}, {"Out", "Mean", "Variance"}); -****************************************************************** -*/ - -KernelSignature FusedFcElementwiseLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "W", "Y", "Bias0", "Scale", "Bias1"}; - paddle::small_vector attrs; - attrs.emplace_back("x_num_col_dims"); - attrs.emplace_back("activation_type"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("begin_norm_axis"); - paddle::small_vector outputs {"Out", "Mean", "Variance"}; - return KernelSignature("fused_fc_elementwise_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedLinearParamGradAddOpArgumentMapping: - -return KernelSignature("fused_linear_param_grad_add", {"x", "dout", "dweight", "dbias"}, {"multi_precision", "has_bias"}, {"dweight_out", "dbias_out"}); -****************************************************************** -*/ - -KernelSignature FusedLinearParamGradAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "dout", "dweight", "dbias"}; - paddle::small_vector attrs; - attrs.emplace_back("multi_precision"); - attrs.emplace_back("has_bias"); - paddle::small_vector outputs {"dweight_out", "dbias_out"}; - return KernelSignature("fused_linear_param_grad_add", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedMultiTransformerInt8XpuOpArgumentMapping: - -return KernelSignature("fused_multi_transformer_int8_xpu", {"x", "ln_scale", "ln_bias", "qkv_in_max", "qkvw", "qkv_bias", "qkv_scales", "out_linear_in_max", "out_linear_w", "out_linear_bias", "out_linear_scales", "ffn_ln_scale", "ffn_ln_bias", "ffn1_in_max", "ffn1_weight", "ffn1_bias", "ffn1_scales", "ffn2_in_max", "ffn2_weight", "ffn2_bias", "ffn2_scales", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}, {"pre_layer_norm", "rotary_emb_dims", "epsilon", "dropout_rate", "is_test", "dropout_implementation", "act_method", "trans_qkvw", "ring_id", "gather_axis"}, {"out", "cache_kv_out"}); -****************************************************************** -*/ - -KernelSignature FusedMultiTransformerInt8XpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "ln_scale", "ln_bias", "qkv_in_max", "qkvw", "qkv_bias", "qkv_scales", "out_linear_in_max", "out_linear_w", "out_linear_bias", "out_linear_scales", "ffn_ln_scale", "ffn_ln_bias", "ffn1_in_max", "ffn1_weight", "ffn1_bias", "ffn1_scales", "ffn2_in_max", "ffn2_weight", "ffn2_bias", "ffn2_scales", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}; - paddle::small_vector attrs; - attrs.emplace_back("pre_layer_norm"); - attrs.emplace_back("rotary_emb_dims"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("dropout_rate"); - attrs.emplace_back("is_test"); - attrs.emplace_back("dropout_implementation"); - attrs.emplace_back("act_method"); - attrs.emplace_back("trans_qkvw"); - attrs.emplace_back("ring_id"); - attrs.emplace_back("gather_axis"); - paddle::small_vector outputs {"out", "cache_kv_out"}; - return KernelSignature("fused_multi_transformer_int8_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedMultiTransformerXpuOpArgumentMapping: - -return KernelSignature("fused_multi_transformer_xpu", {"x", "ln_scale", "ln_bias", "qkvw", "qkvw_max", "qkv_bias", "out_linear_w", "out_linear_wmax", "out_linear_bias", "ffn_ln_scale", "ffn_ln_bias", "ffn1_weight", "ffn1_weight_max", "ffn1_bias", "ffn2_weight", "ffn2_weight_max", "ffn2_bias", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}, {"pre_layer_norm", "rotary_emb_dims", "epsilon", "dropout_rate", "is_test", "dropout_implementation", "act_method", "trans_qkvw", "ring_id", "gather_axis"}, {"out", "cache_kv_out"}); -****************************************************************** -*/ - -KernelSignature FusedMultiTransformerXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "ln_scale", "ln_bias", "qkvw", "qkvw_max", "qkv_bias", "out_linear_w", "out_linear_wmax", "out_linear_bias", "ffn_ln_scale", "ffn_ln_bias", "ffn1_weight", "ffn1_weight_max", "ffn1_bias", "ffn2_weight", "ffn2_weight_max", "ffn2_bias", "cache_kv", "pre_caches", "rotary_pos_emb", "time_step", "seq_lengths", "src_mask", "gather_index", "max_buffer"}; - paddle::small_vector attrs; - attrs.emplace_back("pre_layer_norm"); - attrs.emplace_back("rotary_emb_dims"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("dropout_rate"); - attrs.emplace_back("is_test"); - attrs.emplace_back("dropout_implementation"); - attrs.emplace_back("act_method"); - attrs.emplace_back("trans_qkvw"); - attrs.emplace_back("ring_id"); - attrs.emplace_back("gather_axis"); - paddle::small_vector outputs {"out", "cache_kv_out"}; - return KernelSignature("fused_multi_transformer_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedRotaryPositionEmbeddingOpArgumentMapping: - -return KernelSignature("fused_rotary_position_embedding", {"q", "k", "v", "sin", "cos", "position_ids"}, {"use_neox_rotary_style"}, {"out_q", "out_k", "out_v"}); -****************************************************************** -*/ - -KernelSignature FusedRotaryPositionEmbeddingOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"q", "k", "v", "sin", "cos", "position_ids"}; - paddle::small_vector attrs; - attrs.emplace_back("use_neox_rotary_style"); - paddle::small_vector outputs {"out_q", "out_k", "out_v"}; - return KernelSignature("fused_rotary_position_embedding", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedScaleBiasAddReluOpArgumentMapping: - -return KernelSignature("fused_scale_bias_add_relu", {"x1", "scale1", "bias1", "x2", "scale2", "bias2"}, {"fuse_dual", "exhaustive_search"}, {"out"}); -****************************************************************** -*/ - -KernelSignature FusedScaleBiasAddReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x1", "scale1", "bias1", "x2", "scale2", "bias2"}; - paddle::small_vector attrs; - attrs.emplace_back("fuse_dual"); - attrs.emplace_back("exhaustive_search"); - paddle::small_vector outputs {"out"}; - return KernelSignature("fused_scale_bias_add_relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedScaleBiasReluConvBnOpArgumentMapping: - -return KernelSignature("fused_scale_bias_relu_conv_bn", {"x", "w", "scale", "bias", "bn_scale", "bn_bias", "input_running_mean", "input_running_var"}, {"paddings", "dilations", "strides", "padding_algorithm", "groups", "data_format", "momentum", "epsilon", "fuse_prologue", "exhaustive_search", "accumulation_count"}, {"out", "out_running_mean", "out_running_var", "saved_mean", "saved_var", "eq_scale", "eq_bias"}); -****************************************************************** -*/ - -KernelSignature FusedScaleBiasReluConvBnOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "w", "scale", "bias", "bn_scale", "bn_bias", "input_running_mean", "input_running_var"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("fuse_prologue"); - attrs.emplace_back("exhaustive_search"); - attrs.emplace_back("accumulation_count"); - paddle::small_vector outputs {"out", "out_running_mean", "out_running_var", "saved_mean", "saved_var", "eq_scale", "eq_bias"}; - return KernelSignature("fused_scale_bias_relu_conv_bn", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionGruOpArgumentMapping: - -return KernelSignature("fusion_gru", {"X", "H0", "WeightX", "WeightH", "Bias"}, {"activation", "gate_activation", "is_reverse", "use_seq", "origin_mode", "use_mkldnn", "mkldnn_data_type", "Scale_data", "Shift_data", "Scale_weights", "force_fp32_output"}, {"ReorderedH0", "XX", "BatchedInput", "BatchedOut", "Hidden"}); -****************************************************************** -*/ - -KernelSignature FusionGruOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "H0", "WeightX", "WeightH", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("activation"); - attrs.emplace_back("gate_activation"); - attrs.emplace_back("is_reverse"); - attrs.emplace_back("use_seq"); - attrs.emplace_back("origin_mode"); - attrs.emplace_back("use_mkldnn"); - attrs.emplace_back("mkldnn_data_type"); - attrs.emplace_back("Scale_data"); - attrs.emplace_back("Shift_data"); - attrs.emplace_back("Scale_weights"); - attrs.emplace_back("force_fp32_output"); - paddle::small_vector outputs {"ReorderedH0", "XX", "BatchedInput", "BatchedOut", "Hidden"}; - return KernelSignature("fusion_gru", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionRepeatedFcReluOpArgumentMapping: - -return KernelSignature("fusion_repeated_fc_relu", {"X", "W", "Bias"}, {}, {"ReluOut", "Out"}); -****************************************************************** -*/ - -KernelSignature FusionRepeatedFcReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "W", "Bias"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"ReluOut", "Out"}; - return KernelSignature("fusion_repeated_fc_relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionSeqconvEltaddReluOpArgumentMapping: - -return KernelSignature("fusion_seqconv_eltadd_relu", {"X", "Filter", "Bias"}, {"contextLength", "contextStart", "contextStride"}, {"Out", "ColMat"}); -****************************************************************** -*/ - -KernelSignature FusionSeqconvEltaddReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Filter", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("contextLength"); - attrs.emplace_back("contextStart"); - attrs.emplace_back("contextStride"); - paddle::small_vector outputs {"Out", "ColMat"}; - return KernelSignature("fusion_seqconv_eltadd_relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionSeqexpandConcatFcOpArgumentMapping: - -return KernelSignature("fusion_seqexpand_concat_fc", {"X", "FCWeight", "FCBias"}, {"fc_activation"}, {"Out", "FCOut"}); -****************************************************************** -*/ - -KernelSignature FusionSeqexpandConcatFcOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "FCWeight", "FCBias"}; - paddle::small_vector attrs; - attrs.emplace_back("fc_activation"); - paddle::small_vector outputs {"Out", "FCOut"}; - return KernelSignature("fusion_seqexpand_concat_fc", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionSquaredMatSubOpArgumentMapping: - -return KernelSignature("fusion_squared_mat_sub", {"X", "Y"}, {"scalar"}, {"SquaredX", "SquaredY", "SquaredXY", "Out"}); -****************************************************************** -*/ - -KernelSignature FusionSquaredMatSubOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("scalar"); - paddle::small_vector outputs {"SquaredX", "SquaredY", "SquaredXY", "Out"}; - return KernelSignature("fusion_squared_mat_sub", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusionTransposeFlattenConcatOpArgumentMapping: - -return KernelSignature("fusion_transpose_flatten_concat", {"X"}, {"trans_axis", "flatten_axis", "concat_axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FusionTransposeFlattenConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("trans_axis"); - attrs.emplace_back("flatten_axis"); - attrs.emplace_back("concat_axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fusion_transpose_flatten_concat", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GenerateSequenceXpuOpArgumentMapping: - -return KernelSignature("generate_sequence_xpu", {"x"}, {"dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature GenerateSequenceXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out"}; - return KernelSignature("generate_sequence_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LayerNormActXpuOpArgumentMapping: - -return KernelSignature("layer_norm_act_xpu", {"x", "scale", "bias"}, {"begin_norm_axis", "epsilon", "act_type", "act_param"}, {"out"}); -****************************************************************** -*/ - -KernelSignature LayerNormActXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("begin_norm_axis"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("act_type"); - attrs.emplace_back("act_param"); - paddle::small_vector outputs {"out"}; - return KernelSignature("layer_norm_act_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiEncoderXpuOpArgumentMapping: - -return KernelSignature("multi_encoder_xpu", {"x", "fc_weight", "fc_weight_max", "fc_bias", "ln_scale", "ln_bias", "mask", "seq_lod", "max_seq_len"}, {"layer_num", "norm_before", "hidden_dim", "head_num", "size_per_head", "ffn_hidden_dim_scale", "act_type", "relative_type", "slice_idx"}, {"out", "x_fp16", "out_fp16"}); -****************************************************************** -*/ - -KernelSignature MultiEncoderXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "fc_weight", "fc_weight_max", "fc_bias", "ln_scale", "ln_bias", "mask", "seq_lod", "max_seq_len"}; - paddle::small_vector attrs; - attrs.emplace_back("layer_num"); - attrs.emplace_back("norm_before"); - attrs.emplace_back("hidden_dim"); - attrs.emplace_back("head_num"); - attrs.emplace_back("size_per_head"); - attrs.emplace_back("ffn_hidden_dim_scale"); - attrs.emplace_back("act_type"); - attrs.emplace_back("relative_type"); - attrs.emplace_back("slice_idx"); - paddle::small_vector outputs {"out", "x_fp16", "out_fp16"}; - return KernelSignature("multi_encoder_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiheadMatmulOpArgumentMapping: - -return KernelSignature("multihead_matmul", {"Input", "W", "Bias", "BiasQK"}, {"transpose_Q", "transpose_K", "transpose_V", "alpha", "head_number"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MultiheadMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "W", "Bias", "BiasQK"}; - paddle::small_vector attrs; - attrs.emplace_back("transpose_Q"); - attrs.emplace_back("transpose_K"); - attrs.emplace_back("transpose_V"); - attrs.emplace_back("alpha"); - attrs.emplace_back("head_number"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("multihead_matmul", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by QuantizeXpuOpArgumentMapping: - -return KernelSignature("quantize_xpu", {"x"}, {"out_dtype", "scale"}, {"y"}); -****************************************************************** -*/ - -KernelSignature QuantizeXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("out_dtype"); - attrs.emplace_back("scale"); - paddle::small_vector outputs {"y"}; - return KernelSignature("quantize_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SelfDpAttentionOpArgumentMapping: - -return KernelSignature("self_dp_attention", {"X"}, {"alpha", "head_number"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SelfDpAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - attrs.emplace_back("head_number"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("self_dp_attention", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SkipLayernormOpArgumentMapping: - -return KernelSignature("skip_layernorm", {"X", "Y", "Scale", "Bias"}, {"epsilon", "begin_norm_axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SkipLayernormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Scale", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("begin_norm_axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("skip_layernorm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqueezeExcitationBlockOpArgumentMapping: - -return KernelSignature("squeeze_excitation_block", {"x", "filter", "filter_max", "bias", "branch"}, {"act_type", "act_param", "filter_dims"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SqueezeExcitationBlockOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "filter", "filter_max", "bias", "branch"}; - paddle::small_vector attrs; - attrs.emplace_back("act_type"); - attrs.emplace_back("act_param"); - attrs.emplace_back("filter_dims"); - paddle::small_vector outputs {"out"}; - return KernelSignature("squeeze_excitation_block", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by VariableLengthMemoryEfficientAttentionOpArgumentMapping: - -return KernelSignature("variable_length_memory_efficient_attention", {"query", "key", "value", "seq_lens", "kv_seq_lens", "mask"}, {"scale", "causal", "pre_cache_length"}, {"out"}); -****************************************************************** -*/ - -KernelSignature VariableLengthMemoryEfficientAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"query", "key", "value", "seq_lens", "kv_seq_lens", "mask"}; - paddle::small_vector attrs; - attrs.emplace_back("scale"); - attrs.emplace_back("causal"); - attrs.emplace_back("pre_cache_length"); - paddle::small_vector outputs {"out"}; - return KernelSignature("variable_length_memory_efficient_attention", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by YoloBoxXpuOpArgumentMapping: - -return KernelSignature("yolo_box_xpu", {"x", "x_max", "grid", "stride", "anchor_grid"}, {"offset"}, {"out", "out_max"}); -****************************************************************** -*/ - -KernelSignature YoloBoxXpuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "x_max", "grid", "stride", "anchor_grid"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - paddle::small_vector outputs {"out", "out_max"}; - return KernelSignature("yolo_box_xpu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedBiasDropoutResidualLayerNormGradOpArgumentMapping: - -return KernelSignature("fused_bias_dropout_residual_layer_norm_grad", {"Y@GRAD", "X", "Residual", "Bias", "LnScale", "LnBias", "LnMean", "LnVariance", "BiasDropoutResidualOut", "DropoutMaskOut"}, {"dropout_rate", "is_test", "dropout_fix_seed", "dropout_seed", "dropout_implementation", "ln_epsilon"}, {"X@GRAD", "Residual@GRAD", "Bias@GRAD", "LnScale@GRAD", "LnBias@GRAD"}); -****************************************************************** -*/ - -KernelSignature FusedBiasDropoutResidualLayerNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Y@GRAD", "X", "Residual", "Bias", "LnScale", "LnBias", "LnMean", "LnVariance", "BiasDropoutResidualOut", "DropoutMaskOut"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout_rate"); - attrs.emplace_back("is_test"); - attrs.emplace_back("dropout_fix_seed"); - attrs.emplace_back("dropout_seed"); - attrs.emplace_back("dropout_implementation"); - attrs.emplace_back("ln_epsilon"); - paddle::small_vector outputs {"X@GRAD", "Residual@GRAD", "Bias@GRAD", "LnScale@GRAD", "LnBias@GRAD"}; - return KernelSignature("fused_bias_dropout_residual_layer_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedDropoutAddGradOpArgumentMapping: - -return KernelSignature("fused_dropout_add_grad", {"seed_offset", "out@GRAD"}, {"p", "is_test", "mode", "fix_seed"}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("fused_dropout_add_grad", {"seed_offset", "out@GRAD"}, {"PTensor", "is_test", "mode", "fix_seed"}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature FusedDropoutAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"seed_offset", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("PTensor") ? "PTensor" : "p"); - attrs.emplace_back("is_test"); - attrs.emplace_back("mode"); - attrs.emplace_back("fix_seed"); - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - return KernelSignature("fused_dropout_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FusedRotaryPositionEmbeddingGradOpArgumentMapping: - -return KernelSignature("fused_rotary_position_embedding_grad", {"sin", "cos", "position_ids", "out_q@GRAD", "out_k@GRAD", "out_v@GRAD"}, {"use_neox_rotary_style"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); -****************************************************************** -*/ - -KernelSignature FusedRotaryPositionEmbeddingGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"sin", "cos", "position_ids", "out_q@GRAD", "out_k@GRAD", "out_v@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("use_neox_rotary_style"); - paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; - return KernelSignature("fused_rotary_position_embedding_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(add_act_xpu, phi::AddActXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(add_layernorm_xpu, phi::AddLayernormXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(addcmul_xpu, phi::AddcmulXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(block_multihead_attention, phi::BlockMultiheadAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bn_act_xpu, phi::BnActXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv1d_xpu, phi::Conv1dXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_xpu, phi::Conv2dTransposeXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_xpu, phi::Conv2dXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dequantize_xpu, phi::DequantizeXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(embedding_with_eltwise_add_xpu, phi::EmbeddingWithEltwiseAddXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fast_layernorm_xpu, phi::FastLayernormXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fast_where_xpu, phi::FastWhereXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fc, phi::FcOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fc_xpu, phi::FcXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_bias_act, phi::FusedBiasActOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_bias_dropout_residual_layer_norm, phi::FusedBiasDropoutResidualLayerNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_bias_residual_layernorm, phi::FusedBiasResidualLayernormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_conv2d_add_act, phi::FusedConv2dAddActOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_dconv_drelu_dbn, phi::FusedDconvDreluDbnOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_dropout_add, phi::FusedDropoutAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_embedding_eltwise_layernorm, phi::FusedEmbeddingEltwiseLayernormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_fc_elementwise_layernorm, phi::FusedFcElementwiseLayernormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_linear_param_grad_add, phi::FusedLinearParamGradAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_multi_transformer_int8_xpu, phi::FusedMultiTransformerInt8XpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_multi_transformer_xpu, phi::FusedMultiTransformerXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_rotary_position_embedding, phi::FusedRotaryPositionEmbeddingOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_scale_bias_add_relu, phi::FusedScaleBiasAddReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_scale_bias_relu_conv_bn, phi::FusedScaleBiasReluConvBnOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_gru, phi::FusionGruOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_repeated_fc_relu, phi::FusionRepeatedFcReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_seqconv_eltadd_relu, phi::FusionSeqconvEltaddReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_seqexpand_concat_fc, phi::FusionSeqexpandConcatFcOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_squared_mat_sub, phi::FusionSquaredMatSubOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fusion_transpose_flatten_concat, phi::FusionTransposeFlattenConcatOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(generate_sequence_xpu, phi::GenerateSequenceXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(layer_norm_act_xpu, phi::LayerNormActXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multi_encoder_xpu, phi::MultiEncoderXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multihead_matmul, phi::MultiheadMatmulOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(quantize_xpu, phi::QuantizeXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(self_dp_attention, phi::SelfDpAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(skip_layernorm, phi::SkipLayernormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(squeeze_excitation_block, phi::SqueezeExcitationBlockOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(variable_length_memory_efficient_attention, phi::VariableLengthMemoryEfficientAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(yolo_box_xpu, phi::YoloBoxXpuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_bias_dropout_residual_layer_norm_grad, phi::FusedBiasDropoutResidualLayerNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_dropout_add_grad, phi::FusedDropoutAddGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fused_rotary_position_embedding_grad, phi::FusedRotaryPositionEmbeddingGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_sig.cc b/paddle/fluid/operators/ops_signature/generated_sig.cc deleted file mode 100644 index b33adeafd0471..0000000000000 --- a/paddle/fluid/operators/ops_signature/generated_sig.cc +++ /dev/null @@ -1,9755 +0,0 @@ -// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AbsOpArgumentMapping: - -return KernelSignature("abs", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AbsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("abs", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AccuracyOpArgumentMapping: - -return KernelSignature("accuracy", {"Out", "Indices", "Label"}, {}, {"Accuracy", "Correct", "Total"}); -****************************************************************** -*/ - -KernelSignature AccuracyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Indices", "Label"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Accuracy", "Correct", "Total"}; - return KernelSignature("accuracy", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AcosOpArgumentMapping: - -return KernelSignature("acos", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AcosOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("acos", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AcoshOpArgumentMapping: - -return KernelSignature("acosh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AcoshOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("acosh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AdagradOpArgumentMapping: - -return KernelSignature("adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, {"epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "MasterParamOut"}); -return KernelSignature("adagrad_dense_param_sparse_grad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}, {"epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature AdagradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("multi_precision"); - paddle::small_vector outputs {"ParamOut", "MomentOut", "MasterParamOut"}; - if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("Grad") && - ctx.IsDenseTensorInput("Moment") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("adagrad", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Param") && - ctx.IsSelectedRowsInput("Grad") && - ctx.IsDenseTensorInput("Moment") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("adagrad_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AdamaxOpArgumentMapping: - -return KernelSignature("adamax", {"Param", "Grad", "LearningRate", "Moment", "InfNorm", "Beta1Pow", "MasterParam"}, {"beta1", "beta2", "epsilon", "multi_precision"}, {"ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature AdamaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment", "InfNorm", "Beta1Pow", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("beta1"); - attrs.emplace_back("beta2"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("multi_precision"); - paddle::small_vector outputs {"ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"}; - return KernelSignature("adamax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AdamwOpArgumentMapping: - -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "beta2", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "beta2", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "Beta2Tensor", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"beta1", "Beta2Tensor", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "beta2", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "beta2", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "Beta2Tensor", "epsilon", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("adamw", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"Beta1Tensor", "Beta2Tensor", "EpsilonTensor", "lr_ratio", "coeff", "with_decay", "lazy_mode", "min_row_size_to_use_multithread", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" : "beta1"); - attrs.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" : "beta2"); - attrs.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" : "epsilon"); - attrs.emplace_back("lr_ratio"); - attrs.emplace_back("coeff"); - attrs.emplace_back("with_decay"); - attrs.emplace_back("lazy_mode"); - attrs.emplace_back("min_row_size_to_use_multithread"); - attrs.emplace_back("multi_precision"); - attrs.emplace_back("use_global_beta_pow"); - paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; - return KernelSignature("adamw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AddmmOpArgumentMapping: - -return KernelSignature("addmm", {"Input", "X", "Y"}, {"Beta", "Alpha"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("Beta"); - attrs.emplace_back("Alpha"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("addmm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AffineGridOpArgumentMapping: - -return KernelSignature("affine_grid", {"Theta"}, {"output_shape", "align_corners"}, {"Output"}); -return KernelSignature("affine_grid", {"Theta"}, {"OutputShape", "align_corners"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature AffineGridOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Theta"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("OutputShape") - ? "OutputShape" - : "output_shape"); - - attrs.emplace_back("align_corners"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("affine_grid", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AllcloseOpArgumentMapping: - -return KernelSignature("allclose", {"Input", "Other"}, {"rtol", "atol", "equal_nan"}, {"Out"}); -return KernelSignature("allclose", {"Input", "Other"}, {"rtol", "Atol", "equal_nan"}, {"Out"}); -return KernelSignature("allclose", {"Input", "Other"}, {"Rtol", "atol", "equal_nan"}, {"Out"}); -return KernelSignature("allclose", {"Input", "Other"}, {"Rtol", "Atol", "equal_nan"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AllcloseOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Other"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Rtol") ? "Rtol" : "rtol"); - attrs.emplace_back(ctx.HasInput("Atol") ? "Atol" : "atol"); - attrs.emplace_back("equal_nan"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("allclose", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AngleOpArgumentMapping: - -return KernelSignature("angle", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AngleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("angle", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ArgmaxOpArgumentMapping: - -return KernelSignature("argmax", {"X"}, {"axis", "keepdims", "flatten", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ArgMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("keepdims"); - attrs.emplace_back("flatten"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("argmax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ArgminOpArgumentMapping: - -return KernelSignature("argmin", {"X"}, {"axis", "keepdims", "flatten", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ArgMinOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("keepdims"); - attrs.emplace_back("flatten"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("argmin", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ArgsortOpArgumentMapping: - -return KernelSignature("argsort", {"X"}, {"axis", "descending"}, {"Out", "Indices"}); -****************************************************************** -*/ - -KernelSignature ArgsortOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("descending"); - paddle::small_vector outputs {"Out", "Indices"}; - return KernelSignature("argsort", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsComplexOpArgumentMapping: - -return KernelSignature("as_complex", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AsComplexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("as_complex", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsRealOpArgumentMapping: - -return KernelSignature("as_real", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AsRealOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("as_real", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsStridedOpArgumentMapping: - -return KernelSignature("as_strided", {"input"}, {"dims", "stride", "offset"}, {"out"}); -****************************************************************** -*/ - -KernelSignature AsStridedOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input"}; - paddle::small_vector attrs; - attrs.emplace_back("dims"); - attrs.emplace_back("stride"); - attrs.emplace_back("offset"); - paddle::small_vector outputs {"out"}; - return KernelSignature("as_strided", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsinOpArgumentMapping: - -return KernelSignature("asin", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AsinOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("asin", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsinhOpArgumentMapping: - -return KernelSignature("asinh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AsinhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("asinh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AtanOpArgumentMapping: - -return KernelSignature("atan", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AtanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("atan", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Atan2OpArgumentMapping: - -return KernelSignature("atan2", {"X1", "X2"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Atan2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X1", "X2"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("atan2", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AtanhOpArgumentMapping: - -return KernelSignature("atanh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature AtanhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("atanh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AucOpArgumentMapping: - -return KernelSignature("auc", {"Predict", "Label", "StatPos", "StatNeg", "InsTagWeight"}, {"curve", "num_thresholds", "slide_steps"}, {"AUC", "StatPosOut", "StatNegOut"}); -****************************************************************** -*/ - -KernelSignature AucOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Predict", "Label", "StatPos", "StatNeg", "InsTagWeight"}; - paddle::small_vector attrs; - attrs.emplace_back("curve"); - attrs.emplace_back("num_thresholds"); - attrs.emplace_back("slide_steps"); - paddle::small_vector outputs {"AUC", "StatPosOut", "StatNegOut"}; - return KernelSignature("auc", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AverageAccumulatesOpArgumentMapping: - -return KernelSignature("average_accumulates", {"param", "in_sum_1", "in_sum_2", "in_sum_3", "in_num_accumulates", "in_old_num_accumulates", "in_num_updates"}, {"average_window", "max_average_window", "min_average_window"}, {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}); -****************************************************************** -*/ - -KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"param", "in_sum_1", "in_sum_2", "in_sum_3", "in_num_accumulates", "in_old_num_accumulates", "in_num_updates"}; - paddle::small_vector attrs; - attrs.emplace_back("average_window"); - attrs.emplace_back("max_average_window"); - attrs.emplace_back("min_average_window"); - paddle::small_vector outputs {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}; - return KernelSignature("average_accumulates", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BceLossOpArgumentMapping: - -return KernelSignature("bce_loss", {"X", "Label"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BceLossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bce_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BernoulliOpArgumentMapping: - -return KernelSignature("bernoulli", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BernoulliOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bernoulli", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BicubicInterpOpArgumentMapping: - -return KernelSignature("bicubic_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BicubicInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("bicubic_interp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BilinearOpArgumentMapping: - -return KernelSignature("bilinear", {"X", "Y", "Weight", "Bias"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BilinearTensorProductOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Weight", "Bias"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bilinear", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BilinearInterpOpArgumentMapping: - -return KernelSignature("bilinear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BilinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("bilinear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BincountOpArgumentMapping: - -return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Weights"}; - paddle::small_vector attrs; - attrs.emplace_back("minlength"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("bincount", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BitwiseAndOpArgumentMapping: - -return KernelSignature("bitwise_and", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BitwiseAndOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bitwise_and", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BitwiseNotOpArgumentMapping: - -return KernelSignature("bitwise_not", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BitwiseNotOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bitwise_not", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BitwiseOrOpArgumentMapping: - -return KernelSignature("bitwise_or", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BitwiseOrOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bitwise_or", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BitwiseXorOpArgumentMapping: - -return KernelSignature("bitwise_xor", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BitwiseXorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bitwise_xor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BmmOpArgumentMapping: - -return KernelSignature("bmm", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BmmOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("bmm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BoxCoderOpArgumentMapping: - -return KernelSignature("box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}, {"code_type", "box_normalized", "axis", "variance"}, {"OutputBox"}); -****************************************************************** -*/ - -KernelSignature BoxCoderOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"PriorBox", "PriorBoxVar", "TargetBox"}; - paddle::small_vector attrs; - attrs.emplace_back("code_type"); - attrs.emplace_back("box_normalized"); - attrs.emplace_back("axis"); - attrs.emplace_back("variance"); - paddle::small_vector outputs {"OutputBox"}; - return KernelSignature("box_coder", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BroadcastTensorsOpArgumentMapping: - -return KernelSignature("broadcast_tensors", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BroadcastTensorsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("broadcast_tensors", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CeilOpArgumentMapping: - -return KernelSignature("ceil", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CeilOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("ceil", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CeluOpArgumentMapping: - -return KernelSignature("celu", {"X"}, {"alpha"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CeluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("celu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CheckFiniteAndUnscaleOpArgumentMapping: - -return KernelSignature("check_finite_and_unscale", {"X", "Scale"}, {}, {"Out", "FoundInfinite"}); -****************************************************************** -*/ - -KernelSignature CheckFiniteAndUnscaleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out", "FoundInfinite"}; - return KernelSignature("check_finite_and_unscale", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CheckNumericsOpArgumentMapping: - -return KernelSignature("check_numerics", {"tensor"}, {"op_type", "var_name", "check_nan_inf_level", "stack_height_limit", "output_dir"}, {"stats", "values"}); -****************************************************************** -*/ - -KernelSignature CheckNumericsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"tensor"}; - paddle::small_vector attrs; - attrs.emplace_back("op_type"); - attrs.emplace_back("var_name"); - attrs.emplace_back("check_nan_inf_level"); - attrs.emplace_back("stack_height_limit"); - attrs.emplace_back("output_dir"); - paddle::small_vector outputs {"stats", "values"}; - return KernelSignature("check_numerics", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CholeskyOpArgumentMapping: - -return KernelSignature("cholesky", {"X"}, {"upper"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CholeskyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("cholesky", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CholeskySolveOpArgumentMapping: - -return KernelSignature("cholesky_solve", {"X", "Y"}, {"upper"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CholeskySolveOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("cholesky_solve", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ClassCenterSampleOpArgumentMapping: - -return KernelSignature("class_center_sample", {"Label"}, {"num_classes", "num_samples", "ring_id", "rank", "nranks", "fix_seed", "seed"}, {"RemappedLabel", "SampledLocalClassCenter"}); -****************************************************************** -*/ - -KernelSignature ClassCenterSampleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Label"}; - paddle::small_vector attrs; - attrs.emplace_back("num_classes"); - attrs.emplace_back("num_samples"); - attrs.emplace_back("ring_id"); - attrs.emplace_back("rank"); - attrs.emplace_back("nranks"); - attrs.emplace_back("fix_seed"); - attrs.emplace_back("seed"); - paddle::small_vector outputs {"RemappedLabel", "SampledLocalClassCenter"}; - return KernelSignature("class_center_sample", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ClipOpArgumentMapping: - -return KernelSignature("clip", {"X"}, {"min", "max"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ClipOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); - attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("clip", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ClipByNormOpArgumentMapping: - -return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"}); -return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("max_norm"); - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("clip_by_norm", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("clip_by_norm_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CoalesceTensorOpArgumentMapping: - -return KernelSignature("coalesce_tensor", {"Input"}, {"dtype", "copy_data", "set_constant", "persist_output", "constant", "use_align", "align_size", "user_defined_size_of_dtype", "concated_shapes", "concated_ranks"}, {"Output", "FusedOutput"}); -****************************************************************** -*/ - -KernelSignature CoalesceTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - attrs.emplace_back("dtype"); - attrs.emplace_back("copy_data"); - attrs.emplace_back("set_constant"); - attrs.emplace_back("persist_output"); - attrs.emplace_back("constant"); - attrs.emplace_back("use_align"); - attrs.emplace_back("align_size"); - attrs.emplace_back("user_defined_size_of_dtype"); - attrs.emplace_back("concated_shapes"); - attrs.emplace_back("concated_ranks"); - paddle::small_vector outputs {"Output", "FusedOutput"}; - return KernelSignature("coalesce_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ComplexOpArgumentMapping: - -return KernelSignature("complex", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ComplexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("complex", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ConcatOpArgumentMapping: - -return KernelSignature("concat", {"X"}, {"axis"}, {"Out"}); -return KernelSignature("concat", {"X"}, {"AxisTensor"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("AxisTensor") ? "AxisTensor" : "axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("concat", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ConjOpArgumentMapping: - -return KernelSignature("conj", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ConjOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("conj", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dOpArgumentMapping: - -return KernelSignature("conv2d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("dilations"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("conv2d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv3dOpArgumentMapping: - -return KernelSignature("conv3d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("conv3d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv3dTransposeOpArgumentMapping: - -return KernelSignature("conv3d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature Conv3dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("conv3d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CosOpArgumentMapping: - -return KernelSignature("cos", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CosOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("cos", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CoshOpArgumentMapping: - -return KernelSignature("cosh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CoshOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("cosh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CropOpArgumentMapping: - -return KernelSignature("crop", {"X"}, {"shape", "offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"shape", "Offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"shape", "OffsetsTensor"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"Shape", "offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"Shape", "Offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"Shape", "OffsetsTensor"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"ShapeTensor", "offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"ShapeTensor", "Offsets"}, {"Out"}); -return KernelSignature("crop", {"X"}, {"ShapeTensor", "OffsetsTensor"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CropTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("Shape") - ? "Shape" - : ctx.InputSize("ShapeTensor") > 0 - ? "ShapeTensor" - : "shape"); - attrs.emplace_back( - ctx.HasInput("Offsets") - ? "Offsets" - : ctx.InputSize("OffsetsTensor") > 0 - ? "OffsetsTensor" - : "offsets"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("crop", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CrossOpArgumentMapping: - -return KernelSignature("cross", {"X", "Y"}, {"dim"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("cross", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CrossEntropyWithSoftmaxOpArgumentMapping: - -return KernelSignature("cross_entropy_with_softmax", {"Logits", "Label"}, {"soft_label", "use_softmax", "numeric_stable_mode", "ignore_index", "axis"}, {"Softmax", "Loss"}); -****************************************************************** -*/ - -KernelSignature SoftmaxWithCrossEntropyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Logits", "Label"}; - paddle::small_vector attrs; - attrs.emplace_back("soft_label"); - attrs.emplace_back("use_softmax"); - attrs.emplace_back("numeric_stable_mode"); - attrs.emplace_back("ignore_index"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Softmax", "Loss"}; - return KernelSignature("cross_entropy_with_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CummaxOpArgumentMapping: - -return KernelSignature("cummax", {"x"}, {"axis", "dtype"}, {"out", "indices"}); -****************************************************************** -*/ - -KernelSignature CummaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out", "indices"}; - return KernelSignature("cummax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumminOpArgumentMapping: - -return KernelSignature("cummin", {"x"}, {"axis", "dtype"}, {"out", "indices"}); -****************************************************************** -*/ - -KernelSignature CumminOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out", "indices"}; - return KernelSignature("cummin", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumprodOpArgumentMapping: - -return KernelSignature("cumprod", {"X"}, {"dim"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CumprodOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("cumprod", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumsumOpArgumentMapping: - -return KernelSignature("cumsum", {"X"}, {"axis", "flatten", "exclusive", "reverse"}, {"Out"}); -return KernelSignature("cumsum", {"X"}, {"AxisTensor", "flatten", "exclusive", "reverse"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature CumsumOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("flatten"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("reverse"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("cumsum", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DataOpArgumentMapping: - -return KernelSignature("data", {}, {"name", "shape", "dtype", "place"}, {"out"}); -return KernelSignature("data", {}, {"name", "ShapeTensor", "dtype", "place"}, {"out"}); -return KernelSignature("data", {}, {"name", "ShapeTensorList", "dtype", "place"}, {"out"}); -****************************************************************** -*/ - -KernelSignature DataOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("name"); - attrs.emplace_back( - ctx.HasInput("ShapeTensor") - ? "ShapeTensor" - : ctx.InputSize("ShapeTensorList") > 0 - ? "ShapeTensorList" - : "shape"); - attrs.emplace_back("dtype"); - - paddle::small_vector outputs {"out"}; - return KernelSignature("data", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DepthwiseConv2dOpArgumentMapping: - -return KernelSignature("depthwise_conv2d", {"Input", "Filter"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature DepthwiseConv2dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("depthwise_conv2d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DetOpArgumentMapping: - -return KernelSignature("determinant", {"Input"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DeterminantOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("determinant", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DiagOpArgumentMapping: - -return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DiagV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("padding_value"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("diag", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DiagEmbedOpArgumentMapping: - -return KernelSignature("diag_embed", {"Input"}, {"offset", "dim1", "dim2"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DiagEmbedOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("dim1"); - attrs.emplace_back("dim2"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("diag_embed", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DiagonalOpArgumentMapping: - -return KernelSignature("diagonal", {"Input"}, {"offset", "axis1", "axis2"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DiagonalOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("axis1"); - attrs.emplace_back("axis2"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("diagonal", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DigammaOpArgumentMapping: - -return KernelSignature("digamma", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DigammaOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("digamma", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DirichletOpArgumentMapping: - -return KernelSignature("dirichlet", {"Alpha"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DirichletOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Alpha"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("dirichlet", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DistOpArgumentMapping: - -return KernelSignature("dist", {"X", "Y"}, {"p"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DistOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("p"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("dist", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DotOpArgumentMapping: - -return KernelSignature("dot", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DotOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("dot", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EditDistanceOpArgumentMapping: - -return KernelSignature("edit_distance", {"Hyps", "Refs", "HypsLength", "RefsLength"}, {"normalized"}, {"SequenceNum", "Out"}); -****************************************************************** -*/ - -KernelSignature EditDistanceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Hyps", "Refs", "HypsLength", "RefsLength"}; - paddle::small_vector attrs; - attrs.emplace_back("normalized"); - paddle::small_vector outputs {"SequenceNum", "Out"}; - return KernelSignature("edit_distance", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EigOpArgumentMapping: - -return KernelSignature("eig", {"X"}, {}, {"Eigenvalues", "Eigenvectors"}); -****************************************************************** -*/ - -KernelSignature EigOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; - return KernelSignature("eig", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EighOpArgumentMapping: - -return KernelSignature("eigh", {"X"}, {"UPLO"}, {"Eigenvalues", "Eigenvectors"}); -****************************************************************** -*/ - -KernelSignature EighOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("UPLO"); - paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; - return KernelSignature("eigh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EigvalsOpArgumentMapping: - -return KernelSignature("eigvals", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EigvalsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("eigvals", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EigvalshOpArgumentMapping: - -return KernelSignature("eigvalsh", {"X"}, {"UPLO", "is_test"}, {"Eigenvalues", "Eigenvectors"}); -****************************************************************** -*/ - -KernelSignature EigvalshOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("UPLO"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"Eigenvalues", "Eigenvectors"}; - return KernelSignature("eigvalsh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EluOpArgumentMapping: - -return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("elu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EqualAllOpArgumentMapping: - -return KernelSignature("equal_all", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EqualAllOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("equal_all", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ErfOpArgumentMapping: - -return KernelSignature("erf", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ErfOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("erf", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ErfinvOpArgumentMapping: - -return KernelSignature("erfinv", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ErfinvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("erfinv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ExpOpArgumentMapping: - -return KernelSignature("exp", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ExpOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("exp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ExpandAsOpArgumentMapping: - -return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ExpandAsV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("target_shape"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("expand_as", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Expm1OpArgumentMapping: - -return KernelSignature("expm1", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Expm1OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("expm1", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftC2cOpArgumentMapping: - -return KernelSignature("fft_c2c", {"X"}, {"axes", "normalization", "forward"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FftC2cOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fft_c2c", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftC2rOpArgumentMapping: - -return KernelSignature("fft_c2r", {"X"}, {"axes", "normalization", "forward", "last_dim_size"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FftC2rOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - attrs.emplace_back("last_dim_size"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fft_c2r", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftR2cOpArgumentMapping: - -return KernelSignature("fft_r2c", {"X"}, {"axes", "normalization", "forward", "onesided"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FftR2cOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - attrs.emplace_back("onesided"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fft_r2c", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillOpArgumentMapping: - -return KernelSignature("fill", {"X"}, {"value"}, {"Out"}); -return KernelSignature("fill", {"X"}, {"ValueTensor"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FillAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fill", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillDiagonalOpArgumentMapping: - -return KernelSignature("fill_diagonal", {"X"}, {"value", "offset", "wrap"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FillDiagonalOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - attrs.emplace_back("offset"); - attrs.emplace_back("wrap"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fill_diagonal", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillDiagonalTensorOpArgumentMapping: - -return KernelSignature("fill_diagonal_tensor", {"X", "Y"}, {"offset", "dim1", "dim2"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FillDiagonalTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("dim1"); - attrs.emplace_back("dim2"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("fill_diagonal_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FlashAttnOpArgumentMapping: - -return KernelSignature("flash_attn", {"q", "k", "v", "fixed_seed_offset", "attn_mask"}, {"dropout", "causal", "return_softmax", "is_test", "rng_name"}, {"out", "softmax", "softmax_lse", "seed_offset"}); -****************************************************************** -*/ - -KernelSignature FlashAttnOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"q", "k", "v", "fixed_seed_offset", "attn_mask"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout"); - attrs.emplace_back("causal"); - attrs.emplace_back("return_softmax"); - attrs.emplace_back("is_test"); - attrs.emplace_back("rng_name"); - paddle::small_vector outputs {"out", "softmax", "softmax_lse", "seed_offset"}; - return KernelSignature("flash_attn", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FlashAttnUnpaddedOpArgumentMapping: - -return KernelSignature("flash_attn_unpadded", {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "fixed_seed_offset", "attn_mask"}, {"max_seqlen_q", "max_seqlen_k", "scale", "dropout", "causal", "return_softmax", "is_test", "rng_name"}, {"out", "softmax", "softmax_lse", "seed_offset"}); -****************************************************************** -*/ - -KernelSignature FlashAttnUnpaddedOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "fixed_seed_offset", "attn_mask"}; - paddle::small_vector attrs; - attrs.emplace_back("max_seqlen_q"); - attrs.emplace_back("max_seqlen_k"); - attrs.emplace_back("scale"); - attrs.emplace_back("dropout"); - attrs.emplace_back("causal"); - attrs.emplace_back("return_softmax"); - attrs.emplace_back("is_test"); - attrs.emplace_back("rng_name"); - paddle::small_vector outputs {"out", "softmax", "softmax_lse", "seed_offset"}; - return KernelSignature("flash_attn_unpadded", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FlipOpArgumentMapping: - -return KernelSignature("flip", {"X"}, {"axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FlipOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("flip", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FloorOpArgumentMapping: - -return KernelSignature("floor", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FloorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("floor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FoldOpArgumentMapping: - -return KernelSignature("fold", {"X"}, {"output_sizes", "kernel_sizes", "strides", "paddings", "dilations"}, {"Y"}); -****************************************************************** -*/ - -KernelSignature FoldOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("output_sizes"); - attrs.emplace_back("kernel_sizes"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - paddle::small_vector outputs {"Y"}; - return KernelSignature("fold", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FrameOpArgumentMapping: - -return KernelSignature("frame", {"X"}, {"frame_length", "hop_length", "axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FrameOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("frame_length"); - attrs.emplace_back("hop_length"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("frame", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FullIntArrayOpArgumentMapping: - -return KernelSignature("full_int_array", {}, {"value", "dtype", "place"}, {"out"}); -****************************************************************** -*/ - -KernelSignature FullIntArrayOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - attrs.emplace_back("dtype"); - - paddle::small_vector outputs {"out"}; - return KernelSignature("full_int_array", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GatherOpArgumentMapping: - -return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"}); -return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Axis") ? "Axis" : "axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("gather", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GatherNdOpArgumentMapping: - -return KernelSignature("gather_nd", {"X", "Index"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GatherNdOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("gather_nd", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GatherTreeOpArgumentMapping: - -return KernelSignature("gather_tree", {"Ids", "Parents"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GatherTreeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Ids", "Parents"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("gather_tree", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GaussianInplaceOpArgumentMapping: - -return KernelSignature("gaussian_inplace", {"x"}, {"mean", "std", "seed"}, {"out"}); -****************************************************************** -*/ - -KernelSignature GaussianInplaceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("mean"); - attrs.emplace_back("std"); - attrs.emplace_back("seed"); - paddle::small_vector outputs {"out"}; - return KernelSignature("gaussian_inplace", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GeluOpArgumentMapping: - -return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("approximate"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("gelu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GenerateProposalsOpArgumentMapping: - -return KernelSignature("generate_proposals", {"Scores", "BboxDeltas", "ImShape", "Anchors", "Variances"}, {"pre_nms_topN", "post_nms_topN", "nms_thresh", "min_size", "eta", "pixel_offset"}, {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}); -****************************************************************** -*/ - -KernelSignature GenerateProposalsV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Scores", "BboxDeltas", "ImShape", "Anchors", "Variances"}; - paddle::small_vector attrs; - attrs.emplace_back("pre_nms_topN"); - attrs.emplace_back("post_nms_topN"); - attrs.emplace_back("nms_thresh"); - attrs.emplace_back("min_size"); - attrs.emplace_back("eta"); - attrs.emplace_back("pixel_offset"); - paddle::small_vector outputs {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}; - return KernelSignature("generate_proposals", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GridSampleOpArgumentMapping: - -return KernelSignature("grid_sample", {"X", "Grid"}, {"mode", "padding_mode", "align_corners"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature GridSamplerOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Grid"}; - paddle::small_vector attrs; - attrs.emplace_back("mode"); - attrs.emplace_back("padding_mode"); - attrs.emplace_back("align_corners"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("grid_sample", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GroupNormOpArgumentMapping: - -return KernelSignature("group_norm", {"X", "Scale", "Bias"}, {"epsilon", "groups", "data_layout"}, {"Y", "Mean", "Variance"}); -****************************************************************** -*/ - -KernelSignature GroupNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_layout"); - paddle::small_vector outputs {"Y", "Mean", "Variance"}; - return KernelSignature("group_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GumbelSoftmaxOpArgumentMapping: - -return KernelSignature("gumbel_softmax", {"X"}, {"temperature", "hard", "axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GumbelSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("temperature"); - attrs.emplace_back("hard"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("gumbel_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardshrinkOpArgumentMapping: - -return KernelSignature("hard_shrink", {"X"}, {"threshold"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature HardShrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("hard_shrink", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardsigmoidOpArgumentMapping: - -return KernelSignature("hardsigmoid", {"X"}, {"slope", "offset"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature HardSigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("slope"); - attrs.emplace_back("offset"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("hardsigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardtanhOpArgumentMapping: - -return KernelSignature("hardtanh", {"X"}, {"t_min", "t_max"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature BreluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("t_min"); - attrs.emplace_back("t_max"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("hardtanh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HeavisideOpArgumentMapping: - -return KernelSignature("heaviside", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ElementwiseHeavisideOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("heaviside", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HistogramOpArgumentMapping: - -return KernelSignature("histogram", {"X"}, {"bins", "min", "max"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("bins"); - attrs.emplace_back("min"); - attrs.emplace_back("max"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("histogram", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HuberLossOpArgumentMapping: - -return KernelSignature("huber_loss", {"X", "Y"}, {"delta"}, {"Out", "Residual"}); -****************************************************************** -*/ - -KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("delta"); - paddle::small_vector outputs {"Out", "Residual"}; - return KernelSignature("huber_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I0OpArgumentMapping: - -return KernelSignature("i0", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature I0OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("i0", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I0eOpArgumentMapping: - -return KernelSignature("i0e", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature I0eOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("i0e", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I1OpArgumentMapping: - -return KernelSignature("i1", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature I1OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("i1", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I1eOpArgumentMapping: - -return KernelSignature("i1e", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature I1eOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("i1e", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ImagOpArgumentMapping: - -return KernelSignature("imag", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ImagOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("imag", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexAddOpArgumentMapping: - -return KernelSignature("index_add", {"X", "Index", "AddValue"}, {"axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IndexAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "AddValue"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("index_add", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexPutOpArgumentMapping: - -return KernelSignature("index_put", {"x", "indices", "value"}, {"accumulate"}, {"out"}); -****************************************************************** -*/ - -KernelSignature IndexPutOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "indices", "value"}; - paddle::small_vector attrs; - attrs.emplace_back("accumulate"); - paddle::small_vector outputs {"out"}; - return KernelSignature("index_put", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSampleOpArgumentMapping: - -return KernelSignature("index_sample", {"X", "Index"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IndexSampleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("index_sample", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSelectOpArgumentMapping: - -return KernelSignature("index_select", {"X", "Index"}, {"dim"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IndexSelectOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("index_select", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSelectStridedOpArgumentMapping: - -return KernelSignature("index_select_strided", {"x"}, {"index", "axis"}, {"out"}); -****************************************************************** -*/ - -KernelSignature IndexSelectStridedOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("index"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"out"}; - return KernelSignature("index_select_strided", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by InstanceNormOpArgumentMapping: - -return KernelSignature("instance_norm", {"X", "Scale", "Bias"}, {"epsilon"}, {"Y", "SavedMean", "SavedVariance"}); -****************************************************************** -*/ - -KernelSignature InstanceNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"Y", "SavedMean", "SavedVariance"}; - return KernelSignature("instance_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by InverseOpArgumentMapping: - -return KernelSignature("inverse", {"Input"}, {}, {"Output"}); -****************************************************************** -*/ - -KernelSignature InverseOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Output"}; - return KernelSignature("inverse", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IsEmptyOpArgumentMapping: - -return KernelSignature("is_empty", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IsEmptyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("is_empty", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IscloseOpArgumentMapping: - -return KernelSignature("isclose", {"Input", "Other"}, {"rtol", "atol", "equal_nan"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Other"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Rtol") ? "Rtol" : "rtol"); - attrs.emplace_back(ctx.HasInput("Atol") ? "Atol" : "atol"); - attrs.emplace_back("equal_nan"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("isclose", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IsfiniteOpArgumentMapping: - -return KernelSignature("isfinite", {"X"}, {}, {"Out"}); -return KernelSignature("isfinite_sr", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IsfiniteV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("isfinite", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("isfinite_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IsinfOpArgumentMapping: - -return KernelSignature("isinf", {"X"}, {}, {"Out"}); -return KernelSignature("isinf_sr", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IsinfV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("isinf", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("isinf_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IsnanOpArgumentMapping: - -return KernelSignature("isnan", {"X"}, {}, {"Out"}); -return KernelSignature("isnan_sr", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature IsnanV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("isnan", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("isnan_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KldivLossOpArgumentMapping: - -return KernelSignature("kldiv_loss", {"X", "Target"}, {"reduction"}, {"Loss"}); -****************************************************************** -*/ - -KernelSignature KldivLossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Target"}; - paddle::small_vector attrs; - attrs.emplace_back("reduction"); - paddle::small_vector outputs {"Loss"}; - return KernelSignature("kldiv_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KronOpArgumentMapping: - -return KernelSignature("kron", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature KronOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("kron", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KthvalueOpArgumentMapping: - -return KernelSignature("kthvalue", {"X"}, {"k", "axis", "keepdim"}, {"Out", "Indices"}); -****************************************************************** -*/ - -KernelSignature KthvalueOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("k"); - attrs.emplace_back("axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"Out", "Indices"}; - return KernelSignature("kthvalue", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LabelSmoothOpArgumentMapping: - -return KernelSignature("label_smooth", {"X", "PriorDist"}, {"epsilon"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LabelSmoothOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "PriorDist"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("label_smooth", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LambOpArgumentMapping: - -return KernelSignature("lamb", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"weight_decay", "beta1", "beta2", "epsilon", "always_adapt", "multi_precision"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("lamb_sr", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}, {"weight_decay", "beta1", "beta2", "epsilon", "always_adapt", "multi_precision"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature LambOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam", "SkipUpdate"}; - paddle::small_vector attrs; - attrs.emplace_back("weight_decay"); - attrs.emplace_back("beta1"); - attrs.emplace_back("beta2"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("always_adapt"); - attrs.emplace_back("multi_precision"); - paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; - if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("Grad") && - ctx.IsDenseTensorInput("LearningRate") && - ctx.IsDenseTensorInput("Moment1") && - ctx.IsDenseTensorInput("Moment2") && - ctx.IsDenseTensorInput("Beta1Pow") && - ctx.IsDenseTensorInput("Beta2Pow") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam"))) && - ((ctx.HasInput("SkipUpdate") && ctx.IsDenseTensorInput("SkipUpdate")) || (!ctx.HasInput("SkipUpdate")))) { - return KernelSignature("lamb", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Param") && - ctx.IsSelectedRowsInput("Grad") && - ctx.IsDenseTensorInput("LearningRate") && - ctx.IsDenseTensorInput("Moment1") && - ctx.IsDenseTensorInput("Moment2") && - ctx.IsDenseTensorInput("Beta1Pow") && - ctx.IsDenseTensorInput("Beta2Pow") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam"))) && - ((ctx.HasInput("SkipUpdate") && ctx.IsDenseTensorInput("SkipUpdate")) || (!ctx.HasInput("SkipUpdate")))) { - return KernelSignature("lamb_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LayerNormOpArgumentMapping: - -return KernelSignature("layer_norm", {"X", "Scale", "Bias"}, {"epsilon", "begin_norm_axis"}, {"Y", "Mean", "Variance"}); -****************************************************************** -*/ - -KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "Bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("begin_norm_axis"); - paddle::small_vector outputs {"Y", "Mean", "Variance"}; - return KernelSignature("layer_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LeakyReluOpArgumentMapping: - -return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("leaky_relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LerpOpArgumentMapping: - -return KernelSignature("lerp", {"X", "Y", "Weight"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LerpOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Weight"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("lerp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LgammaOpArgumentMapping: - -return KernelSignature("lgamma", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LgammaOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("lgamma", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LinearInterpOpArgumentMapping: - -return KernelSignature("linear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("linear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LlmInt8LinearOpArgumentMapping: - -return KernelSignature("llm_int8_linear", {"x", "weight", "bias", "weight_scale"}, {"threshold"}, {"out"}); -****************************************************************** -*/ - -KernelSignature LlmInt8LinearOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "weight", "bias", "weight_scale"}; - paddle::small_vector attrs; - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"out"}; - return KernelSignature("llm_int8_linear", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogOpArgumentMapping: - -return KernelSignature("log", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("log", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log10OpArgumentMapping: - -return KernelSignature("log10", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Log10OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("log10", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log1pOpArgumentMapping: - -return KernelSignature("log1p", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Log1pOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("log1p", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log2OpArgumentMapping: - -return KernelSignature("log2", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Log2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("log2", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogLossOpArgumentMapping: - -return KernelSignature("log_loss", {"Predicted", "Labels"}, {"epsilon"}, {"Loss"}); -****************************************************************** -*/ - -KernelSignature LogLossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Predicted", "Labels"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"Loss"}; - return KernelSignature("log_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogSoftmaxOpArgumentMapping: - -return KernelSignature("log_softmax", {"X"}, {"axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("log_softmax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogcumsumexpOpArgumentMapping: - -return KernelSignature("logcumsumexp", {"X"}, {"axis", "flatten", "exclusive", "reverse"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogcumsumexpOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("flatten"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("reverse"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("logcumsumexp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogicalAndOpArgumentMapping: - -return KernelSignature("logical_and", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogicalAndOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("logical_and", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogicalNotOpArgumentMapping: - -return KernelSignature("logical_not", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogicalNotOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("logical_not", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogicalOrOpArgumentMapping: - -return KernelSignature("logical_or", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogicalOrOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("logical_or", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogicalXorOpArgumentMapping: - -return KernelSignature("logical_xor", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogicalXorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("logical_xor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogitOpArgumentMapping: - -return KernelSignature("logit", {"X"}, {"eps"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogitOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("eps"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("logit", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogsigmoidOpArgumentMapping: - -return KernelSignature("logsigmoid", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LogsigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("logsigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LstsqOpArgumentMapping: - -return KernelSignature("lstsq", {"X", "Y"}, {"rcond", "driver"}, {"Solution", "Residuals", "Rank", "SingularValues"}); -return KernelSignature("lstsq", {"X", "Y"}, {"RcondTensor", "driver"}, {"Solution", "Residuals", "Rank", "SingularValues"}); -****************************************************************** -*/ - -KernelSignature LstsqOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("rcond"); - attrs.emplace_back("driver"); - paddle::small_vector outputs {"Solution", "Residuals", "Rank", "SingularValues"}; - return KernelSignature("lstsq", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LuOpArgumentMapping: - -return KernelSignature("lu", {"X"}, {"pivots"}, {"Out", "Pivots", "Infos"}); -****************************************************************** -*/ - -KernelSignature LuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("pivots"); - paddle::small_vector outputs {"Out", "Pivots", "Infos"}; - return KernelSignature("lu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LuUnpackOpArgumentMapping: - -return KernelSignature("lu_unpack", {"X", "Pivots"}, {"unpack_ludata", "unpack_pivots"}, {"Pmat", "L", "U"}); -****************************************************************** -*/ - -KernelSignature LuUnpackOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Pivots"}; - paddle::small_vector attrs; - attrs.emplace_back("unpack_ludata"); - attrs.emplace_back("unpack_pivots"); - paddle::small_vector outputs {"Pmat", "L", "U"}; - return KernelSignature("lu_unpack", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MarginCrossEntropyOpArgumentMapping: - -return KernelSignature("margin_cross_entropy", {"Logits", "Label"}, {"return_softmax", "ring_id", "rank", "nranks", "margin1", "margin2", "margin3", "scale"}, {"Softmax", "Loss"}); -****************************************************************** -*/ - -KernelSignature MarginCrossEntropyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Logits", "Label"}; - paddle::small_vector attrs; - attrs.emplace_back("return_softmax"); - attrs.emplace_back("ring_id"); - attrs.emplace_back("rank"); - attrs.emplace_back("nranks"); - attrs.emplace_back("margin1"); - attrs.emplace_back("margin2"); - attrs.emplace_back("margin3"); - attrs.emplace_back("scale"); - paddle::small_vector outputs {"Softmax", "Loss"}; - return KernelSignature("margin_cross_entropy", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaskedMultiheadAttentionOpArgumentMapping: - -return KernelSignature("masked_multihead_attention", {"x", "cache_kv", "bias", "src_mask", "cum_offsets", "sequence_lengths", "rotary_tensor", "beam_cache_offset", "qkv_out_scale", "out_shift", "out_smooth"}, {"seq_len", "rotary_emb_dims", "use_neox_rotary_style", "compute_dtype", "out_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "cache_kv_out", "beam_cache_offset_out"}); -****************************************************************** -*/ - -KernelSignature MaskedMultiheadAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "cache_kv", "bias", "src_mask", "cum_offsets", "sequence_lengths", "rotary_tensor", "beam_cache_offset", "qkv_out_scale", "out_shift", "out_smooth"}; - paddle::small_vector attrs; - attrs.emplace_back("seq_len"); - attrs.emplace_back("rotary_emb_dims"); - attrs.emplace_back("use_neox_rotary_style"); - attrs.emplace_back("compute_dtype"); - attrs.emplace_back("out_scale"); - attrs.emplace_back("quant_round_type"); - attrs.emplace_back("quant_max_bound"); - attrs.emplace_back("quant_min_bound"); - paddle::small_vector outputs {"out", "cache_kv_out", "beam_cache_offset_out"}; - return KernelSignature("masked_multihead_attention", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaskedSelectOpArgumentMapping: - -return KernelSignature("masked_select", {"X", "Mask"}, {}, {"Y"}); -****************************************************************** -*/ - -KernelSignature MaskedSelectOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Mask"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Y"}; - return KernelSignature("masked_select", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatrixNmsOpArgumentMapping: - -return KernelSignature("matrix_nms", {"BBoxes", "Scores"}, {"score_threshold", "nms_top_k", "keep_top_k", "post_threshold", "use_gaussian", "gaussian_sigma", "background_label", "normalized"}, {"Out", "Index", "RoisNum"}); -****************************************************************** -*/ - -KernelSignature MatrixNmsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"BBoxes", "Scores"}; - paddle::small_vector attrs; - attrs.emplace_back("score_threshold"); - attrs.emplace_back("nms_top_k"); - attrs.emplace_back("keep_top_k"); - attrs.emplace_back("post_threshold"); - attrs.emplace_back("use_gaussian"); - attrs.emplace_back("gaussian_sigma"); - attrs.emplace_back("background_label"); - attrs.emplace_back("normalized"); - paddle::small_vector outputs {"Out", "Index", "RoisNum"}; - return KernelSignature("matrix_nms", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatrixPowerOpArgumentMapping: - -return KernelSignature("matrix_power", {"X"}, {"n"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MatrixPowerOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("n"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("matrix_power", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxPool2dWithIndexOpArgumentMapping: - -return KernelSignature("max_pool2d_with_index", {"X"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"Out", "Mask"}); -****************************************************************** -*/ - -KernelSignature MaxPool2dWithIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - paddle::small_vector outputs {"Out", "Mask"}; - return KernelSignature("max_pool2d_with_index", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxPool3dWithIndexOpArgumentMapping: - -return KernelSignature("max_pool3d_with_index", {"X"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"Out", "Mask"}); -****************************************************************** -*/ - -KernelSignature MaxPool3dWithIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - paddle::small_vector outputs {"Out", "Mask"}; - return KernelSignature("max_pool3d_with_index", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxoutOpArgumentMapping: - -return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MaxoutOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("groups"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("maxout", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MeanAllOpArgumentMapping: - -return KernelSignature("mean_all", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MeanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("mean_all", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MemoryEfficientAttentionOpArgumentMapping: - -return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"max_seqlen_q", "max_seqlen_k", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); -return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"max_seqlen_q", "MaxSeqlenKTensor", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); -return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"MaxSeqlenQTensor", "max_seqlen_k", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); -return KernelSignature("memory_efficient_attention", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}, {"MaxSeqlenQTensor", "MaxSeqlenKTensor", "causal", "dropout_p", "scale", "is_test"}, {"output", "logsumexp", "seed_and_offset"}); -****************************************************************** -*/ - -KernelSignature MemoryEfficientAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "causal_diagonal", "seqlen_k"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("MaxSeqlenQTensor") ? "MaxSeqlenQTensor" : "max_seqlen_q"); - attrs.emplace_back(ctx.HasInput("MaxSeqlenKTensor") ? "MaxSeqlenKTensor" : "max_seqlen_k"); - attrs.emplace_back("causal"); - attrs.emplace_back("dropout_p"); - attrs.emplace_back("scale"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"output", "logsumexp", "seed_and_offset"}; - return KernelSignature("memory_efficient_attention", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MergeSelectedRowsOpArgumentMapping: - -return KernelSignature("merge_selected_rows", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MergeSelectedRowsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("merge_selected_rows", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MergedAdamOpArgumentMapping: - -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "beta2", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "Beta2Tensor", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"beta1", "Beta2Tensor", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "beta2", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "Beta2Tensor", "epsilon", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -return KernelSignature("merged_adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}, {"Beta1Tensor", "Beta2Tensor", "EpsilonTensor", "multi_precision", "use_global_beta_pow"}, {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("beta1"); - attrs.emplace_back("beta2"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("multi_precision"); - attrs.emplace_back("use_global_beta_pow"); - paddle::small_vector outputs {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}; - return KernelSignature("merged_adam", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MergedMomentumOpArgumentMapping: - -return KernelSignature("merged_momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature MergedMomentumOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("mu"); - attrs.emplace_back("use_nesterov"); - attrs.emplace_back("regularization_method"); - attrs.emplace_back("regularization_coeff"); - attrs.emplace_back("multi_precision"); - attrs.emplace_back("rescale_grad"); - paddle::small_vector outputs {"ParamOut", "VelocityOut", "MasterParamOut"}; - return KernelSignature("merged_momentum", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MeshgridOpArgumentMapping: - -return KernelSignature("meshgrid", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MeshgridOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("meshgrid", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ModeOpArgumentMapping: - -return KernelSignature("mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); -****************************************************************** -*/ - -KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"Out", "Indices"}; - return KernelSignature("mode", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MomentumOpArgumentMapping: - -return KernelSignature("momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); -return KernelSignature("momentum_dense_param_sparse_grad", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, {"mu", "use_nesterov", "regularization_method", "regularization_coeff", "multi_precision", "rescale_grad"}, {"ParamOut", "VelocityOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature MomentumOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("mu"); - attrs.emplace_back("use_nesterov"); - attrs.emplace_back("regularization_method"); - attrs.emplace_back("regularization_coeff"); - attrs.emplace_back("multi_precision"); - attrs.emplace_back("rescale_grad"); - paddle::small_vector outputs {"ParamOut", "VelocityOut", "MasterParamOut"}; - if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("Grad") && - ctx.IsDenseTensorInput("Velocity") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("momentum", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Param") && - ctx.IsSelectedRowsInput("Grad") && - ctx.IsDenseTensorInput("Velocity") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("momentum_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiDotOpArgumentMapping: - -return KernelSignature("multi_dot", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MultiDotOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("multi_dot", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MulticlassNms3OpArgumentMapping: - -return KernelSignature("multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}, {"score_threshold", "nms_top_k", "keep_top_k", "nms_threshold", "normalized", "nms_eta", "background_label"}, {"Out", "Index", "NmsRoisNum"}); -****************************************************************** -*/ - -KernelSignature MulticlassNms3OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"BBoxes", "Scores", "RoisNum"}; - paddle::small_vector attrs; - attrs.emplace_back("score_threshold"); - attrs.emplace_back("nms_top_k"); - attrs.emplace_back("keep_top_k"); - attrs.emplace_back("nms_threshold"); - attrs.emplace_back("normalized"); - attrs.emplace_back("nms_eta"); - attrs.emplace_back("background_label"); - paddle::small_vector outputs {"Out", "Index", "NmsRoisNum"}; - return KernelSignature("multiclass_nms3", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultinomialOpArgumentMapping: - -return KernelSignature("multinomial", {"X"}, {"num_samples", "replacement"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MultinomialOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("num_samples"); - attrs.emplace_back("replacement"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("multinomial", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiplexOpArgumentMapping: - -return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Ids"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("multiplex", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MvOpArgumentMapping: - -return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Vec"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("mv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NanmedianOpArgumentMapping: - -return KernelSignature("nanmedian", {"X"}, {"axis", "keepdim"}, {"Out", "MedianIndex"}); -return KernelSignature("nanmedian", {"X"}, {"AxisTensorList", "keepdim"}, {"Out", "MedianIndex"}); -****************************************************************** -*/ - -KernelSignature NanmedianOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"Out", "MedianIndex"}; - return KernelSignature("nanmedian", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NearestInterpOpArgumentMapping: - -return KernelSignature("nearest_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature NearestInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("nearest_interp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NextafterOpArgumentMapping: - -return KernelSignature("nextafter", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature NextafterOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - return KernelSignature("nextafter", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NllLossOpArgumentMapping: - -return KernelSignature("nll_loss", {"X", "Label", "Weight"}, {"ignore_index", "reduction"}, {"Out", "Total_weight"}); -****************************************************************** -*/ - -KernelSignature NllLossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label", "Weight"}; - paddle::small_vector attrs; - attrs.emplace_back("ignore_index"); - attrs.emplace_back("reduction"); - paddle::small_vector outputs {"Out", "Total_weight"}; - return KernelSignature("nll_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NmsOpArgumentMapping: - -return KernelSignature("nms", {"Boxes"}, {"iou_threshold"}, {"KeepBoxesIdxs"}); -****************************************************************** -*/ - -KernelSignature NmsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Boxes"}; - paddle::small_vector attrs; - attrs.emplace_back("iou_threshold"); - paddle::small_vector outputs {"KeepBoxesIdxs"}; - return KernelSignature("nms", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NonzeroOpArgumentMapping: - -return KernelSignature("nonzero", {"Condition"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature WhereIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Condition"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("nonzero", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NpuIdentityOpArgumentMapping: - -return KernelSignature("npu_identity", {"x"}, {"format"}, {"out"}); -****************************************************************** -*/ - -KernelSignature NpuIdentityOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("format"); - paddle::small_vector outputs {"out"}; - return KernelSignature("npu_identity", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NumelOpArgumentMapping: - -return KernelSignature("numel", {"Input"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SizeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("numel", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by OverlapAddOpArgumentMapping: - -return KernelSignature("overlap_add", {"X"}, {"hop_length", "axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature OverlapAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("hop_length"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("overlap_add", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PNormOpArgumentMapping: - -return KernelSignature("p_norm", {"X"}, {"porder", "axis", "epsilon", "keepdim", "asvector"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("porder"); - attrs.emplace_back("axis"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("keepdim"); - attrs.emplace_back("asvector"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("p_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pad3dOpArgumentMapping: - -return KernelSignature("pad3d", {"X"}, {"paddings", "mode", "value", "data_format"}, {"Out"}); -return KernelSignature("pad3d", {"X"}, {"Paddings", "mode", "value", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Pad3dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("Paddings") - ? "Paddings" - : "paddings"); - - attrs.emplace_back("mode"); - attrs.emplace_back("value"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("pad3d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PixelShuffleOpArgumentMapping: - -return KernelSignature("pixel_shuffle", {"X"}, {"upscale_factor", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PixelShuffleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("upscale_factor"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("pixel_shuffle", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PixelUnshuffleOpArgumentMapping: - -return KernelSignature("pixel_unshuffle", {"X"}, {"downscale_factor", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PixelUnshuffleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("downscale_factor"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("pixel_unshuffle", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PoissonOpArgumentMapping: - -return KernelSignature("poisson", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PoissonOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("poisson", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PolygammaOpArgumentMapping: - -return KernelSignature("polygamma", {"x"}, {"n"}, {"out"}); -****************************************************************** -*/ - -KernelSignature PolygammaOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("n"); - paddle::small_vector outputs {"out"}; - return KernelSignature("polygamma", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PowOpArgumentMapping: - -return KernelSignature("pow", {"X"}, {"factor"}, {"Out"}); -return KernelSignature("pow", {"X"}, {"FactorTensor"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("pow", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PreluOpArgumentMapping: - -return KernelSignature("prelu", {"X", "Alpha"}, {"data_format", "mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PreluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Alpha"}; - paddle::small_vector attrs; - attrs.emplace_back("data_format"); - attrs.emplace_back("mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("prelu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PriorBoxOpArgumentMapping: - -return KernelSignature("prior_box", {"Input", "Image"}, {"min_sizes", "max_sizes", "aspect_ratios", "variances", "flip", "clip", "step_w", "step_h", "offset", "min_max_aspect_ratios_order"}, {"Boxes", "Variances"}); -****************************************************************** -*/ - -KernelSignature PriorBoxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Image"}; - paddle::small_vector attrs; - attrs.emplace_back("min_sizes"); - attrs.emplace_back("max_sizes"); - attrs.emplace_back("aspect_ratios"); - attrs.emplace_back("variances"); - attrs.emplace_back("flip"); - attrs.emplace_back("clip"); - attrs.emplace_back("step_w"); - attrs.emplace_back("step_h"); - attrs.emplace_back("offset"); - attrs.emplace_back("min_max_aspect_ratios_order"); - paddle::small_vector outputs {"Boxes", "Variances"}; - return KernelSignature("prior_box", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PsroiPoolOpArgumentMapping: - -return KernelSignature("psroi_pool", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("output_channels"); - attrs.emplace_back("spatial_scale"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("psroi_pool", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PutAlongAxisOpArgumentMapping: - -return KernelSignature("put_along_axis", {"Input", "Index", "Value"}, {"Axis", "Reduce"}, {"Result"}); -****************************************************************** -*/ - -KernelSignature PutAlongAxisOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Index", "Value"}; - paddle::small_vector attrs; - attrs.emplace_back("Axis"); - attrs.emplace_back("Reduce"); - paddle::small_vector outputs {"Result"}; - return KernelSignature("put_along_axis", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by QrOpArgumentMapping: - -return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"}); -****************************************************************** -*/ - -KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("mode"); - paddle::small_vector outputs {"Q", "R"}; - return KernelSignature("qr", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RealOpArgumentMapping: - -return KernelSignature("real", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RealOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("real", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReciprocalOpArgumentMapping: - -return KernelSignature("reciprocal", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ReciprocalOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("reciprocal", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReindexGraphOpArgumentMapping: - -return KernelSignature("graph_reindex", {"X", "Neighbors", "Count", "HashTable_Value", "HashTable_Index"}, {}, {"Reindex_Src", "Reindex_Dst", "Out_Nodes"}); -****************************************************************** -*/ - -KernelSignature GraphReindexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Neighbors", "Count", "HashTable_Value", "HashTable_Index"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Reindex_Src", "Reindex_Dst", "Out_Nodes"}; - return KernelSignature("graph_reindex", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReluOpArgumentMapping: - -return KernelSignature("relu", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Relu6OpArgumentMapping: - -return KernelSignature("relu6", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Relu6OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("relu6", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RenormOpArgumentMapping: - -return KernelSignature("renorm", {"X"}, {"p", "axis", "max_norm"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RenormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("p"); - attrs.emplace_back("axis"); - attrs.emplace_back("max_norm"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("renorm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RmsNormOpArgumentMapping: - -return KernelSignature("rms_norm", {"x", "bias", "residual", "norm_weight", "norm_bias"}, {"epsilon", "begin_norm_axis", "quant_scale", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out", "residual_out"}); -****************************************************************** -*/ - -KernelSignature RmsNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "bias", "residual", "norm_weight", "norm_bias"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("begin_norm_axis"); - attrs.emplace_back("quant_scale"); - attrs.emplace_back("quant_round_type"); - attrs.emplace_back("quant_max_bound"); - attrs.emplace_back("quant_min_bound"); - paddle::small_vector outputs {"out", "residual_out"}; - return KernelSignature("rms_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RmspropOpArgumentMapping: - -return KernelSignature("rmsprop", {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}, {"epsilon", "decay", "momentum", "centered", "multi_precision"}, {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}); -return KernelSignature("rmsprop_dense_param_sparse_grad", {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}, {"epsilon", "decay", "momentum", "centered", "multi_precision"}, {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "MeanSquare", "Grad", "Moment", "LearningRate", "MeanGrad", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("decay"); - attrs.emplace_back("momentum"); - attrs.emplace_back("centered"); - attrs.emplace_back("multi_precision"); - paddle::small_vector outputs {"ParamOut", "MomentOut", "MeanSquareOut", "MeanGradOut", "MasterParamOut"}; - if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("MeanSquare") && - ctx.IsDenseTensorInput("Grad") && - ctx.IsDenseTensorInput("Moment") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MeanGrad") && ctx.IsDenseTensorInput("MeanGrad")) || (!ctx.HasInput("MeanGrad"))) && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("rmsprop", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("MeanSquare") && - ctx.IsSelectedRowsInput("Grad") && - ctx.IsDenseTensorInput("Moment") && - ctx.IsDenseTensorInput("LearningRate") && - ((ctx.HasInput("MeanGrad") && ctx.IsDenseTensorInput("MeanGrad")) || (!ctx.HasInput("MeanGrad"))) && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("rmsprop_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoiAlignOpArgumentMapping: - -return KernelSignature("roi_align", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "spatial_scale", "sampling_ratio", "aligned"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("spatial_scale"); - attrs.emplace_back("sampling_ratio"); - attrs.emplace_back("aligned"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("roi_align", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoiPoolOpArgumentMapping: - -return KernelSignature("roi_pool", {"X", "ROIs", "RoisNum"}, {"pooled_height", "pooled_width", "spatial_scale"}, {"Out", "Argmax"}); -****************************************************************** -*/ - -KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("spatial_scale"); - paddle::small_vector outputs {"Out", "Argmax"}; - return KernelSignature("roi_pool", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RollOpArgumentMapping: - -return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"}); -return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("ShiftsTensor") - ? "ShiftsTensor" - : "shifts"); - - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("roll", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoundOpArgumentMapping: - -return KernelSignature("round", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RoundOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("round", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RsqrtOpArgumentMapping: - -return KernelSignature("rsqrt", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RsqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("rsqrt", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ScaleOpArgumentMapping: - -return KernelSignature("scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); -return KernelSignature("scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); -return KernelSignature("scale_sr", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"}); -return KernelSignature("scale_sr", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("ScaleTensor") ? "ScaleTensor" : "scale"); - attrs.emplace_back("bias"); - attrs.emplace_back("bias_after_scale"); - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("scale", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("scale_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ScatterOpArgumentMapping: - -return KernelSignature("scatter", {"X", "Ids", "Updates"}, {"overwrite"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ScatterOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Ids", "Updates"}; - paddle::small_vector attrs; - attrs.emplace_back("overwrite"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("scatter", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ScatterNdAddOpArgumentMapping: - -return KernelSignature("scatter_nd_add", {"X", "Index", "Updates"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ScatterNdAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "Updates"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("scatter_nd_add", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SearchsortedOpArgumentMapping: - -return KernelSignature("searchsorted", {"SortedSequence", "Values"}, {"out_int32", "right"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SearchsortedOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"SortedSequence", "Values"}; - paddle::small_vector attrs; - attrs.emplace_back("out_int32"); - attrs.emplace_back("right"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("searchsorted", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SegmentPoolOpArgumentMapping: - -return KernelSignature("segment_pool", {"X", "SegmentIds"}, {"pooltype"}, {"Out", "SummedIds"}); -****************************************************************** -*/ - -KernelSignature SegmentPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "SegmentIds"}; - paddle::small_vector attrs; - attrs.emplace_back("pooltype"); - paddle::small_vector outputs {"Out", "SummedIds"}; - return KernelSignature("segment_pool", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SeluOpArgumentMapping: - -return KernelSignature("selu", {"X"}, {"scale", "alpha"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SeluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("scale"); - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("selu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendURecvOpArgumentMapping: - -return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "out_size"}, {"Out", "Dst_count"}); -return KernelSignature("send_u_recv", {"X", "Src_index", "Dst_index"}, {"reduce_op", "Out_size"}, {"Out", "Dst_count"}); -****************************************************************** -*/ - -KernelSignature GraphSendRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Src_index", "Dst_index"}; - paddle::small_vector attrs; - attrs.emplace_back("reduce_op"); - attrs.emplace_back( - ctx.HasInput("Out_size") - ? "Out_size" - : "out_size"); - - paddle::small_vector outputs {"Out", "Dst_count"}; - return KernelSignature("send_u_recv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendUeRecvOpArgumentMapping: - -return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "out_size"}, {"Out", "Dst_count"}); -return KernelSignature("send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, {"message_op", "reduce_op", "Out_size"}, {"Out", "Dst_count"}); -****************************************************************** -*/ - -KernelSignature GraphSendUeRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Src_index", "Dst_index"}; - paddle::small_vector attrs; - attrs.emplace_back("message_op"); - attrs.emplace_back("reduce_op"); - attrs.emplace_back( - ctx.HasInput("Out_size") - ? "Out_size" - : "out_size"); - - paddle::small_vector outputs {"Out", "Dst_count"}; - return KernelSignature("send_ue_recv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendUvOpArgumentMapping: - -return KernelSignature("send_uv", {"x", "y", "src_index", "dst_index"}, {"message_op"}, {"out"}); -****************************************************************** -*/ - -KernelSignature GraphSendUvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "src_index", "dst_index"}; - paddle::small_vector attrs; - attrs.emplace_back("message_op"); - paddle::small_vector outputs {"out"}; - return KernelSignature("send_uv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SgdOpArgumentMapping: - -return KernelSignature("sgd", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); -return KernelSignature("sgd_dense_param_sparse_grad", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); -return KernelSignature("sgd_sparse_param_sparse_grad", {"Param", "LearningRate", "Grad", "MasterParam"}, {"multi_precision"}, {"ParamOut", "MasterParamOut"}); -****************************************************************** -*/ - -KernelSignature SgdOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Param", "LearningRate", "Grad", "MasterParam"}; - paddle::small_vector attrs; - attrs.emplace_back("multi_precision"); - paddle::small_vector outputs {"ParamOut", "MasterParamOut"}; - if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("LearningRate") && - ctx.IsDenseTensorInput("Grad") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("sgd", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Param") && - ctx.IsDenseTensorInput("LearningRate") && - ctx.IsSelectedRowsInput("Grad") && - ((ctx.HasInput("MasterParam") && ctx.IsDenseTensorInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("sgd_dense_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("Param") && - ctx.IsDenseTensorInput("LearningRate") && - ctx.IsSelectedRowsInput("Grad") && - ((ctx.HasInput("MasterParam") && ctx.IsSelectedRowsInput("MasterParam")) || (!ctx.HasInput("MasterParam")))) { - return KernelSignature("sgd_sparse_param_sparse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ShapeOpArgumentMapping: - -return KernelSignature("shape", {"Input"}, {}, {"Out"}); -return KernelSignature("shape_sr", {"Input"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ShapeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("Input")) { - return KernelSignature("shape", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("Input")) { - return KernelSignature("shape_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ShardIndexOpArgumentMapping: - -return KernelSignature("shard_index", {"X"}, {"index_num", "nshards", "shard_id", "ignore_value"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ShardIndexOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("index_num"); - attrs.emplace_back("nshards"); - attrs.emplace_back("shard_id"); - attrs.emplace_back("ignore_value"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("shard_index", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidOpArgumentMapping: - -return KernelSignature("sigmoid", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SigmoidOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("sigmoid", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidCrossEntropyWithLogitsOpArgumentMapping: - -return KernelSignature("sigmoid_cross_entropy_with_logits", {"X", "Label", "pos_weight"}, {"normalize", "ignore_index"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SigmoidCrossEntropyWithLogitsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label", "pos_weight"}; - paddle::small_vector attrs; - attrs.emplace_back("normalize"); - attrs.emplace_back("ignore_index"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("sigmoid_cross_entropy_with_logits", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SignOpArgumentMapping: - -return KernelSignature("sign", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SignOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("sign", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SiluOpArgumentMapping: - -return KernelSignature("silu", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SiluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("silu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinOpArgumentMapping: - -return KernelSignature("sin", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SinOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("sin", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinhOpArgumentMapping: - -return KernelSignature("sinh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SinhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("sinh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SlogdetOpArgumentMapping: - -return KernelSignature("slogdet", {"Input"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SlogdeterminantOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("slogdet", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftplusOpArgumentMapping: - -return KernelSignature("softplus", {"X"}, {"beta", "threshold"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SoftplusOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("beta"); - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("softplus", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftshrinkOpArgumentMapping: - -return KernelSignature("softshrink", {"X"}, {"lambda"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SoftshrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("lambda"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("softshrink", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftsignOpArgumentMapping: - -return KernelSignature("softsign", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SoftsignOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("softsign", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SolveOpArgumentMapping: - -return KernelSignature("solve", {"X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SolveOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("solve", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SpectralNormOpArgumentMapping: - -return KernelSignature("spectral_norm", {"Weight", "U", "V"}, {"dim", "power_iters", "eps"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SpectralNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Weight", "U", "V"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("power_iters"); - attrs.emplace_back("eps"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("spectral_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqrtOpArgumentMapping: - -return KernelSignature("sqrt", {"X"}, {}, {"Out"}); -return KernelSignature("sqrt_sr", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("sqrt", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("sqrt_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SquareOpArgumentMapping: - -return KernelSignature("square", {"X"}, {}, {"Out"}); -return KernelSignature("square_sr", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SquareOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("X")) { - return KernelSignature("square", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsSelectedRowsInput("X")) { - return KernelSignature("square_sr", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SquaredL2NormOpArgumentMapping: - -return KernelSignature("squared_l2_norm", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SquaredL2NormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("squared_l2_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqueezeOpArgumentMapping: - -return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out", "XShape"}); -return KernelSignature("squeeze", {"X"}, {"AxisTensor"}, {"Out", "XShape"}); -return KernelSignature("squeeze", {"X"}, {"AxisTensorList"}, {"Out", "XShape"}); -****************************************************************** -*/ - -KernelSignature Squeeze2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - paddle::small_vector outputs {"Out", "XShape"}; - return KernelSignature("squeeze", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by StackOpArgumentMapping: - -return KernelSignature("stack", {"X"}, {"axis"}, {"Y"}); -****************************************************************** -*/ - -KernelSignature StackOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Y"}; - return KernelSignature("stack", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by StanhOpArgumentMapping: - -return KernelSignature("stanh", {"X"}, {"scale_a", "scale_b"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature StanhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("scale_a"); - attrs.emplace_back("scale_b"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("stanh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SvdOpArgumentMapping: - -return KernelSignature("svd", {"X"}, {"full_matrices"}, {"U", "S", "VH"}); -****************************************************************** -*/ - -KernelSignature SvdOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("full_matrices"); - paddle::small_vector outputs {"U", "S", "VH"}; - return KernelSignature("svd", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TakeAlongAxisOpArgumentMapping: - -return KernelSignature("take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"}); -****************************************************************** -*/ - -KernelSignature TakeAlongAxisOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Index"}; - paddle::small_vector attrs; - attrs.emplace_back("Axis"); - paddle::small_vector outputs {"Result"}; - return KernelSignature("take_along_axis", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanOpArgumentMapping: - -return KernelSignature("tan", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("tan", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhOpArgumentMapping: - -return KernelSignature("tanh", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TanhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("tanh", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhShrinkOpArgumentMapping: - -return KernelSignature("tanh_shrink", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TanhShrinkOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("tanh_shrink", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TemporalShiftOpArgumentMapping: - -return KernelSignature("temporal_shift", {"X"}, {"seg_num", "shift_ratio", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TemporalShiftOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("seg_num"); - attrs.emplace_back("shift_ratio"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("temporal_shift", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TensorUnfoldOpArgumentMapping: - -return KernelSignature("tensor_unfold", {"input"}, {"axis", "size", "step"}, {"out"}); -****************************************************************** -*/ - -KernelSignature TensorUnfoldOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("size"); - attrs.emplace_back("step"); - paddle::small_vector outputs {"out"}; - return KernelSignature("tensor_unfold", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ThresholdedReluOpArgumentMapping: - -return KernelSignature("thresholded_relu", {"X"}, {"threshold"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ThresholdedReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("thresholded_relu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TopPSamplingOpArgumentMapping: - -return KernelSignature("top_p_sampling", {"x", "ps", "threshold"}, {"seed"}, {"out", "ids"}); -****************************************************************** -*/ - -KernelSignature TopPSamplingOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "ps", "threshold"}; - paddle::small_vector attrs; - attrs.emplace_back("seed"); - paddle::small_vector outputs {"out", "ids"}; - return KernelSignature("top_p_sampling", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TopkOpArgumentMapping: - -return KernelSignature("topk", {"X"}, {"k", "axis", "largest", "sorted"}, {"Out", "Indices"}); -****************************************************************** -*/ - -KernelSignature TopKV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("K") ? "K" : "k"); - attrs.emplace_back("axis"); - attrs.emplace_back("largest"); - attrs.emplace_back("sorted"); - paddle::small_vector outputs {"Out", "Indices"}; - return KernelSignature("topk", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TraceOpArgumentMapping: - -return KernelSignature("trace", {"Input"}, {"offset", "axis1", "axis2"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TraceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("axis1"); - attrs.emplace_back("axis2"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("trace", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TriangularSolveOpArgumentMapping: - -return KernelSignature("triangular_solve", {"X", "Y"}, {"upper", "transpose", "unitriangular"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TriangularSolveOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - attrs.emplace_back("transpose"); - attrs.emplace_back("unitriangular"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("triangular_solve", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TrilinearInterpOpArgumentMapping: - -return KernelSignature("trilinear_interp", {"X", "OutSize", "SizeTensor", "Scale"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TrilinearInterpV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("trilinear_interp", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TruncOpArgumentMapping: - -return KernelSignature("trunc", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TruncOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("trunc", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnbindOpArgumentMapping: - -return KernelSignature("unbind", {"X"}, {"axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature UnbindOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("unbind", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnfoldOpArgumentMapping: - -return KernelSignature("unfold", {"X"}, {"kernel_sizes", "strides", "paddings", "dilations"}, {"Y"}); -****************************************************************** -*/ - -KernelSignature UnfoldOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("kernel_sizes"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - paddle::small_vector outputs {"Y"}; - return KernelSignature("unfold", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UniformInplaceOpArgumentMapping: - -return KernelSignature("uniform_inplace", {"X"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature UniformRandomInplaceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("min"); - attrs.emplace_back("max"); - attrs.emplace_back("seed"); - attrs.emplace_back("diag_num"); - attrs.emplace_back("diag_step"); - attrs.emplace_back("diag_val"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("uniform_inplace", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UniqueConsecutiveOpArgumentMapping: - -return KernelSignature("unique_consecutive", {"X"}, {"return_inverse", "return_counts", "axis", "dtype"}, {"Out", "Index", "Counts"}); -****************************************************************** -*/ - -KernelSignature UniqueConsecutiveOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("return_inverse"); - attrs.emplace_back("return_counts"); - attrs.emplace_back("axis"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out", "Index", "Counts"}; - return KernelSignature("unique_consecutive", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Unpool3dOpArgumentMapping: - -return KernelSignature("unpool3d", {"X", "Indices"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Unpool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_size"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("unpool3d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnsqueezeOpArgumentMapping: - -return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out", "XShape"}); -return KernelSignature("unsqueeze", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); -return KernelSignature("unsqueeze", {"X"}, {"AxesTensorList"}, {"Out", "XShape"}); -****************************************************************** -*/ - -KernelSignature Unsqueeze2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxesTensor") - ? "AxesTensor" - : ctx.InputSize("AxesTensorList") > 0 - ? "AxesTensorList" - : "axes"); - paddle::small_vector outputs {"Out", "XShape"}; - return KernelSignature("unsqueeze", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnstackOpArgumentMapping: - -return KernelSignature("unstack", {"X"}, {"axis", "num"}, {"Y"}); -****************************************************************** -*/ - -KernelSignature UnstackOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("num"); - paddle::small_vector outputs {"Y"}; - return KernelSignature("unstack", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UpdateLossScalingOpArgumentMapping: - -return KernelSignature("update_loss_scaling", {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, {"incr_every_n_steps", "decr_every_n_nan_or_inf", "incr_ratio", "decr_ratio", "stop_update"}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); -return KernelSignature("update_loss_scaling", {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, {"incr_every_n_steps", "decr_every_n_nan_or_inf", "incr_ratio", "decr_ratio", "StopUpdate"}, {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); -****************************************************************** -*/ - -KernelSignature UpdateLossScalingOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}; - paddle::small_vector attrs; - attrs.emplace_back("incr_every_n_steps"); - attrs.emplace_back("decr_every_n_nan_or_inf"); - attrs.emplace_back("incr_ratio"); - attrs.emplace_back("decr_ratio"); - attrs.emplace_back(ctx.HasInput("StopUpdate") ? "StopUpdate" : "stop_update"); - paddle::small_vector outputs {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}; - return KernelSignature("update_loss_scaling", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ViewDtypeOpArgumentMapping: - -return KernelSignature("view_dtype", {"input"}, {"dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature ViewDtypeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input"}; - paddle::small_vector attrs; - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out"}; - return KernelSignature("view_dtype", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ViewShapeOpArgumentMapping: - -return KernelSignature("view_shape", {"input"}, {"dims"}, {"out"}); -****************************************************************** -*/ - -KernelSignature ViewShapeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input"}; - paddle::small_vector attrs; - attrs.emplace_back("dims"); - paddle::small_vector outputs {"out"}; - return KernelSignature("view_shape", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ViterbiDecodeOpArgumentMapping: - -return KernelSignature("viterbi_decode", {"Input", "Transition", "Length"}, {"include_bos_eos_tag"}, {"Scores", "Path"}); -****************************************************************** -*/ - -KernelSignature ViterbiDecodeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Transition", "Length"}; - paddle::small_vector attrs; - attrs.emplace_back("include_bos_eos_tag"); - paddle::small_vector outputs {"Scores", "Path"}; - return KernelSignature("viterbi_decode", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WarpctcOpArgumentMapping: - -return KernelSignature("warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}, {"blank", "norm_by_times"}, {"Loss", "WarpCTCGrad"}); -****************************************************************** -*/ - -KernelSignature WarpctcOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Logits", "Label", "LogitsLength", "LabelLength"}; - paddle::small_vector attrs; - attrs.emplace_back("blank"); - attrs.emplace_back("norm_by_times"); - paddle::small_vector outputs {"Loss", "WarpCTCGrad"}; - return KernelSignature("warpctc", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WarprnntOpArgumentMapping: - -return KernelSignature("warprnnt", {"input", "label", "input_lengths", "label_lengths"}, {"blank", "fastemit_lambda"}, {"loss", "warprnntgrad"}); -****************************************************************** -*/ - -KernelSignature WarprnntOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "label", "input_lengths", "label_lengths"}; - paddle::small_vector attrs; - attrs.emplace_back("blank"); - attrs.emplace_back("fastemit_lambda"); - paddle::small_vector outputs {"loss", "warprnntgrad"}; - return KernelSignature("warprnnt", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WeightDequantizeOpArgumentMapping: - -return KernelSignature("weight_dequantize", {"x", "scale"}, {"algo", "out_dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature WeightDequantizeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "scale"}; - paddle::small_vector attrs; - attrs.emplace_back("algo"); - attrs.emplace_back("out_dtype"); - paddle::small_vector outputs {"out"}; - return KernelSignature("weight_dequantize", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WeightOnlyLinearOpArgumentMapping: - -return KernelSignature("weight_only_linear", {"x", "weight", "bias", "weight_scale"}, {"weight_dtype", "arch"}, {"out"}); -****************************************************************** -*/ - -KernelSignature WeightOnlyLinearOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "weight", "bias", "weight_scale"}; - paddle::small_vector attrs; - attrs.emplace_back("weight_dtype"); - attrs.emplace_back("arch"); - paddle::small_vector outputs {"out"}; - return KernelSignature("weight_only_linear", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WeightQuantizeOpArgumentMapping: - -return KernelSignature("weight_quantize", {"x"}, {"algo", "arch"}, {"out", "scale"}); -****************************************************************** -*/ - -KernelSignature WeightQuantizeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("algo"); - attrs.emplace_back("arch"); - paddle::small_vector outputs {"out", "scale"}; - return KernelSignature("weight_quantize", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WeightedSampleNeighborsOpArgumentMapping: - -return KernelSignature("weighted_sample_neighbors", {"row", "colptr", "edge_weight", "input_nodes", "eids"}, {"sample_size", "return_eids"}, {"out_neighbors", "out_count", "out_eids"}); -****************************************************************** -*/ - -KernelSignature WeightedSampleNeighborsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"row", "colptr", "edge_weight", "input_nodes", "eids"}; - paddle::small_vector attrs; - attrs.emplace_back("sample_size"); - attrs.emplace_back("return_eids"); - paddle::small_vector outputs {"out_neighbors", "out_count", "out_eids"}; - return KernelSignature("weighted_sample_neighbors", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WhereOpArgumentMapping: - -return KernelSignature("where", {"Condition", "X", "Y"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature WhereOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Condition", "X", "Y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("where", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by YoloBoxOpArgumentMapping: - -return KernelSignature("yolo_box", {"X", "ImgSize"}, {"anchors", "class_num", "conf_thresh", "downsample_ratio", "clip_bbox", "scale_x_y", "iou_aware", "iou_aware_factor"}, {"Boxes", "Scores"}); -****************************************************************** -*/ - -KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ImgSize"}; - paddle::small_vector attrs; - attrs.emplace_back("anchors"); - attrs.emplace_back("class_num"); - attrs.emplace_back("conf_thresh"); - attrs.emplace_back("downsample_ratio"); - attrs.emplace_back("clip_bbox"); - attrs.emplace_back("scale_x_y"); - attrs.emplace_back("iou_aware"); - attrs.emplace_back("iou_aware_factor"); - paddle::small_vector outputs {"Boxes", "Scores"}; - return KernelSignature("yolo_box", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by YoloLossOpArgumentMapping: - -return KernelSignature("yolo_loss", {"X", "GTBox", "GTLabel", "GTScore"}, {"anchors", "anchor_mask", "class_num", "ignore_thresh", "downsample_ratio", "use_label_smooth", "scale_x_y"}, {"Loss", "ObjectnessMask", "GTMatchMask"}); -****************************************************************** -*/ - -KernelSignature Yolov3LossOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "GTBox", "GTLabel", "GTScore"}; - paddle::small_vector attrs; - attrs.emplace_back("anchors"); - attrs.emplace_back("anchor_mask"); - attrs.emplace_back("class_num"); - attrs.emplace_back("ignore_thresh"); - attrs.emplace_back("downsample_ratio"); - attrs.emplace_back("use_label_smooth"); - attrs.emplace_back("scale_x_y"); - paddle::small_vector outputs {"Loss", "ObjectnessMask", "GTMatchMask"}; - return KernelSignature("yolo_loss", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AbsDoubleGradOpArgumentMapping: - -return KernelSignature("abs_double_grad", {"x", "grad_x@GRAD"}, {}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature AbsDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("abs_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AbsGradOpArgumentMapping: - -return KernelSignature("abs_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("abs_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AcosGradOpArgumentMapping: - -return KernelSignature("acos_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AcosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("acos_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AcoshGradOpArgumentMapping: - -return KernelSignature("acosh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AcoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("acosh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AddmmGradOpArgumentMapping: - -return KernelSignature("addmm_grad", {"Input", "X", "Y", "Out@GRAD"}, {"Alpha", "Beta"}, {"Input@GRAD", "X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("Alpha"); - attrs.emplace_back("Beta"); - paddle::small_vector outputs {"Input@GRAD", "X@GRAD", "Y@GRAD"}; - return KernelSignature("addmm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AffineGridGradOpArgumentMapping: - -return KernelSignature("affine_grid_grad", {"Output@GRAD"}, {"output_shape", "align_corners"}, {"Theta@GRAD"}); -return KernelSignature("affine_grid_grad", {"Output@GRAD"}, {"OutputShape", "align_corners"}, {"Theta@GRAD"}); -****************************************************************** -*/ - -KernelSignature AffineGridGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("OutputShape") - ? "OutputShape" - : "output_shape"); - - attrs.emplace_back("align_corners"); - paddle::small_vector outputs {"Theta@GRAD"}; - return KernelSignature("affine_grid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AngleGradOpArgumentMapping: - -return KernelSignature("angle_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AngleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("angle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ArgsortGradOpArgumentMapping: - -return KernelSignature("argsort_grad", {"Indices", "X", "Out@GRAD"}, {"axis", "descending"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ArgsortGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Indices", "X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("descending"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("argsort_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsStridedGradOpArgumentMapping: - -return KernelSignature("as_strided_grad", {"input", "out@GRAD"}, {"dims", "stride", "offset"}, {"input@GRAD"}); -****************************************************************** -*/ - -KernelSignature AsStridedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dims"); - attrs.emplace_back("stride"); - attrs.emplace_back("offset"); - paddle::small_vector outputs {"input@GRAD"}; - return KernelSignature("as_strided_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsinGradOpArgumentMapping: - -return KernelSignature("asin_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AsinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("asin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AsinhGradOpArgumentMapping: - -return KernelSignature("asinh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AsinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("asinh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Atan2GradOpArgumentMapping: - -return KernelSignature("atan2_grad", {"X1", "X2", "Out@GRAD"}, {}, {"X1@GRAD", "X2@GRAD"}); -****************************************************************** -*/ - -KernelSignature Atan2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X1", "X2", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X1@GRAD", "X2@GRAD"}; - return KernelSignature("atan2_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AtanGradOpArgumentMapping: - -return KernelSignature("atan_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AtanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("atan_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AtanhGradOpArgumentMapping: - -return KernelSignature("atanh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature AtanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("atanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BceLossGradOpArgumentMapping: - -return KernelSignature("bce_loss_grad", {"X", "Label", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature BceLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("bce_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BicubicInterpGradOpArgumentMapping: - -return KernelSignature("bicubic_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature BicubicInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("bicubic_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BilinearGradOpArgumentMapping: - -return KernelSignature("bilinear_grad", {"X", "Y", "Weight", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature BilinearTensorProductGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Weight", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"}; - return KernelSignature("bilinear_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BilinearInterpGradOpArgumentMapping: - -return KernelSignature("bilinear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature BilinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("bilinear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BmmGradOpArgumentMapping: - -return KernelSignature("bmm_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature BmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("bmm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BroadcastTensorsGradOpArgumentMapping: - -return KernelSignature("broadcast_tensors_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature BroadcastTensorsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("broadcast_tensors_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CeilGradOpArgumentMapping: - -return KernelSignature("ceil_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CeilGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("ceil_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CeluDoubleGradOpArgumentMapping: - -return KernelSignature("celu_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"alpha"}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature CeluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("celu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CeluGradOpArgumentMapping: - -return KernelSignature("celu_grad", {"X", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("celu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CholeskyGradOpArgumentMapping: - -return KernelSignature("cholesky_grad", {"Out", "Out@GRAD"}, {"upper"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CholeskyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("cholesky_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CholeskySolveGradOpArgumentMapping: - -return KernelSignature("cholesky_solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {"upper"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature CholeskySolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("cholesky_solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ClipDoubleGradOpArgumentMapping: - -return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"min", "max"}, {"grad_out@GRAD"}); -return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"min", "Max"}, {"grad_out@GRAD"}); -return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"Min", "max"}, {"grad_out@GRAD"}); -return KernelSignature("clip_grad", {"X", "grad_x@GRAD"}, {"Min", "Max"}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature ClipDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); - attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("clip_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ClipGradOpArgumentMapping: - -return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"min", "max"}, {"X@GRAD"}); -return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"min", "Max"}, {"X@GRAD"}); -return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"Min", "max"}, {"X@GRAD"}); -return KernelSignature("clip_grad", {"X", "Out@GRAD"}, {"Min", "Max"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ClipGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Min") ? "Min" : "min"); - attrs.emplace_back(ctx.HasInput("Max") ? "Max" : "max"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("clip_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ComplexGradOpArgumentMapping: - -return KernelSignature("complex_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ComplexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("complex_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ConcatGradOpArgumentMapping: - -return KernelSignature("concat_grad", {"X", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); -return KernelSignature("concat_grad", {"X", "Out@GRAD"}, {"AxisTensor"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ConcatGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("AxisTensor") ? "AxisTensor" : "axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("concat_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dGradOpArgumentMapping: - -return KernelSignature("conv2d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("dilations"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("conv2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dGradGradOpArgumentMapping: - -return KernelSignature("conv2d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "dilations", "groups", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv2dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("dilations"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; - return KernelSignature("conv2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv3dDoubleGradOpArgumentMapping: - -return KernelSignature("conv3d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv3dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; - return KernelSignature("conv3d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv3dGradOpArgumentMapping: - -return KernelSignature("conv3d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("conv3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv3dTransposeGradOpArgumentMapping: - -return KernelSignature("conv3d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv3dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("conv3d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CosDoubleGradOpArgumentMapping: - -return KernelSignature("cos_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature CosDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("cos_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CosGradOpArgumentMapping: - -return KernelSignature("cos_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("cos_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CosTripleGradOpArgumentMapping: - -return KernelSignature("cos_triple_grad", {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}, {}, {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); -****************************************************************** -*/ - -KernelSignature CosTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; - return KernelSignature("cos_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CoshGradOpArgumentMapping: - -return KernelSignature("cosh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("cosh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CropGradOpArgumentMapping: - -return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"offsets"}, {"X@GRAD"}); -return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"Offsets"}, {"X@GRAD"}); -return KernelSignature("crop_grad", {"X", "Out@GRAD"}, {"OffsetsTensor"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CropTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("Offsets") - ? "Offsets" - : ctx.InputSize("OffsetsTensor") > 0 - ? "OffsetsTensor" - : "offsets"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("crop_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CrossEntropyWithSoftmaxGradOpArgumentMapping: - -return KernelSignature("cross_entropy_with_softmax_grad", {"Label", "Softmax", "Loss@GRAD"}, {"soft_label", "use_softmax", "numeric_stable_mode", "ignore_index", "axis"}, {"Logits@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftmaxWithCrossEntropyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Label", "Softmax", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("soft_label"); - attrs.emplace_back("use_softmax"); - attrs.emplace_back("numeric_stable_mode"); - attrs.emplace_back("ignore_index"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Logits@GRAD"}; - return KernelSignature("cross_entropy_with_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CrossGradOpArgumentMapping: - -return KernelSignature("cross_grad", {"X", "Y", "Out@GRAD"}, {"dim"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("cross_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CummaxGradOpArgumentMapping: - -return KernelSignature("cummax_grad", {"x", "indices", "out@GRAD"}, {"axis", "dtype"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature CummaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "indices", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("cummax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumminGradOpArgumentMapping: - -return KernelSignature("cummin_grad", {"x", "indices", "out@GRAD"}, {"axis", "dtype"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature CumminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "indices", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("cummin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumprodGradOpArgumentMapping: - -return KernelSignature("cumprod_grad", {"X", "Out", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CumprodGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("cumprod_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by CumsumGradOpArgumentMapping: - -return KernelSignature("cumsum_grad", {"X", "Out@GRAD"}, {"axis", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); -return KernelSignature("cumsum_grad", {"X", "Out@GRAD"}, {"AxisTensor", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature CumsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("flatten"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("reverse"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("cumsum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DepthwiseConv2dDoubleGradOpArgumentMapping: - -return KernelSignature("depthwise_conv2d_double_grad", {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature DepthwiseConv2dGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_input@GRAD", "grad_filter@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; - return KernelSignature("depthwise_conv2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DepthwiseConv2dGradOpArgumentMapping: - -return KernelSignature("depthwise_conv2d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature DepthwiseConv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("depthwise_conv2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DetGradOpArgumentMapping: - -return KernelSignature("determinant_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature DeterminantGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("determinant_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DiagGradOpArgumentMapping: - -return KernelSignature("diag_grad", {"X", "Out@GRAD"}, {"offset"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature DiagV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("diag_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DiagonalGradOpArgumentMapping: - -return KernelSignature("diagonal_grad", {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature DiagonalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("axis1"); - attrs.emplace_back("axis2"); - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("diagonal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DigammaGradOpArgumentMapping: - -return KernelSignature("digamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature DigammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("digamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DistGradOpArgumentMapping: - -return KernelSignature("dist_grad", {"X", "Y", "Out", "Out@GRAD"}, {"p"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("p"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("dist_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DotGradOpArgumentMapping: - -return KernelSignature("dot_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("dot_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EigGradOpArgumentMapping: - -return KernelSignature("eig_grad", {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature EigGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("eig_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EighGradOpArgumentMapping: - -return KernelSignature("eigh_grad", {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature EighGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Eigenvalues", "Eigenvectors", "Eigenvalues@GRAD", "Eigenvectors@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("eigh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EigvalshGradOpArgumentMapping: - -return KernelSignature("eigvalsh_grad", {"Eigenvectors", "Eigenvalues@GRAD"}, {"UPLO", "is_test"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature EigvalshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Eigenvectors", "Eigenvalues@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("UPLO"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("eigvalsh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EluDoubleGradOpArgumentMapping: - -return KernelSignature("elu_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"alpha"}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature EluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("elu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EluGradOpArgumentMapping: - -return KernelSignature("elu_grad", {"X", "Out", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("elu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ErfGradOpArgumentMapping: - -return KernelSignature("erf_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ErfGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("erf_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ErfinvGradOpArgumentMapping: - -return KernelSignature("erfinv_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("erfinv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ExpGradOpArgumentMapping: - -return KernelSignature("exp_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ExpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("exp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ExpandAsGradOpArgumentMapping: - -return KernelSignature("expand_as_grad", {"X", "Out@GRAD"}, {"target_shape"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ExpandAsV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("target_shape"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("expand_as_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Expm1GradOpArgumentMapping: - -return KernelSignature("expm1_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Expm1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("expm1_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftC2cGradOpArgumentMapping: - -return KernelSignature("fft_c2c_grad", {"Out@GRAD"}, {"axes", "normalization", "forward"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FftC2cGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fft_c2c_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftC2rGradOpArgumentMapping: - -return KernelSignature("fft_c2r_grad", {"Out@GRAD"}, {"axes", "normalization", "forward", "last_dim_size"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FftC2rGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - attrs.emplace_back("last_dim_size"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fft_c2r_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FftR2cGradOpArgumentMapping: - -return KernelSignature("fft_r2c_grad", {"X", "Out@GRAD"}, {"axes", "normalization", "forward", "onesided"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FftR2cGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - attrs.emplace_back("normalization"); - attrs.emplace_back("forward"); - attrs.emplace_back("onesided"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fft_r2c_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillDiagonalGradOpArgumentMapping: - -return KernelSignature("fill_diagonal_grad", {"Out@GRAD"}, {"value", "offset", "wrap"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FillDiagonalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - attrs.emplace_back("offset"); - attrs.emplace_back("wrap"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fill_diagonal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillDiagonalTensorGradOpArgumentMapping: - -return KernelSignature("fill_diagonal_tensor_grad", {"Out@GRAD"}, {"offset", "dim1", "dim2"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FillDiagonalTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("dim1"); - attrs.emplace_back("dim2"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fill_diagonal_tensor_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FillGradOpArgumentMapping: - -return KernelSignature("fill_grad", {"Out@GRAD"}, {"value"}, {"X@GRAD"}); -return KernelSignature("fill_grad", {"Out@GRAD"}, {"ValueTensor"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FillAnyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fill_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FlashAttnGradOpArgumentMapping: - -return KernelSignature("flash_attn_grad", {"q", "k", "v", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}, {"dropout", "causal"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); -****************************************************************** -*/ - -KernelSignature FlashAttnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"q", "k", "v", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout"); - attrs.emplace_back("causal"); - paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; - return KernelSignature("flash_attn_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FlashAttnUnpaddedGradOpArgumentMapping: - -return KernelSignature("flash_attn_unpadded_grad", {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}, {"max_seqlen_q", "max_seqlen_k", "scale", "dropout", "causal"}, {"q@GRAD", "k@GRAD", "v@GRAD"}); -****************************************************************** -*/ - -KernelSignature FlashAttnUnpaddedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k", "out", "softmax_lse", "seed_offset", "attn_mask", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("max_seqlen_q"); - attrs.emplace_back("max_seqlen_k"); - attrs.emplace_back("scale"); - attrs.emplace_back("dropout"); - attrs.emplace_back("causal"); - paddle::small_vector outputs {"q@GRAD", "k@GRAD", "v@GRAD"}; - return KernelSignature("flash_attn_unpadded_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FloorGradOpArgumentMapping: - -return KernelSignature("floor_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FloorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("floor_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FmaxGradOpArgumentMapping: - -return KernelSignature("fmax_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwiseFmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("fmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FminGradOpArgumentMapping: - -return KernelSignature("fmin_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwiseFminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("fmin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FoldGradOpArgumentMapping: - -return KernelSignature("fold_grad", {"X", "Y@GRAD"}, {"output_sizes", "kernel_sizes", "strides", "paddings", "dilations"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("output_sizes"); - attrs.emplace_back("kernel_sizes"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("fold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FrameGradOpArgumentMapping: - -return KernelSignature("frame_grad", {"X", "Out@GRAD"}, {"frame_length", "hop_length", "axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FrameGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("frame_length"); - attrs.emplace_back("hop_length"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("frame_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GatherGradOpArgumentMapping: - -return KernelSignature("gather_grad", {"X", "Index", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); -return KernelSignature("gather_grad", {"X", "Index", "Out@GRAD"}, {"Axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("Axis") ? "Axis" : "axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("gather_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GatherNdGradOpArgumentMapping: - -return KernelSignature("gather_nd_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature GatherNdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("gather_nd_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GaussianInplaceGradOpArgumentMapping: - -return KernelSignature("gaussian_inplace_grad", {"out@GRAD"}, {"mean", "std", "seed"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature GaussianInplaceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("mean"); - attrs.emplace_back("std"); - attrs.emplace_back("seed"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("gaussian_inplace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GeluGradOpArgumentMapping: - -return KernelSignature("gelu_grad", {"X", "Out@GRAD"}, {"approximate"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("approximate"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("gelu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GridSampleGradOpArgumentMapping: - -return KernelSignature("grid_sample_grad", {"X", "Grid", "Output@GRAD"}, {"mode", "padding_mode", "align_corners"}, {"X@GRAD", "Grid@GRAD"}); -****************************************************************** -*/ - -KernelSignature GridSamplerGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Grid", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("mode"); - attrs.emplace_back("padding_mode"); - attrs.emplace_back("align_corners"); - paddle::small_vector outputs {"X@GRAD", "Grid@GRAD"}; - return KernelSignature("grid_sample_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GroupNormGradOpArgumentMapping: - -return KernelSignature("group_norm_grad", {"X", "Scale", "Bias", "Y", "Mean", "Variance", "Y@GRAD"}, {"epsilon", "groups", "data_layout"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature GroupNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "Bias", "Y", "Mean", "Variance", "Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("groups"); - attrs.emplace_back("data_layout"); - paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; - return KernelSignature("group_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GumbelSoftmaxGradOpArgumentMapping: - -return KernelSignature("gumbel_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature GumbelSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("gumbel_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardshrinkGradOpArgumentMapping: - -return KernelSignature("hard_shrink_grad", {"X", "Out@GRAD"}, {"threshold"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature HardShrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("hard_shrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardsigmoidGradOpArgumentMapping: - -return KernelSignature("hardsigmoid_grad", {"Out", "Out@GRAD"}, {"slope", "offset"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature HardSigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("slope"); - attrs.emplace_back("offset"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("hardsigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardtanhGradOpArgumentMapping: - -return KernelSignature("hardtanh_grad", {"X", "Out@GRAD"}, {"t_min", "t_max"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature BreluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("t_min"); - attrs.emplace_back("t_max"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("hardtanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HeavisideGradOpArgumentMapping: - -return KernelSignature("heaviside_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwiseHeavisideGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("heaviside_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HuberLossGradOpArgumentMapping: - -return KernelSignature("huber_loss_grad", {"Residual", "Out@GRAD"}, {"delta"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature HuberLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Residual", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("delta"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("huber_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I0GradOpArgumentMapping: - -return KernelSignature("i0_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature I0GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("i0_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I0eGradOpArgumentMapping: - -return KernelSignature("i0e_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature I0eGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("i0e_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I1GradOpArgumentMapping: - -return KernelSignature("i1_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature I1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("i1_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by I1eGradOpArgumentMapping: - -return KernelSignature("i1e_grad", {"x", "out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature I1eGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("i1e_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ImagGradOpArgumentMapping: - -return KernelSignature("imag_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("imag_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexAddGradOpArgumentMapping: - -return KernelSignature("index_add_grad", {"Index", "AddValue", "Out@GRAD"}, {"axis"}, {"X@GRAD", "AddValue@GRAD"}); -****************************************************************** -*/ - -KernelSignature IndexAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Index", "AddValue", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD", "AddValue@GRAD"}; - return KernelSignature("index_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexPutGradOpArgumentMapping: - -return KernelSignature("index_put_grad", {"x", "indices", "value", "out@GRAD"}, {"accumulate"}, {"x@GRAD", "value@GRAD"}); -****************************************************************** -*/ - -KernelSignature IndexPutGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "indices", "value", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("accumulate"); - paddle::small_vector outputs {"x@GRAD", "value@GRAD"}; - return KernelSignature("index_put_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSampleGradOpArgumentMapping: - -return KernelSignature("index_sample_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature IndexSampleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("index_sample_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSelectGradOpArgumentMapping: - -return KernelSignature("index_select_grad", {"X", "Index", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature IndexSelectGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Index", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("index_select_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by IndexSelectStridedGradOpArgumentMapping: - -return KernelSignature("index_select_strided_grad", {"x", "out@GRAD"}, {"index", "axis"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature IndexSelectStridedGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("index"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("index_select_strided_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by InstanceNormDoubleGradOpArgumentMapping: - -return KernelSignature("instance_norm_double_grad", {"x", "fwd_scale", "saved_mean", "saved_variance", "grad_y", "grad_x@GRAD", "grad_scale@GRAD", "grad_bias@GRAD"}, {"epsilon"}, {"x@GRAD", "fwd_scale@GRAD", "grad_y@GRAD"}); -****************************************************************** -*/ - -KernelSignature InstanceNormDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "fwd_scale", "saved_mean", "saved_variance", "grad_y", "grad_x@GRAD", "grad_scale@GRAD", "grad_bias@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"x@GRAD", "fwd_scale@GRAD", "grad_y@GRAD"}; - return KernelSignature("instance_norm_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by InstanceNormGradOpArgumentMapping: - -return KernelSignature("instance_norm_grad", {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"}, {"epsilon"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature InstanceNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "SavedMean", "SavedVariance", "Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; - return KernelSignature("instance_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by InverseGradOpArgumentMapping: - -return KernelSignature("inverse_grad", {"Output", "Output@GRAD"}, {}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature InverseGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Output", "Output@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("inverse_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KldivLossGradOpArgumentMapping: - -return KernelSignature("kldiv_loss_grad", {"X", "Target", "Loss@GRAD"}, {"reduction"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature KldivLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Target", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("reduction"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("kldiv_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KronGradOpArgumentMapping: - -return KernelSignature("kron_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("kron_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by KthvalueGradOpArgumentMapping: - -return KernelSignature("kthvalue_grad", {"X", "Indices", "Out@GRAD"}, {"k", "axis", "keepdim"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature KthvalueGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("k"); - attrs.emplace_back("axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("kthvalue_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LabelSmoothGradOpArgumentMapping: - -return KernelSignature("label_smooth_grad", {"Out@GRAD"}, {"epsilon"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LabelSmoothGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("label_smooth_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LayerNormGradOpArgumentMapping: - -return KernelSignature("layer_norm_grad", {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}, {"epsilon", "begin_norm_axis"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature LayerNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - attrs.emplace_back("begin_norm_axis"); - paddle::small_vector outputs {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}; - return KernelSignature("layer_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LeakyReluDoubleGradOpArgumentMapping: - -return KernelSignature("leaky_relu_double_grad", {"X", "grad_x@GRAD"}, {"alpha"}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature LeakyReluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("leaky_relu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LeakyReluGradOpArgumentMapping: - -return KernelSignature("leaky_relu_grad", {"X", "Out@GRAD"}, {"alpha"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LeakyReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("leaky_relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LerpGradOpArgumentMapping: - -return KernelSignature("lerp_grad", {"X", "Y", "Weight", "Out", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Weight", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("lerp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LgammaGradOpArgumentMapping: - -return KernelSignature("lgamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("lgamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LinearInterpGradOpArgumentMapping: - -return KernelSignature("linear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("linear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log10GradOpArgumentMapping: - -return KernelSignature("log10_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Log10GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("log10_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log1pGradOpArgumentMapping: - -return KernelSignature("log1p_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Log1pGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("log1p_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Log2GradOpArgumentMapping: - -return KernelSignature("log2_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Log2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("log2_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogDoubleGradOpArgumentMapping: - -return KernelSignature("log_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("log_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogGradOpArgumentMapping: - -return KernelSignature("log_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("log_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogLossGradOpArgumentMapping: - -return KernelSignature("log_loss_grad", {"Predicted", "Labels", "Loss@GRAD"}, {"epsilon"}, {"Predicted@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Predicted", "Labels", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("epsilon"); - paddle::small_vector outputs {"Predicted@GRAD"}; - return KernelSignature("log_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogSoftmaxGradOpArgumentMapping: - -return KernelSignature("log_softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("log_softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogcumsumexpGradOpArgumentMapping: - -return KernelSignature("logcumsumexp_grad", {"X", "Out", "Out@GRAD"}, {"axis", "flatten", "exclusive", "reverse"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogcumsumexpGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("flatten"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("reverse"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("logcumsumexp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogitGradOpArgumentMapping: - -return KernelSignature("logit_grad", {"X", "Out@GRAD"}, {"eps"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogitGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("eps"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("logit_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LogsigmoidGradOpArgumentMapping: - -return KernelSignature("logsigmoid_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LogsigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("logsigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LuGradOpArgumentMapping: - -return KernelSignature("lu_grad", {"X", "Out", "Pivots", "Out@GRAD"}, {"pivots"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LuGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Pivots", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("pivots"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("lu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LuUnpackGradOpArgumentMapping: - -return KernelSignature("lu_unpack_grad", {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"}, {"unpack_ludata", "unpack_pivots"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature LuUnpackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("unpack_ludata"); - attrs.emplace_back("unpack_pivots"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("lu_unpack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MarginCrossEntropyGradOpArgumentMapping: - -return KernelSignature("margin_cross_entropy_grad", {"Logits", "Label", "Softmax", "Loss@GRAD"}, {"return_softmax", "ring_id", "rank", "nranks", "margin1", "margin2", "margin3", "scale"}, {"Logits@GRAD"}); -****************************************************************** -*/ - -KernelSignature MarginCrossEntropyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Logits", "Label", "Softmax", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("return_softmax"); - attrs.emplace_back("ring_id"); - attrs.emplace_back("rank"); - attrs.emplace_back("nranks"); - attrs.emplace_back("margin1"); - attrs.emplace_back("margin2"); - attrs.emplace_back("margin3"); - attrs.emplace_back("scale"); - paddle::small_vector outputs {"Logits@GRAD"}; - return KernelSignature("margin_cross_entropy_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaskedSelectGradOpArgumentMapping: - -return KernelSignature("masked_select_grad", {"X", "Mask", "Y@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MaskedSelectGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Mask", "Y@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("masked_select_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatrixPowerGradOpArgumentMapping: - -return KernelSignature("matrix_power_grad", {"X", "Out", "Out@GRAD"}, {"n"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MatrixPowerGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("n"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("matrix_power_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxPool2dWithIndexGradOpArgumentMapping: - -return KernelSignature("max_pool2d_with_index_grad", {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MaxPool2dWithIndexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Mask", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("max_pool2d_with_index_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxPool3dWithIndexGradOpArgumentMapping: - -return KernelSignature("max_pool3d_with_index_grad", {"X", "Mask", "Out@GRAD"}, {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MaxPool3dWithIndexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Mask", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("max_pool3d_with_index_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxoutGradOpArgumentMapping: - -return KernelSignature("maxout_grad", {"X", "Out", "Out@GRAD"}, {"groups", "axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MaxoutGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("groups"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("maxout_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MeanAllGradOpArgumentMapping: - -return KernelSignature("mean_all_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MeanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("mean_all_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MemoryEfficientAttentionGradOpArgumentMapping: - -return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"max_seqlen_q", "max_seqlen_k", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); -return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"max_seqlen_q", "MaxSeqlenKTensor", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); -return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"MaxSeqlenQTensor", "max_seqlen_k", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); -return KernelSignature("memory_efficient_attention_grad", {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}, {"MaxSeqlenQTensor", "MaxSeqlenKTensor", "causal", "dropout_p", "scale"}, {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature MemoryEfficientAttentionGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"query", "key", "value", "bias", "cu_seqlens_q", "cu_seqlens_k", "output", "logsumexp", "seed_and_offset", "output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("MaxSeqlenQTensor") ? "MaxSeqlenQTensor" : "max_seqlen_q"); - attrs.emplace_back(ctx.HasInput("MaxSeqlenKTensor") ? "MaxSeqlenKTensor" : "max_seqlen_k"); - attrs.emplace_back("causal"); - attrs.emplace_back("dropout_p"); - attrs.emplace_back("scale"); - paddle::small_vector outputs {"query@GRAD", "key@GRAD", "value@GRAD", "bias@GRAD"}; - return KernelSignature("memory_efficient_attention_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MeshgridGradOpArgumentMapping: - -return KernelSignature("meshgrid_grad", {"X", "outputs@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MeshgridGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "outputs@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("meshgrid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ModeGradOpArgumentMapping: - -return KernelSignature("mode_grad", {"X", "Indices", "Out@GRAD"}, {"axis", "keepdim"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("mode_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiDotGradOpArgumentMapping: - -return KernelSignature("multi_dot_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MultiDotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("multi_dot_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MultiplexGradOpArgumentMapping: - -return KernelSignature("multiplex_grad", {"Ids", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature MultiplexGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Ids", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("multiplex_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MvGradOpArgumentMapping: - -return KernelSignature("mv_grad", {"X", "Vec", "Out@GRAD"}, {}, {"X@GRAD", "Vec@GRAD"}); -****************************************************************** -*/ - -KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Vec", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Vec@GRAD"}; - return KernelSignature("mv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NanmedianGradOpArgumentMapping: - -return KernelSignature("nanmedian_grad", {"X", "MedianIndex", "Out@GRAD"}, {"axis", "keepdim"}, {"X@GRAD"}); -return KernelSignature("nanmedian_grad", {"X", "MedianIndex", "Out@GRAD"}, {"AxisTensorList", "keepdim"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature NanmedianGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "MedianIndex", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("nanmedian_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NearestInterpGradOpArgumentMapping: - -return KernelSignature("nearest_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature NearestInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("nearest_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NllLossGradOpArgumentMapping: - -return KernelSignature("nll_loss_grad", {"X", "Label", "Weight", "Total_weight", "Out@GRAD"}, {"ignore_index", "reduction"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature NllLossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label", "Weight", "Total_weight", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ignore_index"); - attrs.emplace_back("reduction"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("nll_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by OverlapAddGradOpArgumentMapping: - -return KernelSignature("overlap_add_grad", {"X", "Out@GRAD"}, {"hop_length", "axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature OverlapAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("hop_length"); - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("overlap_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PNormGradOpArgumentMapping: - -return KernelSignature("p_norm_grad", {"X", "Out", "Out@GRAD"}, {"porder", "axis", "epsilon", "keepdim", "asvector"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("porder"); - attrs.emplace_back("axis"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("keepdim"); - attrs.emplace_back("asvector"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("p_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pad3dDoubleGradOpArgumentMapping: - -return KernelSignature("pad3d", {"grad_x@GRAD"}, {"paddings", "mode", "value", "data_format"}, {"grad_out@GRAD"}); -return KernelSignature("pad3d", {"grad_x@GRAD"}, {"Paddings", "mode", "value", "data_format"}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature Pad3dDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("Paddings") - ? "Paddings" - : "paddings"); - - attrs.emplace_back("mode"); - attrs.emplace_back("value"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("pad3d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pad3dGradOpArgumentMapping: - -return KernelSignature("pad3d_grad", {"X", "Out@GRAD"}, {"paddings", "mode", "value", "data_format"}, {"X@GRAD"}); -return KernelSignature("pad3d_grad", {"X", "Out@GRAD"}, {"Paddings", "mode", "value", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Pad3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("Paddings") - ? "Paddings" - : "paddings"); - - attrs.emplace_back("mode"); - attrs.emplace_back("value"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pad3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PixelShuffleGradOpArgumentMapping: - -return KernelSignature("pixel_shuffle_grad", {"Out@GRAD"}, {"upscale_factor", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PixelShuffleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("upscale_factor"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pixel_shuffle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PixelUnshuffleGradOpArgumentMapping: - -return KernelSignature("pixel_unshuffle_grad", {"Out@GRAD"}, {"downscale_factor", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PixelUnshuffleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("downscale_factor"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pixel_unshuffle_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PoissonGradOpArgumentMapping: - -return KernelSignature("poisson_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PoissonGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("poisson_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PolygammaGradOpArgumentMapping: - -return KernelSignature("polygamma_grad", {"x", "out@GRAD"}, {"n"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature PolygammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("n"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("polygamma_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PowDoubleGradOpArgumentMapping: - -return KernelSignature("pow_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"factor"}, {"X@GRAD", "grad_out@GRAD"}); -return KernelSignature("pow_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"FactorTensor"}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature PowDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("pow_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PowGradOpArgumentMapping: - -return KernelSignature("pow_grad", {"X", "Out@GRAD"}, {"factor"}, {"X@GRAD"}); -return KernelSignature("pow_grad", {"X", "Out@GRAD"}, {"FactorTensor"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pow_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PowTripleGradOpArgumentMapping: - -return KernelSignature("pow_triple_grad", {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}, {"factor"}, {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}); -return KernelSignature("pow_triple_grad", {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}, {"FactorTensor"}, {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}); -****************************************************************** -*/ - -KernelSignature PowTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_grad_x", "grad_x@GRAD", "grad_grad_out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("FactorTensor") ? "FactorTensor" : "factor"); - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD"}; - return KernelSignature("pow_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PreluGradOpArgumentMapping: - -return KernelSignature("prelu_grad", {"X", "Alpha", "Out@GRAD"}, {"data_format", "mode"}, {"X@GRAD", "Alpha@GRAD"}); -****************************************************************** -*/ - -KernelSignature PreluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Alpha", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_format"); - attrs.emplace_back("mode"); - paddle::small_vector outputs {"X@GRAD", "Alpha@GRAD"}; - return KernelSignature("prelu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PsroiPoolGradOpArgumentMapping: - -return KernelSignature("psroi_pool_grad", {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature PsroiPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("output_channels"); - attrs.emplace_back("spatial_scale"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("psroi_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PutAlongAxisGradOpArgumentMapping: - -return KernelSignature("put_along_axis_grad", {"Input", "Index", "Result@GRAD"}, {"Axis", "Reduce"}, {"Input@GRAD", "Value@GRAD"}); -****************************************************************** -*/ - -KernelSignature PutAlongAxisGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Index", "Result@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("Axis"); - attrs.emplace_back("Reduce"); - paddle::small_vector outputs {"Input@GRAD", "Value@GRAD"}; - return KernelSignature("put_along_axis_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by QrGradOpArgumentMapping: - -return KernelSignature("qr_grad", {"X", "Q", "R", "Q@GRAD", "R@GRAD"}, {"mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature QrGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Q", "R", "Q@GRAD", "R@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("qr_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RealGradOpArgumentMapping: - -return KernelSignature("real_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RealGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("real_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReciprocalGradOpArgumentMapping: - -return KernelSignature("reciprocal_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReciprocalGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("reciprocal_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Relu6GradOpArgumentMapping: - -return KernelSignature("relu6_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Relu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("relu6_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReluDoubleGradOpArgumentMapping: - -return KernelSignature("relu_double_grad", {"Out", "grad_x@GRAD"}, {}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReluGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("relu_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReluGradOpArgumentMapping: - -return KernelSignature("relu_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RenormGradOpArgumentMapping: - -return KernelSignature("renorm_grad", {"X", "Out@GRAD"}, {"p", "axis", "max_norm"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RenormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("p"); - attrs.emplace_back("axis"); - attrs.emplace_back("max_norm"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("renorm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoiAlignGradOpArgumentMapping: - -return KernelSignature("roi_align_grad", {"X", "ROIs", "RoisNum", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale", "sampling_ratio", "aligned"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RoiAlignGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("spatial_scale"); - attrs.emplace_back("sampling_ratio"); - attrs.emplace_back("aligned"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("roi_align_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoiPoolGradOpArgumentMapping: - -return KernelSignature("roi_pool_grad", {"X", "ROIs", "RoisNum", "Argmax", "Out@GRAD"}, {"pooled_height", "pooled_width", "spatial_scale"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RoiPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "ROIs", "RoisNum", "Argmax", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("pooled_height"); - attrs.emplace_back("pooled_width"); - attrs.emplace_back("spatial_scale"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("roi_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RollGradOpArgumentMapping: - -return KernelSignature("roll_grad", {"X", "Out@GRAD"}, {"shifts", "axis"}, {"X@GRAD"}); -return KernelSignature("roll_grad", {"X", "Out@GRAD"}, {"ShiftsTensor", "axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("ShiftsTensor") - ? "ShiftsTensor" - : "shifts"); - - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("roll_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RoundGradOpArgumentMapping: - -return KernelSignature("round_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RoundGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("round_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RsqrtDoubleGradOpArgumentMapping: - -return KernelSignature("rsqrt_double_grad", {"Out", "grad_x", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature RsqrtGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "grad_x", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; - return KernelSignature("rsqrt_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RsqrtGradOpArgumentMapping: - -return KernelSignature("rsqrt_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature RsqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("rsqrt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ScatterGradOpArgumentMapping: - -return KernelSignature("scatter_grad", {"Ids", "Updates", "Out@GRAD"}, {"overwrite"}, {"X@GRAD", "Updates@GRAD"}); -****************************************************************** -*/ - -KernelSignature ScatterGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Ids", "Updates", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("overwrite"); - paddle::small_vector outputs {"X@GRAD", "Updates@GRAD"}; - return KernelSignature("scatter_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ScatterNdAddGradOpArgumentMapping: - -return KernelSignature("scatter_nd_add_grad", {"Index", "Updates", "Out@GRAD"}, {}, {"X@GRAD", "Updates@GRAD"}); -****************************************************************** -*/ - -KernelSignature ScatterNdAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Index", "Updates", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Updates@GRAD"}; - return KernelSignature("scatter_nd_add_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SegmentPoolGradOpArgumentMapping: - -return KernelSignature("segment_pool_grad", {"X", "SegmentIds", "Out", "SummedIds", "Out@GRAD"}, {"pooltype"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SegmentPoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "SegmentIds", "Out", "SummedIds", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("pooltype"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("segment_pool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SeluGradOpArgumentMapping: - -return KernelSignature("selu_grad", {"Out", "Out@GRAD"}, {"scale", "alpha"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("scale"); - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("selu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendURecvGradOpArgumentMapping: - -return KernelSignature("send_u_recv_grad", {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"reduce_op"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature GraphSendRecvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("reduce_op"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("send_u_recv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendUeRecvGradOpArgumentMapping: - -return KernelSignature("send_ue_recv_grad", {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"message_op", "reduce_op"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature GraphSendUeRecvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("message_op"); - attrs.emplace_back("reduce_op"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("send_ue_recv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SendUvGradOpArgumentMapping: - -return KernelSignature("send_uv_grad", {"x", "y", "src_index", "dst_index", "out@GRAD"}, {"message_op"}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature GraphSendUvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "src_index", "dst_index", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("message_op"); - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - return KernelSignature("send_uv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidCrossEntropyWithLogitsGradOpArgumentMapping: - -return KernelSignature("sigmoid_cross_entropy_with_logits_grad", {"X", "Label", "pos_weight", "Out@GRAD"}, {"normalize", "ignore_index"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SigmoidCrossEntropyWithLogitsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Label", "pos_weight", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("normalize"); - attrs.emplace_back("ignore_index"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sigmoid_cross_entropy_with_logits_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidDoubleGradOpArgumentMapping: - -return KernelSignature("sigmoid_double_grad", {"Out", "fwd_grad_out", "grad_x@GRAD"}, {}, {"Out@GRAD", "fwd_grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature SigmoidGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "fwd_grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "fwd_grad_out@GRAD"}; - return KernelSignature("sigmoid_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidGradOpArgumentMapping: - -return KernelSignature("sigmoid_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SigmoidGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sigmoid_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SigmoidTripleGradOpArgumentMapping: - -return KernelSignature("sigmoid_triple_grad", {"Out", "fwd_grad_out", "grad_grad_x", "grad_out@GRAD", "grad_grad_out@GRAD"}, {}, {"Out@GRAD", "fwd_grad_out@GRAD", "grad_grad_x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SigmoidTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "fwd_grad_out", "grad_grad_x", "grad_out@GRAD", "grad_grad_out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "fwd_grad_out@GRAD", "grad_grad_x@GRAD"}; - return KernelSignature("sigmoid_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SiluGradOpArgumentMapping: - -return KernelSignature("silu_grad", {"X", "Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SiluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("silu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinDoubleGradOpArgumentMapping: - -return KernelSignature("sin_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature SinDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("sin_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinGradOpArgumentMapping: - -return KernelSignature("sin_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinTripleGradOpArgumentMapping: - -return KernelSignature("sin_triple_grad", {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}, {}, {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); -****************************************************************** -*/ - -KernelSignature SinTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out_forward", "grad_x_grad_forward", "grad_x@GRAD", "grad_out_grad@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; - return KernelSignature("sin_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SinhGradOpArgumentMapping: - -return KernelSignature("sinh_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sinh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SlogdetGradOpArgumentMapping: - -return KernelSignature("slogdet_grad", {"Input", "Out", "Out@GRAD"}, {}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature SlogdeterminantGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("slogdet_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftplusDoubleGradOpArgumentMapping: - -return KernelSignature("softplus_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {"beta", "threshold"}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftplusDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("beta"); - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("softplus_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftplusGradOpArgumentMapping: - -return KernelSignature("softplus_grad", {"X", "Out@GRAD"}, {"beta", "threshold"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftplusGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("beta"); - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("softplus_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftshrinkGradOpArgumentMapping: - -return KernelSignature("softshrink_grad", {"X", "Out@GRAD"}, {"lambda"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftshrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("lambda"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("softshrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftsignGradOpArgumentMapping: - -return KernelSignature("softsign_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftsignGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("softsign_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SolveGradOpArgumentMapping: - -return KernelSignature("solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SpectralNormGradOpArgumentMapping: - -return KernelSignature("spectral_norm_grad", {"Weight", "U", "V", "Out@GRAD"}, {"dim", "power_iters", "eps"}, {"Weight@GRAD"}); -****************************************************************** -*/ - -KernelSignature SpectralNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Weight", "U", "V", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("power_iters"); - attrs.emplace_back("eps"); - paddle::small_vector outputs {"Weight@GRAD"}; - return KernelSignature("spectral_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqrtDoubleGradOpArgumentMapping: - -return KernelSignature("sqrt_double_grad", {"Out", "grad_x", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature SqrtGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "grad_x", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; - return KernelSignature("sqrt_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqrtGradOpArgumentMapping: - -return KernelSignature("sqrt_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sqrt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SquareDoubleGradOpArgumentMapping: - -return KernelSignature("square_double_grad", {"X", "grad_out", "grad_x@GRAD"}, {}, {"X@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature SquareGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "grad_out@GRAD"}; - return KernelSignature("square_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SquareGradOpArgumentMapping: - -return KernelSignature("square_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SquareGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("square_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SquaredL2NormGradOpArgumentMapping: - -return KernelSignature("squared_l2_norm_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SquaredL2NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("squared_l2_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SqueezeGradOpArgumentMapping: - -return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); -return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"AxisTensor"}, {"X@GRAD"}); -return KernelSignature("squeeze_grad", {"XShape", "Out@GRAD"}, {"AxisTensorList"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Squeeze2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"XShape", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axes"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("squeeze_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by StackGradOpArgumentMapping: - -return KernelSignature("stack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature StackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("stack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by StanhGradOpArgumentMapping: - -return KernelSignature("stanh_grad", {"X", "Out@GRAD"}, {"scale_a", "scale_b"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature StanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("scale_a"); - attrs.emplace_back("scale_b"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("stanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SvdGradOpArgumentMapping: - -return KernelSignature("svd_grad", {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"}, {"full_matrices"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SvdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("full_matrices"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("svd_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TakeAlongAxisGradOpArgumentMapping: - -return KernelSignature("take_along_axis_grad", {"Input", "Index", "Result@GRAD"}, {"Axis"}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature TakeAlongAxisGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Index", "Result@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("Axis"); - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("take_along_axis_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanGradOpArgumentMapping: - -return KernelSignature("tan_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("tan_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhDoubleGradOpArgumentMapping: - -return KernelSignature("tanh_double_grad", {"Out", "grad_out", "grad_x@GRAD"}, {}, {"Out@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature TanhGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "grad_out", "grad_x@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "grad_out@GRAD"}; - return KernelSignature("tanh_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhGradOpArgumentMapping: - -return KernelSignature("tanh_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("tanh_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhShrinkGradOpArgumentMapping: - -return KernelSignature("tanh_shrink_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TanhShrinkGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("tanh_shrink_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TanhTripleGradOpArgumentMapping: - -return KernelSignature("tanh_triple_grad", {"Out", "grad_out_forward", "grad_x_grad_forward", "grad_out_new@GRAD", "grad_out_grad@GRAD"}, {}, {"Out@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}); -****************************************************************** -*/ - -KernelSignature TanhTripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "grad_out_forward", "grad_x_grad_forward", "grad_out_new@GRAD", "grad_out_grad@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out@GRAD", "grad_out_forward@GRAD", "grad_x_grad_forward@GRAD"}; - return KernelSignature("tanh_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TemporalShiftGradOpArgumentMapping: - -return KernelSignature("temporal_shift_grad", {"Out@GRAD"}, {"seg_num", "shift_ratio", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TemporalShiftGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("seg_num"); - attrs.emplace_back("shift_ratio"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("temporal_shift_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TensorUnfoldGradOpArgumentMapping: - -return KernelSignature("tensor_unfold_grad", {"input", "out@GRAD"}, {"axis", "size", "step"}, {"input@GRAD"}); -****************************************************************** -*/ - -KernelSignature TensorUnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("size"); - attrs.emplace_back("step"); - paddle::small_vector outputs {"input@GRAD"}; - return KernelSignature("tensor_unfold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ThresholdedReluGradOpArgumentMapping: - -return KernelSignature("thresholded_relu_grad", {"X", "Out@GRAD"}, {"threshold"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ThresholdedReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("threshold"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("thresholded_relu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TopkGradOpArgumentMapping: - -return KernelSignature("topk_grad", {"X", "Indices", "Out@GRAD"}, {"k", "axis", "largest", "sorted"}, {"X@GRAD"}); -return KernelSignature("topk_grad", {"X", "Indices", "Out@GRAD"}, {"K", "axis", "largest", "sorted"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TopKV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("K") ? "K" : "k"); - attrs.emplace_back("axis"); - attrs.emplace_back("largest"); - attrs.emplace_back("sorted"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("topk_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TraceGradOpArgumentMapping: - -return KernelSignature("trace_grad", {"Input", "Out@GRAD"}, {"offset", "axis1", "axis2"}, {"Input@GRAD"}); -****************************************************************** -*/ - -KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("offset"); - attrs.emplace_back("axis1"); - attrs.emplace_back("axis2"); - paddle::small_vector outputs {"Input@GRAD"}; - return KernelSignature("trace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TriangularSolveGradOpArgumentMapping: - -return KernelSignature("triangular_solve_grad", {"X", "Y", "Out", "Out@GRAD"}, {"upper", "transpose", "unitriangular"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature TriangularSolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("upper"); - attrs.emplace_back("transpose"); - attrs.emplace_back("unitriangular"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("triangular_solve_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TrilinearInterpGradOpArgumentMapping: - -return KernelSignature("trilinear_interp_grad", {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}, {"data_layout", "out_d", "out_h", "out_w", "scale", "interp_method", "align_corners", "align_mode"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TrilinearInterpV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "OutSize", "SizeTensor", "Scale", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("data_layout"); - attrs.emplace_back("out_d"); - attrs.emplace_back("out_h"); - attrs.emplace_back("out_w"); - attrs.emplace_back("scale"); - attrs.emplace_back("interp_method"); - attrs.emplace_back("align_corners"); - attrs.emplace_back("align_mode"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("trilinear_interp_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TruncGradOpArgumentMapping: - -return KernelSignature("trunc_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("trunc_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnfoldGradOpArgumentMapping: - -return KernelSignature("unfold_grad", {"X", "Y@GRAD"}, {"kernel_sizes", "strides", "paddings", "dilations"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature UnfoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("kernel_sizes"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("unfold_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UniformInplaceGradOpArgumentMapping: - -return KernelSignature("uniform_inplace_grad", {"Out@GRAD"}, {"min", "max", "seed", "diag_num", "diag_step", "diag_val"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature UniformRandomInplaceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("min"); - attrs.emplace_back("max"); - attrs.emplace_back("seed"); - attrs.emplace_back("diag_num"); - attrs.emplace_back("diag_step"); - attrs.emplace_back("diag_val"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("uniform_inplace_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnsqueezeGradOpArgumentMapping: - -return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"axes"}, {"X@GRAD"}); -return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"AxesTensor"}, {"X@GRAD"}); -return KernelSignature("unsqueeze_grad", {"XShape", "Out@GRAD"}, {"AxesTensorList"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Unsqueeze2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"XShape", "Out@GRAD"}; - paddle::small_vector attrs; - - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("unsqueeze_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnstackGradOpArgumentMapping: - -return KernelSignature("unstack_grad", {"Y@GRAD"}, {"axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature UnstackGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("unstack_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ViewDtypeGradOpArgumentMapping: - -return KernelSignature("view_dtype_grad", {"input", "out@GRAD"}, {"dtype"}, {"input@GRAD"}); -****************************************************************** -*/ - -KernelSignature ViewDtypeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"input@GRAD"}; - return KernelSignature("view_dtype_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ViewShapeGradOpArgumentMapping: - -return KernelSignature("view_shape_grad", {"input", "out@GRAD"}, {"dims"}, {"input@GRAD"}); -****************************************************************** -*/ - -KernelSignature ViewShapeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dims"); - paddle::small_vector outputs {"input@GRAD"}; - return KernelSignature("view_shape_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WarpctcGradOpArgumentMapping: - -return KernelSignature("warpctc_grad", {"Logits", "LogitsLength", "WarpCTCGrad", "Loss@GRAD"}, {"blank", "norm_by_times"}, {"Logits@GRAD"}); -****************************************************************** -*/ - -KernelSignature WarpctcGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Logits", "LogitsLength", "WarpCTCGrad", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("blank"); - attrs.emplace_back("norm_by_times"); - paddle::small_vector outputs {"Logits@GRAD"}; - return KernelSignature("warpctc_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WarprnntGradOpArgumentMapping: - -return KernelSignature("warprnnt_grad", {"input", "input_lengths", "warprnntgrad", "loss@GRAD"}, {"blank", "fastemit_lambda"}, {"input@GRAD"}); -****************************************************************** -*/ - -KernelSignature WarprnntGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "input_lengths", "warprnntgrad", "loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("blank"); - attrs.emplace_back("fastemit_lambda"); - paddle::small_vector outputs {"input@GRAD"}; - return KernelSignature("warprnnt_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WeightOnlyLinearGradOpArgumentMapping: - -return KernelSignature("weight_only_linear_grad", {"x", "weight", "bias", "weight_scale", "out@GRAD"}, {"weight_dtype", "arch"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature WeightOnlyLinearGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "weight", "bias", "weight_scale", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("weight_dtype"); - attrs.emplace_back("arch"); - paddle::small_vector outputs {"x@GRAD"}; - return KernelSignature("weight_only_linear_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by WhereGradOpArgumentMapping: - -return KernelSignature("where_grad", {"Condition", "X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Condition", "X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("where_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by YoloLossGradOpArgumentMapping: - -return KernelSignature("yolo_loss_grad", {"X", "GTBox", "GTLabel", "GTScore", "ObjectnessMask", "GTMatchMask", "Loss@GRAD"}, {"anchors", "anchor_mask", "class_num", "ignore_thresh", "downsample_ratio", "use_label_smooth", "scale_x_y"}, {"X@GRAD", "GTBox@GRAD", "GTLabel@GRAD", "GTScore@GRAD"}); -****************************************************************** -*/ - -KernelSignature Yolov3LossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "GTBox", "GTLabel", "GTScore", "ObjectnessMask", "GTMatchMask", "Loss@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("anchors"); - attrs.emplace_back("anchor_mask"); - attrs.emplace_back("class_num"); - attrs.emplace_back("ignore_thresh"); - attrs.emplace_back("downsample_ratio"); - attrs.emplace_back("use_label_smooth"); - attrs.emplace_back("scale_x_y"); - paddle::small_vector outputs {"X@GRAD", "GTBox@GRAD", "GTLabel@GRAD", "GTScore@GRAD"}; - return KernelSignature("yolo_loss_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Unpool3dGradOpArgumentMapping: - -return KernelSignature("unpool3d_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Unpool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_size"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("unpool3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(accuracy, phi::AccuracyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acos, phi::AcosOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acosh, phi::AcoshOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(adagrad, phi::AdagradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(adamax, phi::AdamaxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(affine_grid, phi::AffineGridOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllcloseOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(angle, phi::AngleOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(arg_max, argmax); -PD_REGISTER_ARG_MAPPING_FN(arg_max, phi::ArgMaxOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(arg_min, argmin); -PD_REGISTER_ARG_MAPPING_FN(arg_min, phi::ArgMinOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(argsort, phi::ArgsortOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(as_complex, phi::AsComplexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(as_real, phi::AsRealOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(as_strided, phi::AsStridedOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asin, phi::AsinOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asinh, phi::AsinhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atan, phi::AtanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atan2, phi::Atan2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atanh, phi::AtanhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(auc, phi::AucOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(average_accumulates, phi::AverageAccumulatesOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bce_loss, phi::BceLossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bernoulli, phi::BernoulliOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bicubic_interp_v2, bicubic_interp); -PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2, phi::BicubicInterpV2OpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bilinear_tensor_product, bilinear); -PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product, phi::BilinearTensorProductOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bilinear_interp_v2, bilinear_interp); -PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2, phi::BilinearInterpV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bitwise_and, phi::BitwiseAndOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bitwise_not, phi::BitwiseNotOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bitwise_or, phi::BitwiseOrOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bitwise_xor, phi::BitwiseXorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bmm, phi::BmmOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(box_coder, phi::BoxCoderOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors, phi::BroadcastTensorsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(ceil, phi::CeilOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(celu, phi::CeluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(check_finite_and_unscale, phi::CheckFiniteAndUnscaleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(check_numerics, phi::CheckNumericsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cholesky, phi::CholeskyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cholesky_solve, phi::CholeskySolveOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(class_center_sample, phi::ClassCenterSampleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(clip, phi::ClipOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(coalesce_tensor, phi::CoalesceTensorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(complex, phi::ComplexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conj, phi::ConjOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose, phi::Conv3dTransposeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cos, phi::CosOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cosh, phi::CoshOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(crop_tensor, crop); -PD_REGISTER_ARG_MAPPING_FN(crop_tensor, phi::CropTensorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cross, phi::CrossOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy, cross_entropy_with_softmax); -PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy, phi::SoftmaxWithCrossEntropyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cummax, phi::CummaxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cummin, phi::CumminOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cumprod, phi::CumprodOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cumsum, phi::CumsumOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(data, phi::DataOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d, phi::DepthwiseConv2dOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(determinant, det); -PD_REGISTER_ARG_MAPPING_FN(determinant, phi::DeterminantOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); -PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(diag_embed, phi::DiagEmbedOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(diagonal, phi::DiagonalOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(digamma, phi::DigammaOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dirichlet, phi::DirichletOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dist, phi::DistOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dot, phi::DotOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(edit_distance, phi::EditDistanceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eig, phi::EigOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eigh, phi::EighOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eigvals, phi::EigvalsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eigvalsh, phi::EigvalshOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(equal_all, phi::EqualAllOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(erf, phi::ErfOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(erfinv, phi::ErfinvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(exp, phi::ExpOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as); -PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(expm1, phi::Expm1OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_c2c, phi::FftC2cOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_c2r, phi::FftC2rOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_r2c, phi::FftR2cOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(fill_any, fill); -PD_REGISTER_ARG_MAPPING_FN(fill_any, phi::FillAnyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fill_diagonal, phi::FillDiagonalOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor, phi::FillDiagonalTensorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(flash_attn, phi::FlashAttnOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(flash_attn_unpadded, phi::FlashAttnUnpaddedOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(flip, phi::FlipOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(floor, phi::FloorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fold, phi::FoldOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(frame, phi::FrameOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(full_int_array, phi::FullIntArrayOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gather_nd, phi::GatherNdOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gather_tree, phi::GatherTreeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gaussian_inplace, phi::GaussianInplaceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(generate_proposals_v2, generate_proposals); -PD_REGISTER_ARG_MAPPING_FN(generate_proposals_v2, phi::GenerateProposalsV2OpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); -PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(group_norm, phi::GroupNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax, phi::GumbelSoftmaxOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(hard_shrink, hardshrink); -PD_REGISTER_ARG_MAPPING_FN(hard_shrink, phi::HardShrinkOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(hard_sigmoid, hardsigmoid); -PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid, phi::HardSigmoidOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(brelu, hardtanh); -PD_REGISTER_ARG_MAPPING_FN(brelu, phi::BreluOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_heaviside, heaviside); -PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside, phi::ElementwiseHeavisideOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i0, phi::I0OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i0e, phi::I0eOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i1, phi::I1OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i1e, phi::I1eOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(imag, phi::ImagOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_add, phi::IndexAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_put, phi::IndexPutOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_sample, phi::IndexSampleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_select, phi::IndexSelectOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_select_strided, phi::IndexSelectStridedOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(inverse, phi::InverseOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(is_empty, phi::IsEmptyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite); -PD_REGISTER_ARG_MAPPING_FN(isfinite_v2, phi::IsfiniteV2OpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf); -PD_REGISTER_ARG_MAPPING_FN(isinf_v2, phi::IsinfV2OpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan); -PD_REGISTER_ARG_MAPPING_FN(isnan_v2, phi::IsnanV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kldiv_loss, phi::KldivLossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kron, phi::KronOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kthvalue, phi::KthvalueOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(label_smooth, phi::LabelSmoothOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lamb, phi::LambOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lgamma, phi::LgammaOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(linear_interp_v2, linear_interp); -PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2, phi::LinearInterpV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(llm_int8_linear, phi::LlmInt8LinearOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log, phi::LogOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log10, phi::Log10OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log1p, phi::Log1pOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log2, phi::Log2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log_loss, phi::LogLossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log_softmax, phi::LogSoftmaxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logcumsumexp, phi::LogcumsumexpOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logical_and, phi::LogicalAndOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logical_not, phi::LogicalNotOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logical_or, phi::LogicalOrOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logical_xor, phi::LogicalXorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logit, phi::LogitOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logsigmoid, phi::LogsigmoidOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lstsq, phi::LstsqOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lu, phi::LuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lu_unpack, phi::LuUnpackOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy, phi::MarginCrossEntropyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(masked_multihead_attention, phi::MaskedMultiheadAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(matrix_nms, phi::MatrixNmsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(matrix_power, phi::MatrixPowerOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, phi::MaxPool2dWithIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, phi::MaxPool3dWithIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(mean, mean_all); -PD_REGISTER_ARG_MAPPING_FN(mean, phi::MeanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(memory_efficient_attention, phi::MemoryEfficientAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(merge_selected_rows, phi::MergeSelectedRowsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(merged_momentum, phi::MergedMomentumOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(meshgrid, phi::MeshgridOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(momentum, phi::MomentumOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multi_dot, phi::MultiDotOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multiclass_nms3, phi::MulticlassNms3OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multinomial, phi::MultinomialOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nanmedian, phi::NanmedianOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(nearest_interp_v2, nearest_interp); -PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2, phi::NearestInterpV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nextafter, phi::NextafterOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nll_loss, phi::NllLossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nms, phi::NmsOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(where_index, nonzero); -PD_REGISTER_ARG_MAPPING_FN(where_index, phi::WhereIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(npu_identity, phi::NpuIdentityOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(size, numel); -PD_REGISTER_ARG_MAPPING_FN(size, phi::SizeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(overlap_add, phi::OverlapAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(p_norm, phi::PNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pad3d, phi::Pad3dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle, phi::PixelShuffleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle, phi::PixelUnshuffleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(poisson, phi::PoissonOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(polygamma, phi::PolygammaOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pow, phi::PowOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(prelu, phi::PreluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(prior_box, phi::PriorBoxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(real, phi::RealOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(reciprocal, phi::ReciprocalOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_reindex, reindex_graph); -PD_REGISTER_ARG_MAPPING_FN(graph_reindex, phi::GraphReindexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(relu, phi::ReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(relu6, phi::Relu6OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(renorm, phi::RenormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rms_norm, phi::RmsNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rmsprop, phi::RmspropOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(round, phi::RoundOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rsqrt, phi::RsqrtOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(scatter, phi::ScatterOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add, phi::ScatterNdAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(searchsorted, phi::SearchsortedOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(segment_pool, phi::SegmentPoolOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(selu, phi::SeluOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv, send_u_recv); -PD_REGISTER_ARG_MAPPING_FN(graph_send_recv, phi::GraphSendRecvOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv, send_ue_recv); -PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv, phi::GraphSendUeRecvOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_uv, send_uv); -PD_REGISTER_ARG_MAPPING_FN(graph_send_uv, phi::GraphSendUvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sgd, phi::SgdOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(shape, phi::ShapeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(shard_index, phi::ShardIndexOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sigmoid, phi::SigmoidOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sigmoid_cross_entropy_with_logits, phi::SigmoidCrossEntropyWithLogitsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sign, phi::SignOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(silu, phi::SiluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sin, phi::SinOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sinh, phi::SinhOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant, slogdet); -PD_REGISTER_ARG_MAPPING_FN(slogdeterminant, phi::SlogdeterminantOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softplus, phi::SoftplusOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softshrink, phi::SoftshrinkOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softsign, phi::SoftsignOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(solve, phi::SolveOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(spectral_norm, phi::SpectralNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sqrt, phi::SqrtOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(square, phi::SquareOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm, phi::SquaredL2NormOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze); -PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::Squeeze2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(stack, phi::StackOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(stanh, phi::StanhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(svd, phi::SvdOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tan, phi::TanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh, phi::TanhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_shrink, phi::TanhShrinkOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tensor_unfold, phi::TensorUnfoldOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(thresholded_relu, phi::ThresholdedReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(top_p_sampling, phi::TopPSamplingOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(top_k_v2, topk); -PD_REGISTER_ARG_MAPPING_FN(top_k_v2, phi::TopKV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(triangular_solve, phi::TriangularSolveOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(trilinear_interp_v2, trilinear_interp); -PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2, phi::TrilinearInterpV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unbind, phi::UnbindOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unfold, phi::UnfoldOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(uniform_random_inplace, uniform_inplace); -PD_REGISTER_ARG_MAPPING_FN(uniform_random_inplace, phi::UniformRandomInplaceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unique_consecutive, phi::UniqueConsecutiveOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unpool3d, phi::Unpool3dOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze); -PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::Unsqueeze2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unstack, phi::UnstackOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(update_loss_scaling, phi::UpdateLossScalingOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(view_dtype, phi::ViewDtypeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(view_shape, phi::ViewShapeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(viterbi_decode, phi::ViterbiDecodeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(warpctc, phi::WarpctcOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(warprnnt, phi::WarprnntOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(weight_dequantize, phi::WeightDequantizeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(weight_only_linear, phi::WeightOnlyLinearOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(weight_quantize, phi::WeightQuantizeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(weighted_sample_neighbors, phi::WeightedSampleNeighborsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(where, phi::WhereOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss, yolo_loss); -PD_REGISTER_ARG_MAPPING_FN(yolov3_loss, phi::Yolov3LossOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(abs_double_grad, phi::AbsDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(affine_grid_grad, phi::AffineGridGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(angle_grad, phi::AngleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(argsort_grad, phi::ArgsortGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(as_strided_grad, phi::AsStridedGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atan2_grad, phi::Atan2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bce_loss_grad, phi::BceLossGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bicubic_interp_v2_grad, bicubic_interp_grad); -PD_REGISTER_ARG_MAPPING_FN(bicubic_interp_v2_grad, phi::BicubicInterpV2GradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bilinear_tensor_product_grad, bilinear_grad); -PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad, phi::BilinearTensorProductGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(bilinear_interp_v2_grad, bilinear_interp_grad); -PD_REGISTER_ARG_MAPPING_FN(bilinear_interp_v2_grad, phi::BilinearInterpV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(bmm_grad, phi::BmmGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad, phi::BroadcastTensorsGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(ceil_grad, phi::CeilGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(celu_grad_grad, celu_double_grad); -PD_REGISTER_ARG_MAPPING_FN(celu_grad_grad, phi::CeluGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(celu_grad, phi::CeluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cholesky_solve_grad, phi::CholeskySolveGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(clip_double_grad, phi::ClipDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(clip_grad, phi::ClipGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(complex_grad, phi::ComplexGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(concat_grad, phi::ConcatGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad, phi::Conv2dGradGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(conv3d_grad_grad, conv3d_double_grad); -PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad, phi::Conv3dGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv3d_transpose_grad, phi::Conv3dTransposeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cos_double_grad, phi::CosDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cos_triple_grad, phi::CosTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(crop_tensor_grad, crop_grad); -PD_REGISTER_ARG_MAPPING_FN(crop_tensor_grad, phi::CropTensorGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy_grad, cross_entropy_with_softmax_grad); -PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy_grad, phi::SoftmaxWithCrossEntropyGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cross_grad, phi::CrossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cummax_grad, phi::CummaxGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cummin_grad, phi::CumminGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(cumsum_grad, phi::CumsumGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(depthwise_conv2d_grad_grad, depthwise_conv2d_double_grad); -PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad, phi::DepthwiseConv2dGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad, phi::DepthwiseConv2dGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(determinant_grad, det_grad); -PD_REGISTER_ARG_MAPPING_FN(determinant_grad, phi::DeterminantGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad); -PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eig_grad, phi::EigGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eigh_grad, phi::EighGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eigvalsh_grad, phi::EigvalshGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); -PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(erf_grad, phi::ErfGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(exp_grad, phi::ExpGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad); -PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad, phi::ExpandAsV2GradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(expand_v2_double_grad, expand_double_grad); -PD_REGISTER_ARG_MAPPING_FN(expm1_grad, phi::Expm1GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_c2c_grad, phi::FftC2cGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_c2r_grad, phi::FftC2rGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fft_r2c_grad, phi::FftR2cGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_grad, phi::FillDiagonalGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor_grad, phi::FillDiagonalTensorGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(fill_any_grad, fill_grad); -PD_REGISTER_ARG_MAPPING_FN(fill_any_grad, phi::FillAnyGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(flash_attn_grad, phi::FlashAttnGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(flash_attn_unpadded_grad, phi::FlashAttnUnpaddedGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(floor_grad, phi::FloorGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmax_grad, fmax_grad); -PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad, phi::ElementwiseFmaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_fmin_grad, fmin_grad); -PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad, phi::ElementwiseFminGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(fold_grad, phi::FoldGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(frame_grad, phi::FrameGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gather_nd_grad, phi::GatherNdGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gaussian_inplace_grad, phi::GaussianInplaceGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); -PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, phi::GridSamplerGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(group_norm_grad, phi::GroupNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad, phi::GumbelSoftmaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(hard_shrink_grad, hardshrink_grad); -PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, phi::HardShrinkGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(hard_sigmoid_grad, hardsigmoid_grad); -PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, phi::HardSigmoidGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(brelu_grad, hardtanh_grad); -PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BreluGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_heaviside_grad, heaviside_grad); -PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad, phi::ElementwiseHeavisideGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(huber_loss_grad, phi::HuberLossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i0_grad, phi::I0GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i0e_grad, phi::I0eGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i1_grad, phi::I1GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(i1e_grad, phi::I1eGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_add_grad, phi::IndexAddGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_put_grad, phi::IndexPutGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_sample_grad, phi::IndexSampleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_select_grad, phi::IndexSelectGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(index_select_strided_grad, phi::IndexSelectStridedGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(instance_norm_double_grad, phi::InstanceNormDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad, phi::InstanceNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(inverse_grad, phi::InverseGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad, phi::KldivLossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(label_smooth_grad, phi::LabelSmoothGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad, phi::LayerNormGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); -PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, phi::LeakyReluGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, phi::LeakyReluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(linear_interp_v2_grad, linear_interp_grad); -PD_REGISTER_ARG_MAPPING_FN(linear_interp_v2_grad, phi::LinearInterpV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad); -PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, phi::LogSoftmaxGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logcumsumexp_grad, phi::LogcumsumexpGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logit_grad, phi::LogitGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, phi::LogsigmoidGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lu_grad, phi::LuGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(lu_unpack_grad, phi::LuUnpackGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy_grad, phi::MarginCrossEntropyGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(masked_select_grad, phi::MaskedSelectGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(matrix_power_grad, phi::MatrixPowerGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, phi::MaxPool2dWithIndexGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, phi::MaxPool3dWithIndexGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(mean_grad, mean_all_grad); -PD_REGISTER_ARG_MAPPING_FN(mean_grad, phi::MeanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(memory_efficient_attention_grad, phi::MemoryEfficientAttentionGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(meshgrid_grad, phi::MeshgridGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multi_dot_grad, phi::MultiDotGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nanmedian_grad, phi::NanmedianGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(nearest_interp_v2_grad, nearest_interp_grad); -PD_REGISTER_ARG_MAPPING_FN(nearest_interp_v2_grad, phi::NearestInterpV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(nll_loss_grad, phi::NllLossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(overlap_add_grad, phi::OverlapAddGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(p_norm_grad, phi::PNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pad3d_double_grad, phi::Pad3dDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pad3d_grad, phi::Pad3dGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle_grad, phi::PixelShuffleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad, phi::PixelUnshuffleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(poisson_grad, phi::PoissonGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(polygamma_grad, phi::PolygammaGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pow_double_grad, phi::PowDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pow_triple_grad, phi::PowTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PreluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad, phi::PsroiPoolGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad, phi::PutAlongAxisGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(qr_grad, phi::QrGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad, phi::ReciprocalGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(relu6_grad, phi::Relu6GradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); -PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(renorm_grad, phi::RenormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(round_grad, phi::RoundGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(rsqrt_grad_grad, rsqrt_double_grad); -PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad_grad, phi::RsqrtGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(scatter_grad, phi::ScatterGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(scatter_nd_add_grad, phi::ScatterNdAddGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad, phi::SegmentPoolGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_recv_grad, send_u_recv_grad); -PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, phi::GraphSendRecvGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_ue_recv_grad, send_ue_recv_grad); -PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv_grad, phi::GraphSendUeRecvGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(graph_send_uv_grad, send_uv_grad); -PD_REGISTER_ARG_MAPPING_FN(graph_send_uv_grad, phi::GraphSendUvGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sigmoid_cross_entropy_with_logits_grad, phi::SigmoidCrossEntropyWithLogitsGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad); -PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, phi::SigmoidGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, phi::SigmoidTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sin_double_grad, phi::SinDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sin_triple_grad, phi::SinTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(slogdeterminant_grad, slogdet_grad); -PD_REGISTER_ARG_MAPPING_FN(slogdeterminant_grad, phi::SlogdeterminantGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softplus_double_grad, phi::SoftplusDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, phi::SoftshrinkGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softsign_grad, phi::SoftsignGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(solve_grad, phi::SolveGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(spectral_norm_grad, phi::SpectralNormGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(sqrt_grad_grad, sqrt_double_grad); -PD_REGISTER_ARG_MAPPING_FN(sqrt_grad_grad, phi::SqrtGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(square_grad_grad, square_double_grad); -PD_REGISTER_ARG_MAPPING_FN(square_grad_grad, phi::SquareGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm_grad, phi::SquaredL2NormGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(squeeze2_double_grad, squeeze_double_grad); -PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad); -PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::Squeeze2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(stack_grad, phi::StackGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::StanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(svd_grad, phi::SvdGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad, phi::TakeAlongAxisGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); -PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, phi::TanhGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, phi::TanhShrinkGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, phi::TanhTripleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad, phi::TemporalShiftGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tensor_unfold_grad, phi::TensorUnfoldGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, phi::ThresholdedReluGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(top_k_v2_grad, topk_grad); -PD_REGISTER_ARG_MAPPING_FN(top_k_v2_grad, phi::TopKV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(triangular_solve_grad, phi::TriangularSolveGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(trilinear_interp_v2_grad, trilinear_interp_grad); -PD_REGISTER_ARG_MAPPING_FN(trilinear_interp_v2_grad, phi::TrilinearInterpV2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unfold_grad, phi::UnfoldGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(uniform_random_inplace_grad, uniform_inplace_grad); -PD_REGISTER_ARG_MAPPING_FN(uniform_random_inplace_grad, phi::UniformRandomInplaceGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_double_grad, unsqueeze_double_grad); -PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad); -PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad, phi::Unsqueeze2GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unstack_grad, phi::UnstackGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(view_dtype_grad, phi::ViewDtypeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(view_shape_grad, phi::ViewShapeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(warpctc_grad, phi::WarpctcGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(warprnnt_grad, phi::WarprnntGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(weight_only_linear_grad, phi::WeightOnlyLinearGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(yolov3_loss_grad, yolo_loss_grad); -PD_REGISTER_ARG_MAPPING_FN(yolov3_loss_grad, phi::Yolov3LossGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unpool3d_grad, phi::Unpool3dGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc b/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc deleted file mode 100644 index 5f487e09b0e4e..0000000000000 --- a/paddle/fluid/operators/ops_signature/generated_sparse_sig.cc +++ /dev/null @@ -1,2735 +0,0 @@ -// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit. -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAbsOpArgumentMapping: - -return KernelSignature("abs_coo", {"x"}, {}, {"out"}); -return KernelSignature("abs_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAbsOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "abs_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "abs_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAcosOpArgumentMapping: - -return KernelSignature("acos_coo", {"x"}, {}, {"out"}); -return KernelSignature("acos_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAcosOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "acos_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "acos_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAcoshOpArgumentMapping: - -return KernelSignature("acosh_coo", {"x"}, {}, {"out"}); -return KernelSignature("acosh_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAcoshOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "acosh_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "acosh_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAddOpArgumentMapping: - -return KernelSignature("add_coo_coo", {"x", "y"}, {}, {"out"}); -return KernelSignature("add_csr_csr", {"x", "y"}, {}, {"out"}); -return KernelSignature("add_coo_dense", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAddOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "add_coo_coo"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "add_csr_csr"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { - kernel_name = "add_coo_dense"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAsinOpArgumentMapping: - -return KernelSignature("asin_coo", {"x"}, {}, {"out"}); -return KernelSignature("asin_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAsinOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "asin_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "asin_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAsinhOpArgumentMapping: - -return KernelSignature("asinh_coo", {"x"}, {}, {"out"}); -return KernelSignature("asinh_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAsinhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "asinh_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "asinh_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAtanOpArgumentMapping: - -return KernelSignature("atan_coo", {"x"}, {}, {"out"}); -return KernelSignature("atan_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAtanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "atan_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "atan_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAtanhOpArgumentMapping: - -return KernelSignature("atanh_coo", {"x"}, {}, {"out"}); -return KernelSignature("atanh_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAtanhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "atanh_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "atanh_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseBatchNormOpArgumentMapping: - -return KernelSignature("batch_norm_coo", {"x", "mean", "variance", "scale", "bias"}, {"is_test", "momentum", "epsilon", "data_layout", "use_global_stats", "trainable_statistics"}, {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}); -****************************************************************** -*/ - -KernelSignature SparseBatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("is_test"); - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("data_layout"); - attrs.emplace_back("use_global_stats"); - attrs.emplace_back("trainable_statistics"); - paddle::small_vector outputs {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("mean") && ctx.IsDenseTensorInput("variance") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias")) { - kernel_name = "batch_norm_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseCastOpArgumentMapping: - -return KernelSignature("cast_coo", {"x"}, {"index_dtype", "value_dtype"}, {"out"}); -return KernelSignature("cast_csr", {"x"}, {"index_dtype", "value_dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseCastOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("index_dtype"); - attrs.emplace_back("value_dtype"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "cast_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "cast_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseConv3dOpArgumentMapping: - -return KernelSignature("conv3d_coo", {"x", "kernel"}, {"paddings", "dilations", "strides", "groups", "subm", "key"}, {"out", "rulebook", "counter"}); -****************************************************************** -*/ - -KernelSignature SparseConv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "kernel"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("groups"); - attrs.emplace_back("subm"); - attrs.emplace_back("key"); - paddle::small_vector outputs {"out", "rulebook", "counter"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("kernel")) { - kernel_name = "conv3d_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseDivideOpArgumentMapping: - -return KernelSignature("divide_coo_coo", {"x", "y"}, {}, {"out"}); -return KernelSignature("divide_csr_csr", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseDivideOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "divide_coo_coo"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "divide_csr_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseDivideScalarOpArgumentMapping: - -return KernelSignature("divide_scalar_coo", {"x"}, {"scalar"}, {"out"}); -return KernelSignature("divide_scalar_csr", {"x"}, {"scalar"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseDivideScalarOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("scalar"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "divide_scalar_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "divide_scalar_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseExpm1OpArgumentMapping: - -return KernelSignature("expm1_coo", {"x"}, {}, {"out"}); -return KernelSignature("expm1_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseExpm1OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "expm1_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "expm1_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseIsnanOpArgumentMapping: - -return KernelSignature("isnan_coo", {"x"}, {}, {"out"}); -return KernelSignature("isnan_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseIsnanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "isnan_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "isnan_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseLeakyReluOpArgumentMapping: - -return KernelSignature("leaky_relu_coo", {"x"}, {"alpha"}, {"out"}); -return KernelSignature("leaky_relu_csr", {"x"}, {"alpha"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseLeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "leaky_relu_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "leaky_relu_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseLog1pOpArgumentMapping: - -return KernelSignature("log1p_coo", {"x"}, {}, {"out"}); -return KernelSignature("log1p_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseLog1pOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "log1p_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "log1p_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMultiplyOpArgumentMapping: - -return KernelSignature("multiply_coo_coo", {"x", "y"}, {}, {"out"}); -return KernelSignature("multiply_csr_csr", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseMultiplyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "multiply_coo_coo"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "multiply_csr_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparsePowOpArgumentMapping: - -return KernelSignature("pow_coo", {"x"}, {"factor"}, {"out"}); -return KernelSignature("pow_csr", {"x"}, {"factor"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparsePowOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("factor"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "pow_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "pow_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseReluOpArgumentMapping: - -return KernelSignature("relu_coo", {"x"}, {}, {"out"}); -return KernelSignature("relu_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseReluOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "relu_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "relu_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseRelu6OpArgumentMapping: - -return KernelSignature("relu6_coo", {"x"}, {}, {"out"}); -return KernelSignature("relu6_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseRelu6OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "relu6_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "relu6_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseReshapeOpArgumentMapping: - -return KernelSignature("reshape_coo", {"x"}, {"shape"}, {"out"}); -return KernelSignature("reshape_coo", {"x"}, {"ShapeTensor"}, {"out"}); -return KernelSignature("reshape_coo", {"x"}, {"ShapeTensorList"}, {"out"}); -return KernelSignature("reshape_csr", {"x"}, {"shape"}, {"out"}); -return KernelSignature("reshape_csr", {"x"}, {"ShapeTensor"}, {"out"}); -return KernelSignature("reshape_csr", {"x"}, {"ShapeTensorList"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseReshapeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("ShapeTensor") - ? "ShapeTensor" - : ctx.InputSize("ShapeTensorList") > 0 - ? "ShapeTensorList" - : "shape"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "reshape_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "reshape_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseScaleOpArgumentMapping: - -return KernelSignature("scale_coo", {"x"}, {"scale", "bias", "bias_after_scale"}, {"out"}); -return KernelSignature("scale_csr", {"x"}, {"scale", "bias", "bias_after_scale"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseScaleOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("scale"); - attrs.emplace_back("bias"); - attrs.emplace_back("bias_after_scale"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "scale_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "scale_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSinOpArgumentMapping: - -return KernelSignature("sin_coo", {"x"}, {}, {"out"}); -return KernelSignature("sin_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSinOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "sin_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "sin_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSinhOpArgumentMapping: - -return KernelSignature("sinh_coo", {"x"}, {}, {"out"}); -return KernelSignature("sinh_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSinhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "sinh_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "sinh_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSoftmaxOpArgumentMapping: - -return KernelSignature("softmax_coo", {"x"}, {"axis"}, {"out"}); -return KernelSignature("softmax_csr", {"x"}, {"axis"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "softmax_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "softmax_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSparseCooTensorOpArgumentMapping: - -return KernelSignature("sparse_coo_tensor", {"values", "indices"}, {"shape"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSparseCooTensorOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"values", "indices"}; - paddle::small_vector attrs; - attrs.emplace_back("shape"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("values") && ctx.IsDenseTensorInput("indices")) { - kernel_name = "sparse_coo_tensor"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSqrtOpArgumentMapping: - -return KernelSignature("sqrt_coo", {"x"}, {}, {"out"}); -return KernelSignature("sqrt_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSqrtOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "sqrt_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "sqrt_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSquareOpArgumentMapping: - -return KernelSignature("square_coo", {"x"}, {}, {"out"}); -return KernelSignature("square_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSquareOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "square_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "square_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSubtractOpArgumentMapping: - -return KernelSignature("subtract_coo_coo", {"x", "y"}, {}, {"out"}); -return KernelSignature("subtract_csr_csr", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSubtractOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "subtract_coo_coo"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "subtract_csr_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSumOpArgumentMapping: - -return KernelSignature("sum_coo", {"x"}, {"axis", "dtype", "keepdim"}, {"out"}); -return KernelSignature("sum_coo", {"x"}, {"AxisTensor", "dtype", "keepdim"}, {"out"}); -return KernelSignature("sum_coo", {"x"}, {"AxisTensorList", "dtype", "keepdim"}, {"out"}); -return KernelSignature("sum_csr", {"x"}, {"axis", "dtype", "keepdim"}, {"out"}); -return KernelSignature("sum_csr", {"x"}, {"AxisTensor", "dtype", "keepdim"}, {"out"}); -return KernelSignature("sum_csr", {"x"}, {"AxisTensorList", "dtype", "keepdim"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSumOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "axis"); - attrs.emplace_back("dtype"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "sum_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "sum_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSyncBatchNormOpArgumentMapping: - -return KernelSignature("sync_batch_norm_coo", {"x", "mean", "variance", "scale", "bias"}, {"is_test", "momentum", "epsilon", "data_layout", "use_global_stats", "trainable_statistics"}, {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}); -****************************************************************** -*/ - -KernelSignature SparseSyncBatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "mean", "variance", "scale", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("is_test"); - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("data_layout"); - attrs.emplace_back("use_global_stats"); - attrs.emplace_back("trainable_statistics"); - paddle::small_vector outputs {"out", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("mean") && ctx.IsDenseTensorInput("variance") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias")) { - kernel_name = "sync_batch_norm_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTanOpArgumentMapping: - -return KernelSignature("tan_coo", {"x"}, {}, {"out"}); -return KernelSignature("tan_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseTanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "tan_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "tan_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTanhOpArgumentMapping: - -return KernelSignature("tanh_coo", {"x"}, {}, {"out"}); -return KernelSignature("tanh_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseTanhOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "tanh_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "tanh_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseToDenseOpArgumentMapping: - -return KernelSignature("coo_to_dense", {"x"}, {}, {"out"}); -return KernelSignature("csr_to_dense", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseToDenseOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "coo_to_dense"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "csr_to_dense"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseToSparseCooOpArgumentMapping: - -return KernelSignature("dense_to_coo", {"x"}, {"sparse_dim"}, {"out"}); -return KernelSignature("csr_to_coo", {"x"}, {"sparse_dim"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseToSparseCooOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("sparse_dim"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("x")) { - kernel_name = "dense_to_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "csr_to_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseToSparseCsrOpArgumentMapping: - -return KernelSignature("dense_to_csr", {"x"}, {}, {"out"}); -return KernelSignature("coo_to_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseToSparseCsrOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("x")) { - kernel_name = "dense_to_csr"; - } - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "coo_to_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTransposeOpArgumentMapping: - -return KernelSignature("transpose_coo", {"x"}, {"perm"}, {"out"}); -return KernelSignature("transpose_csr", {"x"}, {"perm"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("perm"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "transpose_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "transpose_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseValuesOpArgumentMapping: - -return KernelSignature("values_coo", {"x"}, {}, {"out"}); -return KernelSignature("values_csr", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseValuesOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "values_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "values_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAddmmOpArgumentMapping: - -return KernelSignature("addmm_csr_dense", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); -return KernelSignature("addmm_csr_csr", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); -return KernelSignature("addmm_coo_dense", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); -return KernelSignature("addmm_coo_coo", {"input", "x", "y"}, {"beta", "alpha"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseAddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "x", "y"}; - paddle::small_vector attrs; - attrs.emplace_back("beta"); - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y")) { - kernel_name = "addmm_csr_dense"; - } - if (ctx.IsSparseCsrTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "addmm_csr_csr"; - } - if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { - kernel_name = "addmm_coo_dense"; - } - if (ctx.IsSparseCooTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "addmm_coo_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseCoalesceOpArgumentMapping: - -return KernelSignature("coalesce_coo", {"x"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseCoalesceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "coalesce_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseFullLikeOpArgumentMapping: - -return KernelSignature("full_like_coo", {"x"}, {"value", "dtype"}, {"out"}); -return KernelSignature("full_like_coo", {"x"}, {"ValueTensor", "dtype"}, {"out"}); -return KernelSignature("full_like_csr", {"x"}, {"value", "dtype"}, {"out"}); -return KernelSignature("full_like_csr", {"x"}, {"ValueTensor", "dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseFullLikeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("ValueTensor") ? "ValueTensor" : "value"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "full_like_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "full_like_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseFusedAttentionOpArgumentMapping: - -return KernelSignature("fused_attention_csr", {"query", "key", "value", "sparse_mask", "key_padding_mask", "attn_mask"}, {}, {"out", "softmax"}); -****************************************************************** -*/ - -KernelSignature SparseFusedAttentionOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"query", "key", "value", "sparse_mask", "key_padding_mask", "attn_mask"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out", "softmax"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("query") && ctx.IsDenseTensorInput("key") && ctx.IsDenseTensorInput("value") && ctx.IsSparseCsrTensorInput("sparse_mask") && ctx.IsDenseTensorInput("key_padding_mask") && ctx.IsDenseTensorInput("attn_mask")) { - kernel_name = "fused_attention_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMaskedMatmulOpArgumentMapping: - -return KernelSignature("masked_matmul_csr", {"x", "y", "mask"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseMaskedMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "mask"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCsrTensorInput("mask")) { - kernel_name = "masked_matmul_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMatmulOpArgumentMapping: - -return KernelSignature("matmul_csr_dense", {"x", "y"}, {}, {"out"}); -return KernelSignature("matmul_csr_csr", {"x", "y"}, {}, {"out"}); -return KernelSignature("matmul_coo_dense", {"x", "y"}, {}, {"out"}); -return KernelSignature("matmul_coo_coo", {"x", "y"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseMatmulOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y")) { - kernel_name = "matmul_csr_dense"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y")) { - kernel_name = "matmul_csr_csr"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y")) { - kernel_name = "matmul_coo_dense"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y")) { - kernel_name = "matmul_coo_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMaxpoolOpArgumentMapping: - -return KernelSignature("maxpool_coo", {"x"}, {"kernel_sizes", "paddings", "dilations", "strides"}, {"out", "rulebook", "counter"}); -****************************************************************** -*/ - -KernelSignature SparseMaxpoolOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back("kernel_sizes"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - paddle::small_vector outputs {"out", "rulebook", "counter"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "maxpool_coo"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMvOpArgumentMapping: - -return KernelSignature("mv_coo", {"x", "vec"}, {}, {"out"}); -return KernelSignature("mv_csr", {"x", "vec"}, {}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseMvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "vec"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("vec")) { - kernel_name = "mv_coo"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("vec")) { - kernel_name = "mv_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSliceOpArgumentMapping: - -return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_coo", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"out"}); -return KernelSignature("slice_csr", {"x"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"out"}); -****************************************************************** -*/ - -KernelSignature SparseSliceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxesTensor") - ? "AxesTensor" - : ctx.InputSize("AxesTensorList") > 0 - ? "AxesTensorList" - : "axes"); - attrs.emplace_back( - ctx.HasInput("StartsTensor") - ? "StartsTensor" - : ctx.InputSize("StartsTensorList") > 0 - ? "StartsTensorList" - : "starts"); - attrs.emplace_back( - ctx.HasInput("EndsTensor") - ? "EndsTensor" - : ctx.InputSize("EndsTensorList") > 0 - ? "EndsTensorList" - : "ends"); - paddle::small_vector outputs {"out"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x")) { - kernel_name = "slice_coo"; - } - if (ctx.IsSparseCsrTensorInput("x")) { - kernel_name = "slice_csr"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAbsGradOpArgumentMapping: - -return KernelSignature("abs_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("abs_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAbsGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "abs_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "abs_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAcosGradOpArgumentMapping: - -return KernelSignature("acos_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("acos_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAcosGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "acos_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "acos_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAcoshGradOpArgumentMapping: - -return KernelSignature("acosh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("acosh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAcoshGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "acosh_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "acosh_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAddGradOpArgumentMapping: - -return KernelSignature("add_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("add_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("add_coo_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAddGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "add_coo_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "add_csr_csr_grad"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "add_coo_dense_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAddmmGradOpArgumentMapping: - -return KernelSignature("addmm_csr_dense_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); -return KernelSignature("addmm_csr_csr_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); -return KernelSignature("addmm_coo_dense_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); -return KernelSignature("addmm_coo_coo_grad", {"input", "x", "y", "out@GRAD"}, {"alpha", "beta"}, {"input@GRAD", "x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"input", "x", "y", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - attrs.emplace_back("beta"); - paddle::small_vector outputs {"input@GRAD", "x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "addmm_csr_dense_grad"; - } - if (ctx.IsSparseCsrTensorInput("input") && ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "addmm_csr_csr_grad"; - } - if (ctx.IsDenseTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "addmm_coo_dense_grad"; - } - if (ctx.IsSparseCooTensorInput("input") && ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "addmm_coo_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAsinGradOpArgumentMapping: - -return KernelSignature("asin_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("asin_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAsinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "asin_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "asin_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAsinhGradOpArgumentMapping: - -return KernelSignature("asinh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("asinh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAsinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "asinh_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "asinh_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAtanGradOpArgumentMapping: - -return KernelSignature("atan_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("atan_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAtanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "atan_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "atan_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseAtanhGradOpArgumentMapping: - -return KernelSignature("atanh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("atanh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseAtanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "atanh_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "atanh_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseBatchNormGradOpArgumentMapping: - -return KernelSignature("batch_norm_coo_grad", {"x", "scale", "bias", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}, {"momentum", "epsilon", "data_layout", "is_test", "use_global_stats", "trainable_statistics"}, {"x@GRAD", "scale@GRAD", "bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseBatchNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "scale", "bias", "mean_out", "variance_out", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("data_layout"); - attrs.emplace_back("is_test"); - attrs.emplace_back("use_global_stats"); - attrs.emplace_back("trainable_statistics"); - paddle::small_vector outputs {"x@GRAD", "scale@GRAD", "bias@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias") && ctx.IsDenseTensorInput("mean_out") && ctx.IsDenseTensorInput("variance_out") && ctx.IsDenseTensorInput("saved_mean") && ctx.IsDenseTensorInput("saved_variance") && ctx.IsDenseTensorInput("reserve_space") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "batch_norm_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseCastGradOpArgumentMapping: - -return KernelSignature("cast_coo_grad", {"x", "out@GRAD"}, {"value_dtype"}, {"x@GRAD"}); -return KernelSignature("cast_csr_grad", {"x", "out@GRAD"}, {"value_dtype"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseCastGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("value_dtype"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "cast_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "cast_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseConv3dGradOpArgumentMapping: - -return KernelSignature("conv3d_coo_grad", {"x", "kernel", "out", "rulebook", "counter", "out@GRAD"}, {"paddings", "dilations", "strides", "groups", "subm", "key"}, {"x@GRAD", "kernel@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseConv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "kernel", "out", "rulebook", "counter", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("strides"); - attrs.emplace_back("groups"); - attrs.emplace_back("subm"); - attrs.emplace_back("key"); - paddle::small_vector outputs {"x@GRAD", "kernel@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("kernel") && ctx.IsSparseCooTensorInput("out") && ctx.IsDenseTensorInput("rulebook") && ctx.IsDenseTensorInput("counter") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "conv3d_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseDivideGradOpArgumentMapping: - -return KernelSignature("divide_coo_coo_grad", {"x", "y", "out", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("divide_csr_csr_grad", {"x", "y", "out", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseDivideGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "divide_coo_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "divide_csr_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseExpm1GradOpArgumentMapping: - -return KernelSignature("expm1_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("expm1_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseExpm1GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "expm1_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "expm1_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseLeakyReluGradOpArgumentMapping: - -return KernelSignature("leaky_relu_coo_grad", {"x", "out@GRAD"}, {"alpha"}, {"x@GRAD"}); -return KernelSignature("leaky_relu_csr_grad", {"x", "out@GRAD"}, {"alpha"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseLeakyReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("alpha"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "leaky_relu_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "leaky_relu_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseLog1pGradOpArgumentMapping: - -return KernelSignature("log1p_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("log1p_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseLog1pGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "log1p_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "log1p_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMaskedMatmulGradOpArgumentMapping: - -return KernelSignature("masked_matmul_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseMaskedMatmulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "masked_matmul_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMatmulGradOpArgumentMapping: - -return KernelSignature("matmul_csr_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("matmul_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("matmul_coo_dense_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("matmul_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseMatmulGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "matmul_csr_dense_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "matmul_csr_csr_grad"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("y") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "matmul_coo_dense_grad"; - } - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "matmul_coo_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMaxpoolGradOpArgumentMapping: - -return KernelSignature("maxpool_coo_grad", {"x", "rulebook", "counter", "out", "out@GRAD"}, {"kernel_sizes"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseMaxpoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "rulebook", "counter", "out", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("kernel_sizes"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("rulebook") && ctx.IsDenseTensorInput("counter") && ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "maxpool_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMultiplyGradOpArgumentMapping: - -return KernelSignature("multiply_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("multiply_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseMultiplyGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "multiply_coo_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "multiply_csr_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseMvGradOpArgumentMapping: - -return KernelSignature("mv_coo_grad", {"x", "vec", "out@GRAD"}, {}, {"x@GRAD", "vec@GRAD"}); -return KernelSignature("mv_csr_grad", {"x", "vec", "out@GRAD"}, {}, {"x@GRAD", "vec@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseMvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "vec", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "vec@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("vec") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "mv_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsDenseTensorInput("vec") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "mv_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparsePowGradOpArgumentMapping: - -return KernelSignature("pow_coo_grad", {"x", "out@GRAD"}, {"factor"}, {"x@GRAD"}); -return KernelSignature("pow_csr_grad", {"x", "out@GRAD"}, {"factor"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparsePowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("factor"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "pow_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "pow_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseRelu6GradOpArgumentMapping: - -return KernelSignature("relu6_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("relu6_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseRelu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "relu6_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "relu6_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseReluGradOpArgumentMapping: - -return KernelSignature("relu_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("relu_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "relu_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "relu_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseReshapeGradOpArgumentMapping: - -return KernelSignature("reshape_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("reshape_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseReshapeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "reshape_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "reshape_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSinGradOpArgumentMapping: - -return KernelSignature("sin_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("sin_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sin_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "sin_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSinhGradOpArgumentMapping: - -return KernelSignature("sinh_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("sinh_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSinhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sinh_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "sinh_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSoftmaxGradOpArgumentMapping: - -return KernelSignature("softmax_coo_grad", {"out", "out@GRAD"}, {"axis"}, {"x@GRAD"}); -return KernelSignature("softmax_csr_grad", {"out", "out@GRAD"}, {"axis"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "softmax_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "softmax_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSparseCooTensorGradOpArgumentMapping: - -return KernelSignature("sparse_coo_tensor_grad", {"indices", "out@GRAD"}, {}, {"values@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSparseCooTensorGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"indices", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"values@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("indices") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sparse_coo_tensor_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSqrtGradOpArgumentMapping: - -return KernelSignature("sqrt_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("sqrt_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSqrtGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sqrt_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "sqrt_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSquareGradOpArgumentMapping: - -return KernelSignature("square_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("square_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSquareGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "square_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "square_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSubtractGradOpArgumentMapping: - -return KernelSignature("subtract_coo_coo_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -return KernelSignature("subtract_csr_csr_grad", {"x", "y", "out@GRAD"}, {}, {"x@GRAD", "y@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSubtractGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "y", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD", "y@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("y") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "subtract_coo_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("y") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "subtract_csr_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSumGradOpArgumentMapping: - -return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"axis", "keepdim"}, {"x@GRAD"}); -return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"AxisTensor", "keepdim"}, {"x@GRAD"}); -return KernelSignature("sum_coo_grad", {"x", "out@GRAD"}, {"AxisTensorList", "keepdim"}, {"x@GRAD"}); -return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"axis", "keepdim"}, {"x@GRAD"}); -return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"AxisTensor", "keepdim"}, {"x@GRAD"}); -return KernelSignature("sum_csr_grad", {"x", "out@GRAD"}, {"AxisTensorList", "keepdim"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "axis"); - attrs.emplace_back("keepdim"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sum_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "sum_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSyncBatchNormGradOpArgumentMapping: - -return KernelSignature("sync_batch_norm_coo_grad", {"x", "scale", "bias", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}, {"momentum", "epsilon", "data_layout", "is_test", "use_global_stats", "trainable_statistics"}, {"x@GRAD", "scale@GRAD", "bias@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSyncBatchNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "scale", "bias", "saved_mean", "saved_variance", "reserve_space", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("momentum"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("data_layout"); - attrs.emplace_back("is_test"); - attrs.emplace_back("use_global_stats"); - attrs.emplace_back("trainable_statistics"); - paddle::small_vector outputs {"x@GRAD", "scale@GRAD", "bias@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("scale") && ctx.IsDenseTensorInput("bias") && ctx.IsDenseTensorInput("saved_mean") && ctx.IsDenseTensorInput("saved_variance") && ctx.IsDenseTensorInput("reserve_space") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "sync_batch_norm_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTanGradOpArgumentMapping: - -return KernelSignature("tan_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("tan_csr_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseTanGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "tan_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "tan_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTanhGradOpArgumentMapping: - -return KernelSignature("tanh_coo_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -return KernelSignature("tanh_csr_grad", {"out", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseTanhGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "tanh_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "tanh_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseToDenseGradOpArgumentMapping: - -return KernelSignature("coo_to_dense_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseToDenseGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "coo_to_dense_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseToSparseCooGradOpArgumentMapping: - -return KernelSignature("coo_to_dense", {"out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseToSparseCooGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "coo_to_dense"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseTransposeGradOpArgumentMapping: - -return KernelSignature("transpose_coo_grad", {"out@GRAD"}, {"perm"}, {"x@GRAD"}); -return KernelSignature("transpose_csr_grad", {"out@GRAD"}, {"perm"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("perm"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "transpose_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "transpose_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseValuesGradOpArgumentMapping: - -return KernelSignature("values_coo_grad", {"x", "out@GRAD"}, {}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseValuesGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "values_coo_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseFusedAttentionGradOpArgumentMapping: - -return KernelSignature("fused_attention_csr_grad", {"query", "key", "value", "softmax", "out@GRAD"}, {}, {"query@GRAD", "key@GRAD", "value@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseFusedAttentionGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"query", "key", "value", "softmax", "out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"query@GRAD", "key@GRAD", "value@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsDenseTensorInput("query") && ctx.IsDenseTensorInput("key") && ctx.IsDenseTensorInput("value") && ctx.IsSparseCsrTensorInput("softmax") && ctx.IsDenseTensorInput("out_grad")) { - kernel_name = "fused_attention_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SparseSliceGradOpArgumentMapping: - -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_coo_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"axes", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensor", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "starts", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensor", "EndsTensorList"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "ends"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensor"}, {"x@GRAD"}); -return KernelSignature("slice_csr_grad", {"x", "out@GRAD"}, {"AxesTensorList", "StartsTensorList", "EndsTensorList"}, {"x@GRAD"}); -****************************************************************** -*/ - -KernelSignature SparseSliceGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxesTensor") - ? "AxesTensor" - : ctx.InputSize("AxesTensorList") > 0 - ? "AxesTensorList" - : "axes"); - attrs.emplace_back( - ctx.HasInput("StartsTensor") - ? "StartsTensor" - : ctx.InputSize("StartsTensorList") > 0 - ? "StartsTensorList" - : "starts"); - attrs.emplace_back( - ctx.HasInput("EndsTensor") - ? "EndsTensor" - : ctx.InputSize("EndsTensorList") > 0 - ? "EndsTensorList" - : "ends"); - paddle::small_vector outputs {"x@GRAD"}; - - const char* kernel_name = "unregistered"; - - if (ctx.IsSparseCooTensorInput("x") && ctx.IsSparseCooTensorInput("out_grad")) { - kernel_name = "slice_coo_grad"; - } - if (ctx.IsSparseCsrTensorInput("x") && ctx.IsSparseCsrTensorInput("out_grad")) { - kernel_name = "slice_csr_grad"; - } - KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs)); - return sig; -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(sparse_abs, phi::SparseAbsOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_acos, phi::SparseAcosOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_acosh, phi::SparseAcoshOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_add, phi::SparseAddOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_asin, phi::SparseAsinOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_asinh, phi::SparseAsinhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_atan, phi::SparseAtanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_atanh, phi::SparseAtanhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_batch_norm, phi::SparseBatchNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_cast, phi::SparseCastOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_conv3d, phi::SparseConv3dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_divide, phi::SparseDivideOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_divide_scalar, phi::SparseDivideScalarOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_expm1, phi::SparseExpm1OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_isnan, phi::SparseIsnanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_leaky_relu, phi::SparseLeakyReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_log1p, phi::SparseLog1pOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_multiply, phi::SparseMultiplyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_pow, phi::SparsePowOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_relu, phi::SparseReluOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_relu6, phi::SparseRelu6OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_reshape, phi::SparseReshapeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_scale, phi::SparseScaleOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sin, phi::SparseSinOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sinh, phi::SparseSinhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_softmax, phi::SparseSoftmaxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sparse_coo_tensor, phi::SparseSparseCooTensorOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sqrt, phi::SparseSqrtOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_square, phi::SparseSquareOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_subtract, phi::SparseSubtractOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sum, phi::SparseSumOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sync_batch_norm, phi::SparseSyncBatchNormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_tan, phi::SparseTanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_tanh, phi::SparseTanhOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_to_dense, phi::SparseToDenseOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_coo, phi::SparseToSparseCooOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_csr, phi::SparseToSparseCsrOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_transpose, phi::SparseTransposeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_values, phi::SparseValuesOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_addmm, phi::SparseAddmmOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_coalesce, phi::SparseCoalesceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_full_like, phi::SparseFullLikeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_fused_attention, phi::SparseFusedAttentionOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_masked_matmul, phi::SparseMaskedMatmulOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_matmul, phi::SparseMatmulOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_maxpool, phi::SparseMaxpoolOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_mv, phi::SparseMvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_slice, phi::SparseSliceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_abs_grad, phi::SparseAbsGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_acos_grad, phi::SparseAcosGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_acosh_grad, phi::SparseAcoshGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_add_grad, phi::SparseAddGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_addmm_grad, phi::SparseAddmmGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_asin_grad, phi::SparseAsinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_asinh_grad, phi::SparseAsinhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_atan_grad, phi::SparseAtanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_atanh_grad, phi::SparseAtanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_batch_norm_grad, phi::SparseBatchNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_cast_grad, phi::SparseCastGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_conv3d_grad, phi::SparseConv3dGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_divide_grad, phi::SparseDivideGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_expm1_grad, phi::SparseExpm1GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_leaky_relu_grad, phi::SparseLeakyReluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_log1p_grad, phi::SparseLog1pGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_masked_matmul_grad, phi::SparseMaskedMatmulGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_matmul_grad, phi::SparseMatmulGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_maxpool_grad, phi::SparseMaxpoolGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_multiply_grad, phi::SparseMultiplyGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_mv_grad, phi::SparseMvGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_pow_grad, phi::SparsePowGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_relu6_grad, phi::SparseRelu6GradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_relu_grad, phi::SparseReluGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_reshape_grad, phi::SparseReshapeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sin_grad, phi::SparseSinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sinh_grad, phi::SparseSinhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_softmax_grad, phi::SparseSoftmaxGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sparse_coo_tensor_grad, phi::SparseSparseCooTensorGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sqrt_grad, phi::SparseSqrtGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_square_grad, phi::SparseSquareGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_subtract_grad, phi::SparseSubtractGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sum_grad, phi::SparseSumGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_sync_batch_norm_grad, phi::SparseSyncBatchNormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_tan_grad, phi::SparseTanGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_tanh_grad, phi::SparseTanhGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_to_dense_grad, phi::SparseToDenseGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_to_sparse_coo_grad, phi::SparseToSparseCooGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_transpose_grad, phi::SparseTransposeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_values_grad, phi::SparseValuesGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_fused_attention_grad, phi::SparseFusedAttentionGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(sparse_slice_grad, phi::SparseSliceGradOpArgumentMapping); diff --git a/paddle/fluid/operators/ops_signature/generated_static_sig.cc b/paddle/fluid/operators/ops_signature/generated_static_sig.cc deleted file mode 100644 index 8e3ffbef1ffde..0000000000000 --- a/paddle/fluid/operators/ops_signature/generated_static_sig.cc +++ /dev/null @@ -1,1585 +0,0 @@ -// this file is generated by paddle/phi/op/yaml/generator/generate_op.py, do not edit. -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AllGatherOpArgumentMapping: - -return KernelSignature("all_gather", {"x"}, {"ring_id", "nranks"}, {"out"}); -****************************************************************** -*/ - -KernelSignature AllGatherOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("nranks"); - paddle::small_vector outputs {"out"}; - return KernelSignature("all_gather", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AllReduceOpArgumentMapping: - -return KernelSignature("all_reduce", {"x"}, {"ring_id", "reduce_type"}, {"out"}); -****************************************************************** -*/ - -KernelSignature AllReduceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("reduce_type"); - paddle::small_vector outputs {"out"}; - return KernelSignature("all_reduce", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AllToAllOpArgumentMapping: - -return KernelSignature("all_to_all", {"x"}, {"ring_id"}, {"out"}); -****************************************************************** -*/ - -KernelSignature AllToAllOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - paddle::small_vector outputs {"out"}; - return KernelSignature("all_to_all", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ArangeOpArgumentMapping: - -return KernelSignature("arange_tensor", {"Start", "End", "Step"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RangeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Start", "End", "Step"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("arange_tensor", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by BroadcastOpArgumentMapping: - -return KernelSignature("broadcast", {"x"}, {"ring_id", "root"}, {"out"}); -****************************************************************** -*/ - -KernelSignature BroadcastOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("root"); - paddle::small_vector outputs {"out"}; - return KernelSignature("broadcast", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dTransposeOpArgumentMapping: - -return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -return KernelSignature("conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature Conv2dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", }; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("conv2d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DecodeJpegOpArgumentMapping: - -return KernelSignature("decode_jpeg", {"X"}, {"mode"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature DecodeJpegOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("mode"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("decode_jpeg", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DeformableConvOpArgumentMapping: - -return KernelSignature("deformable_conv", {"Input", "Offset", "Filter", "Mask"}, {"strides", "paddings", "dilations", "deformable_groups", "groups", "im2col_step"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature DeformableConvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Offset", "Filter", "Mask"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("deformable_groups"); - attrs.emplace_back("groups"); - attrs.emplace_back("im2col_step"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("deformable_conv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DepthwiseConv2dTransposeOpArgumentMapping: - -return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -return KernelSignature("depthwise_conv2d_transpose", {"Input", "Filter"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Output"}); -****************************************************************** -*/ - -KernelSignature DepthwiseConv2dTransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", }; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Output"}; - return KernelSignature("depthwise_conv2d_transpose", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DistConcatOpArgumentMapping: - -return KernelSignature("dist_concat", {"x"}, {"ring_id", "nranks"}, {"out"}); -****************************************************************** -*/ - -KernelSignature DistConcatOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("nranks"); - paddle::small_vector outputs {"out"}; - return KernelSignature("dist_concat", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EinsumOpArgumentMapping: - -return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"}); -****************************************************************** -*/ - -KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Operands"}; - paddle::small_vector attrs; - attrs.emplace_back("equation"); - paddle::small_vector outputs {"Out", "InnerCache", "XShape"}; - return KernelSignature("einsum", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EmbeddingOpArgumentMapping: - -return KernelSignature("embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"}); -return KernelSignature("sparse_weight_embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LookupTableV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Ids", "W"}; - paddle::small_vector attrs; - attrs.emplace_back("padding_idx"); - paddle::small_vector outputs {"Out"}; - if ( ctx.IsDenseTensorInput("Ids") && - ctx.IsDenseTensorInput("W")) { - return KernelSignature("embedding", std::move(inputs), std::move(attrs), std::move(outputs)); - } - else if ( ctx.IsDenseTensorInput("Ids") && - ctx.IsSelectedRowsInput("W")) { - return KernelSignature("sparse_weight_embedding", std::move(inputs), std::move(attrs), std::move(outputs)); - } -else { return KernelSignature("unregistered", {}, {}, {}); } -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EmptyOpArgumentMapping: - -return KernelSignature("empty", {}, {"shape", "dtype"}, {"Out"}); -return KernelSignature("empty", {}, {"ShapeTensor", "dtype"}, {"Out"}); -return KernelSignature("empty", {}, {"ShapeTensorList", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("ShapeTensor") - ? "ShapeTensor" - : ctx.InputSize("ShapeTensorList") > 0 - ? "ShapeTensorList" - : "shape"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("empty", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EqualOpArgumentMapping: - -return KernelSignature("equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EqualOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ExponentialOpArgumentMapping: - -return KernelSignature("exponential", {"X"}, {"lambda"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature ExponentialOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("lambda"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("exponential", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EyeOpArgumentMapping: - -return KernelSignature("eye", {}, {"num_rows", "num_columns", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature EyeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("num_rows"); - attrs.emplace_back("num_columns"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("eye", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FrobeniusNormOpArgumentMapping: - -return KernelSignature("frobenius_norm", {"X"}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); -return KernelSignature("frobenius_norm", {"X"}, {"AxisTensor", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); -return KernelSignature("frobenius_norm", {"X"}, {"AxisTensorList", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FrobeniusNormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - - - paddle::small_vector outputs {"Out"}; - return KernelSignature("frobenius_norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FullLikeOpArgumentMapping: - -return KernelSignature("full_like", {"X"}, {"value", "dtype"}, {"Out"}); -return KernelSignature("full_like", {"X"}, {"ValueTensor", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature FillAnyLikeOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("value"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("full_like", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GreaterEqualOpArgumentMapping: - -return KernelSignature("greater_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GreaterEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("greater_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by GreaterThanOpArgumentMapping: - -return KernelSignature("greater_than_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature GreaterThanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("greater_than_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LessEqualOpArgumentMapping: - -return KernelSignature("less_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LessEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("less_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LessThanOpArgumentMapping: - -return KernelSignature("less_than_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LessThanOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("less_than_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by LinspaceOpArgumentMapping: - -return KernelSignature("linspace", {"Start", "Stop", "Num"}, {"dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature LinspaceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Start", "Stop", "Num"}; - paddle::small_vector attrs; - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("linspace", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatmulOpArgumentMapping: - -return KernelSignature("matmul", {"X", "Y"}, {"trans_x", "trans_y"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature MatmulV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("trans_x"); - attrs.emplace_back("trans_y"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("matmul", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NormOpArgumentMapping: - -return KernelSignature("norm", {"X"}, {"axis", "epsilon", "is_test"}, {"Out", "Norm"}); -****************************************************************** -*/ - -KernelSignature NormOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"Out", "Norm"}; - return KernelSignature("norm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NotEqualOpArgumentMapping: - -return KernelSignature("not_equal_raw", {"X", "Y"}, {"axis", "force_cpu"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature NotEqualOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("not_equal_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by OneHotOpArgumentMapping: - -return KernelSignature("one_hot_raw", {"X"}, {"depth", "dtype", "allow_out_of_range"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature OneHotV2OpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back(ctx.HasInput("depth_tensor") ? "depth_tensor" : "depth"); - attrs.emplace_back("dtype"); - attrs.emplace_back("allow_out_of_range"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("one_hot_raw", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PRecvOpArgumentMapping: - -return KernelSignature("p_recv", {}, {"ring_id", "peer", "dtype", "dynamic_shape"}, {"out"}); -****************************************************************** -*/ - -KernelSignature PRecvOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - - attrs.emplace_back("peer"); - attrs.emplace_back("dtype"); - attrs.emplace_back("dynamic_shape"); - paddle::small_vector outputs {"out"}; - return KernelSignature("p_recv", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by PRecvArrayOpArgumentMapping: - -return KernelSignature("p_recv_array", {}, {"ring_id", "peer", "dtype", "out_shape"}, {"out"}); -****************************************************************** -*/ - -KernelSignature PRecvArrayOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - - attrs.emplace_back("peer"); - attrs.emplace_back("dtype"); - attrs.emplace_back("out_shape"); - paddle::small_vector outputs {"out"}; - return KernelSignature("p_recv_array", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pool2dOpArgumentMapping: - -return KernelSignature("pool2d", {"X"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); -return KernelSignature("pool2d", {"X"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); -return KernelSignature("pool2d", {"X"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("ceil_mode"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("data_format"); - attrs.emplace_back("pooling_type"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - attrs.emplace_back("padding_algorithm"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("pool2d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pool3dOpArgumentMapping: - -return KernelSignature("pool3d", {"X"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm", "use_cudnn"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("ceil_mode"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("data_format"); - attrs.emplace_back("pooling_type"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - attrs.emplace_back("padding_algorithm"); - - paddle::small_vector outputs {"Out"}; - return KernelSignature("pool3d", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by QuantLinearOpArgumentMapping: - -return KernelSignature("quant_linear", {"x", "w", "bias"}, {"in_num_col_dims", "activation_type", "padding_weights", "scale_in", "scale_weights", "quant_round_type", "quant_max_bound", "quant_min_bound"}, {"out"}); -****************************************************************** -*/ - -KernelSignature QuantLinearOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x", "w", "bias"}; - paddle::small_vector attrs; - attrs.emplace_back("in_num_col_dims"); - attrs.emplace_back("activation_type"); - attrs.emplace_back("padding_weights"); - attrs.emplace_back("scale_in"); - attrs.emplace_back("scale_weights"); - attrs.emplace_back("quant_round_type"); - attrs.emplace_back("quant_max_bound"); - attrs.emplace_back("quant_min_bound"); - paddle::small_vector outputs {"out"}; - return KernelSignature("quant_linear", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RandpermOpArgumentMapping: - -return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("n"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("randperm", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReduceOpArgumentMapping: - -return KernelSignature("reduce", {"x"}, {"ring_id", "root_id", "reduce_type"}, {"out"}); -****************************************************************** -*/ - -KernelSignature ReduceOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("root_id"); - attrs.emplace_back("reduce_type"); - paddle::small_vector outputs {"out"}; - return KernelSignature("reduce", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ReduceScatterOpArgumentMapping: - -return KernelSignature("reduce_scatter", {"x"}, {"ring_id", "nranks"}, {"out"}); -****************************************************************** -*/ - -KernelSignature ReduceScatterOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - attrs.emplace_back("nranks"); - paddle::small_vector outputs {"out"}; - return KernelSignature("reduce_scatter", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RnnOpArgumentMapping: - -return KernelSignature("rnn", {"Input", "PreState", "WeightList", "SequenceLength"}, {"dropout_prob", "is_bidirec", "input_size", "hidden_size", "num_layers", "mode", "seed", "is_test"}, {"Out", "DropoutState", "State", "Reserve"}); -****************************************************************** -*/ - -KernelSignature RnnOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "PreState", "WeightList", "SequenceLength"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout_prob"); - attrs.emplace_back("is_bidirec"); - attrs.emplace_back("input_size"); - attrs.emplace_back("hidden_size"); - attrs.emplace_back("num_layers"); - attrs.emplace_back("mode"); - attrs.emplace_back("seed"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"Out", "DropoutState", "State", "Reserve"}; - return KernelSignature("rnn", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ShadowOutputOpArgumentMapping: - -return KernelSignature("shadow_output", {"x"}, {"name"}, {"out"}); -****************************************************************** -*/ - -KernelSignature ShadowOutputOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x"}; - paddle::small_vector attrs; - - paddle::small_vector outputs {"out"}; - return KernelSignature("shadow_output", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ShareBufferOpArgumentMapping: - -return KernelSignature("share_buffer", {"X"}, {"share_dims_and_dtype"}, {"Out", "XOut"}); -****************************************************************** -*/ - -KernelSignature ShareBufferOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("share_dims_and_dtype"); - paddle::small_vector outputs {"Out", "XOut"}; - return KernelSignature("share_buffer", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftmaxOpArgumentMapping: - -return KernelSignature("softmax", {"X"}, {"axis"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("softmax", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SwishOpArgumentMapping: - -return KernelSignature("swish", {"X"}, {}, {"Out"}); -****************************************************************** -*/ - -KernelSignature SwishOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"Out"}; - return KernelSignature("swish", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TrilIndicesOpArgumentMapping: - -return KernelSignature("tril_indices", {}, {"rows", "cols", "offset", "dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature TrilIndicesOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("rows"); - attrs.emplace_back("cols"); - attrs.emplace_back("offset"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out"}; - return KernelSignature("tril_indices", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TrilTriuOpArgumentMapping: - -return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X"}; - paddle::small_vector attrs; - attrs.emplace_back("diagonal"); - attrs.emplace_back("lower"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("tril_triu", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TriuIndicesOpArgumentMapping: - -return KernelSignature("triu_indices", {}, {"row", "col", "offset", "dtype"}, {"out"}); -****************************************************************** -*/ - -KernelSignature TriuIndicesOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("row"); - attrs.emplace_back("col"); - attrs.emplace_back("offset"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"out"}; - return KernelSignature("triu_indices", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TruncatedGaussianRandomOpArgumentMapping: - -return KernelSignature("truncated_gaussian_random", {}, {"shape", "mean", "std", "seed", "dtype"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature TruncatedGaussianRandomOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {}; - paddle::small_vector attrs; - attrs.emplace_back("shape"); - attrs.emplace_back("mean"); - attrs.emplace_back("std"); - attrs.emplace_back("seed"); - attrs.emplace_back("dtype"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("truncated_gaussian_random", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnpoolOpArgumentMapping: - -return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "output_size", "data_format"}, {"Out"}); -return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "OutputSizeTensor", "data_format"}, {"Out"}); -return KernelSignature("unpool", {"X", "Indices"}, {"ksize", "unpooling_type", "strides", "paddings", "OutputSizeTensorList", "data_format"}, {"Out"}); -****************************************************************** -*/ - -KernelSignature UnpoolOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_size"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Out"}; - return KernelSignature("unpool", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AmaxGradOpArgumentMapping: - -return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("amax_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceAmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("amax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by AminGradOpArgumentMapping: - -return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("amin_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceAminGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back( - ctx.HasInput("AxisTensor") - ? "AxisTensor" - : ctx.InputSize("AxisTensorList") > 0 - ? "AxisTensorList" - : "dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("amin_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dTransposeDoubleGradOpArgumentMapping: - -return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -return KernelSignature("conv2d_transpose_double_grad", {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv2dTransposeGradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "grad_out", "grad_x@GRAD", "grad_filter@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD", "grad_out@GRAD"}; - return KernelSignature("conv2d_transpose_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Conv2dTransposeGradOpArgumentMapping: - -return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -return KernelSignature("conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature Conv2dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("conv2d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DeformableConvGradOpArgumentMapping: - -return KernelSignature("deformable_conv_grad", {"Input", "Offset", "Filter", "Mask", "Output@GRAD"}, {"strides", "paddings", "dilations", "deformable_groups", "groups", "im2col_step"}, {"Input@GRAD", "Offset@GRAD", "Filter@GRAD", "Mask@GRAD"}); -****************************************************************** -*/ - -KernelSignature DeformableConvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Offset", "Filter", "Mask", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("dilations"); - attrs.emplace_back("deformable_groups"); - attrs.emplace_back("groups"); - attrs.emplace_back("im2col_step"); - paddle::small_vector outputs {"Input@GRAD", "Offset@GRAD", "Filter@GRAD", "Mask@GRAD"}; - return KernelSignature("deformable_conv_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by DepthwiseConv2dTransposeGradOpArgumentMapping: - -return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "output_size", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensor", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -return KernelSignature("depthwise_conv2d_transpose_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", "output_padding", "OutputSizeTensorList", "padding_algorithm", "groups", "dilations", "data_format"}, {"Input@GRAD", "Filter@GRAD"}); -****************************************************************** -*/ - -KernelSignature DepthwiseConv2dTransposeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "Filter", "Output@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_padding"); - attrs.emplace_back("output_size"); - attrs.emplace_back("padding_algorithm"); - attrs.emplace_back("groups"); - attrs.emplace_back("dilations"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"Input@GRAD", "Filter@GRAD"}; - return KernelSignature("depthwise_conv2d_transpose_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by EinsumGradOpArgumentMapping: - -return KernelSignature("einsum_grad", {"x_shape", "InnerCache", "Out@GRAD"}, {"equation"}, {"Operands@GRAD"}); -****************************************************************** -*/ - -KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"x_shape", "InnerCache", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("equation"); - paddle::small_vector outputs {"Operands@GRAD"}; - return KernelSignature("einsum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ElementwisePowGradOpArgumentMapping: - -return KernelSignature("elementwise_pow_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwisePowGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("elementwise_pow_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by FrobeniusNormGradOpArgumentMapping: - -return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); -return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); -return KernelSignature("frobenius_norm_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature FrobeniusNormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - - - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("frobenius_norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by HardswishGradOpArgumentMapping: - -return KernelSignature("hardswish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature HardSwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("hardswish_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatmulDoubleGradOpArgumentMapping: - -return KernelSignature("matmul_double_grad", {"X", "Y", "grad_out", "grad_x@GRAD", "grad_y@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD", "grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature MatmulV2GradGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "grad_out", "grad_x@GRAD", "grad_y@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("trans_x"); - attrs.emplace_back("trans_y"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "grad_out@GRAD"}; - return KernelSignature("matmul_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatmulGradOpArgumentMapping: - -return KernelSignature("matmul_grad", {"X", "Y", "Out@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature MatmulV2GradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("trans_x"); - attrs.emplace_back("trans_y"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("matmul_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MatmulTripleGradOpArgumentMapping: - -return KernelSignature("matmul_triple_grad", {"X", "Y", "grad_out", "grad_grad_x", "grad_grad_y", "grad_x@GRAD", "grad_y@GRAD", "grad_grad_out@GRAD"}, {"trans_x", "trans_y"}, {"X@GRAD", "Y@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD", "grad_grad_y@GRAD"}); -****************************************************************** -*/ - -KernelSignature MatmulV2TripleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "grad_out", "grad_grad_x", "grad_grad_y", "grad_x@GRAD", "grad_y@GRAD", "grad_grad_out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("trans_x"); - attrs.emplace_back("trans_y"); - paddle::small_vector outputs {"X@GRAD", "Y@GRAD", "grad_out@GRAD", "grad_grad_x@GRAD", "grad_grad_y@GRAD"}; - return KernelSignature("matmul_triple_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaxGradOpArgumentMapping: - -return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("max_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceMaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("max_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MaximumGradOpArgumentMapping: - -return KernelSignature("maximum_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwiseMaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("maximum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MinGradOpArgumentMapping: - -return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("min_grad", {"X", "Out", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceMinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("min_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by MinimumGradOpArgumentMapping: - -return KernelSignature("minimum_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -****************************************************************** -*/ - -KernelSignature ElementwiseMinGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Y", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD", "Y@GRAD"}; - return KernelSignature("minimum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by NormGradOpArgumentMapping: - -return KernelSignature("norm_grad", {"X", "Norm", "Out@GRAD"}, {"axis", "epsilon", "is_test"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Norm", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - attrs.emplace_back("epsilon"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("norm_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pool2dDoubleGradOpArgumentMapping: - -return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); -return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); -return KernelSignature("pool2d_double_grad", {"grad_x@GRAD"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"grad_out@GRAD"}); -****************************************************************** -*/ - -KernelSignature Pool2dDoubleGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"grad_x@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("ceil_mode"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("data_format"); - attrs.emplace_back("pooling_type"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - attrs.emplace_back("padding_algorithm"); - paddle::small_vector outputs {"grad_out@GRAD"}; - return KernelSignature("pool2d_double_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pool2dGradOpArgumentMapping: - -return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); -return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"KernelSizeTensor", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); -return KernelSignature("pool2d_grad", {"X", "Out", "Out@GRAD"}, {"KernelSizeTensorList", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("ceil_mode"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("data_format"); - attrs.emplace_back("pooling_type"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - attrs.emplace_back("padding_algorithm"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pool2d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by Pool3dGradOpArgumentMapping: - -return KernelSignature("pool3d_grad", {"X", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "ceil_mode", "exclusive", "data_format", "pooling_type", "global_pooling", "adaptive", "padding_algorithm"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("ceil_mode"); - attrs.emplace_back("exclusive"); - attrs.emplace_back("data_format"); - attrs.emplace_back("pooling_type"); - attrs.emplace_back("global_pooling"); - attrs.emplace_back("adaptive"); - attrs.emplace_back("padding_algorithm"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("pool3d_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by ProdGradOpArgumentMapping: - -return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"DimsTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("prod_grad", {"X", "Out", "Out@GRAD"}, {"DimsTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceProdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("prod_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by RnnGradOpArgumentMapping: - -return KernelSignature("rnn_grad", {"Input", "PreState", "WeightList", "SequenceLength", "Out", "DropoutState", "Reserve", "Out@GRAD", "State@GRAD"}, {"dropout_prob", "is_bidirec", "input_size", "hidden_size", "num_layers", "mode", "seed", "is_test"}, {"Input@GRAD", "PreState@GRAD", "WeightList@GRAD"}); -****************************************************************** -*/ - -KernelSignature RnnGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Input", "PreState", "WeightList", "SequenceLength", "Out", "DropoutState", "Reserve", "Out@GRAD", "State@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dropout_prob"); - attrs.emplace_back("is_bidirec"); - attrs.emplace_back("input_size"); - attrs.emplace_back("hidden_size"); - attrs.emplace_back("num_layers"); - attrs.emplace_back("mode"); - attrs.emplace_back("seed"); - attrs.emplace_back("is_test"); - paddle::small_vector outputs {"Input@GRAD", "PreState@GRAD", "WeightList@GRAD"}; - return KernelSignature("rnn_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SoftmaxGradOpArgumentMapping: - -return KernelSignature("softmax_grad", {"Out", "Out@GRAD"}, {"axis"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SoftmaxGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("axis"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("softmax_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SumGradOpArgumentMapping: - -return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"dim", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"AxisTensor", "keep_dim", "reduce_all"}, {"X@GRAD"}); -return KernelSignature("sum_grad", {"X", "Out@GRAD"}, {"AxisTensorList", "keep_dim", "reduce_all"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature ReduceSumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("dim"); - attrs.emplace_back("keep_dim"); - attrs.emplace_back("reduce_all"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("sum_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by SwishGradOpArgumentMapping: - -return KernelSignature("swish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature SwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Out@GRAD"}; - paddle::small_vector attrs; - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("swish_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by TrilTriuGradOpArgumentMapping: - -return KernelSignature("tril_triu_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature TrilTriuGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("diagonal"); - attrs.emplace_back("lower"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("tril_triu_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -/* -****************************************************************** -NOTE: The following codes are for 'get_compat_kernel_signature.py' -All possible KernelSignatures returned by UnpoolGradOpArgumentMapping: - -return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "output_size", "data_format"}, {"X@GRAD"}); -return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "OutputSizeTensor", "data_format"}, {"X@GRAD"}); -return KernelSignature("unpool_grad", {"X", "Indices", "Out", "Out@GRAD"}, {"ksize", "strides", "paddings", "OutputSizeTensorList", "data_format"}, {"X@GRAD"}); -****************************************************************** -*/ - -KernelSignature UnpoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::small_vector inputs {"X", "Indices", "Out", "Out@GRAD"}; - paddle::small_vector attrs; - attrs.emplace_back("ksize"); - attrs.emplace_back("strides"); - attrs.emplace_back("paddings"); - attrs.emplace_back("output_size"); - attrs.emplace_back("data_format"); - paddle::small_vector outputs {"X@GRAD"}; - return KernelSignature("unpool_grad", std::move(inputs), std::move(attrs), std::move(outputs)); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(all_gather, phi::AllGatherOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(all_reduce, phi::AllReduceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(all_to_all, phi::AllToAllOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(range, arange); -PD_REGISTER_ARG_MAPPING_FN(range, phi::RangeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(broadcast, phi::BroadcastOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose, phi::Conv2dTransposeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(decode_jpeg, phi::DecodeJpegOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(deformable_conv, phi::DeformableConvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose, phi::DepthwiseConv2dTransposeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(dist_concat, phi::DistConcatOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(einsum, phi::EinsumOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2, embedding); -PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2, phi::LookupTableV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(equal, phi::EqualOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(exponential, phi::ExponentialOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(eye, phi::EyeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(frobenius_norm, phi::FrobeniusNormOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like); -PD_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(greater_equal, phi::GreaterEqualOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(greater_than, phi::GreaterThanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(less_equal, phi::LessEqualOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(less_than, phi::LessThanOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(linspace, phi::LinspaceOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul); -PD_REGISTER_ARG_MAPPING_FN(matmul_v2, phi::MatmulV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(not_equal, phi::NotEqualOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot); -PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotV2OpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(p_recv, phi::PRecvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(p_recv_array, phi::PRecvArrayOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(quant_linear, phi::QuantLinearOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(reduce, phi::ReduceOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(reduce_scatter, phi::ReduceScatterOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rnn, phi::RnnOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(shadow_output, phi::ShadowOutputOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(share_buffer, phi::ShareBufferOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softmax, phi::SoftmaxOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(swish, phi::SwishOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tril_indices, phi::TrilIndicesOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(triu_indices, phi::TriuIndicesOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(truncated_gaussian_random, phi::TruncatedGaussianRandomOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unpool, phi::UnpoolOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_amax_grad, amax_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_amax_grad, phi::ReduceAmaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_amin_grad, amin_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_amin_grad, phi::ReduceAminGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(conv2d_transpose_grad_grad, conv2d_transpose_double_grad); -PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad_grad, phi::Conv2dTransposeGradGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(conv2d_transpose_grad, phi::Conv2dTransposeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(deformable_conv_grad, phi::DeformableConvGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_transpose_grad, phi::DepthwiseConv2dTransposeGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(einsum_grad, phi::EinsumGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad, phi::ElementwisePowGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(exponential_grad, exponential__grad); -PD_REGISTER_ARG_MAPPING_FN(frobenius_norm_grad, phi::FrobeniusNormGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(hard_swish_grad, hardswish_grad); -PD_REGISTER_ARG_MAPPING_FN(hard_swish_grad, phi::HardSwishGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad); -PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad, phi::MatmulV2GradGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad); -PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulV2GradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad); -PD_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad, phi::MatmulV2TripleGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad, phi::ReduceMaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_max_grad, maximum_grad); -PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad, phi::ElementwiseMaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad, phi::ReduceMinGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(elementwise_min_grad, minimum_grad); -PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad, phi::ElementwiseMinGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, phi::Pool2dDoubleGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad, phi::ReduceProdGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(rnn_grad, phi::RnnGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(softmax_grad, phi::SoftmaxGradOpArgumentMapping); -PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); -PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(swish_grad, phi::SwishGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(unpool_grad, phi::UnpoolGradOpArgumentMapping); From 9b332fd4fb1bd34fdec9b2e3fd5e1b06b901b6b5 Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Tue, 19 Dec 2023 13:27:30 +0800 Subject: [PATCH 06/15] :mute: Remove: deleted extra files --- .../conv2d/generated_tmp/conv2d_bias_act.cu | 4349 ----------------- .../generated_tmp/conv2d_bias_residual.cu | 2389 --------- .../conv2d_depthwise_bias_act.cu | 3502 ------------- .../fpA_intB_gemm/autogen_tmp/arch_define.h | 4 - ...m_kernelLauncher_bf16_sm80_stages2_bias.cu | 439 -- ...kernelLauncher_bf16_sm80_stages2_noBias.cu | 439 -- ...m_kernelLauncher_bf16_sm80_stages3_bias.cu | 439 -- ...kernelLauncher_bf16_sm80_stages3_noBias.cu | 439 -- ...m_kernelLauncher_bf16_sm80_stages4_bias.cu | 439 -- ...kernelLauncher_bf16_sm80_stages4_noBias.cu | 439 -- ...m_kernelLauncher_bf16_sm80_stages5_bias.cu | 439 -- ...kernelLauncher_bf16_sm80_stages5_noBias.cu | 439 -- ...m_kernelLauncher_fp16_sm80_stages2_bias.cu | 439 -- ...kernelLauncher_fp16_sm80_stages2_noBias.cu | 439 -- ...m_kernelLauncher_fp16_sm80_stages3_bias.cu | 439 -- ...kernelLauncher_fp16_sm80_stages3_noBias.cu | 439 -- ...m_kernelLauncher_fp16_sm80_stages4_bias.cu | 439 -- ...kernelLauncher_fp16_sm80_stages4_noBias.cu | 439 -- .../fpA_intB_gemm/fpA_intB_gemm.h | 158 - .../fpA_intB_gemm/fpA_intB_gemm_template.cu | 681 --- .../fpA_intB_gemm/fpA_intB_gemm_template.h | 519 -- .../generic_mixed_gemm_kernelLauncher.py | 228 - python/paddle/tensor/__init__.py | 8 +- python/paddle/tensor/random.py | 20 +- test/legacy_test/test_inplace.py | 4 +- third_party/flashattn | 2 +- 26 files changed, 19 insertions(+), 17991 deletions(-) delete mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu deleted file mode 100644 index c8bc7b420c82d..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_act.cu +++ /dev/null @@ -1,4349 +0,0 @@ - -// Generated by conv2d_bias_act.py - Do not edit. - -#include -#include "cutlass/conv/kernel/default_conv2d_fprop.h" -#include "cutlass/epilogue/thread/linear_combination_leaky_relu.h" -#include "cutlass/epilogue/thread/linear_combination_silu.h" -#include "cutlass/epilogue/thread/linear_combination_bias_relu.h" -#include "cutlass/epilogue/thread/linear_combination_sigmoid.h" -#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" - - -namespace phi { -namespace fusion { -namespace cutlass_internal { - -cutlass::Status conv2d_bias_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_sm75_all_func = {conv2d_bias_sm750, -conv2d_bias_sm751, -conv2d_bias_sm752, -conv2d_bias_sm753, -conv2d_bias_sm754, -conv2d_bias_sm755, -conv2d_bias_sm756, -conv2d_bias_sm757, -conv2d_bias_sm758, -}; - -std::map, int> map_problem_conv2d_bias_sm75; -std::mutex conv2d_bias_sm75_mutex; - -void conv2d_bias_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_sm75.count(problem_size)) { - conv2d_bias_sm75_all_func[map_problem_conv2d_bias_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_sm75_all_func, params, CONV2D_BIAS); - - std::lock_guard guard(conv2d_bias_sm75_mutex); - - map_problem_conv2d_bias_sm75[problem_size] = best_config_index; - conv2d_bias_sm75_all_func[best_config_index](params); -} - -cutlass::Status conv2d_bias_relu_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_relu_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_relu_sm75_all_func = {conv2d_bias_relu_sm750, -conv2d_bias_relu_sm751, -conv2d_bias_relu_sm752, -conv2d_bias_relu_sm753, -conv2d_bias_relu_sm754, -conv2d_bias_relu_sm755, -conv2d_bias_relu_sm756, -conv2d_bias_relu_sm757, -conv2d_bias_relu_sm758, -}; - -std::map, int> map_problem_conv2d_bias_relu_sm75; -std::mutex conv2d_bias_relu_sm75_mutex; - -void conv2d_bias_relu_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_relu_sm75.count(problem_size)) { - conv2d_bias_relu_sm75_all_func[map_problem_conv2d_bias_relu_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_relu_sm75_all_func, params, CONV2D_BIAS_RELU); - - std::lock_guard guard(conv2d_bias_relu_sm75_mutex); - - map_problem_conv2d_bias_relu_sm75[problem_size] = best_config_index; - conv2d_bias_relu_sm75_all_func[best_config_index](params); -} - -cutlass::Status conv2d_bias_silu_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_silu_sm75_all_func = {conv2d_bias_silu_sm750, -conv2d_bias_silu_sm751, -conv2d_bias_silu_sm752, -conv2d_bias_silu_sm753, -conv2d_bias_silu_sm754, -conv2d_bias_silu_sm755, -conv2d_bias_silu_sm756, -conv2d_bias_silu_sm757, -conv2d_bias_silu_sm758, -}; - -std::map, int> map_problem_conv2d_bias_silu_sm75; -std::mutex conv2d_bias_silu_sm75_mutex; - -void conv2d_bias_silu_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_silu_sm75.count(problem_size)) { - conv2d_bias_silu_sm75_all_func[map_problem_conv2d_bias_silu_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_silu_sm75_all_func, params, CONV2D_BIAS_SILU); - - std::lock_guard guard(conv2d_bias_silu_sm75_mutex); - - map_problem_conv2d_bias_silu_sm75[problem_size] = best_config_index; - conv2d_bias_silu_sm75_all_func[best_config_index](params); -} - -cutlass::Status conv2d_bias_leaky_relu_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_leaky_relu_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationLeakyRelu< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - float alpha = params.alpha; typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f, alpha}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_leaky_relu_sm75_all_func = {conv2d_bias_leaky_relu_sm750, -conv2d_bias_leaky_relu_sm751, -conv2d_bias_leaky_relu_sm752, -conv2d_bias_leaky_relu_sm753, -conv2d_bias_leaky_relu_sm754, -conv2d_bias_leaky_relu_sm755, -conv2d_bias_leaky_relu_sm756, -conv2d_bias_leaky_relu_sm757, -conv2d_bias_leaky_relu_sm758, -}; - -std::map, int> map_problem_conv2d_bias_leaky_relu_sm75; -std::mutex conv2d_bias_leaky_relu_sm75_mutex; - -void conv2d_bias_leaky_relu_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_leaky_relu_sm75.count(problem_size)) { - conv2d_bias_leaky_relu_sm75_all_func[map_problem_conv2d_bias_leaky_relu_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_leaky_relu_sm75_all_func, params, CONV2D_BIAS_LEAKY_RELU); - - std::lock_guard guard(conv2d_bias_leaky_relu_sm75_mutex); - - map_problem_conv2d_bias_leaky_relu_sm75[problem_size] = best_config_index; - conv2d_bias_leaky_relu_sm75_all_func[best_config_index](params); -} - -cutlass::Status conv2d_bias_sigmoid_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_sigmoid_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, float, float>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)(bias), {0, 0, 0}}, - {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_sigmoid_sm75_all_func = {conv2d_bias_sigmoid_sm750, -conv2d_bias_sigmoid_sm751, -conv2d_bias_sigmoid_sm752, -conv2d_bias_sigmoid_sm753, -conv2d_bias_sigmoid_sm754, -conv2d_bias_sigmoid_sm755, -conv2d_bias_sigmoid_sm756, -conv2d_bias_sigmoid_sm757, -conv2d_bias_sigmoid_sm758, -}; - -std::map, int> map_problem_conv2d_bias_sigmoid_sm75; -std::mutex conv2d_bias_sigmoid_sm75_mutex; - -void conv2d_bias_sigmoid_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_sigmoid_sm75.count(problem_size)) { - conv2d_bias_sigmoid_sm75_all_func[map_problem_conv2d_bias_sigmoid_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_sigmoid_sm75_all_func, params, CONV2D_BIAS_SIGMOID); - - std::lock_guard guard(conv2d_bias_sigmoid_sm75_mutex); - - map_problem_conv2d_bias_sigmoid_sm75[problem_size] = best_config_index; - conv2d_bias_sigmoid_sm75_all_func[best_config_index](params); -} - -void Conv2dBias(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_sm75(params); - } - -} - -void Conv2dBiasRelu(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_relu_sm75(params); - } - -} - -void Conv2dBiasSilu(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_silu_sm75(params); - } - -} - -void Conv2dBiasLeakyRelu(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_leaky_relu_sm75(params); - } - -} - -void Conv2dBiasSigmoid(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_sigmoid_sm75(params); - } - -} - -} // namespace cutlass_internal -} // namespace fusion -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu deleted file mode 100644 index 15729531e5dc8..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_bias_residual.cu +++ /dev/null @@ -1,2389 +0,0 @@ - -// Generated by conv2d_bias_residual.py - Do not edit. - -#include -#include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h" -#include "cutlass/epilogue/thread/linear_combination_residual_block.h" -#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" - -namespace phi { -namespace fusion { -namespace cutlass_internal { - -cutlass::Status conv2d_bias_silu_add_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm759(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm7510(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 64, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_silu_add_sm7511(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::SiLu, cutlass::plus, cutlass::epilogue::thread::Identity>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_silu_add_sm75_all_func = {conv2d_bias_silu_add_sm750, -conv2d_bias_silu_add_sm751, -conv2d_bias_silu_add_sm752, -conv2d_bias_silu_add_sm753, -conv2d_bias_silu_add_sm754, -conv2d_bias_silu_add_sm755, -conv2d_bias_silu_add_sm756, -conv2d_bias_silu_add_sm757, -conv2d_bias_silu_add_sm758, -conv2d_bias_silu_add_sm759, -conv2d_bias_silu_add_sm7510, -conv2d_bias_silu_add_sm7511, -}; - -std::map, int> map_problem_conv2d_bias_silu_add_sm75; -std::mutex conv2d_bias_silu_add_sm75_mutex; - -void conv2d_bias_silu_add_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_silu_add_sm75.count(problem_size)) { - conv2d_bias_silu_add_sm75_all_func[map_problem_conv2d_bias_silu_add_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_silu_add_sm75_all_func, params, CONV2D_BIAS_SILU_ADD); - - std::lock_guard guard(conv2d_bias_silu_add_sm75_mutex); - - map_problem_conv2d_bias_silu_add_sm75[problem_size] = best_config_index; - conv2d_bias_silu_add_sm75_all_func[best_config_index](params); -} - -cutlass::Status conv2d_bias_add_relu_sm750(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm751(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm752(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm753(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm754(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm755(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm756(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm757(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm758(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm759(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm7510(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 64, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_bias_add_relu_sm7511(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultConv2dFpropWithBroadcast< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - float, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 32>, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<16,8,8>, - cutlass::epilogue::thread::LinearCombinationResidualBlock< cutlass::half_t, float, float, cutlass::half_t, 8, cutlass::epilogue::thread::Identity, cutlass::plus, cutlass::epilogue::thread::ReLu>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kOptimized, - cutlass::conv::StrideSupport::kStrided, - 8, - 8 - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::ImplicitGemmConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = 1; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - - const half *residual = params.residual; - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)(weight), {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)residual, {oc, oc * ow, oc * ow * oh}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - cutlass::conv::SplitKMode::kSerial, - (cutlass::half_t *)(bias), nullptr, - 0, oc}; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - conv2d_bias_add_relu_sm75_all_func = {conv2d_bias_add_relu_sm750, -conv2d_bias_add_relu_sm751, -conv2d_bias_add_relu_sm752, -conv2d_bias_add_relu_sm753, -conv2d_bias_add_relu_sm754, -conv2d_bias_add_relu_sm755, -conv2d_bias_add_relu_sm756, -conv2d_bias_add_relu_sm757, -conv2d_bias_add_relu_sm758, -conv2d_bias_add_relu_sm759, -conv2d_bias_add_relu_sm7510, -conv2d_bias_add_relu_sm7511, -}; - -std::map, int> map_problem_conv2d_bias_add_relu_sm75; -std::mutex conv2d_bias_add_relu_sm75_mutex; - -void conv2d_bias_add_relu_sm75(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_conv2d_bias_add_relu_sm75.count(problem_size)) { - conv2d_bias_add_relu_sm75_all_func[map_problem_conv2d_bias_add_relu_sm75.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - conv2d_bias_add_relu_sm75_all_func, params, CONV2D_BIAS_ADD_RELU); - - std::lock_guard guard(conv2d_bias_add_relu_sm75_mutex); - - map_problem_conv2d_bias_add_relu_sm75[problem_size] = best_config_index; - conv2d_bias_add_relu_sm75_all_func[best_config_index](params); -} - -void Conv2dBiasSiluAdd(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_silu_add_sm75(params); - } - -} - -void Conv2dBiasAddRelu(const ConvAllParams& params) { - - if (params.sm_version == 75) - { - conv2d_bias_add_relu_sm75(params); - } - -} - -} // namespace cutlass_internal -} // namespace fusion -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu deleted file mode 100644 index c6a6dd6a9e129..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp/conv2d_depthwise_bias_act.cu +++ /dev/null @@ -1,3502 +0,0 @@ - -// Generated by conv2d_depthwise_bias_act.py - Do not edit. - -#include -#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h" -#include -#include -#include "cutlass/cutlass.h" -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/conv/kernel/default_depthwise_fprop.h" -#include "cutlass/epilogue/thread/linear_combination_silu.h" -#include "cutlass/conv/device/direct_convolution.h" - -#include "cutlass/conv/device/implicit_gemm_convolution.h" -#include "cutlass/conv/kernel/default_conv2d_fprop.h" -namespace phi { -namespace fusion { -namespace cutlass_internal { - -cutlass::Status conv2d_depthwise_bias_0(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_1(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_2(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_3(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_4(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_5(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_6(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_7(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombination< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - Conv2dDepthwiseBias_all_func = {conv2d_depthwise_bias_0, -conv2d_depthwise_bias_1, -conv2d_depthwise_bias_2, -conv2d_depthwise_bias_3, -conv2d_depthwise_bias_4, -conv2d_depthwise_bias_5, -conv2d_depthwise_bias_6, -conv2d_depthwise_bias_7, -}; - -std::map, int> map_problem_Conv2dDepthwiseBias; -std::mutex Conv2dDepthwiseBias_mutex; - -void Conv2dDepthwiseBias(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_Conv2dDepthwiseBias.count(problem_size)) { - Conv2dDepthwiseBias_all_func[map_problem_Conv2dDepthwiseBias.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - Conv2dDepthwiseBias_all_func, params, CONV2D_DEPTHWISE_BIAS); - - std::lock_guard guard(Conv2dDepthwiseBias_mutex); - - map_problem_Conv2dDepthwiseBias[problem_size] = best_config_index; - Conv2dDepthwiseBias_all_func[best_config_index](params); -} - -cutlass::Status conv2d_depthwise_bias_relu_0(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_1(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_2(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_3(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_4(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_5(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_6(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_relu_7(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationRelu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - Conv2dDepthwiseBiasRelu_all_func = {conv2d_depthwise_bias_relu_0, -conv2d_depthwise_bias_relu_1, -conv2d_depthwise_bias_relu_2, -conv2d_depthwise_bias_relu_3, -conv2d_depthwise_bias_relu_4, -conv2d_depthwise_bias_relu_5, -conv2d_depthwise_bias_relu_6, -conv2d_depthwise_bias_relu_7, -}; - -std::map, int> map_problem_Conv2dDepthwiseBiasRelu; -std::mutex Conv2dDepthwiseBiasRelu_mutex; - -void Conv2dDepthwiseBiasRelu(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_Conv2dDepthwiseBiasRelu.count(problem_size)) { - Conv2dDepthwiseBiasRelu_all_func[map_problem_Conv2dDepthwiseBiasRelu.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - Conv2dDepthwiseBiasRelu_all_func, params, CONV2D_DEPTHWISE_BIAS_RELU); - - std::lock_guard guard(Conv2dDepthwiseBiasRelu_mutex); - - map_problem_Conv2dDepthwiseBiasRelu[problem_size] = best_config_index; - Conv2dDepthwiseBiasRelu_all_func[best_config_index](params); -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_0(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_1(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_2(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_3(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_4(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_5(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_6(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_sigmoid_7(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSigmoid< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - Conv2dDepthwiseBiasSigmoid_all_func = {conv2d_depthwise_bias_sigmoid_0, -conv2d_depthwise_bias_sigmoid_1, -conv2d_depthwise_bias_sigmoid_2, -conv2d_depthwise_bias_sigmoid_3, -conv2d_depthwise_bias_sigmoid_4, -conv2d_depthwise_bias_sigmoid_5, -conv2d_depthwise_bias_sigmoid_6, -conv2d_depthwise_bias_sigmoid_7, -}; - -std::map, int> map_problem_Conv2dDepthwiseBiasSigmoid; -std::mutex Conv2dDepthwiseBiasSigmoid_mutex; - -void Conv2dDepthwiseBiasSigmoid(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_Conv2dDepthwiseBiasSigmoid.count(problem_size)) { - Conv2dDepthwiseBiasSigmoid_all_func[map_problem_Conv2dDepthwiseBiasSigmoid.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - Conv2dDepthwiseBiasSigmoid_all_func, params, CONV2D_DEPTHWISE_BIAS_SIGMOID); - - std::lock_guard guard(Conv2dDepthwiseBiasSigmoid_mutex); - - map_problem_Conv2dDepthwiseBiasSigmoid[problem_size] = best_config_index; - Conv2dDepthwiseBiasSigmoid_all_func[best_config_index](params); -} - -cutlass::Status conv2d_depthwise_bias_silu_0(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_1(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_2(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,9>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,16,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_3(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,9>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<3,3>, - - cutlass::gemm::GemmShape<16,32,9>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_4(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_5(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<1,1>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_6(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,16,25>, - cutlass::conv::TensorNHWCShape<1,8,8,16>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,16,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -cutlass::Status conv2d_depthwise_bias_silu_7(const ConvAllParams& params) { - using kernel_base = - typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop< - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::layout::TensorNHWC, - cutlass::half_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm70, - cutlass::gemm::GemmShape<64,32,25>, - cutlass::conv::TensorNHWCShape<1,8,8,32>, - cutlass::MatrixShape<5,5>, - - cutlass::gemm::GemmShape<16,32,25>, - cutlass::gemm::GemmShape<1,1,1>, - cutlass::epilogue::thread::LinearCombinationSilu< cutlass::half_t, 8, cutlass::half_t, float>, - cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,1,8,8>, - 2, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::IteratorAlgorithm::kFixedStrideDilation, - cutlass::conv::StrideSupport::kStrided, - cutlass::MatrixShape<2,2>, - cutlass::MatrixShape<1, 1> - >::Kernel; - - using ImplicitGemm = - cutlass::conv::device::DirectConvolution; - const half *input = params.input; - const half *weight = params.weight; - const half *bias = params.bias; - half *output = params.output; - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - int pad_h0 = params.pad_h0; - int pad_w0 = params.pad_w0; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - int groups = params.groups; - int kc = ic / groups; - - int oh = params.oh; - int ow = params.ow; - int dilation_h = params.dilation_h; - int dilation_w = params.dilation_w; - int split_k_slices = (oh * ow + 63) / 64; - - cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic}, - {oc, kh, kw, ic / groups}, - {pad_h0, 0, pad_w0, 0}, - {stride_h, stride_w}, - {dilation_h, dilation_w}, - {batch, oh, ow, oc}, - cutlass::conv::Mode::kCrossCorrelation, - split_k_slices, - groups); - -size_t filter_size = oc * kh * kw * kc * sizeof(half); -phi::Allocator::AllocationPtr filter_gpu_ptrs_data = - phi::memory_utils::Alloc( - params.ctx->GetPlace(), - filter_size, - phi::Stream(reinterpret_cast(params.ctx->stream()))); -void *filter_workspace = filter_gpu_ptrs_data->ptr(); - - typename ImplicitGemm::Arguments arguments{ - problem_size, - {(cutlass::half_t *)input, {ic, ic * iw, ic * iw * ih}}, - {(cutlass::half_t *)weight, {kc, kc * kw, kc * kw * kh}}, - {(cutlass::half_t *)bias, {0, 0, 0}}, - {(cutlass::half_t *)output, {oc, oc * ow, oc * ow * oh}}, - {1.f, 1.f}, - {(cutlass::half_t *)filter_workspace, {kc, kc * kw, kc * kw * kh}}, - }; - - ImplicitGemm implicit_gemm_op; - size_t bytes = implicit_gemm_op.get_workspace_size(arguments); - - auto ctx = params.ctx; - auto stream = ctx->stream(); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = - phi::memory_utils::Alloc( - ctx->GetPlace(), - bytes, - phi::Stream(reinterpret_cast(stream))); - void *workspace = tmp_gpu_ptrs_data->ptr(); - - cutlass::Status status = implicit_gemm_op.can_implement(arguments); - CUTLASS_CHECK(status); - status = implicit_gemm_op.initialize(arguments, workspace); - CUTLASS_CHECK(status); - status = implicit_gemm_op(stream); - CUTLASS_CHECK(status); - return status; -} - -std::vector> - Conv2dDepthwiseBiasSilu_all_func = {conv2d_depthwise_bias_silu_0, -conv2d_depthwise_bias_silu_1, -conv2d_depthwise_bias_silu_2, -conv2d_depthwise_bias_silu_3, -conv2d_depthwise_bias_silu_4, -conv2d_depthwise_bias_silu_5, -conv2d_depthwise_bias_silu_6, -conv2d_depthwise_bias_silu_7, -}; - -std::map, int> map_problem_Conv2dDepthwiseBiasSilu; -std::mutex Conv2dDepthwiseBiasSilu_mutex; - -void Conv2dDepthwiseBiasSilu(const ConvAllParams& params) { - int batch = params.batch; - int ic = params.ic; - int ih = params.ih; - int iw = params.iw; - int kh = params.kh; - int kw = params.kw; - int oc = params.oc; - //int pad_h0 = params.pad_h0; - //int pad_w0 = params.pad_w0; - int groups = params.groups; - int stride_h = params.stride_h; - int stride_w = params.stride_w; - - std::vector problem_size = { - batch, ic, ih, iw, kh, kw, oc, groups, stride_h, stride_w}; - - if (map_problem_Conv2dDepthwiseBiasSilu.count(problem_size)) { - Conv2dDepthwiseBiasSilu_all_func[map_problem_Conv2dDepthwiseBiasSilu.at(problem_size)]( - params); - return; - } - - int best_config_index = ProfileToGetBestConfig( - Conv2dDepthwiseBiasSilu_all_func, params, CONV2D_DEPTHWISE_BIAS_SILU); - - std::lock_guard guard(Conv2dDepthwiseBiasSilu_mutex); - - map_problem_Conv2dDepthwiseBiasSilu[problem_size] = best_config_index; - Conv2dDepthwiseBiasSilu_all_func[best_config_index](params); -} - -} // namespace cutlass_internal -} // namespace fusion -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h deleted file mode 100644 index 95537512437e7..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/arch_define.h +++ /dev/null @@ -1,4 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#define USE_FPAINTB_GEMM_WITH_SM80 diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu deleted file mode 100644 index de7e712e91f73..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu deleted file mode 100644 index 9e094fbb407d1..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages2_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu deleted file mode 100644 index 3dda2e10c076d..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu deleted file mode 100644 index ef063b6b55bfb..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages3_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu deleted file mode 100644 index 8e83a67d39ac3..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu deleted file mode 100644 index 037a9df710d29..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages4_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu deleted file mode 100644 index 1707f4580d253..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu deleted file mode 100644 index 6190532c83f51..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_bf16_sm80_stages5_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const uint8_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - uint8_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<16, 128, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<32, 128, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const __nv_bfloat16* A, - const cutlass::uint4b_t* B, - const __nv_bfloat16* weight_scales, - const __nv_bfloat16* biases, - __nv_bfloat16* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<__nv_bfloat16, - cutlass::uint4b_t, - cutlass::arch::Sm80, - EpilogueOpNoBias, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu deleted file mode 100644 index b9961a387e02d..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu deleted file mode 100644 index 65c5476ee32ee..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages2_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 2>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu deleted file mode 100644 index 8412cb4a9a29a..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu deleted file mode 100644 index ffda7f835c359..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages3_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 3>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu deleted file mode 100644 index f5c6440565500..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu deleted file mode 100644 index bed2b479bf58e..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages4_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 4>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h deleted file mode 100644 index 15c5267ae0f9d..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h" -// #include "src/fastertransformer/utils/allocator.h" -#include "cuda_runtime_api.h" // NOLINT - -namespace phi { - -/* - This runner only supports: - T in {half, __nv_bfloat} WeightType in {uint8_t, cutlass::uint4b_t} - - Activations, biases, scales and outputs are all assumed to be row-major. - - However, it is assumed that B is in a special format governed by - cutlass_extensions/gemm/kernel/mixed_gemm_B_layout. In this case, B must be - preprocessed using the cutlass weight only quant preprocessors. The weight - preprocessor will instantiate the layout and preprocess based on the - instantiation, so layout changes should only require modifications to - mix_gemm_B_layout.h. -*/ - -template -class CutlassFpAIntBGemmRunner { - public: - CutlassFpAIntBGemmRunner(); - ~CutlassFpAIntBGemmRunner(); - - void gemm(const T* A, - const WeightType* B, - const T* weight_scales, - T* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream); - - void gemm_bias_act(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - std::string activation_type, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream); - - // Returns desired workspace size in bytes. - int getWorkspaceSize(const int m, const int n, const int k); - - private: - template - void dispatch_to_arch(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream, - int* occupancy = nullptr); - - template - void run_gemm(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream); - - private: - static constexpr int split_k_limit = 7; - - int sm_; - int multi_processor_count_; -}; - -// This allocation is present to help with compiling with other structures in -// FT. It will throw an error in all functions because this runner assumes the -// weight type and the activation type are different. We allow empty classes to -// be created, but any calls to gemm or gemm_bias_act will throw an error. -template -class CutlassFpAIntBGemmRunner { - public: - CutlassFpAIntBGemmRunner() = default; - ~CutlassFpAIntBGemmRunner() = default; - - void gemm(const float* A, - const WeightType* B, - const float* weight_scales, - float* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream); - - void gemm_bias_act(const float* A, - const WeightType* B, - const float* weight_scales, - const float* biases, - float* C, - int m, - int n, - int k, - std::string activation_type, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream); - - int getWorkspaceSize(const int m, const int n, const int k); -}; -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu deleted file mode 100644 index 2f566d4dbc35e..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu +++ /dev/null @@ -1,681 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -#include "paddle/common/errors.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/arch_define.h" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic pop - -namespace phi { - -template -void dispatch_gemm_config(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - switch (gemm_config.stages) { - case 2: - using DispatcherStages2 = dispatch_stages; - DispatcherStages2::dispatch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case 3: - using DispatcherStages3 = dispatch_stages; - DispatcherStages3::dispatch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case 4: - using DispatcherStages4 = dispatch_stages; - DispatcherStages4::dispatch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case 5: - using DispatcherStages5 = dispatch_stages; - DispatcherStages5::dispatch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - default: - std::string err_msg = "dispatch_gemm_config does not support stages " + - std::to_string(gemm_config.stages); - throw std::runtime_error("[dispatch_gemm_config] " + err_msg); - break; - } -} - -template -void dispatch_gemm_to_cutlass(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - char* workspace, - size_t workspace_bytes, - CutlassGemmConfig gemm_config, - cudaStream_t stream, - int* occupancy) { - // VLOG(3)<<__PRETTY_FUNCTION__; - // Note that SIMT configs are omitted here since they are not supported for - // fpA_intB. We also only instantiate configs here where threadblockShapeM == - // warpShapeM since those usually perform the best for mixed type gemms. - switch (gemm_config.tile_config) { -#if defined(USE_FPAINTB_GEMM_WITH_SM80) || defined(USE_FPAINTB_GEMM_WITH_SM86) - case CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<16, 32, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; -#endif - case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<32, 32, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<64, 64, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; -#if defined(USE_FPAINTB_GEMM_WITH_SM80) || defined(USE_FPAINTB_GEMM_WITH_SM86) - case CutlassTileConfig::CtaShape128x128x64_WarpShape64x64x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<64, 64, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - // config for M_16000_N_12288_K_6144 in encoder - case CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<64, 64, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; -#endif - case CutlassTileConfig::Undefined: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); - break; - case CutlassTileConfig::ChooseWithHeuristic: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have " - "already been set by heuristic."); - break; - default: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed " - "type GEMM."); - break; - } -} - -template -void dispatch_gemm_to_cutlass_sm7x(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - char* workspace, - size_t workspace_bytes, - CutlassGemmConfig gemm_config, - cudaStream_t stream, - int* occupancy) { - // VLOG(3)<<__PRETTY_FUNCTION__; - // Note that SIMT configs are omitted here since they are not supported for - // fpA_intB. We also only instantiate configs here where threadblockShapeM == - // warpShapeM since those usually perform the best for mixed type gemms. - switch (gemm_config.tile_config) { - case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<32, 32, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64: - dispatch_gemm_config, - cutlass::gemm::GemmShape<64, 64, 64>>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - break; - case CutlassTileConfig::Undefined: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); - break; - case CutlassTileConfig::ChooseWithHeuristic: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have " - "already been set by heuristic."); - break; - default: - throw std::runtime_error( - "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed " - "type GEMM."); - break; - } -} - -template -CutlassFpAIntBGemmRunner::CutlassFpAIntBGemmRunner() { - // VLOG(3)<<__PRETTY_FUNCTION__; - int device{-1}; - check_cuda_error(cudaGetDevice(&device)); - sm_ = getSMVersion(); - check_cuda_error(cudaDeviceGetAttribute( - &multi_processor_count_, cudaDevAttrMultiProcessorCount, device)); -} - -template -CutlassFpAIntBGemmRunner::~CutlassFpAIntBGemmRunner() { - // VLOG(3)<<__PRETTY_FUNCTION__; -} - -template -template -void CutlassFpAIntBGemmRunner::dispatch_to_arch( - const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - // VLOG(3)<<__PRETTY_FUNCTION__; - if (sm_ >= 70 && sm_ < 75) { -#if defined(USE_FPAINTB_GEMM_WITH_SM70) - dispatch_gemm_to_cutlass_sm7x(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - workspace_ptr, - workspace_bytes, - gemm_config, - stream, - occupancy); -#else - throw std::runtime_error( - "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " - "CUTLASS mixed type GEMM"); -#endif - } else if (sm_ >= 75 && sm_ < 80) { -#if defined(USE_FPAINTB_GEMM_WITH_SM75) - dispatch_gemm_to_cutlass_sm7x(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - workspace_ptr, - workspace_bytes, - gemm_config, - stream, - occupancy); -#else - throw std::runtime_error( - "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " - "CUTLASS mixed type GEMM"); -#endif - } else if (sm_ >= 80 && sm_ < 90) { -#if defined(USE_FPAINTB_GEMM_WITH_SM80) - dispatch_gemm_to_cutlass( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - workspace_ptr, - workspace_bytes, - gemm_config, - stream, - occupancy); -#else - throw std::runtime_error( - "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " - "CUTLASS mixed type GEMM"); -#endif - } else { - throw std::runtime_error( - "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " - "CUTLASS mixed type GEMM"); - } -} - -template -template -void CutlassFpAIntBGemmRunner::run_gemm( - const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream) { - // VLOG(3)<<__PRETTY_FUNCTION__; - static constexpr bool is_weight_only = !std::is_same::value; - const bool is_weight_only_encoder = m >= 512 ? true : false; - std::vector candidate_configs = - get_candidate_configs(sm_, is_weight_only, is_weight_only_encoder, false); - std::vector occupancies(candidate_configs.size()); - - for (size_t ii = 0; ii < candidate_configs.size(); ++ii) { - dispatch_to_arch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - candidate_configs[ii], - workspace_ptr, - workspace_bytes, - stream, - &occupancies[ii]); - } - // Standard GEMM, so 1 "expert". We use the same function for MoE and regular - // FFN. - static constexpr int num_experts = 1; - CutlassGemmConfig chosen_config = - estimate_best_config_from_occupancies(candidate_configs, - occupancies, - m, - n, - k, - num_experts, - split_k_limit, - workspace_bytes, - multi_processor_count_, - is_weight_only); - - dispatch_to_arch(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - chosen_config, - workspace_ptr, - workspace_bytes, - stream); -} - -template -void CutlassFpAIntBGemmRunner::gemm_bias_act( - const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - std::string activation_type, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream) { - if (activation_type == "gelu") { - PADDLE_THROW(phi::errors::Unimplemented( - "Activation_type = gelu for fpA_intB gemm is not instantiated.")); - } else if (activation_type == "relu") { - PADDLE_THROW(phi::errors::Unimplemented( - "Activation_type = relu for fpA_intB gemm is not instantiated.")); - } else if (activation_type == "none") { - run_gemm(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - workspace_ptr, - workspace_bytes, - stream); - } else { - throw std::runtime_error(("Invalid activation type.")); - } -} - -template -void CutlassFpAIntBGemmRunner::gemm(const T* A, - const WeightType* B, - const T* weight_scales, - T* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream) { - // VLOG(3)<<__PRETTY_FUNCTION__; - run_gemm(A, - B, - weight_scales, - nullptr, - C, - m, - n, - k, - workspace_ptr, - workspace_bytes, - stream); -} - -template -int CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, - const int n, - const int k) { - // VLOG(3)<<__PRETTY_FUNCTION__; // These are the min tile sizes for each - // config, which would launch the maximum number of blocks - const int max_grid_m = (m + 31) / 32; - const int max_grid_n = (n + 127) / 128; - // We need 4 bytes per block in the worst case. We launch split_k_limit in z - // dim. - return max_grid_m * max_grid_n * split_k_limit * 4; -} - -// =============================== Specialization T == WeightType -// ======================================= -template -void CutlassFpAIntBGemmRunner::gemm_bias_act( - const float* A, - const WeightType* B, - const float* weight_scales, - const float* biases, - float* C, - int m, - int n, - int k, - std::string activation_type, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream) { - throw std::runtime_error( - ("Attempting to run mixed gemm bias act when the types are the same is " - "an error.")); -} - -template -void CutlassFpAIntBGemmRunner::gemm( - const float* A, - const WeightType* B, - const float* weight_scales, - float* C, - int m, - int n, - int k, - char* workspace_ptr, - const size_t workspace_bytes, - cudaStream_t stream) { - throw std::runtime_error(( - "Attempting to run mixed gemm when the types are the same is an error.")); -} - -template -int CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, - const int n, - const int k) { - return 0; -} - -template class CutlassFpAIntBGemmRunner; -template class CutlassFpAIntBGemmRunner; -#ifdef PADDLE_CUDA_BF16 -template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t>; -#endif -template class CutlassFpAIntBGemmRunner; -template class CutlassFpAIntBGemmRunner; -#ifdef PADDLE_CUDA_BF16 -template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t>; -#endif -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h deleted file mode 100644 index 8ae1047c43afc..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma once - -#include "cutlass/gemm/device/gemm_universal_base.h" -#include "cutlass/gemm/kernel/default_gemm.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/compute_occupancy.h" - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue_helpers.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/threadblock/default_mma.h" -#pragma GCC diagnostic pop - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/cutlass_heuristic.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/arch_define.h" -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" -#include "paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h" -namespace phi { - -template -void generic_mixed_gemm_kernelLauncher(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - static_assert(cutlass::platform::is_same::value || -#ifdef PADDLE_CUDA_BF16 - cutlass::platform::is_same::value || -#endif - cutlass::platform::is_same::value, - "Specialized for bfloat16, half, float"); - - static_assert( - cutlass::platform::is_same::value || - cutlass::platform::is_same::value || - cutlass::platform::is_same::value, - ""); - - // The cutlass type for the input elements. This is needed to convert to - // cutlass::half_t if necessary. - using ElementType_ = typename cutlass::platform::conditional< - cutlass::platform::is_same::value, - cutlass::half_t, - T>::type; -#ifdef PADDLE_CUDA_BF16 - using ElementType = typename cutlass::platform::conditional< - cutlass::platform::is_same::value, - cutlass::bfloat16_t, - ElementType_>::type; -#endif - using CutlassWeightType_ = typename cutlass::platform::conditional< - cutlass::platform::is_same::value, - cutlass::half_t, - WeightType>::type; - -#ifdef PADDLE_CUDA_BF16 - using CutlassWeightType = typename cutlass::platform::conditional< - cutlass::platform::is_same::value, - cutlass::bfloat16_t, - CutlassWeightType_>::type; -#endif - - // We need separate config for each architecture since we will target - // different tensorcore instructions. For float, we do not target TCs. - using MixedGemmArchTraits = cutlass::gemm::kernel:: - MixedGemmArchTraits; - using ElementAccumulator = typename MixedGemmArchTraits::AccType; - - using EpilogueOp = typename Epilogue::Op; - if (gemm_config.split_k_style == SplitKStyle::NO_SPLIT_K) { - using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemm< - ElementType, - cutlass::layout::RowMajor, - MixedGemmArchTraits::ElementsPerAccessA, - CutlassWeightType, - typename MixedGemmArchTraits::LayoutB, - MixedGemmArchTraits::ElementsPerAccessB, - ElementType, - cutlass::layout::RowMajor, - ElementAccumulator, - cutlass::arch::OpClassTensorOp, - arch, - ThreadblockShape, - WarpShape, - typename MixedGemmArchTraits::InstructionShape, - EpilogueOp, - typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, - Stages, - true, - typename MixedGemmArchTraits::Operator>::GemmKernel; - - using GemmKernel = cutlass::gemm::kernel::GemmFpAIntB< - typename GemmKernel_::Mma, - typename GemmKernel_::Epilogue, - typename GemmKernel_::ThreadblockSwizzle, - arch, // Ensure top level arch is used for dispatch - GemmKernel_::kSplitKSerial>; - - if (occupancy != nullptr) { - *occupancy = compute_occupancy_for_kernel(); - return; - } - - using Gemm = cutlass::gemm::device::GemmUniversalBase; - - const int ldb = - cutlass::platform::is_same::value - ? n - : k * GemmKernel::kInterleave; - - typename Gemm::Arguments args( - {m, n, k}, - {reinterpret_cast(const_cast(A)), k}, - {reinterpret_cast(const_cast(B)), ldb}, - {reinterpret_cast(const_cast(weight_scales)), 0}, - {reinterpret_cast(const_cast(biases)), 0}, - {reinterpret_cast(C), n}, - gemm_config.split_k_factor, - {ElementAccumulator(1.f), ElementAccumulator(0.f)}); - - // This assertion is enabled because because for the column interleaved - // layout, K MUST be a multiple of threadblockK. The reason for this is that - // the default pitchlinear iterators are used to handle walking over the - // interleaved matrix. The way masking in handled in these do not map to the - // interleaved layout. We need to write our own predicated iterator in order - // to relax this limitation. - if (GemmKernel::kInterleave > 1 && - ((k % MixedGemmArchTraits::ThreadblockK) || - ((k / gemm_config.split_k_factor) % - MixedGemmArchTraits::ThreadblockK))) { - throw std::runtime_error( - "Temp assertion: k must be multiple of threadblockK"); - } - - Gemm gemm; - if (gemm.get_workspace_size(args) > workspace_bytes) { - // TODO(wangbojun) here to reset the split-k in gemm args, but no work for - // now to run bf16 mixgemm, we have set the split-k factor to 1 - VLOG(1) << "Requested split-k but workspace size insufficient. Falling " - "back to non-split-k implementation."; - VLOG(1) << "need workspace sizoe of: " << gemm.get_workspace_size(args) - << ", but got " << workspace_bytes; - VLOG(1) << "args.batch_stride_D:" << args.batch_stride_D; - VLOG(1) << "args.batch_count:" << args.batch_count; - // If requested split-k factor will require more workspace bytes, revert - // to standard gemm. - // - args.batch_count = 1; - } - - auto can_implement = gemm.can_implement(args); - if (can_implement != cutlass::Status::kSuccess) { - std::string err_msg = - "fpA_intB cutlass kernel will fail for params. Error: " + - std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[fpA_intB Runner] " + err_msg); - } - - auto init_status = gemm.initialize(args, workspace, stream); - if (init_status != cutlass::Status::kSuccess) { - std::string err_msg = - "Failed to initialize cutlass fpA_intB gemm. Error: " + - std::string(cutlassGetStatusString(init_status)); - throw std::runtime_error("[fpA_intB Runner] " + err_msg); - } - - auto run_status = gemm.run(stream); - if (run_status != cutlass::Status::kSuccess) { - std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + - std::string(cutlassGetStatusString(run_status)); - throw std::runtime_error("[fpA_intB Runner] " + err_msg); - } - } else { - // for stream-k, we set gemm_config.split_k_factor = 1 to use default load - // balance. - gemm_config.split_k_factor = 1; - using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemmUniversal< - ElementType, - cutlass::layout::RowMajor, - cutlass::ComplexTransform::kNone, - MixedGemmArchTraits::ElementsPerAccessA, - CutlassWeightType, - typename MixedGemmArchTraits::LayoutB, - cutlass::ComplexTransform::kNone, - MixedGemmArchTraits::ElementsPerAccessB, - ElementType, - cutlass::layout::RowMajor, - ElementAccumulator, - cutlass::arch::OpClassTensorOp, - arch, - ThreadblockShape, - WarpShape, - typename MixedGemmArchTraits::InstructionShape, - EpilogueOp, - typename cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, - Stages, - typename MixedGemmArchTraits::Operator, - cutlass::gemm::SharedMemoryClearOption::kNone>::GemmKernel; - using GemmKernel = cutlass::gemm::kernel::GemmFpAIntBSplitK< - typename GemmKernel_::Mma, - typename GemmKernel_::Epilogue, - typename GemmKernel_::ThreadblockSwizzle, - arch // Ensure top level arch is used for dispatch - >; - - if (occupancy != nullptr) { - *occupancy = compute_occupancy_for_kernel2(); - return; - } - - using Gemm = cutlass::gemm::device::GemmUniversalBase; - - const int ldb = - cutlass::platform::is_same::value - ? n - : k * GemmKernel::kInterleave; - typename Gemm::Arguments args( - cutlass::gemm::GemmUniversalMode::kGemm, - {m, n, k}, - {reinterpret_cast(const_cast(A)), k}, - {reinterpret_cast(const_cast(B)), ldb}, - {reinterpret_cast(const_cast(weight_scales)), 0}, - {reinterpret_cast(const_cast(biases)), 0}, - {reinterpret_cast(C), n}, - gemm_config.split_k_factor, - {ElementAccumulator(1.f), ElementAccumulator(0.f)}); - - // This assertion is enabled because because for the column interleaved - // layout, K MUST be a multiple of threadblockK. The reason for this is that - // the default pitchlinear iterators are used to handle walking over the - // interleaved matrix. The way masking in handled in these do not map to the - // interleaved layout. We need to write our own predicated iterator in order - // to relax this limitation. - if (GemmKernel::kInterleave > 1 && - ((k % MixedGemmArchTraits::ThreadblockK) || - ((k / gemm_config.split_k_factor) % - MixedGemmArchTraits::ThreadblockK))) { - throw std::runtime_error( - "Temp assertion: k must be multiple of threadblockK"); - } - - Gemm gemm; - if (gemm.get_workspace_size(args) > workspace_bytes) { - VLOG(1) << "Requested split-k but workspace size insufficient. Falling " - "back to non-split-k implementation."; - VLOG(1) << "Requested workspace_size: " << gemm.get_workspace_size(args); - VLOG(1) << "get workspace_size: " << workspace_bytes; - // If requested split-k factor will require more workspace bytes, revert - // to standard gemm. - args.batch_count = 1; - } - - auto can_implement = gemm.can_implement(args); - if (can_implement != cutlass::Status::kSuccess) { - std::string err_msg = - "fpA_intB cutlass kernel will fail for params. Error: " + - std::string(cutlassGetStatusString(can_implement)); - throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + - err_msg); - } - - auto init_status = gemm.initialize(args, workspace, stream); - if (init_status != cutlass::Status::kSuccess) { - std::string err_msg = - "Failed to initialize cutlass fpA_intB gemm. Error: " + - std::string(cutlassGetStatusString(init_status)); - throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + - err_msg); - } - - auto run_status = gemm.run(stream); - if (run_status != cutlass::Status::kSuccess) { - std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + - std::string(cutlassGetStatusString(run_status)); - throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + - err_msg); - } - } -} - -template -void generic_mixed_gemm_kernelLauncher_template(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy); - -template -struct dispatch_stages { - static void dispatch(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy = nullptr) { - // VLOG(3)<<__PRETTY_FUNCTION__; - std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " + - std::to_string(arch::kMinComputeCapability) + - " with stages set to " + std::to_string(Stages); - throw std::runtime_error("[dispatch_stages::dispatch] " + err_msg); - } -}; -template -struct dispatch_stages { - static void dispatch(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy = nullptr) { - // VLOG(3)<<__PRETTY_FUNCTION__; - - generic_mixed_gemm_kernelLauncher_template(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - } -}; - -#if defined(USE_FPAINTB_GEMM_WITH_SM80) -template -struct dispatch_stages 2)>::type> { - static void dispatch(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy = nullptr) { - generic_mixed_gemm_kernelLauncher_template(A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); - } -}; -#endif - -template -void dispatch_gemm_config(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy); - -template -void dispatch_gemm_to_cutlass(const T* A, - const WeightType* B, - const T* weight_scales, - const T* biases, - T* C, - int m, - int n, - int k, - char* workspace, - size_t workspace_bytes, - CutlassGemmConfig gemm_config, - cudaStream_t stream, - int* occupancy); - -} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py deleted file mode 100644 index ad7f1e65591ce..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import re - -# this is a file's header part -CommonHead = ''' -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { -''' - -CommonTail = ''' -} // namespace phi - -''' -DispatchGemmConfigInstanceDeclare = """ -template<> -void generic_mixed_gemm_kernelLauncher_template<{T}, - {WeightType}, - {arch}, - {EpilogueTag}, - {ThreadblockShape}, - {WarpShape}, - {Stages}>( - const {T}* A, - const {WeightType}* B, - const {T}* weight_scales, - const {T}* biases, - {T}* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher<{T}, - {WeightType}, - {arch}, - {EpilogueTag}, - {ThreadblockShape}, - {WarpShape}, - {Stages}>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} -""" - -DefineHeader = """ -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -""" - -DefaultArch = [70, 75, 80] -epilogue_tags = ["bias", "biasFtGelu", "biasReLU", "noBias"] - -WeightTypes = ["uint8_t", "cutlass::uint4b_t"] -ThreadblockShapes = [ - "cutlass::gemm::GemmShape<16, 128, 64>", - "cutlass::gemm::GemmShape<32, 128, 64>", - "cutlass::gemm::GemmShape<64, 128, 64>", - "cutlass::gemm::GemmShape<128, 128, 64>", - "cutlass::gemm::GemmShape<128, 256, 64>", -] -WarpShapes = [ - "cutlass::gemm::GemmShape<16, 32, 64>", - "cutlass::gemm::GemmShape<32, 32, 64>", - "cutlass::gemm::GemmShape<64, 64, 64>", - "cutlass::gemm::GemmShape<64, 64, 64>", - "cutlass::gemm::GemmShape<64, 64, 64>", -] - -ThreadblockShapes_sm70 = [ - "cutlass::gemm::GemmShape<32, 128, 64>", - "cutlass::gemm::GemmShape<64, 128, 64>", -] -WarpShapes_sm70 = [ - "cutlass::gemm::GemmShape<32, 32, 64>", - "cutlass::gemm::GemmShape<64, 64, 64>", -] -StagesList = {70: [2], 75: [2], 80: [2, 3, 4, 5]} - -ElementTypes = {"fp16": "half", "bf16": "__nv_bfloat16"} -Archs = { - 70: "cutlass::arch::Sm70", - 75: "cutlass::arch::Sm75", - 80: "cutlass::arch::Sm80", -} -EpilogueTags = { - "bias": "EpilogueOpBias", - "noBias": "EpilogueOpNoBias", - # "biasFtGelu": "EpilogueOpBiasFtGelu", - # "biasReLU": "EpilogueOpBiasReLU", -} - - -def SubstituteTemplate(template, values): - text = template - changed = True - while changed: - changed = False - for key, value in values.items(): - regex = "\\{%s\\}" % key - newtext = re.sub(regex, value, text) - if newtext != text: - changed = True - text = newtext - return text - - -def find_arch_range(archs): - compile_archs = [] - for arch in archs: - if arch >= 70 and arch < 75: - compile_archs.append(70) - elif arch >= 75 and arch < 80: - compile_archs.append(75) - elif arch >= 80 and arch < 90: - compile_archs.append(80) - compile_archs = list(set(compile_archs)) - compile_archs.sort() - return compile_archs - - -def convert_to_arch_list(archs): - archs = archs.lower().strip() - if archs == "all": - return DefaultArch - - archs = [int(s.strip()) for s in archs.split(';') if s.strip()] - archs = list(set(archs)) - return find_arch_range(archs) - - -def parse_args(): - parser = argparse.ArgumentParser( - description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." - ) - parser.add_argument( - "--cuda_arch", - type=convert_to_arch_list, - default=convert_to_arch_list("All"), - help="The CUDA architecture to be generated.", - ) - args = parser.parse_args() - return args - - -# generate source cu -def generate_source_cu( - element_type: str, arch: int, epilogue_tag: str, stages: int -): - all_code = CommonHead - ThreadblockShapes_arch = ThreadblockShapes - WarpShapes_arch = WarpShapes - if arch < 80: - ThreadblockShapes_arch = ThreadblockShapes_sm70 - WarpShapes_arch = WarpShapes_sm70 - for WeightType in WeightTypes: - for i in range(len(ThreadblockShapes_arch)): - value_dict = { - "T": ElementTypes[element_type], - "WeightType": WeightType, - "arch": Archs[arch], - "EpilogueTag": EpilogueTags[epilogue_tag], - "ThreadblockShape": ThreadblockShapes_arch[i], - "WarpShape": WarpShapes_arch[i], - "Stages": str(stages), - } - all_code += SubstituteTemplate( - DispatchGemmConfigInstanceDeclare, value_dict - ) - all_code += CommonTail - return all_code - - -if __name__ == "__main__": - args = parse_args() - archs = args.cuda_arch - header_all = DefineHeader - header_name = "autogen_tmp/arch_define.h" - if archs: - for arch in archs: - define_line = "#define USE_FPAINTB_GEMM_WITH_SM%s\n" % str(arch) - header_all += define_line - with open(header_name, "w") as f: - f.write(header_all) - f.close() - if archs: - for element_type in ElementTypes.keys(): - for arch in archs: - for epilogue_tag in EpilogueTags.keys(): - for stages in StagesList[arch]: - file_name = "autogen_tmp/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format( - element_type, arch, stages, epilogue_tag - ) - all_code = generate_source_cu( - element_type, arch, epilogue_tag, stages - ) - with open(file_name, "w") as f: - f.write(all_code) - f.close() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index dc20eced15269..4c1ca640d20e4 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -380,8 +380,11 @@ vander, ) from .random import ( # noqa: F401 + bernoulli_, binomial, exponential_, + log_normal, + log_normal_, multinomial, normal, normal_, @@ -394,9 +397,6 @@ standard_normal, uniform, uniform_, - bernoulli_, - log_normal, - log_normal_, ) from .search import ( # noqa: F401 argmax, @@ -798,4 +798,4 @@ ('__or__', 'bitwise_or'), ('__xor__', 'bitwise_xor'), ('__invert__', 'bitwise_not'), -] \ No newline at end of file +] diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 6903f729ec1e6..ca93c47f61e46 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1519,7 +1519,7 @@ def bernoulli_(x, p=0.5, name=None): - x (Tensor): Input Tensor ``x``. Examples: .. code-block:: python - + >>> import paddle >>> x = paddle.empty((3, 4)).uniform_(0, 1) >>> x.bernoulli_() @@ -1537,7 +1537,7 @@ def bernoulli_(x, p=0.5, name=None): check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential") - uniform_(x, min=0., max=1.) + uniform_(x, min=0.0, max=1.0) return x.set_value((x < p).astype(x.dtype)) @@ -1571,7 +1571,7 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): - out (Tensor): A Tensor filled with random values sampled from a log normal distribution with ``mean`` and ``std`` . Examples: .. code-block:: python - + :name: log_normal-example-1 >>> import paddle >>> out1 = paddle.log_normal(shape=[2, 3]) @@ -1616,11 +1616,13 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): ) if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - - n_mean = paddle.log(mean ** 2 / paddle.sqrt(mean ** 2 + std ** 2)) - n_std = paddle.sqrt(paddle.log(1 + (std ** 2 / mean ** 2))) - distribution = gaussian(shape, mean=n_mean, std=n_std, seed=seed, dtype=dtype) + n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) + n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) + + distribution = gaussian( + shape, mean=n_mean, std=n_std, seed=seed, dtype=dtype + ) return paddle.exp(distribution) @@ -1665,7 +1667,7 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): >>> # doctest: -SKIP """ - n_mean = paddle.log(mean ** 2 / paddle.sqrt(mean ** 2 + std ** 2)) - n_std = paddle.sqrt(paddle.log(1 + (std ** 2 / mean ** 2))) + n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) + n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) return gaussian_(x, mean=mean, std=std, seed=seed).exp_() diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index 41b2999db19aa..746786f9367c6 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1707,7 +1707,9 @@ def init_data(self): self.seed = 100 def inplace_api_processing(self, var): - return paddle.log_normal_(var, self.shape, self.mean, self.std, self.seed) + return paddle.log_normal_( + var, self.shape, self.mean, self.std, self.seed + ) def non_inplace_api_processing(self, var): return paddle.log_normal(var, self.index, self.axis, self.value) diff --git a/third_party/flashattn b/third_party/flashattn index a96f802471445..b74460b385b69 160000 --- a/third_party/flashattn +++ b/third_party/flashattn @@ -1 +1 @@ -Subproject commit a96f8024714455fb86a326e20c3b7f700ec50772 +Subproject commit b74460b385b691d881ff2d3a1adbcefdcac574a3 From 17840aded797233517b23e81b7d90a3fc91c68a7 Mon Sep 17 00:00:00 2001 From: PommesPeter <434596665@qq.com> Date: Tue, 19 Dec 2023 13:54:16 +0800 Subject: [PATCH 07/15] :white_check_mark: Test: added test cases --- python/paddle/tensor/random.py | 2 +- test/legacy_test/test_log_normal_op.py | 252 +++++++++++++++++++++---- 2 files changed, 214 insertions(+), 40 deletions(-) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index ca93c47f61e46..22d5bfcf3c527 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1670,4 +1670,4 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) - return gaussian_(x, mean=mean, std=std, seed=seed).exp_() + return gaussian_(x, mean=n_mean, std=n_std, seed=seed).exp_() diff --git a/test/legacy_test/test_log_normal_op.py b/test/legacy_test/test_log_normal_op.py index 5083c9f18e955..cac82a5a77998 100644 --- a/test/legacy_test/test_log_normal_op.py +++ b/test/legacy_test/test_log_normal_op.py @@ -15,9 +15,12 @@ import unittest import numpy as np +from op_test import paddle_static_guard import paddle +from paddle import base from paddle.base import core +from paddle.tensor import random SEED = 100 np.random.seed(SEED) @@ -29,45 +32,216 @@ def output_log_normal(shape, mean, std): class TestLogNormalAPI(unittest.TestCase): - DTYPE = "float64" - SHAPE = [2, 4] - MEAN = 0 - STD = 1 - - def setUp(self): - self.x = output_log_normal(self.SHAPE, self.MEAN, self.STD) - self.place = [paddle.CPUPlace()] - if core.is_compiled_with_cuda(): - self.place.append(paddle.CUDAPlace(0)) - - def test_api_static(self): - def run(place): - paddle.enable_static() - with paddle.static.program_guard(paddle.static.Program()): - out = paddle.log_normal() - exe = paddle.static.Executor(place) - res = exe.run( - paddle.static.default_main_program(), - feed={}, - fetch_list=[out], - ) - return res[0] - - for place in self.place: - res = run(place) - self.assertTrue(np.allclose(res, self.x)) - - def test_api_dygraph(self): - def run(place): - paddle.disable_static(place) - out = paddle.log_normal(self.SHAPE, self.MEAN, self.STD, seed=SEED) - - out_ref = output_log_normal(self.SHAPE, self.MEAN, self.STD) - np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-5) - paddle.enable_static() - - for place in self.place: - run(place) + def test_static_api(self): + with paddle_static_guard(): + positive_2_int32 = paddle.tensor.fill_constant([1], "int32", 2000) + + positive_2_int64 = paddle.tensor.fill_constant([1], "int64", 500) + shape_tensor_int32 = paddle.static.data( + name="shape_tensor_int32", shape=[2], dtype="int32" + ) + + shape_tensor_int64 = paddle.static.data( + name="shape_tensor_int64", shape=[2], dtype="int64" + ) + + out_1 = random.log_normal( + shape=[2000, 500], dtype="float32", mean=0.0, std=1.0, seed=10 + ) + + out_2 = random.log_normal( + shape=[2000, positive_2_int32], + dtype="float32", + mean=0.0, + std=1.0, + seed=10, + ) + + out_3 = random.log_normal( + shape=[2000, positive_2_int64], + dtype="float32", + mean=0.0, + std=1.0, + seed=10, + ) + + out_4 = random.log_normal( + shape=shape_tensor_int32, + dtype="float32", + mean=0.0, + std=1.0, + seed=10, + ) + + out_5 = random.log_normal( + shape=shape_tensor_int64, + dtype="float32", + mean=0.0, + std=1.0, + seed=10, + ) + + out_6 = random.log_normal( + shape=shape_tensor_int64, + dtype=np.float32, + mean=0.0, + std=1.0, + seed=10, + ) + + exe = base.Executor(place=base.CPUPlace()) + res_1, res_2, res_3, res_4, res_5, res_6 = exe.run( + base.default_main_program(), + feed={ + "shape_tensor_int32": np.array([2000, 500]).astype("int32"), + "shape_tensor_int64": np.array([2000, 500]).astype("int64"), + }, + fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6], + ) + + self.assertAlmostEqual(np.mean(res_1), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_1), 1.0, delta=0.1) + self.assertAlmostEqual(np.mean(res_2), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_2), 1.0, delta=0.1) + self.assertAlmostEqual(np.mean(res_3), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_3), 1.0, delta=0.1) + self.assertAlmostEqual(np.mean(res_4), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_5), 1.0, delta=0.1) + self.assertAlmostEqual(np.mean(res_5), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_5), 1.0, delta=0.1) + self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_6), 1.0, delta=0.1) + + def test_default_dtype(self): + def test_default_fp16(): + paddle.framework.set_default_dtype('float16') + out = paddle.tensor.random.log_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16) + + def test_default_fp32(): + paddle.framework.set_default_dtype('float32') + out = paddle.tensor.random.log_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP32) + + def test_default_fp64(): + paddle.framework.set_default_dtype('float64') + out = paddle.tensor.random.log_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP64) + + if paddle.is_compiled_with_cuda(): + paddle.set_device('gpu') + test_default_fp16() + test_default_fp64() + test_default_fp32() + + +class TestStandardNormalDtype(unittest.TestCase): + def test_default_dtype(self): + def test_default_fp16(): + paddle.framework.set_default_dtype('float16') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16) + + def test_default_fp32(): + paddle.framework.set_default_dtype('float32') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP32) + + def test_default_fp64(): + paddle.framework.set_default_dtype('float64') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP64) + + if paddle.is_compiled_with_cuda(): + paddle.set_device('gpu') + test_default_fp16() + test_default_fp64() + test_default_fp32() + + +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Different GPU generatte different random value. Only test V100 here. + if "V100" not in paddle.device.cuda.get_device_name(): + return + + def _check_random_value(dtype, expect, expect_mean, expect_std): + x = paddle.randn([32, 3, 1024, 1024], dtype=dtype) + actual = x.numpy() + np.testing.assert_allclose( + actual[2, 1, 512, 1000:1010], expect, rtol=1e-05 + ) + self.assertTrue(np.mean(actual), expect_mean) + self.assertTrue(np.std(actual), expect_std) + + print("Test Fixed Random number on V100 GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + expect = [ + -0.79037829, + -0.54411126, + -0.32266671, + 0.35791815, + 1.44169267, + -0.87785644, + -1.23909874, + -2.18194139, + 0.49489656, + 0.40703062, + ] + expect_mean = ( + -0.0000053026194133403266873214888799115129813799285329878330230713 + ) + expect_std = 0.99999191058126390974081232343451119959354400634765625 + _check_random_value( + core.VarDesc.VarType.FP64, expect, expect_mean, expect_std + ) + + expect = [ + -0.7988942, + 1.8644791, + 0.02782744, + 1.3692524, + 0.6419724, + 0.12436751, + 0.12058455, + -1.9984808, + 1.5635862, + 0.18506318, + ] + expect_mean = -0.00004762359094456769526004791259765625 + expect_std = 0.999975681304931640625 + _check_random_value( + core.VarDesc.VarType.FP32, expect, expect_mean, expect_std + ) + + +class TestLogNormalErrors(unittest.TestCase): + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + mean = [1, 2, 3] + self.assertRaises(TypeError, paddle.log_normal, mean) + + std = [1, 2, 3] + self.assertRaises(TypeError, paddle.log_normal, std=std) + + mean = paddle.static.data('Mean', [100], 'int32') + self.assertRaises(TypeError, paddle.log_normal, mean) + + std = paddle.static.data('Std', [100], 'int32') + self.assertRaises(TypeError, paddle.log_normal, mean=1.0, std=std) + + self.assertRaises(TypeError, paddle.log_normal, shape=1) + + self.assertRaises(TypeError, paddle.log_normal, shape=[1.0]) + + shape = paddle.static.data('Shape', [100], 'float32') + self.assertRaises(TypeError, paddle.log_normal, shape=shape) if __name__ == "__main__": From c6fc4ed37237de2e369de2c5fe6e5aa328084449 Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Tue, 19 Dec 2023 14:00:20 +0800 Subject: [PATCH 08/15] :pencil2: Fix: fixed a error --- ...m_kernelLauncher_fp16_sm80_stages5_bias.cu | 439 ----------- ...kernelLauncher_fp16_sm80_stages5_noBias.cu | 439 ----------- .../fpA_intB_gemm/fpA_intB_gemm.h | 158 ++++ .../fpA_intB_gemm/fpA_intB_gemm_template.cu | 681 ++++++++++++++++++ .../fpA_intB_gemm/fpA_intB_gemm_template.h | 519 +++++++++++++ .../generic_mixed_gemm_kernelLauncher.py | 228 ++++++ 6 files changed, 1586 insertions(+), 878 deletions(-) delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu delete mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h create mode 100644 paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu deleted file mode 100644 index a9cbad5330d9a..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_bias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu deleted file mode 100644 index 2fdeec9273b6c..0000000000000 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen_tmp/generic_mixed_gemm_kernelLauncher_fp16_sm80_stages5_noBias.cu +++ /dev/null @@ -1,439 +0,0 @@ - -// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. - -#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" - -namespace phi { - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const uint8_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<16, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<32, 32, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -template<> -void generic_mixed_gemm_kernelLauncher_template, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - const half* A, - const cutlass::uint4b_t* B, - const half* weight_scales, - const half* biases, - half* C, - int m, - int n, - int k, - CutlassGemmConfig gemm_config, - char* workspace, - size_t workspace_bytes, - cudaStream_t stream, - int* occupancy) { - generic_mixed_gemm_kernelLauncher, - cutlass::gemm::GemmShape<64, 64, 64>, - 5>( - A, - B, - weight_scales, - biases, - C, - m, - n, - k, - gemm_config, - workspace, - workspace_bytes, - stream, - occupancy); -} - -} // namespace phi - diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h new file mode 100644 index 0000000000000..f86ccba9d4617 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h" +// #include "src/fastertransformer/utils/allocator.h" +#include "cuda_runtime_api.h" // NOLINT + +namespace phi { + +/* + This runner only supports: + T in {half, __nv_bfloat} WeightType in {uint8_t, cutlass::uint4b_t} + + Activations, biases, scales and outputs are all assumed to be row-major. + + However, it is assumed that B is in a special format governed by + cutlass_extensions/gemm/kernel/mixed_gemm_B_layout. In this case, B must be + preprocessed using the cutlass weight only quant preprocessors. The weight + preprocessor will instantiate the layout and preprocess based on the + instantiation, so layout changes should only require modifications to + mix_gemm_B_layout.h. +*/ + +template +class CutlassFpAIntBGemmRunner { + public: + CutlassFpAIntBGemmRunner(); + ~CutlassFpAIntBGemmRunner(); + + void gemm(const T* A, + const WeightType* B, + const T* weight_scales, + T* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream); + + void gemm_bias_act(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + std::string activation_type, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream); + + // Returns desired workspace size in bytes. + int getWorkspaceSize(const int m, const int n, const int k); + + private: + template + void dispatch_to_arch(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream, + int* occupancy = nullptr); + + template + void run_gemm(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream); + + private: + static constexpr int split_k_limit = 7; + + int sm_; + int multi_processor_count_; +}; + +// This allocation is present to help with compiling with other structures in +// FT. It will throw an error in all functions because this runner assumes the +// weight type and the activation type are different. We allow empty classes to +// be created, but any calls to gemm or gemm_bias_act will throw an error. +template +class CutlassFpAIntBGemmRunner { + public: + CutlassFpAIntBGemmRunner() = default; + ~CutlassFpAIntBGemmRunner() = default; + + void gemm(const float* A, + const WeightType* B, + const float* weight_scales, + float* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream); + + void gemm_bias_act(const float* A, + const WeightType* B, + const float* weight_scales, + const float* biases, + float* C, + int m, + int n, + int k, + std::string activation_type, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream); + + int getWorkspaceSize(const int m, const int n, const int k); +}; +} // namespace phi \ No newline at end of file diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu new file mode 100644 index 0000000000000..869b4689a3ac0 --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" +#include "paddle/common/errors.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/arch_define.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic pop + +namespace phi { + +template +void dispatch_gemm_config(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + switch (gemm_config.stages) { + case 2: + using DispatcherStages2 = dispatch_stages; + DispatcherStages2::dispatch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case 3: + using DispatcherStages3 = dispatch_stages; + DispatcherStages3::dispatch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case 4: + using DispatcherStages4 = dispatch_stages; + DispatcherStages4::dispatch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case 5: + using DispatcherStages5 = dispatch_stages; + DispatcherStages5::dispatch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + default: + std::string err_msg = "dispatch_gemm_config does not support stages " + + std::to_string(gemm_config.stages); + throw std::runtime_error("[dispatch_gemm_config] " + err_msg); + break; + } +} + +template +void dispatch_gemm_to_cutlass(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + char* workspace, + size_t workspace_bytes, + CutlassGemmConfig gemm_config, + cudaStream_t stream, + int* occupancy) { + // VLOG(3)<<__PRETTY_FUNCTION__; + // Note that SIMT configs are omitted here since they are not supported for + // fpA_intB. We also only instantiate configs here where threadblockShapeM == + // warpShapeM since those usually perform the best for mixed type gemms. + switch (gemm_config.tile_config) { +#if defined(USE_FPAINTB_GEMM_WITH_SM80) || defined(USE_FPAINTB_GEMM_WITH_SM86) + case CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<16, 32, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; +#endif + case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<32, 32, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<64, 64, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; +#if defined(USE_FPAINTB_GEMM_WITH_SM80) || defined(USE_FPAINTB_GEMM_WITH_SM86) + case CutlassTileConfig::CtaShape128x128x64_WarpShape64x64x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<64, 64, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + // config for M_16000_N_12288_K_6144 in encoder + case CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<64, 64, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; +#endif + case CutlassTileConfig::Undefined: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); + break; + case CutlassTileConfig::ChooseWithHeuristic: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have " + "already been set by heuristic."); + break; + default: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed " + "type GEMM."); + break; + } +} + +template +void dispatch_gemm_to_cutlass_sm7x(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + char* workspace, + size_t workspace_bytes, + CutlassGemmConfig gemm_config, + cudaStream_t stream, + int* occupancy) { + // VLOG(3)<<__PRETTY_FUNCTION__; + // Note that SIMT configs are omitted here since they are not supported for + // fpA_intB. We also only instantiate configs here where threadblockShapeM == + // warpShapeM since those usually perform the best for mixed type gemms. + switch (gemm_config.tile_config) { + case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<32, 32, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case CutlassTileConfig::CtaShape64x128x64_WarpShape64x64x64: + dispatch_gemm_config, + cutlass::gemm::GemmShape<64, 64, 64>>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + break; + case CutlassTileConfig::Undefined: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined."); + break; + case CutlassTileConfig::ChooseWithHeuristic: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have " + "already been set by heuristic."); + break; + default: + throw std::runtime_error( + "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed " + "type GEMM."); + break; + } +} + +template +CutlassFpAIntBGemmRunner::CutlassFpAIntBGemmRunner() { + // VLOG(3)<<__PRETTY_FUNCTION__; + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + sm_ = getSMVersion(); + check_cuda_error(cudaDeviceGetAttribute( + &multi_processor_count_, cudaDevAttrMultiProcessorCount, device)); +} + +template +CutlassFpAIntBGemmRunner::~CutlassFpAIntBGemmRunner() { + // VLOG(3)<<__PRETTY_FUNCTION__; +} + +template +template +void CutlassFpAIntBGemmRunner::dispatch_to_arch( + const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + // VLOG(3)<<__PRETTY_FUNCTION__; + if (sm_ >= 70 && sm_ < 75) { +#if defined(USE_FPAINTB_GEMM_WITH_SM70) + dispatch_gemm_to_cutlass_sm7x(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + workspace_ptr, + workspace_bytes, + gemm_config, + stream, + occupancy); +#else + throw std::runtime_error( + "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " + "CUTLASS mixed type GEMM"); +#endif + } else if (sm_ >= 75 && sm_ < 80) { +#if defined(USE_FPAINTB_GEMM_WITH_SM75) + dispatch_gemm_to_cutlass_sm7x(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + workspace_ptr, + workspace_bytes, + gemm_config, + stream, + occupancy); +#else + throw std::runtime_error( + "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " + "CUTLASS mixed type GEMM"); +#endif + } else if (sm_ >= 80 && sm_ < 90) { +#if defined(USE_FPAINTB_GEMM_WITH_SM80) + dispatch_gemm_to_cutlass( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + workspace_ptr, + workspace_bytes, + gemm_config, + stream, + occupancy); +#else + throw std::runtime_error( + "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " + "CUTLASS mixed type GEMM"); +#endif + } else { + throw std::runtime_error( + "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for " + "CUTLASS mixed type GEMM"); + } +} + +template +template +void CutlassFpAIntBGemmRunner::run_gemm( + const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream) { + // VLOG(3)<<__PRETTY_FUNCTION__; + static constexpr bool is_weight_only = !std::is_same::value; + const bool is_weight_only_encoder = m >= 512 ? true : false; + std::vector candidate_configs = + get_candidate_configs(sm_, is_weight_only, is_weight_only_encoder, false); + std::vector occupancies(candidate_configs.size()); + + for (size_t ii = 0; ii < candidate_configs.size(); ++ii) { + dispatch_to_arch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + candidate_configs[ii], + workspace_ptr, + workspace_bytes, + stream, + &occupancies[ii]); + } + // Standard GEMM, so 1 "expert". We use the same function for MoE and regular + // FFN. + static constexpr int num_experts = 1; + CutlassGemmConfig chosen_config = + estimate_best_config_from_occupancies(candidate_configs, + occupancies, + m, + n, + k, + num_experts, + split_k_limit, + workspace_bytes, + multi_processor_count_, + is_weight_only); + + dispatch_to_arch(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + chosen_config, + workspace_ptr, + workspace_bytes, + stream); +} + +template +void CutlassFpAIntBGemmRunner::gemm_bias_act( + const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + std::string activation_type, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream) { + if (activation_type == "gelu") { + PADDLE_THROW(phi::errors::Unimplemented( + "Activation_type = gelu for fpA_intB gemm is not instantiated.")); + } else if (activation_type == "relu") { + PADDLE_THROW(phi::errors::Unimplemented( + "Activation_type = relu for fpA_intB gemm is not instantiated.")); + } else if (activation_type == "none") { + run_gemm(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + workspace_ptr, + workspace_bytes, + stream); + } else { + throw std::runtime_error(("Invalid activation type.")); + } +} + +template +void CutlassFpAIntBGemmRunner::gemm(const T* A, + const WeightType* B, + const T* weight_scales, + T* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream) { + // VLOG(3)<<__PRETTY_FUNCTION__; + run_gemm(A, + B, + weight_scales, + nullptr, + C, + m, + n, + k, + workspace_ptr, + workspace_bytes, + stream); +} + +template +int CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, + const int n, + const int k) { + // VLOG(3)<<__PRETTY_FUNCTION__; // These are the min tile sizes for each + // config, which would launch the maximum number of blocks + const int max_grid_m = (m + 31) / 32; + const int max_grid_n = (n + 127) / 128; + // We need 4 bytes per block in the worst case. We launch split_k_limit in z + // dim. + return max_grid_m * max_grid_n * split_k_limit * 4; +} + +// =============================== Specialization T == WeightType +// ======================================= +template +void CutlassFpAIntBGemmRunner::gemm_bias_act( + const float* A, + const WeightType* B, + const float* weight_scales, + const float* biases, + float* C, + int m, + int n, + int k, + std::string activation_type, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream) { + throw std::runtime_error( + ("Attempting to run mixed gemm bias act when the types are the same is " + "an error.")); +} + +template +void CutlassFpAIntBGemmRunner::gemm( + const float* A, + const WeightType* B, + const float* weight_scales, + float* C, + int m, + int n, + int k, + char* workspace_ptr, + const size_t workspace_bytes, + cudaStream_t stream) { + throw std::runtime_error(( + "Attempting to run mixed gemm when the types are the same is an error.")); +} + +template +int CutlassFpAIntBGemmRunner::getWorkspaceSize(const int m, + const int n, + const int k) { + return 0; +} + +template class CutlassFpAIntBGemmRunner; +template class CutlassFpAIntBGemmRunner; +#ifdef PADDLE_CUDA_BF16 +template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t>; +#endif +template class CutlassFpAIntBGemmRunner; +template class CutlassFpAIntBGemmRunner; +#ifdef PADDLE_CUDA_BF16 +template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t>; +#endif +} // namespace phi \ No newline at end of file diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h new file mode 100644 index 0000000000000..95f681655c7ea --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma once + +#include "cutlass/gemm/device/gemm_universal_base.h" +#include "cutlass/gemm/kernel/default_gemm.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/compute_occupancy.h" + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/epilogue_helpers.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/ft_gemm_configs.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/kernel/fpA_intB_gemm_split_k.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_extensions/gemm/threadblock/default_mma.h" +#pragma GCC diagnostic pop + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/cutlass_heuristic.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/arch_define.h" +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" +#include "paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h" +namespace phi { + +template +void generic_mixed_gemm_kernelLauncher(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + static_assert(cutlass::platform::is_same::value || +#ifdef PADDLE_CUDA_BF16 + cutlass::platform::is_same::value || +#endif + cutlass::platform::is_same::value, + "Specialized for bfloat16, half, float"); + + static_assert( + cutlass::platform::is_same::value || + cutlass::platform::is_same::value || + cutlass::platform::is_same::value, + ""); + + // The cutlass type for the input elements. This is needed to convert to + // cutlass::half_t if necessary. + using ElementType_ = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::half_t, + T>::type; +#ifdef PADDLE_CUDA_BF16 + using ElementType = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::bfloat16_t, + ElementType_>::type; +#endif + using CutlassWeightType_ = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::half_t, + WeightType>::type; + +#ifdef PADDLE_CUDA_BF16 + using CutlassWeightType = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::bfloat16_t, + CutlassWeightType_>::type; +#endif + + // We need separate config for each architecture since we will target + // different tensorcore instructions. For float, we do not target TCs. + using MixedGemmArchTraits = cutlass::gemm::kernel:: + MixedGemmArchTraits; + using ElementAccumulator = typename MixedGemmArchTraits::AccType; + + using EpilogueOp = typename Epilogue::Op; + if (gemm_config.split_k_style == SplitKStyle::NO_SPLIT_K) { + using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemm< + ElementType, + cutlass::layout::RowMajor, + MixedGemmArchTraits::ElementsPerAccessA, + CutlassWeightType, + typename MixedGemmArchTraits::LayoutB, + MixedGemmArchTraits::ElementsPerAccessB, + ElementType, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + arch, + ThreadblockShape, + WarpShape, + typename MixedGemmArchTraits::InstructionShape, + EpilogueOp, + typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + Stages, + true, + typename MixedGemmArchTraits::Operator>::GemmKernel; + + using GemmKernel = cutlass::gemm::kernel::GemmFpAIntB< + typename GemmKernel_::Mma, + typename GemmKernel_::Epilogue, + typename GemmKernel_::ThreadblockSwizzle, + arch, // Ensure top level arch is used for dispatch + GemmKernel_::kSplitKSerial>; + + if (occupancy != nullptr) { + *occupancy = compute_occupancy_for_kernel(); + return; + } + + using Gemm = cutlass::gemm::device::GemmUniversalBase; + + const int ldb = + cutlass::platform::is_same::value + ? n + : k * GemmKernel::kInterleave; + + typename Gemm::Arguments args( + {m, n, k}, + {reinterpret_cast(const_cast(A)), k}, + {reinterpret_cast(const_cast(B)), ldb}, + {reinterpret_cast(const_cast(weight_scales)), 0}, + {reinterpret_cast(const_cast(biases)), 0}, + {reinterpret_cast(C), n}, + gemm_config.split_k_factor, + {ElementAccumulator(1.f), ElementAccumulator(0.f)}); + + // This assertion is enabled because because for the column interleaved + // layout, K MUST be a multiple of threadblockK. The reason for this is that + // the default pitchlinear iterators are used to handle walking over the + // interleaved matrix. The way masking in handled in these do not map to the + // interleaved layout. We need to write our own predicated iterator in order + // to relax this limitation. + if (GemmKernel::kInterleave > 1 && + ((k % MixedGemmArchTraits::ThreadblockK) || + ((k / gemm_config.split_k_factor) % + MixedGemmArchTraits::ThreadblockK))) { + throw std::runtime_error( + "Temp assertion: k must be multiple of threadblockK"); + } + + Gemm gemm; + if (gemm.get_workspace_size(args) > workspace_bytes) { + // TODO(wangbojun) here to reset the split-k in gemm args, but no work for + // now to run bf16 mixgemm, we have set the split-k factor to 1 + VLOG(1) << "Requested split-k but workspace size insufficient. Falling " + "back to non-split-k implementation."; + VLOG(1) << "need workspace sizoe of: " << gemm.get_workspace_size(args) + << ", but got " << workspace_bytes; + VLOG(1) << "args.batch_stride_D:" << args.batch_stride_D; + VLOG(1) << "args.batch_count:" << args.batch_count; + // If requested split-k factor will require more workspace bytes, revert + // to standard gemm. + // + args.batch_count = 1; + } + + auto can_implement = gemm.can_implement(args); + if (can_implement != cutlass::Status::kSuccess) { + std::string err_msg = + "fpA_intB cutlass kernel will fail for params. Error: " + + std::string(cutlassGetStatusString(can_implement)); + throw std::runtime_error("[fpA_intB Runner] " + err_msg); + } + + auto init_status = gemm.initialize(args, workspace, stream); + if (init_status != cutlass::Status::kSuccess) { + std::string err_msg = + "Failed to initialize cutlass fpA_intB gemm. Error: " + + std::string(cutlassGetStatusString(init_status)); + throw std::runtime_error("[fpA_intB Runner] " + err_msg); + } + + auto run_status = gemm.run(stream); + if (run_status != cutlass::Status::kSuccess) { + std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + + std::string(cutlassGetStatusString(run_status)); + throw std::runtime_error("[fpA_intB Runner] " + err_msg); + } + } else { + // for stream-k, we set gemm_config.split_k_factor = 1 to use default load + // balance. + gemm_config.split_k_factor = 1; + using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemmUniversal< + ElementType, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + MixedGemmArchTraits::ElementsPerAccessA, + CutlassWeightType, + typename MixedGemmArchTraits::LayoutB, + cutlass::ComplexTransform::kNone, + MixedGemmArchTraits::ElementsPerAccessB, + ElementType, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + arch, + ThreadblockShape, + WarpShape, + typename MixedGemmArchTraits::InstructionShape, + EpilogueOp, + typename cutlass::gemm::threadblock::ThreadblockSwizzleStreamK, + Stages, + typename MixedGemmArchTraits::Operator, + cutlass::gemm::SharedMemoryClearOption::kNone>::GemmKernel; + using GemmKernel = cutlass::gemm::kernel::GemmFpAIntBSplitK< + typename GemmKernel_::Mma, + typename GemmKernel_::Epilogue, + typename GemmKernel_::ThreadblockSwizzle, + arch // Ensure top level arch is used for dispatch + >; + + if (occupancy != nullptr) { + *occupancy = compute_occupancy_for_kernel2(); + return; + } + + using Gemm = cutlass::gemm::device::GemmUniversalBase; + + const int ldb = + cutlass::platform::is_same::value + ? n + : k * GemmKernel::kInterleave; + typename Gemm::Arguments args( + cutlass::gemm::GemmUniversalMode::kGemm, + {m, n, k}, + {reinterpret_cast(const_cast(A)), k}, + {reinterpret_cast(const_cast(B)), ldb}, + {reinterpret_cast(const_cast(weight_scales)), 0}, + {reinterpret_cast(const_cast(biases)), 0}, + {reinterpret_cast(C), n}, + gemm_config.split_k_factor, + {ElementAccumulator(1.f), ElementAccumulator(0.f)}); + + // This assertion is enabled because because for the column interleaved + // layout, K MUST be a multiple of threadblockK. The reason for this is that + // the default pitchlinear iterators are used to handle walking over the + // interleaved matrix. The way masking in handled in these do not map to the + // interleaved layout. We need to write our own predicated iterator in order + // to relax this limitation. + if (GemmKernel::kInterleave > 1 && + ((k % MixedGemmArchTraits::ThreadblockK) || + ((k / gemm_config.split_k_factor) % + MixedGemmArchTraits::ThreadblockK))) { + throw std::runtime_error( + "Temp assertion: k must be multiple of threadblockK"); + } + + Gemm gemm; + if (gemm.get_workspace_size(args) > workspace_bytes) { + VLOG(1) << "Requested split-k but workspace size insufficient. Falling " + "back to non-split-k implementation."; + VLOG(1) << "Requested workspace_size: " << gemm.get_workspace_size(args); + VLOG(1) << "get workspace_size: " << workspace_bytes; + // If requested split-k factor will require more workspace bytes, revert + // to standard gemm. + args.batch_count = 1; + } + + auto can_implement = gemm.can_implement(args); + if (can_implement != cutlass::Status::kSuccess) { + std::string err_msg = + "fpA_intB cutlass kernel will fail for params. Error: " + + std::string(cutlassGetStatusString(can_implement)); + throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + + err_msg); + } + + auto init_status = gemm.initialize(args, workspace, stream); + if (init_status != cutlass::Status::kSuccess) { + std::string err_msg = + "Failed to initialize cutlass fpA_intB gemm. Error: " + + std::string(cutlassGetStatusString(init_status)); + throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + + err_msg); + } + + auto run_status = gemm.run(stream); + if (run_status != cutlass::Status::kSuccess) { + std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " + + std::string(cutlassGetStatusString(run_status)); + throw std::runtime_error("[fpA_intB_gemm Error][fpA_intB Runner] " + + err_msg); + } + } +} + +template +void generic_mixed_gemm_kernelLauncher_template(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy); + +template +struct dispatch_stages { + static void dispatch(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy = nullptr) { + // VLOG(3)<<__PRETTY_FUNCTION__; + std::string err_msg = "Cutlass fpA_intB gemm. Not instantiates for arch " + + std::to_string(arch::kMinComputeCapability) + + " with stages set to " + std::to_string(Stages); + throw std::runtime_error("[dispatch_stages::dispatch] " + err_msg); + } +}; +template +struct dispatch_stages { + static void dispatch(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy = nullptr) { + // VLOG(3)<<__PRETTY_FUNCTION__; + + generic_mixed_gemm_kernelLauncher_template(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + } +}; + +#if defined(USE_FPAINTB_GEMM_WITH_SM80) +template +struct dispatch_stages 2)>::type> { + static void dispatch(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy = nullptr) { + generic_mixed_gemm_kernelLauncher_template(A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); + } +}; +#endif + +template +void dispatch_gemm_config(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy); + +template +void dispatch_gemm_to_cutlass(const T* A, + const WeightType* B, + const T* weight_scales, + const T* biases, + T* C, + int m, + int n, + int k, + char* workspace, + size_t workspace_bytes, + CutlassGemmConfig gemm_config, + cudaStream_t stream, + int* occupancy); + +} // namespace phi \ No newline at end of file diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py new file mode 100644 index 0000000000000..12bb74fb04e8c --- /dev/null +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py @@ -0,0 +1,228 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re + +# this is a file's header part +CommonHead = ''' +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" + +namespace phi { +''' + +CommonTail = ''' +} // namespace phi + +''' +DispatchGemmConfigInstanceDeclare = """ +template<> +void generic_mixed_gemm_kernelLauncher_template<{T}, + {WeightType}, + {arch}, + {EpilogueTag}, + {ThreadblockShape}, + {WarpShape}, + {Stages}>( + const {T}* A, + const {WeightType}* B, + const {T}* weight_scales, + const {T}* biases, + {T}* C, + int m, + int n, + int k, + CutlassGemmConfig gemm_config, + char* workspace, + size_t workspace_bytes, + cudaStream_t stream, + int* occupancy) { + generic_mixed_gemm_kernelLauncher<{T}, + {WeightType}, + {arch}, + {EpilogueTag}, + {ThreadblockShape}, + {WarpShape}, + {Stages}>( + A, + B, + weight_scales, + biases, + C, + m, + n, + k, + gemm_config, + workspace, + workspace_bytes, + stream, + occupancy); +} +""" + +DefineHeader = """ +// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit. + +""" + +DefaultArch = [70, 75, 80] +epilogue_tags = ["bias", "biasFtGelu", "biasReLU", "noBias"] + +WeightTypes = ["uint8_t", "cutlass::uint4b_t"] +ThreadblockShapes = [ + "cutlass::gemm::GemmShape<16, 128, 64>", + "cutlass::gemm::GemmShape<32, 128, 64>", + "cutlass::gemm::GemmShape<64, 128, 64>", + "cutlass::gemm::GemmShape<128, 128, 64>", + "cutlass::gemm::GemmShape<128, 256, 64>", +] +WarpShapes = [ + "cutlass::gemm::GemmShape<16, 32, 64>", + "cutlass::gemm::GemmShape<32, 32, 64>", + "cutlass::gemm::GemmShape<64, 64, 64>", + "cutlass::gemm::GemmShape<64, 64, 64>", + "cutlass::gemm::GemmShape<64, 64, 64>", +] + +ThreadblockShapes_sm70 = [ + "cutlass::gemm::GemmShape<32, 128, 64>", + "cutlass::gemm::GemmShape<64, 128, 64>", +] +WarpShapes_sm70 = [ + "cutlass::gemm::GemmShape<32, 32, 64>", + "cutlass::gemm::GemmShape<64, 64, 64>", +] +StagesList = {70: [2], 75: [2], 80: [2, 3, 4, 5]} + +ElementTypes = {"fp16": "half", "bf16": "__nv_bfloat16"} +Archs = { + 70: "cutlass::arch::Sm70", + 75: "cutlass::arch::Sm75", + 80: "cutlass::arch::Sm80", +} +EpilogueTags = { + "bias": "EpilogueOpBias", + "noBias": "EpilogueOpNoBias", + # "biasFtGelu": "EpilogueOpBiasFtGelu", + # "biasReLU": "EpilogueOpBiasReLU", +} + + +def SubstituteTemplate(template, values): + text = template + changed = True + while changed: + changed = False + for key, value in values.items(): + regex = "\\{%s\\}" % key + newtext = re.sub(regex, value, text) + if newtext != text: + changed = True + text = newtext + return text + + +def find_arch_range(archs): + compile_archs = [] + for arch in archs: + if arch >= 70 and arch < 75: + compile_archs.append(70) + elif arch >= 75 and arch < 80: + compile_archs.append(75) + elif arch >= 80 and arch < 90: + compile_archs.append(80) + compile_archs = list(set(compile_archs)) + compile_archs.sort() + return compile_archs + + +def convert_to_arch_list(archs): + archs = archs.lower().strip() + if archs == "all": + return DefaultArch + + archs = [int(s.strip()) for s in archs.split(';') if s.strip()] + archs = list(set(archs)) + return find_arch_range(archs) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." + ) + parser.add_argument( + "--cuda_arch", + type=convert_to_arch_list, + default=convert_to_arch_list("All"), + help="The CUDA architecture to be generated.", + ) + args = parser.parse_args() + return args + + +# generate source cu +def generate_source_cu( + element_type: str, arch: int, epilogue_tag: str, stages: int +): + all_code = CommonHead + ThreadblockShapes_arch = ThreadblockShapes + WarpShapes_arch = WarpShapes + if arch < 80: + ThreadblockShapes_arch = ThreadblockShapes_sm70 + WarpShapes_arch = WarpShapes_sm70 + for WeightType in WeightTypes: + for i in range(len(ThreadblockShapes_arch)): + value_dict = { + "T": ElementTypes[element_type], + "WeightType": WeightType, + "arch": Archs[arch], + "EpilogueTag": EpilogueTags[epilogue_tag], + "ThreadblockShape": ThreadblockShapes_arch[i], + "WarpShape": WarpShapes_arch[i], + "Stages": str(stages), + } + all_code += SubstituteTemplate( + DispatchGemmConfigInstanceDeclare, value_dict + ) + all_code += CommonTail + return all_code + + +if __name__ == "__main__": + args = parse_args() + archs = args.cuda_arch + header_all = DefineHeader + header_name = "autogen_tmp/arch_define.h" + if archs: + for arch in archs: + define_line = "#define USE_FPAINTB_GEMM_WITH_SM%s\n" % str(arch) + header_all += define_line + with open(header_name, "w") as f: + f.write(header_all) + f.close() + if archs: + for element_type in ElementTypes.keys(): + for arch in archs: + for epilogue_tag in EpilogueTags.keys(): + for stages in StagesList[arch]: + file_name = "autogen_tmp/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format( + element_type, arch, stages, epilogue_tag + ) + all_code = generate_source_cu( + element_type, arch, epilogue_tag, stages + ) + with open(file_name, "w") as f: + f.write(all_code) + f.close() \ No newline at end of file From aaaa9e05948c6286dc55659d025ecc2686d9017a Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Tue, 19 Dec 2023 14:01:12 +0800 Subject: [PATCH 09/15] :zap: Fix: fixed code format --- .../cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h | 2 +- .../cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu | 2 +- .../cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h | 2 +- .../fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h index f86ccba9d4617..15c5267ae0f9d 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -155,4 +155,4 @@ class CutlassFpAIntBGemmRunner { int getWorkspaceSize(const int m, const int n, const int k); }; -} // namespace phi \ No newline at end of file +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu index 869b4689a3ac0..2f566d4dbc35e 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu @@ -678,4 +678,4 @@ template class CutlassFpAIntBGemmRunner; #ifdef PADDLE_CUDA_BF16 template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t>; #endif -} // namespace phi \ No newline at end of file +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index 95f681655c7ea..8ae1047c43afc 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -516,4 +516,4 @@ void dispatch_gemm_to_cutlass(const T* A, cudaStream_t stream, int* occupancy); -} // namespace phi \ No newline at end of file +} // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py index 12bb74fb04e8c..ad7f1e65591ce 100644 --- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py +++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py @@ -225,4 +225,4 @@ def generate_source_cu( ) with open(file_name, "w") as f: f.write(all_code) - f.close() \ No newline at end of file + f.close() From 64f9e5989a14ba2541012a2d1ec8eabad8bcf6de Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Tue, 19 Dec 2023 14:21:06 +0800 Subject: [PATCH 10/15] :pencil2: Fix: fixed code format --- test/legacy_test/test_inplace.py | 4 +++- third_party/flashattn | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index 746786f9367c6..23a2deac8a5f7 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1712,7 +1712,9 @@ def inplace_api_processing(self, var): ) def non_inplace_api_processing(self, var): - return paddle.log_normal(var, self.index, self.axis, self.value) + return paddle.log_normal( + var, self.shape, self.mean, self.std, self.seed + ) if __name__ == '__main__': diff --git a/third_party/flashattn b/third_party/flashattn index b74460b385b69..a96f802471445 160000 --- a/third_party/flashattn +++ b/third_party/flashattn @@ -1 +1 @@ -Subproject commit b74460b385b691d881ff2d3a1adbcefdcac574a3 +Subproject commit a96f8024714455fb86a326e20c3b7f700ec50772 From 428ec01696cd24713a5e0fab4ccfc29c9c784db6 Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Tue, 19 Dec 2023 16:38:56 +0800 Subject: [PATCH 11/15] fix: fixed code format --- python/paddle/tensor/random.py | 95 ++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 21 deletions(-) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 22d5bfcf3c527..5f9b76d0bef92 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1532,7 +1532,7 @@ def bernoulli_(x, p=0.5, name=None): >>> # doctest: -SKIP """ - if 0 <= p and p <= 1: + if not (0 <= p and p <= 1): raise ValueError(f"bernoulli_ expects p to be in [0, 1], but got p={p}") check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential") @@ -1541,7 +1541,7 @@ def bernoulli_(x, p=0.5, name=None): return x.set_value((x < p).astype(x.dtype)) -def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): +def log_normal(mean=1.0, std=1.0, shape=None, dtype=None, name=None): r""" Returns a Tensor filled with random values sampled from a Log Normal Distribution, with ``mean``, ``std``, ``shape`` and ``dtype``. @@ -1594,6 +1594,7 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): :name: log_normal-example-3 >>> import paddle + >>> mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) >>> std_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) >>> out3 = paddle.log_normal(mean=mean_tensor, std=std_tensor) >>> print(out3) @@ -1603,31 +1604,78 @@ def log_normal(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None): >>> # doctest: -SKIP """ - op_type_for_check = 'gaussian/standard_normal/randn/normal' - supported_dtypes = ['float32', 'float64', 'float16', 'uint16', 'bfloat16'] - - if dtype is None: - dtype = paddle.framework.get_default_dtype() - if dtype not in supported_dtypes: - raise TypeError( - "{} only supports {}, but the default dtype is {}".format( - op_type_for_check, supported_dtypes, dtype - ) + if not in_dynamic_or_pir_mode(): + check_type(mean, 'mean', (list, tuple, Variable), 'log_normal') + check_type(std, 'std', (list, tuple, Variable), 'log_normal') + if isinstance(mean, Variable): + check_dtype( + mean.dtype, + 'mean', + ['float32', 'float64'], + 'log_normal', + "If mean is a Tensor, it's data type only support float32, float64", ) - if not isinstance(dtype, core.VarDesc.VarType): - dtype = convert_np_dtype_to_dtype_(dtype) + if isinstance(std, Variable): + check_dtype( + std.dtype, + 'std', + ['float16', 'float32', 'float64'], + 'log_normal', + "If std is a Tensor, it's data type only support float32, float64", + ) + if shape is not None: + check_shape(shape, 'log_normal') - n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) - n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) + def normalize_mean_std(mean, std): + n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) + n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) + return n_mean, n_std + + if isinstance(mean, Variable): + check_dtype( + mean.dtype, + 'mean', + ['float16', 'float32', 'float64'], + 'log_normal', + "If mean is a Tensor, it's data type only support float32, float64", + ) + if isinstance(std, Variable): + check_dtype( + std.dtype, + 'std', + ['float16', 'float32', 'float64'], + 'log_normal', + "If std is a Tensor, it's data type only support float32, float64", + ) + if std.dtype != mean.dtype: + std = paddle.cast(std, mean.dtype) + mean_shape = paddle.shape(mean) + std = paddle.reshape(std, mean_shape) + else: + std = paddle.to_tensor(std) + n_mean, n_std = normalize_mean_std(mean, std) + distribution = normal( + shape=paddle.shape(mean), mean=n_mean, std=n_std, name=name + ) + elif isinstance(std, Variable): + mean = paddle.to_tensor(mean) + n_mean, n_std = normalize_mean_std(mean, std) + distribution = normal( + shape=paddle.shape(std), mean=n_mean, std=n_std, name=name + ) + else: + mean = paddle.to_tensor(mean) + std = paddle.to_tensor(std) + n_mean, n_std = normalize_mean_std(mean, std) + distribution = normal( + mean=n_mean, std=n_std, shape=shape, name=name + ) - distribution = gaussian( - shape, mean=n_mean, std=n_std, seed=seed, dtype=dtype - ) return paddle.exp(distribution) @dygraph_only -def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): +def log_normal_(x, mean=0.0, std=1.0, name=None): r""" This inplace OP fill input Tensor ``x`` with random number from a Log Normal Distribution with ``mean`` and ``std``. The Log Normal Distribution is defined as follows: @@ -1667,7 +1715,12 @@ def log_normal_(x, mean=0.0, std=1.0, seed=0, name=None): >>> # doctest: -SKIP """ + if not isinstance(mean, Variable): + mean = paddle.to_tensor(mean) + if not isinstance(std, Variable): + std = paddle.to_tensor(std) + n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2))) - return gaussian_(x, mean=n_mean, std=n_std, seed=seed).exp_() + return normal_(x, mean=n_mean, std=n_std).exp_() From f6a855d87a8de2ab92b8dd66715dcc629e356685 Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Wed, 20 Dec 2023 15:23:50 +0800 Subject: [PATCH 12/15] :art: Formated code style --- python/paddle/tensor/random.py | 16 +++++++--------- test/legacy_test/test_inplace.py | 4 ++-- test/legacy_test/test_log_normal_op.py | 24 +++++------------------- 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 5f9b76d0bef92..dd293b85ca0c8 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1633,12 +1633,12 @@ def normalize_mean_std(mean, std): if isinstance(mean, Variable): check_dtype( - mean.dtype, - 'mean', - ['float16', 'float32', 'float64'], - 'log_normal', - "If mean is a Tensor, it's data type only support float32, float64", - ) + mean.dtype, + 'mean', + ['float16', 'float32', 'float64'], + 'log_normal', + "If mean is a Tensor, it's data type only support float32, float64", + ) if isinstance(std, Variable): check_dtype( std.dtype, @@ -1667,9 +1667,7 @@ def normalize_mean_std(mean, std): mean = paddle.to_tensor(mean) std = paddle.to_tensor(std) n_mean, n_std = normalize_mean_std(mean, std) - distribution = normal( - mean=n_mean, std=n_std, shape=shape, name=name - ) + distribution = normal(mean=n_mean, std=n_std, shape=shape, name=name) return paddle.exp(distribution) diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index 23a2deac8a5f7..5227d425a0428 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1683,7 +1683,7 @@ def test_backward_error(self): class TestDygraphInplaceBernoulli(TestDygraphInplace): def init_data(self): self.shape = (20, 40) - self.x = np.random.random(self.shape) + self.input_var_numpy = np.random.random(self.shape) self.dtype = "float32" self.mean = 0 self.std = 1 @@ -1700,7 +1700,7 @@ def non_inplace_api_processing(self, var): class TestDygraphInplaceLogNormal(TestDygraphInplace): def init_data(self): self.shape = (20, 40) - self.x = np.random.random(self.shape) + self.input_var_numpy = np.random.random(self.shape) self.dtype = "float32" self.mean = 0 self.std = 1 diff --git a/test/legacy_test/test_log_normal_op.py b/test/legacy_test/test_log_normal_op.py index cac82a5a77998..474c1d2daafdc 100644 --- a/test/legacy_test/test_log_normal_op.py +++ b/test/legacy_test/test_log_normal_op.py @@ -46,7 +46,7 @@ def test_static_api(self): ) out_1 = random.log_normal( - shape=[2000, 500], dtype="float32", mean=0.0, std=1.0, seed=10 + shape=[2000, 500], dtype="float32", mean=0.0, std=1.0 ) out_2 = random.log_normal( @@ -54,7 +54,6 @@ def test_static_api(self): dtype="float32", mean=0.0, std=1.0, - seed=10, ) out_3 = random.log_normal( @@ -62,31 +61,18 @@ def test_static_api(self): dtype="float32", mean=0.0, std=1.0, - seed=10, ) out_4 = random.log_normal( - shape=shape_tensor_int32, - dtype="float32", - mean=0.0, - std=1.0, - seed=10, + shape=shape_tensor_int32, dtype="float32", mean=0.0, std=1.0 ) out_5 = random.log_normal( - shape=shape_tensor_int64, - dtype="float32", - mean=0.0, - std=1.0, - seed=10, + shape=shape_tensor_int64, dtype="float32", mean=0.0, std=1.0 ) out_6 = random.log_normal( - shape=shape_tensor_int64, - dtype=np.float32, - mean=0.0, - std=1.0, - seed=10, + shape=shape_tensor_int64, dtype=np.float32, mean=0.0, std=1.0 ) exe = base.Executor(place=base.CPUPlace()) @@ -181,7 +167,7 @@ def _check_random_value(dtype, expect, expect_mean, expect_std): print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() paddle.set_device('gpu') - paddle.seed(2021) + paddle.seed(100) expect = [ -0.79037829, -0.54411126, From b1532e071096302d43636ee1fd1e9a03a008484b Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Wed, 20 Dec 2023 17:24:31 +0800 Subject: [PATCH 13/15] :bug: Fix: fixed log_normal_ --- python/paddle/tensor/random.py | 24 +++++++++++++++++------- test/legacy_test/test_inplace.py | 8 ++------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index dd293b85ca0c8..a2c77e455fa06 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1654,20 +1654,30 @@ def normalize_mean_std(mean, std): else: std = paddle.to_tensor(std) n_mean, n_std = normalize_mean_std(mean, std) - distribution = normal( - shape=paddle.shape(mean), mean=n_mean, std=n_std, name=name + distribution = gaussian( + shape=paddle.shape(mean), + mean=n_mean, + std=n_std, + dtype=dtype, + name=name, ) elif isinstance(std, Variable): mean = paddle.to_tensor(mean) n_mean, n_std = normalize_mean_std(mean, std) - distribution = normal( - shape=paddle.shape(std), mean=n_mean, std=n_std, name=name + distribution = gaussian( + shape=paddle.shape(std), + mean=n_mean, + std=n_std, + dtype=dtype, + name=name, ) else: mean = paddle.to_tensor(mean) std = paddle.to_tensor(std) n_mean, n_std = normalize_mean_std(mean, std) - distribution = normal(mean=n_mean, std=n_std, shape=shape, name=name) + distribution = gaussian( + mean=n_mean, std=n_std, shape=shape, dtype=dtype, name=name + ) return paddle.exp(distribution) @@ -1713,9 +1723,9 @@ def log_normal_(x, mean=0.0, std=1.0, name=None): >>> # doctest: -SKIP """ - if not isinstance(mean, Variable): + if not isinstance(mean, Variable) or not isinstance(mean, float): mean = paddle.to_tensor(mean) - if not isinstance(std, Variable): + if not isinstance(std, Variable) or not isinstance(std, float): std = paddle.to_tensor(std) n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py index 5227d425a0428..7b3aeb008a00b 100644 --- a/test/legacy_test/test_inplace.py +++ b/test/legacy_test/test_inplace.py @@ -1707,14 +1707,10 @@ def init_data(self): self.seed = 100 def inplace_api_processing(self, var): - return paddle.log_normal_( - var, self.shape, self.mean, self.std, self.seed - ) + return paddle.log_normal_(var, self.shape, self.mean, self.std) def non_inplace_api_processing(self, var): - return paddle.log_normal( - var, self.shape, self.mean, self.std, self.seed - ) + return paddle.log_normal(var, self.shape, self.mean, self.std) if __name__ == '__main__': From 4cb7630ce6736367ad1d7de54602fc67189f50bf Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Wed, 20 Dec 2023 23:28:14 +0800 Subject: [PATCH 14/15] :art: Fix: fixed log_normal_ --- test/legacy_test/test_log_normal_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/legacy_test/test_log_normal_op.py b/test/legacy_test/test_log_normal_op.py index 474c1d2daafdc..dc5eace984f37 100644 --- a/test/legacy_test/test_log_normal_op.py +++ b/test/legacy_test/test_log_normal_op.py @@ -27,7 +27,7 @@ paddle.seed(SEED) -def output_log_normal(shape, mean, std): +def ref_log_normal(shape, mean, std): return np.exp(np.random.normal(mean, std, shape)) From 3d3dea252d7e94ee1a8bf3160a549bfae6bdcca2 Mon Sep 17 00:00:00 2001 From: PommesPeter Date: Fri, 22 Dec 2023 11:40:48 +0800 Subject: [PATCH 15/15] :pencil2: Fix: fixed log_normal_ --- python/paddle/tensor/random.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index a2c77e455fa06..e55003a19a786 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -1724,9 +1724,9 @@ def log_normal_(x, mean=0.0, std=1.0, name=None): """ if not isinstance(mean, Variable) or not isinstance(mean, float): - mean = paddle.to_tensor(mean) + mean = paddle.to_tensor(mean, dtype=paddle.float64) if not isinstance(std, Variable) or not isinstance(std, float): - std = paddle.to_tensor(std) + std = paddle.to_tensor(std, dtype=paddle.float64) n_mean = paddle.log(mean**2 / paddle.sqrt(mean**2 + std**2)) n_std = paddle.sqrt(paddle.log(1 + (std**2 / mean**2)))