From a47738f0b1a0e0669d1bcc046d018f25c61afb65 Mon Sep 17 00:00:00 2001 From: yucai Date: Wed, 4 Jun 2025 02:05:20 -0700 Subject: [PATCH 1/6] add dummy gamma --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 935ab99f7..1c09ad48d 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -1418,7 +1418,18 @@ void group_norm_backward_kernel_impl( .build(); gpu_kernel(iter, GroupNormBackwardC1Functor()); } + else { + Tensor dummy_gamma= at::ones({1, G, D}, X.options().dtype(kAccType)); + auto iter = TensorIteratorConfig() + .check_all_same_dtype(std::is_same::value) + .add_output(c1) + .add_owned_const_input(rstd.view({N, G, 1})) + .add_owned_const_input(dummy_gamma.view({1, G, D})) + .build(); + gpu_kernel(iter, GroupNormBackwardC1Functor()); + } + std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; wg_size = (C / G) < get_group_reduce_group_size(simd) ? simd : get_group_reduce_group_size(simd); @@ -1440,7 +1451,8 @@ void group_norm_backward_kernel_impl( c2_data, c3_data); - if (gamma.defined()) { + // if (gamma.defined()) { + std::cout << "gamma.defined()---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .resize_outputs(false) @@ -1452,24 +1464,12 @@ void group_norm_backward_kernel_impl( .add_owned_const_input(c3.view({N * G, 1, 1})) .build(); gpu_kernel(iter, GroupNormBackwardDXFunctor()); - } else { - auto iter = TensorIteratorConfig() - .check_all_same_dtype(std::is_same::value) - .resize_outputs(false) - .add_owned_output(dX.view({N * G, D * HxW})) - .add_owned_const_input(dY.view({N * G, D * HxW})) - .add_owned_const_input(X.view({N * G, D * HxW})) - .add_owned_const_input(rstd.view({N * G, 1})) - .add_owned_const_input(c2.view({N * G, 1})) - .add_owned_const_input(c3.view({N * G, 1})) - .build(); - gpu_kernel(iter, GroupNormBackwardDXFunctor()); - } } if (dgamma.defined() || dbeta.defined()) { T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr() : nullptr; T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr() : nullptr; + std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; if (N <= 128) { // For small batch size, do colwise reduce directly. auto caller = GammaBetaBackwardPlainFunctor( From ff893a0e828f0226760d6c0c29bf2736da9413dd Mon Sep 17 00:00:00 2001 From: yucai Date: Wed, 4 Jun 2025 02:57:56 -0700 Subject: [PATCH 2/6] revise --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 1c09ad48d..9100d0aea 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -1406,6 +1407,7 @@ void group_norm_backward_kernel_impl( Tensor c1 = at::empty({0}, X.options().dtype(kAccType)); Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType)); Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType)); + Tensor dummy_gamma = at::empty({1, G, D}, X.options().dtype(kAccType)); T_ACC* c2_data = c2.mutable_data_ptr(); T_ACC* c3_data = c3.mutable_data_ptr(); @@ -1417,9 +1419,7 @@ void group_norm_backward_kernel_impl( .add_owned_const_input(gamma.view({1, G, D})) .build(); gpu_kernel(iter, GroupNormBackwardC1Functor()); - } - else { - Tensor dummy_gamma= at::ones({1, G, D}, X.options().dtype(kAccType)); + } else { auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) @@ -1452,18 +1452,18 @@ void group_norm_backward_kernel_impl( c3_data); // if (gamma.defined()) { - std::cout << "gamma.defined()---" << std::endl; - auto iter = TensorIteratorConfig() - .check_all_same_dtype(std::is_same::value) - .resize_outputs(false) - .add_owned_output(dX.view({N * G, D, HxW})) - .add_owned_const_input(dY.view({N * G, D, HxW})) - .add_owned_const_input(X.view({N * G, D, HxW})) - .add_owned_const_input(c1.view({N * G, D, 1})) - .add_owned_const_input(c2.view({N * G, 1, 1})) - .add_owned_const_input(c3.view({N * G, 1, 1})) - .build(); - gpu_kernel(iter, GroupNormBackwardDXFunctor()); + std::cout << "gamma.defined()---" << std::endl; + auto iter = TensorIteratorConfig() + .check_all_same_dtype(std::is_same::value) + .resize_outputs(false) + .add_owned_output(dX.view({N * G, D, HxW})) + .add_owned_const_input(dY.view({N * G, D, HxW})) + .add_owned_const_input(X.view({N * G, D, HxW})) + .add_owned_const_input(c1.view({N * G, D, 1})) + .add_owned_const_input(c2.view({N * G, 1, 1})) + .add_owned_const_input(c3.view({N * G, 1, 1})) + .build(); + gpu_kernel(iter, GroupNormBackwardDXFunctor()); } if (dgamma.defined() || dbeta.defined()) { From 21e37fd6e2605a1c353b69fc915bc3d25046a0bf Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Mon, 9 Jun 2025 10:50:52 +0800 Subject: [PATCH 3/6] Update GroupNormKernels.cpp --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 9100d0aea..1e23bee0a 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -1407,7 +1407,8 @@ void group_norm_backward_kernel_impl( Tensor c1 = at::empty({0}, X.options().dtype(kAccType)); Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType)); Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType)); - Tensor dummy_gamma = at::empty({1, G, D}, X.options().dtype(kAccType)); + Tensor dummy_gamma = at::ones_like( + gamma, X.options().dtype(kAccType), at::MemoryFormat::Preserve); T_ACC* c2_data = c2.mutable_data_ptr(); T_ACC* c3_data = c3.mutable_data_ptr(); @@ -1429,7 +1430,7 @@ void group_norm_backward_kernel_impl( gpu_kernel(iter, GroupNormBackwardC1Functor()); } - std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; + // std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; wg_size = (C / G) < get_group_reduce_group_size(simd) ? simd : get_group_reduce_group_size(simd); @@ -1451,8 +1452,7 @@ void group_norm_backward_kernel_impl( c2_data, c3_data); - // if (gamma.defined()) { - std::cout << "gamma.defined()---" << std::endl; + // std::cout << "gamma.defined()---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .resize_outputs(false) @@ -1469,7 +1469,7 @@ void group_norm_backward_kernel_impl( if (dgamma.defined() || dbeta.defined()) { T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr() : nullptr; T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr() : nullptr; - std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; + // std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; if (N <= 128) { // For small batch size, do colwise reduce directly. auto caller = GammaBetaBackwardPlainFunctor( From dfd8aa9274ad9053011512b6b27b7b7702e608ad Mon Sep 17 00:00:00 2001 From: yucai Date: Mon, 9 Jun 2025 19:30:59 -0700 Subject: [PATCH 4/6] fix err --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 1e23bee0a..41799d76b 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -3,10 +3,11 @@ #include #include #include +#include #include #include #include -#include +#include #include #include #include @@ -1407,12 +1408,12 @@ void group_norm_backward_kernel_impl( Tensor c1 = at::empty({0}, X.options().dtype(kAccType)); Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType)); Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType)); - Tensor dummy_gamma = at::ones_like( - gamma, X.options().dtype(kAccType), at::MemoryFormat::Preserve); T_ACC* c2_data = c2.mutable_data_ptr(); T_ACC* c3_data = c3.mutable_data_ptr(); - + Tensor dummy_gamma = at::ones({1, G, D}, X.options().dtype(kAccType)); + std::cout << "dummy---------" << dummy_gamma.device() << std::endl; if (gamma.defined()) { + std::cout << "gamma.defined()---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) @@ -1421,6 +1422,7 @@ void group_norm_backward_kernel_impl( .build(); gpu_kernel(iter, GroupNormBackwardC1Functor()); } else { + std::cout << "gamma not defined---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) @@ -1430,7 +1432,7 @@ void group_norm_backward_kernel_impl( gpu_kernel(iter, GroupNormBackwardC1Functor()); } - // std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; + std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; wg_size = (C / G) < get_group_reduce_group_size(simd) ? simd : get_group_reduce_group_size(simd); @@ -1452,7 +1454,6 @@ void group_norm_backward_kernel_impl( c2_data, c3_data); - // std::cout << "gamma.defined()---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .resize_outputs(false) @@ -1469,7 +1470,7 @@ void group_norm_backward_kernel_impl( if (dgamma.defined() || dbeta.defined()) { T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr() : nullptr; T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr() : nullptr; - // std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; + std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; if (N <= 128) { // For small batch size, do colwise reduce directly. auto caller = GammaBetaBackwardPlainFunctor( @@ -1484,6 +1485,8 @@ void group_norm_backward_kernel_impl( dbeta_data); sycl_kernel_submit(sycl::range<1>(C), queue, caller); } else { + std::cout << "dgamma not defined() && dbeta not defined()---" + << std::endl; // The algorithm for colwise reduction here is to accumulate each // (subgroup_size) cols to a (subgroup_size^2) tile and write the tile // to shared memory. Then do subgroup reduce for each col in the tile. From 1ca5264e4d9fc1d0fd20caedeef95c87381e0234 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 10 Jun 2025 10:32:02 +0800 Subject: [PATCH 5/6] Update GroupNormKernels.cpp --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 41799d76b..dd8b7aeb6 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -1411,9 +1410,7 @@ void group_norm_backward_kernel_impl( T_ACC* c2_data = c2.mutable_data_ptr(); T_ACC* c3_data = c3.mutable_data_ptr(); Tensor dummy_gamma = at::ones({1, G, D}, X.options().dtype(kAccType)); - std::cout << "dummy---------" << dummy_gamma.device() << std::endl; if (gamma.defined()) { - std::cout << "gamma.defined()---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) @@ -1422,7 +1419,6 @@ void group_norm_backward_kernel_impl( .build(); gpu_kernel(iter, GroupNormBackwardC1Functor()); } else { - std::cout << "gamma not defined---" << std::endl; auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) @@ -1432,7 +1428,6 @@ void group_norm_backward_kernel_impl( gpu_kernel(iter, GroupNormBackwardC1Functor()); } - std::cout << "ComputeBackwardFusedParamsFunctor---" << std::endl; wg_size = (C / G) < get_group_reduce_group_size(simd) ? simd : get_group_reduce_group_size(simd); From 5b5be4d7fb1ad63f22649c895a8ea9ccbe43f929 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:23:59 +0800 Subject: [PATCH 6/6] Update GroupNormKernels.cpp --- src/ATen/native/xpu/sycl/GroupNormKernels.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index dd8b7aeb6..4038007e6 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -1465,7 +1465,6 @@ void group_norm_backward_kernel_impl( if (dgamma.defined() || dbeta.defined()) { T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr() : nullptr; T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr() : nullptr; - std::cout << "dgamma.defined() || dbeta.defined()---" << std::endl; if (N <= 128) { // For small batch size, do colwise reduce directly. auto caller = GammaBetaBackwardPlainFunctor( @@ -1480,8 +1479,6 @@ void group_norm_backward_kernel_impl( dbeta_data); sycl_kernel_submit(sycl::range<1>(C), queue, caller); } else { - std::cout << "dgamma not defined() && dbeta not defined()---" - << std::endl; // The algorithm for colwise reduction here is to accumulate each // (subgroup_size) cols to a (subgroup_size^2) tile and write the tile // to shared memory. Then do subgroup reduce for each col in the tile.