@@ -132,19 +132,22 @@ class CollectiveEpilogue<
132132 using CopyThreadShape = Shape<_1, Int<SubgroupSize>>;
133133
134134 using Trait_D = Copy_Traits<GmemTiledCopyD, InternalStrideD>;
135+ using val_layout_store_D = decltype (make_layout(shape_div(typename Trait_D::BlockShape{}, CopyThreadShape{})));
135136 using XE_Copy_D = decltype (make_tiled_copy(Copy_Atom<Trait_D, ElementD>{},
136137 Layout<CopyThreadShape>{},
137- make_layout (shape_div( typename Trait_D::BlockShape{}, CopyThreadShape{})) ));
138+ val_layout_store_D{} ));
138139private:
139140 constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
140141 constexpr static bool is_destination_supported = not cute::is_void_v<ElementD> && not cute::is_void_v<CopyOpR2G>;
141142
142143 using NonVoidElementC = conditional_t <is_source_supported, ElementC, ElementD>;
143144 using Trait_C = Copy_Traits<GmemTiledCopyC, InternalStrideC>;
144145 using NonVoidTrait_C = conditional_t <is_source_supported, Trait_C, Trait_D>;
146+ using val_layout_load_C = decltype (make_layout(shape_div(typename NonVoidTrait_C::BlockShape{}, CopyThreadShape{})));
147+ using NonVoidValLayoutLoad_C = conditional_t <is_source_supported, val_layout_load_C, val_layout_store_D>;
145148 using XE_Copy_C = decltype (make_tiled_copy(Copy_Atom<NonVoidTrait_C, NonVoidElementC>{},
146149 Layout<CopyThreadShape>{},
147- make_layout (shape_div( typename NonVoidTrait_C::BlockShape{}, CopyThreadShape{})) ));
150+ NonVoidValLayoutLoad_C{} ));
148151public:
149152
150153 using EmptyType = cute::tuple<>;
0 commit comments