forked from NVIDIA/cutlass
-
Notifications
You must be signed in to change notification settings - Fork 68
Intel gpu backend gemm pipeline #89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
aacostadiaz
merged 36 commits into
intel:sycl-develop
from
Jiaxingla:intel_gpu_backend_pipeline
Aug 2, 2024
Merged
Changes from all commits
Commits
Show all changes
36 commits
Select commit
Hold shift + click to select a range
07f36e5
apply patch of gemm pipeline
Jiaxingla d4cf3eb
fix format of copyright
Jiaxingla 665f9be
replace the macro of cache flush and idx
Jiaxingla 59c0ce4
auto format
Jiaxingla bdadf1e
auto format
Jiaxingla 9e23cd6
fix comments about prefetch
Jiaxingla 60adb24
fix comments of enum and sycl macro
Jiaxingla 1e3f855
update from tensor library repo
Jiaxingla 8e951d1
fix format
Jiaxingla c92adb3
rm redundancy code
Jiaxingla 6bdda75
resolve conflict
Jiaxingla 3496593
revert the change of nv hpp
Jiaxingla 69d5c2a
Restore invalid changes
Jiaxingla 962766b
refine gemm interface will codeplay epilogue
Jiaxingla 7739df6
fix the issue of batch gemm
Jiaxingla 5b1f514
rm epilogue and revert gemm example
Jiaxingla f5e23e8
only keep code changes of gemm
Jiaxingla 1c57c36
comments clean
Jiaxingla 13ae1a1
rebase other examples
Jiaxingla fdb7244
rm vnni_matrix func
Jiaxingla d09da29
code clean
Jiaxingla 5a3d227
define N-major tensor
Jiaxingla b50574a
delete useless header
Jiaxingla 2c6d1ba
more comments
Jiaxingla c97ccd8
modify comments
Jiaxingla ede5c03
Update pvc_gemm
Jiaxingla f9aae6f
Update mma_xe
Jiaxingla 7878a7c
more comments
Jiaxingla 4c42645
code clean
Jiaxingla abbbe4f
fix typo
Jiaxingla 8e9a84f
revert the change of copy_atom
Jiaxingla ea30c83
rename enum of LSC_LDCC
Jiaxingla 043fbea
fix typo
Jiaxingla abf38bd
scope enums
Jiaxingla 5193329
modify commment of copy
Jiaxingla b854995
remove useless copy
Jiaxingla File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -55,24 +55,6 @@ static void fill_matrix(std::vector<T> &vector) | |
| return static_cast<T>( (rand() / double(RAND_MAX)) ); | ||
| }); | ||
| } | ||
|
|
||
| template <typename T> | ||
| static void vnni_matrix( | ||
| T* dst, const T* src, | ||
| int batch, int numRows, int numCols, int factor) | ||
| { | ||
| for (int b = 0; b < batch; b++) { | ||
| for (int r = 0; r < numRows / factor; r++) { | ||
| for (int c = 0; c < numCols; c++) { | ||
| for (int k = 0; k < factor; k++) { | ||
| dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = | ||
| src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| using namespace cute; | ||
|
|
||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
|
@@ -89,7 +71,7 @@ struct Options { | |
| Options(): | ||
| help(false), | ||
| error(false), | ||
| m(4096), n(4096), k(4096), l(1), iterations(100), | ||
| m(4096), n(4096), k(4096), l(1), iterations(20), | ||
| alpha(1.f), beta(0.f) | ||
| { } | ||
|
|
||
|
|
@@ -108,7 +90,7 @@ struct Options { | |
| cmd.get_cmd_line_argument("l", l, 1); | ||
| cmd.get_cmd_line_argument("alpha", alpha, 1.f); | ||
| cmd.get_cmd_line_argument("beta", beta, 0.f); | ||
| cmd.get_cmd_line_argument("iterations", iterations, 100); | ||
| cmd.get_cmd_line_argument("iterations", iterations, 20); | ||
| } | ||
|
|
||
| /// Prints the usage statement. | ||
|
|
@@ -170,7 +152,6 @@ struct ExampleRunner { | |
|
|
||
| cutlass::DeviceAllocation<ElementA> block_A; | ||
| cutlass::DeviceAllocation<ElementB> block_B; | ||
| cutlass::DeviceAllocation<ElementB> block_B_vnni; | ||
| cutlass::DeviceAllocation<ElementC> block_C; | ||
| cutlass::DeviceAllocation<ElementOutput> block_D; | ||
| cutlass::DeviceAllocation<ElementOutput> block_ref_D; | ||
|
|
@@ -231,7 +212,6 @@ struct ExampleRunner { | |
|
|
||
| block_A.reset(M * K * L); | ||
| block_B.reset(K * N * L); | ||
| block_B_vnni.reset(K * N * L); | ||
| block_C.reset(M * N * L); | ||
| block_D.reset(M * N * L); | ||
| block_ref_D.reset(M * N * L); | ||
|
|
@@ -247,11 +227,9 @@ struct ExampleRunner { | |
| fill_matrix(a); | ||
| fill_matrix(b); | ||
| fill_matrix(c); | ||
| vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); | ||
|
|
||
| syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA)); | ||
| syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB)); | ||
| syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB)); | ||
| syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC)); | ||
| syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC)); | ||
| } | ||
|
|
@@ -272,7 +250,7 @@ struct ExampleRunner { | |
| typename Gemm::GemmKernel::Arguments arguments{ | ||
| cutlass::gemm::GemmUniversalMode::kGemm, | ||
| problem_size, | ||
| {block_A.get(), stride_A, block_B_vnni.get(), stride_B}, | ||
| {block_A.get(), stride_A, block_B.get(), stride_B}, | ||
| {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, | ||
| hw_info | ||
| }; | ||
|
|
@@ -362,14 +340,14 @@ int main(int argc, const char** argv) | |
| using LayoutD = cutlass::layout::RowMajor; | ||
|
|
||
| using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. adding some explanation for naming conventions of copy function |
||
| using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; | ||
| using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V; | ||
|
|
||
| // Workgroup-level tile | ||
| using TileShape = Shape<_32, _256, _32>; | ||
| using TileShape = Shape<_256, _256, _32>; | ||
|
|
||
| using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TN>, | ||
| using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, | ||
| Layout<Shape<_1,_1,_1>>, | ||
| Tile<_32,_64,_32>>; // Subgroup level-tile | ||
| Tile<_32,_64,_32>>; // Subgroup level-tile | ||
|
|
||
| using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated; | ||
| using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue; | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.