From 061f58f240a2fe1ffc903e3027d18a9578c13082 Mon Sep 17 00:00:00 2001 From: Jussi Enkovaara Date: Wed, 27 Nov 2024 08:48:13 +0200 Subject: [PATCH 1/3] Fence needed for sensible timing measurement --- exercises/kokkos/05-laplacian/solution-functor/laplacian.cpp | 1 + exercises/kokkos/05-laplacian/solution-lambda/laplacian.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/exercises/kokkos/05-laplacian/solution-functor/laplacian.cpp b/exercises/kokkos/05-laplacian/solution-functor/laplacian.cpp index 14b1575..14cbd90 100644 --- a/exercises/kokkos/05-laplacian/solution-functor/laplacian.cpp +++ b/exercises/kokkos/05-laplacian/solution-functor/laplacian.cpp @@ -61,6 +61,7 @@ int main(int argc, char** argv) Kokkos::MDRangePolicy >({1, 1}, {nx-1, ny-1}), laplFunctor(A, L, dx, dy)); + Kokkos::fence(); double t1 = timer.seconds(); // Check the result diff --git a/exercises/kokkos/05-laplacian/solution-lambda/laplacian.cpp b/exercises/kokkos/05-laplacian/solution-lambda/laplacian.cpp index 0a9fcd1..f6bf022 100644 --- a/exercises/kokkos/05-laplacian/solution-lambda/laplacian.cpp +++ b/exercises/kokkos/05-laplacian/solution-lambda/laplacian.cpp @@ -50,6 +50,7 @@ int main(int argc, char** argv) (A(i,j-1) - 2.0*A(i,j) + A(i,j+1)) * inv_dy2; }); + Kokkos::fence(); double t1 = timer.seconds(); // Check the result From c087ba6de179de8e93ec630c7ce9db593168a8e8 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 27 Nov 2024 10:23:06 +0200 Subject: [PATCH 2/3] Remove reference to non-existing image --- docs/06-kokkos.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/06-kokkos.md b/docs/06-kokkos.md index c9a51b8..b138d5d 100644 --- a/docs/06-kokkos.md +++ b/docs/06-kokkos.md @@ -53,10 +53,6 @@ lang: en - Execution units may have distinct memories -
-![](img/kokkos-node-doc.png){.center width=70%} -
- # Execution and Memory Spaces - Kokkos uses an execution space model to abstract the details of parallel hardware From c4321cc5a6e933ea7b6c6dffd33bb785195b5558 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 27 Nov 2024 10:32:18 +0200 Subject: [PATCH 3/3] Fix typo --- docs/03-memory-access-hierarchy.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/03-memory-access-hierarchy.md b/docs/03-memory-access-hierarchy.md index 0662141..46809b6 100644 --- a/docs/03-memory-access-hierarchy.md +++ b/docs/03-memory-access-hierarchy.md @@ -30,7 +30,7 @@ lang: en
- Accessible by all threads in a grid - Slow, latency of eg. 600-700 cycles - - Still, high bandwidth compared to CPU memory (1600 TB/s in AMD MI250X) + - Still, high bandwidth compared to CPU memory (1600 GB/s for a single GCD of AMD MI250X) - Can be controlled by host (via pointer operations) - Lifetime of the program