From 5dbdf848f4906aef2af331a39eb512f0669f7d5c Mon Sep 17 00:00:00 2001 From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:39:20 -0500 Subject: [PATCH 1/4] Fix issue with gpuScan using 32 bit types This caused invalid memory alignment issues with CUDA. Workaround that for now by using inline proc Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> --- modules/standard/GPU.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 901ae50166ce..61b8b717b369 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -795,7 +795,7 @@ module GPU // This function requires that startIdx and endIdx are within the bounds of the array // it checks that only if boundsChecking is true (i.e. NOT with --fast or --no-checks) - private proc serialScan(ref arr : [] ?t, startIdx = arr.domain.low, endIdx = arr.domain.high) { + private inline proc serialScan(ref arr : [] ?t, startIdx = arr.domain.low, endIdx = arr.domain.high) { // Convert this count array into a prefix sum // This is the same as the count array, but each element is the sum of all previous elements // This is an exclusive scan From 415fb6b157e48b9b4958addd34385c41ac94dbbf Mon Sep 17 00:00:00 2001 From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:40:27 -0500 Subject: [PATCH 2/4] Expand gpuScan testing to more types Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> --- test/gpu/native/studies/sort/scanTest.chpl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/gpu/native/studies/sort/scanTest.chpl b/test/gpu/native/studies/sort/scanTest.chpl index 092ae192c9cf..b47c5d5a53c1 100644 --- a/test/gpu/native/studies/sort/scanTest.chpl +++ b/test/gpu/native/studies/sort/scanTest.chpl @@ -28,8 +28,15 @@ if multiDimArray { var timer: Time.stopwatch; testType(uint); +testType(uint(32)); +testType(uint(16)); +testType(uint(8)); testType(int); +testType(int(32)); +testType(int(16)); +testType(int(8)); testType(real); +// testType(real(32)); // Causes overflow (I think?) proc testType(type t) { var cpuArr: [low..#arrSize] t; From afb699f6f7f1e4962ca004d2640ce47ccd541b20 Mon Sep 17 00:00:00 2001 From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:40:52 -0500 Subject: [PATCH 3/4] Cleanup gpu-sort-performance-explorer Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> --- .../sort/gpu-sort-performance-explorer.chpl | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/test/gpu/native/studies/sort/gpu-sort-performance-explorer.chpl b/test/gpu/native/studies/sort/gpu-sort-performance-explorer.chpl index 167ca9c2a940..32900bf8a99d 100644 --- a/test/gpu/native/studies/sort/gpu-sort-performance-explorer.chpl +++ b/test/gpu/native/studies/sort/gpu-sort-performance-explorer.chpl @@ -15,24 +15,20 @@ config const chunkSize = 6250; config const bitsAtATime = 8; config const inputDataScheme = 1; - -config const parallel = true; - -config param reverse = false; config type eltType = uint(32); config const seed = NPBRandom.oddTimeSeed(); var methods = ["default", "gpuCub", "gpuRadix"]; -proc testsort(ref input, method) { +proc testsort(ref gpuArr, method) { if method == "gpuCub" { - GPU.gpuExternSort(input); + GPU.gpuExternSort(gpuArr); } else if method == "gpuRadix" { - GPU.parallelRadixSort(input, bitsAtATime, chunkSize, false); + GPU.parallelRadixSort(gpuArr, bitsAtATime, chunkSize); } else if method == "default" { - GPU.gpuSort(input); + GPU.gpuSort(gpuArr); } else { halt("Unknown sorting method " + method); } @@ -145,11 +141,11 @@ proc testsize(size:int) { for i in 1..ntrials { input = makeInput(array); on here.gpus[0]{ - var arr = input; // Copy to gpu + var gpuArr = input; // Copy to gpu t.start(); - testsort(arr, m); + testsort(gpuArr, m); t.stop(); - input = arr; // Copy back to cpu + input = gpuArr; // Copy back to cpu } } var mibs = mibibytes * ntrials / t.elapsed(); From 9567f60b5cc07f997dad44cf130d7d7e92745cf1 Mon Sep 17 00:00:00 2001 From: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> Date: Tue, 12 Mar 2024 18:58:53 -0500 Subject: [PATCH 4/4] Add future for misaligned address issue Signed-off-by: Shreyas Khandekar <60454060+ShreyasKhandekar@users.noreply.github.com> --- .../native/studies/sort/cuda12_misaligned.bad | 1 + .../native/studies/sort/cuda12_misaligned.chpl | 17 +++++++++++++++++ .../studies/sort/cuda12_misaligned.future | 1 + .../native/studies/sort/cuda12_misaligned.good | 0 4 files changed, 19 insertions(+) create mode 100644 test/gpu/native/studies/sort/cuda12_misaligned.bad create mode 100644 test/gpu/native/studies/sort/cuda12_misaligned.chpl create mode 100644 test/gpu/native/studies/sort/cuda12_misaligned.future create mode 100644 test/gpu/native/studies/sort/cuda12_misaligned.good diff --git a/test/gpu/native/studies/sort/cuda12_misaligned.bad b/test/gpu/native/studies/sort/cuda12_misaligned.bad new file mode 100644 index 000000000000..c3fa35cb75e5 --- /dev/null +++ b/test/gpu/native/studies/sort/cuda12_misaligned.bad @@ -0,0 +1 @@ +internal error: gpu-nvidia.c:292: Error calling CUDA function: misaligned address (Code: 716) diff --git a/test/gpu/native/studies/sort/cuda12_misaligned.chpl b/test/gpu/native/studies/sort/cuda12_misaligned.chpl new file mode 100644 index 000000000000..cea76f2fb24e --- /dev/null +++ b/test/gpu/native/studies/sort/cuda12_misaligned.chpl @@ -0,0 +1,17 @@ +on here.gpus[0] { + var a: [1..10] uint(32); + @assertOnGpu + foreach chunk in 0..1 { + serialScan(a); // Exclusive scan in serial + } +} + + proc serialScan(ref arr : [] uint(32)) { + // Calculate the prefix sum + var sum : uint(32); + var temp : uint(32) = arr[1]; + arr[1] = sum; + sum += temp; + +} + diff --git a/test/gpu/native/studies/sort/cuda12_misaligned.future b/test/gpu/native/studies/sort/cuda12_misaligned.future new file mode 100644 index 000000000000..685849b48bf8 --- /dev/null +++ b/test/gpu/native/studies/sort/cuda12_misaligned.future @@ -0,0 +1 @@ +# issue: \ No newline at end of file diff --git a/test/gpu/native/studies/sort/cuda12_misaligned.good b/test/gpu/native/studies/sort/cuda12_misaligned.good new file mode 100644 index 000000000000..e69de29bb2d1