Skip to content

Commit

Permalink
Fix bugs with 32 bit types being passed to gpuScan (#24600)
Browse files Browse the repository at this point in the history
There were failures in `gpu-sort-performance-explorer` caused by in
in-house radix sort using the `gpuScan` which was untested for
`uint(32)` types.
Upon testing it was revealed that passing any type which is not a 64 bit
type caused an error as seen in
#24602

This PR fixes `gpuScan` to work with no 64 bit types by using the
workaround described in issue above. Why this works is not known yet :)

While there, we also expand the `gpuScan` testing and cleanup some code.

[Reviewed by @vasslitvinov]
  • Loading branch information
ShreyasKhandekar authored Mar 13, 2024
2 parents 3ec1a94 + 9567f60 commit 1408aca
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 12 deletions.
2 changes: 1 addition & 1 deletion modules/standard/GPU.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ module GPU

// This function requires that startIdx and endIdx are within the bounds of the array
// it checks that only if boundsChecking is true (i.e. NOT with --fast or --no-checks)
private proc serialScan(ref arr : [] ?t, startIdx = arr.domain.low, endIdx = arr.domain.high) {
private inline proc serialScan(ref arr : [] ?t, startIdx = arr.domain.low, endIdx = arr.domain.high) {
// Convert this count array into a prefix sum
// This is the same as the count array, but each element is the sum of all previous elements
// This is an exclusive scan
Expand Down
1 change: 1 addition & 0 deletions test/gpu/native/studies/sort/cuda12_misaligned.bad
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
internal error: gpu-nvidia.c:292: Error calling CUDA function: misaligned address (Code: 716)
17 changes: 17 additions & 0 deletions test/gpu/native/studies/sort/cuda12_misaligned.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
on here.gpus[0] {
var a: [1..10] uint(32);
@assertOnGpu
foreach chunk in 0..1 {
serialScan(a); // Exclusive scan in serial
}
}

proc serialScan(ref arr : [] uint(32)) {
// Calculate the prefix sum
var sum : uint(32);
var temp : uint(32) = arr[1];
arr[1] = sum;
sum += temp;

}

1 change: 1 addition & 0 deletions test/gpu/native/studies/sort/cuda12_misaligned.future
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# issue:
Empty file.
18 changes: 7 additions & 11 deletions test/gpu/native/studies/sort/gpu-sort-performance-explorer.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,20 @@ config const chunkSize = 6250;
config const bitsAtATime = 8;

config const inputDataScheme = 1;

config const parallel = true;

config param reverse = false;
config type eltType = uint(32);

config const seed = NPBRandom.oddTimeSeed();

var methods = ["default", "gpuCub", "gpuRadix"];

proc testsort(ref input, method) {
proc testsort(ref gpuArr, method) {

if method == "gpuCub" {
GPU.gpuExternSort(input);
GPU.gpuExternSort(gpuArr);
} else if method == "gpuRadix" {
GPU.parallelRadixSort(input, bitsAtATime, chunkSize, false);
GPU.parallelRadixSort(gpuArr, bitsAtATime, chunkSize);
} else if method == "default" {
GPU.gpuSort(input);
GPU.gpuSort(gpuArr);
} else {
halt("Unknown sorting method " + method);
}
Expand Down Expand Up @@ -145,11 +141,11 @@ proc testsize(size:int) {
for i in 1..ntrials {
input = makeInput(array);
on here.gpus[0]{
var arr = input; // Copy to gpu
var gpuArr = input; // Copy to gpu
t.start();
testsort(arr, m);
testsort(gpuArr, m);
t.stop();
input = arr; // Copy back to cpu
input = gpuArr; // Copy back to cpu
}
}
var mibs = mibibytes * ntrials / t.elapsed();
Expand Down
7 changes: 7 additions & 0 deletions test/gpu/native/studies/sort/scanTest.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,15 @@ if multiDimArray {
var timer: Time.stopwatch;

testType(uint);
testType(uint(32));
testType(uint(16));
testType(uint(8));
testType(int);
testType(int(32));
testType(int(16));
testType(int(8));
testType(real);
// testType(real(32)); // Causes overflow (I think?)

proc testType(type t) {
var cpuArr: [low..#arrSize] t;
Expand Down

0 comments on commit 1408aca

Please sign in to comment.