5
5
#include < parlay/sequence.h>
6
6
#include < parlay/primitives.h>
7
7
8
- template <typename InIt, typename OutIt, typename KeyIt>
9
- parlay::sequence<int >
10
- counting_sort (const InIt& begin, const InIt& end,
11
- OutIt out, const KeyIt& keys,
8
+ // **************************************************************
9
+ // Counting sort
10
+ // A parallel version of counting sort. It breaks the input into
11
+ // partitions and for each partition, in parallel, it counts how many
12
+ // of each key there are. It then using scan to calculate the offsets
13
+ // for each bucket in each partition, and does a final pass placing
14
+ // all keys in their correct position.
15
+ // **************************************************************
16
+
17
+ using counter_type = unsigned long ;
18
+
19
+ // **************************************************************
20
+ // Input:
21
+ // begin and end iterators for the values to be rearranged
22
+ // begin iterator for the output (value_type must be the same)
23
+ // begin iterator for the keys (range must be same length as values)
24
+ // num_buckets : number of buckets (should be smallish, e.g. 256)
25
+ // Output:
26
+ // Offsets within output of each key. Will be of length
27
+ // num_buckets+1 since last entry will contain total size
28
+ // (i.e. end-begin).
29
+ // **************************************************************
30
+ template <typename InIt, typename OutIt, typename KeyIt>
31
+ parlay::sequence<counter_type>
32
+ counting_sort (const InIt& begin, const InIt& end,
33
+ OutIt out, const KeyIt& keys,
12
34
long num_buckets) {
13
35
long n = end - begin;
14
- long num_parts = n / (num_buckets * 64 ) + 1 ;
36
+ if (n == 0 ) return parlay::sequence<counter_type>(1 , 0 );
37
+ long num_parts = std::min (1000l , n / (num_buckets * 64 ) + 1 );
15
38
long part_size = (n - 1 )/num_parts + 1 ;
16
39
17
40
// first count buckets within each partition
18
- auto counts = parlay::sequence<int >::uninitialized (num_buckets * num_parts);
41
+ auto counts = parlay::sequence<counter_type >::uninitialized (num_buckets * num_parts);
19
42
parlay::parallel_for (0 , num_parts, [&] (long i) {
20
43
long start = i * part_size;
21
44
long end = std::min<long >(start + part_size, n);
22
- for (int j = 0 ; j < num_buckets; j++) counts[i*num_buckets + j] = 0 ;
23
- for (size_t j = start; j < end; j++) counts[i*num_buckets + keys[j]]++;
45
+ for (long j = 0 ; j < num_buckets; j++) counts[i*num_buckets + j] = 0 ;
46
+ for (long j = start; j < end; j++) counts[i*num_buckets + keys[j]]++;
24
47
}, 1 );
25
48
26
49
// transpose the counts if more than one part
27
- parlay::sequence<int > trans_counts;
28
- if (num_parts > 1 ) {
29
- trans_counts = parlay::sequence<int >::uninitialized (num_buckets * num_parts);
30
- parlay::parallel_for (0 , num_buckets, [&] (long i) {
31
- for (size_t j = 0 ; j < num_parts; j++)
50
+ parlay::sequence<counter_type > trans_counts;
51
+ if (num_parts > 1 ) {
52
+ trans_counts = parlay::sequence<counter_type >::uninitialized (num_buckets * num_parts);
53
+ parlay::parallel_for (0 , num_buckets, [&] (long i) {
54
+ for (size_t j = 0 ; j < num_parts; j++)
32
55
trans_counts[i* num_parts + j] = counts[j * num_buckets + i];}, 1 );
33
56
} else trans_counts = std::move (counts);
34
57
@@ -39,19 +62,32 @@ counting_sort(const InIt& begin, const InIt& end,
39
62
parlay::parallel_for (0 , num_parts, [&] (long i) {
40
63
long start = i * part_size;
41
64
long end = std::min<long >(start + part_size, n);
42
- int local_offsets[ num_buckets] ;
65
+ parlay::sequence<counter_type> local_offsets ( num_buckets) ;
43
66
44
67
// transpose back
45
- for (int j = 0 ; j < num_buckets; j++)
68
+ for (long j = 0 ; j < num_buckets; j++)
46
69
local_offsets[j] = trans_counts[num_parts * j + i];
47
70
48
71
// copy to output
49
- for (size_t j = start; j < end; j++) {
50
- int k = local_offsets[keys[j]]++;
72
+ for (long j = start; j < end; j++) {
73
+ counter_type k = local_offsets[keys[j]]++;
74
+ // prefetching speeds up the code
75
+ #if defined(__GNUC__) || defined(__clang__)
51
76
__builtin_prefetch (((char *) &out[k]) + 64 );
77
+ #endif
52
78
out[k] = begin[j];
53
79
}}, 1 );
54
80
55
- return parlay::tabulate (num_buckets, [&] (long i) {
56
- return trans_counts[i * num_parts];});
81
+ return parlay::tabulate (num_buckets+1 , [&] (long i) {
82
+ return (i == num_buckets) ? (counter_type) n : trans_counts[i * num_parts];});
83
+ }
84
+
85
+ // A version that uses ranges as inputs and generates its own output sequence
86
+ template <typename InRange, typename KeysRange>
87
+ auto counting_sort (const InRange& in, const KeysRange& keys,
88
+ long num_buckets) {
89
+ auto out = parlay::sequence<typename InRange::value_type>::uninitialized (in.size ());
90
+ auto offsets = counting_sort (in.begin (), in.end (), out.begin (), keys.begin (),
91
+ num_buckets);
92
+ return std::pair (std::move (out), std::move (offsets));
57
93
}
0 commit comments