-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.cpp
122 lines (104 loc) · 4.02 KB
/
test.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include "CLI/App.hpp"
#include "CLI/Formatter.hpp"
#include "CLI/Config.hpp"
#include "tatami/tatami.hpp"
#include <random>
#include <vector>
#include <chrono>
#include <memory>
#include <immintrin.h>
template<typename T, bool simd>
void run(int NR, int NC, int nthreads) {
std::vector<T> contents(NR * NC);
std::mt19937_64 rng(42);
std::normal_distribution<T> dist;
for (auto& x : contents) {
x = dist(rng);
}
tatami::DenseRowMatrix<T, int> mat(NR, NC, std::move(contents));
constexpr int alignment = 256;
constexpr int jump = alignment / sizeof(T) / 8;
std::vector<T> vec(NC + alignment);
auto vaptr = reinterpret_cast<void*>(vec.data());
size_t space = vec.size();
std::align(alignment, sizeof(T), vaptr, space);
auto aptr = reinterpret_cast<T*>(vaptr);
for (size_t c = 0; c < NC; ++c) {
aptr[c] = dist(rng);
}
std::vector<T> output(nthreads);
auto start = std::chrono::high_resolution_clock::now();
tatami::parallelize([&](int t, int start, int len) -> void {
auto wrk = mat.dense_row();
std::vector<T> buffer(NC + alignment);
auto vbptr = reinterpret_cast<void*>(buffer.data());
size_t space = buffer.size();
std::align(alignment, sizeof(T), vbptr, space);
auto bptr = reinterpret_cast<T*>(vbptr);
T total = 0;
int upto = NC - NC % jump;
for (int i = start, end = start + len; i < end; ++i) {
auto xptr = wrk->fetch(i, bptr);
tatami::copy_n(xptr, NC, bptr);
if constexpr(!simd) {
for (int j = 0; j < NC; ++j) {
auto x = xptr[j];
auto a = aptr[j];
bptr[j] = x / a + x * a;
}
} else {
for (int j = 0; j < upto; j += jump) {
if constexpr(std::is_same<T, double>::value) {
auto x = _mm256_loadu_pd(xptr + j);
auto a = _mm256_loadu_pd(aptr + j);
_mm256_store_pd(bptr + j, _mm256_add_pd(_mm256_div_pd(x, a), _mm256_mul_pd(x, a)));
} else {
auto x = _mm256_loadu_ps(xptr + j);
auto a = _mm256_loadu_ps(aptr + j);
_mm256_store_ps(bptr + j, _mm256_add_ps(_mm256_div_ps(x, a), _mm256_mul_ps(x, a)));
}
}
for (int j = upto; j < NC; ++j) {
auto x = xptr[j];
auto a = aptr[j];
bptr[j] = x / a + x * a;
}
}
// Just an operation to make sure that the results are used,
// so the entire loop doesn't get optimized away.
total += std::accumulate(bptr, bptr + NC, 0.0);
}
output[t] += total;
}, NR, nthreads);
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "Access time: " << duration.count() << ", yielding " << std::accumulate(output.begin(), output.end(), 0.0) << std::endl;
}
int main(int argc, char** argv) {
CLI::App app{"SIMD performance testing"};
bool simd = false;
app.add_flag("-s,--simd", simd, "Use SIMD");
bool use_float = false;
app.add_flag("-f,--float", use_float, "Use single-precision floats");
int nthreads;
app.add_option("-t,--threads", nthreads, "Number of threads")->default_val(1);
int NR;
app.add_option("-r,--nrow", NR, "Number of rows")->default_val(10000);
int NC;
app.add_option("-c,--ncol", NC, "Number of columns")->default_val(10000);
CLI11_PARSE(app, argc, argv);
if (use_float) {
if (simd) {
run<float, true>(NR, NC, nthreads);
} else {
run<float, false>(NR, NC, nthreads);
}
} else {
if (simd) {
run<double, true>(NR, NC, nthreads);
} else {
run<double, false>(NR, NC, nthreads);
}
}
return 0;
}