forked from unisa-hpc/sycl-bench
-
Notifications
You must be signed in to change notification settings - Fork 2
/
matmul_hdp.cpp
151 lines (129 loc) · 5.05 KB
/
matmul_hdp.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#include <cstdio>
#include <vector>
#include <common.h>
//#define PRINT_MATRIX
//#define MATRIX_TEST
// Performs chained matrix multiply of the form (AB)(CD)
// Uses two intermediate buffers and one for the result
template <typename T>
class MatmulHDP;
template <typename T>
void multiply(cl::sycl::queue& queue, cl::sycl::buffer<T, 2>& mat_a, cl::sycl::buffer<T, 2>& mat_b,
cl::sycl::buffer<T, 2>& mat_c, const size_t mat_size, const size_t local_size) {
queue.submit([&](cl::sycl::handler& cgh) {
auto a = mat_a.template get_access<cl::sycl::access::mode::read>(cgh);
auto b = mat_b.template get_access<cl::sycl::access::mode::read>(cgh);
auto c = mat_c.template get_access<cl::sycl::access::mode::discard_write>(cgh);
size_t workgroup_size = local_size;
size_t num_workgroups = (mat_size + workgroup_size - 1) / workgroup_size;
//std::cout << "Problem size=" <<mat_size<< "x"<<mat_size << " # Work Groups=" <<num_workgroups<<"x"<<num_workgroups<<"("<<(num_workgroups*num_workgroups)<<")"<< " of " << workgroup_size<<"x"<<workgroup_size << " big" << std::endl;
cgh.parallel_for_work_group<class MatmulHDP<T>>(cl::sycl::range<2>(num_workgroups, num_workgroups),cl::sycl::range<2>(workgroup_size, workgroup_size), [=](cl::sycl::group<2> group)
{
//auto group_id = group.get_linear();
group.parallel_for_work_item([&](cl::sycl::h_item<2> item) {
//printf("group: %zu local(x):%zu local(y): %zu global(x): %zu global(y): %zu\n",group_id,item.get_local_id(0),item.get_local_id(1),item.get_global_id(0),item.get_global_id(1));
auto sum = 0;
for(size_t k = 0; k < mat_size; ++k) {
const auto a_ik = a[{item.get_global_id(0), k}];
const auto b_kj = b[{k, item.get_global_id(1)}];
sum += a_ik * b_kj;
}
c[{item.get_global_id(0),item.get_global_id(1)}] = sum;
});
});
});
}
template <typename T>
class MatmulHDP {
protected:
std::vector<T> mat_a;
std::vector<T> mat_b;
std::vector<T> mat_res;
BenchmarkArgs args;
int mat_size;
PrefetchedBuffer<T, 2> mat_a_buf;
PrefetchedBuffer<T, 2> mat_b_buf;
PrefetchedBuffer<T, 2> mat_res_buf;
public:
MatmulHDP(const BenchmarkArgs &_args) : args(_args) {
mat_size = args.problem_size;
}
void setup() {
#ifdef MATRIX_TEST
//good test for 2x2 matrix:
//mat_a = {1,2,3,4};
//mat_b = {2,0,1,2};
//result= {4, 4, 10, 8};
//or: {2,4,7,10} with:
//mat_a = {2,0,1,2};
//mat_b = {1,2,3,4};
args.problem_size = 2;
args.local_size = 1;
mat_size = 2;
mat_a = std::vector<T>({1,2,3,4});
mat_b = std::vector<T>({2,0,1,2});
mat_res = std::vector<T>(mat_size * mat_size);
mat_a_buf.initialize(args.device_queue, mat_a.data(), cl::sycl::range<2>(mat_size, mat_size));
mat_b_buf.initialize(args.device_queue, mat_b.data(), cl::sycl::range<2>(mat_size, mat_size));
mat_res_buf.initialize(args.device_queue, mat_res.data(), cl::sycl::range<2>(mat_size, mat_size));
return void();
#endif
mat_a = std::vector<T>(mat_size * mat_size);
mat_b = std::vector<T>(mat_size * mat_size);
mat_res = std::vector<T>(mat_size * mat_size);
// Initialize matrices to the identity
for(size_t i = 0; i < mat_size; ++i) {
for(size_t j = 0; j < mat_size; ++j) {
mat_a[i * mat_size + j] = i == j;
mat_b[i * mat_size + j] = i == j;
}
}
mat_a_buf.initialize(args.device_queue, mat_a.data(), cl::sycl::range<2>(mat_size, mat_size));
mat_b_buf.initialize(args.device_queue, mat_b.data(), cl::sycl::range<2>(mat_size, mat_size));
mat_res_buf.initialize(args.device_queue, mat_res.data(), cl::sycl::range<2>(mat_size, mat_size));
}
void run() {
multiply(args.device_queue, mat_a_buf.get(), mat_b_buf.get(), mat_res_buf.get(), mat_size, args.local_size);
}
static std::string getBenchmarkName() {
std::stringstream name;
name << "MatMul_HDP_";
name << ReadableTypename<T>::name;
return name.str();
}
bool verify(VerificationSetting &ver) {
// Triggers writeback
mat_res_buf.reset();
#ifdef PRINT_MATRIX
for(size_t i = 0; i < mat_size; ++i) {
for(size_t j = 0; j < mat_size; ++j) {
const T kernel_value = mat_res[i * mat_size + j];
printf("%f\t", kernel_value);
}
printf("\n");
}
#endif
#ifdef MATRIX_TEST
return(true);
#endif
bool verification_passed = true;
for(size_t i = 0; i < mat_size; ++i) {
for(size_t j = 0; j < mat_size; ++j) {
const T kernel_value = mat_res[i * mat_size + j];
const T host_value = i == j;
if(kernel_value != host_value) {
fprintf(stderr, "VERIFICATION FAILED for element %ld,%ld: %f != %f\n", i, j, kernel_value, host_value);
verification_passed = false;
break;
}
}
if(!verification_passed) { break; }
}
return verification_passed;
}
};
int main(int argc, char** argv) {
BenchmarkApp app(argc, argv);
// float
app.run< MatmulHDP<float> >();
}