forked from stan-dev/perf-math
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tbatching.cpp
105 lines (88 loc) · 2.89 KB
/
tbatching.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include <benchmark/benchmark.h>
#include <Eigen/Dense>
#include <future>
#include <thread>
#include <iostream>
static void escape(void *p) {
asm volatile("" : : "g"(p) : "memory");
}
static void clobber() {
asm volatile("" : : : "memory");
}
using matrix_d = Eigen::MatrixXd;
static void BM_ChunkedThreads(benchmark::State& state) {
using Eigen::MatrixXd;
auto m_d = MatrixXd::Random(50, 50).eval();
auto output = matrix_d(0, 0);
auto execute_chunk = [&](int size) -> std::vector<matrix_d> {
std::vector<matrix_d> chunk_f_out;
chunk_f_out.reserve(size);
for (int i = 0; i < size; i++) {
chunk_f_out.push_back(m_d.transpose() * m_d);
}
return chunk_f_out;
};
const int num_chunks = state.range(0);
const int chunk_size = state.range(1);
for (auto _ : state) {
escape(m_d.data());
// Actual task
std::vector<std::future<std::vector<matrix_d>>> futures;
futures.reserve(num_chunks);
for (int i = 0; i < num_chunks; i++) {
futures.emplace_back(std::async(std::launch::async, execute_chunk, chunk_size));
}
int offset = 0;
bool resized = false;
for (auto& f : futures) {
const std::vector<matrix_d>& chunk_result = f.get();
if (!resized) {
output.resize(chunk_result[0].rows(),
num_chunks * chunk_result[0].cols());
resized = true;
}
for (const auto& job_result : chunk_result) {
const int num_job_outputs = job_result.cols();
if (output.cols() < offset + num_job_outputs) {
output.conservativeResize(Eigen::NoChange,
2 * (offset + num_job_outputs));
}
output.block(0, offset, output.rows(), num_job_outputs) = job_result;
offset += num_job_outputs;
}
}
clobber();
}
}
BENCHMARK(BM_ChunkedThreads)->Args({10, 12})->Args({1, 1})->Args({100, 100});
static void BM_BareThreads(benchmark::State& state) {
using Eigen::MatrixXd;
auto m_d = MatrixXd::Random(50, 50).eval();
auto execute_chunk = [&]() -> matrix_d {
return m_d.transpose() * m_d;
};
const int num_chunks = state.range(0);
const int chunk_size = state.range(1);
const int num_tasks = num_chunks * chunk_size;
matrix_d output(m_d.rows(), m_d.cols() * num_tasks);
for (auto _ : state) {
escape(m_d.data());
// Actual task
std::vector<std::future<matrix_d>> futures;
futures.reserve(num_tasks);
for (int i = 0; i < num_tasks; i++) {
futures.emplace_back(std::async(std::launch::async, execute_chunk));
}
int offset = 0;
for (auto& f : futures) {
const matrix_d& job_result = f.get();
const int num_job_outputs = job_result.cols();
output.block(0, offset, output.rows(), num_job_outputs)
= job_result;
offset += num_job_outputs;
}
clobber();
}
}
BENCHMARK(BM_BareThreads)->Args({10, 12})->Args({1, 1})->Args({100, 100});
BENCHMARK_MAIN();