diff --git a/content/posts/2023/02/benchmarking/benchmarking_the_world.tar b/content/posts/2023/02/benchmarking/benchmarking_the_world.tar new file mode 100644 index 0000000..55c5ea5 Binary files /dev/null and b/content/posts/2023/02/benchmarking/benchmarking_the_world.tar differ diff --git a/content/posts/2023/02/benchmarking/create_src_archive b/content/posts/2023/02/benchmarking/create_src_archive new file mode 100755 index 0000000..2c35c41 --- /dev/null +++ b/content/posts/2023/02/benchmarking/create_src_archive @@ -0,0 +1,6 @@ +#!/bin/sh +tar -cf benchmarking_the_world.tar \ + --transform 's,^,benchmarking_the_world/,' \ + *.hpp \ + *.cpp \ + meson.build diff --git a/content/posts/2023/02/benchmarking/google_micro.cpp b/content/posts/2023/02/benchmarking/google_micro.cpp new file mode 100644 index 0000000..3ba53e8 --- /dev/null +++ b/content/posts/2023/02/benchmarking/google_micro.cpp @@ -0,0 +1,13 @@ +#include +#include "sum.hpp" + +static void BM_Sum(benchmark::State& state) { + for (auto _ : state) { + benchmark::DoNotOptimize(sum(state.range(0), state.range(1))); + } + state.SetComplexityN(state.range(0) + state.range(1)); +} + +BENCHMARK(BM_Sum)->Args({1, 2})->Args({10, 20})->Complexity(); + +BENCHMARK_MAIN(); diff --git a/content/posts/2023/02/benchmarking/index.md b/content/posts/2023/02/benchmarking/index.md new file mode 100644 index 0000000..1ba3cc0 --- /dev/null +++ b/content/posts/2023/02/benchmarking/index.md @@ -0,0 +1,434 @@ ++++ +title = "Benchmarking The World" +date = "2023-02-28" +authors = ["philipp.david"] +tags = ["Efficiency", "EPEA 2022", "Benchmarking"] ++++ + +Performance analysis is an important part of programming and computer science +in general and it is often done with the goal of increasing performance. To +find out how fast a program is in the first place and then verify whether or +not a change improves performance, benchmarking is used. + +In this post, we will explore different types of benchmarking tools and how +they can help you improve the performance of your software. We will cover +microbenchmarking tools, macrobenchmarking tools and whole-system benchmarking +tools. + + + +## Setup + +Before we begin, please note that all sample code shown in this blog post can +be downloaded [here](benchmarking_the_world.tar). If you want to go along with +the commands, you will have to install a working C++17 compiler, +[Meson](https://mesonbuild.com/) and [ninja](https://ninja-build.org/). + +## Microbenchmarking Tools + +Microbenchmarking is the process of measuring the performance of small code +snippets, usually at the function level. It is essential for identifying +performance issues in critical parts of your codebase. As such, it is often +employed by implementors of programming languages to measure and improve the +performance of standard library functions, such as operations on vectors. + +### DIY Microbenchmarks + +To illustrate how one would write a microbenchmark manually, let's consider the +following C++ function that calculates the sum of two integers: + +```cpp {linenos=true} +int sum(int a, int b) { + return a + b; +} +``` + +To benchmark this function, we can measure the time it takes to execute it for +different input values. Here's an example of how we can do that manually: + +```cpp {linenos=true} +#include +#include +#include "sum.hpp" + +using namespace std::chrono; + +const int iterations = 100000000; + +int main() { + // Benchmark sum(1, 2) + auto start = steady_clock::now(); + for (int i = 0; i < iterations; i++) { + sum(1, 2); + } + auto end = steady_clock::now(); + auto duration = duration_cast(end - start).count(); + std::cout << "sum(1, 2) took " << duration << " microseconds" << std::endl; + + // Benchmark sum(10, 20) + start = steady_clock::now(); + for (int i = 0; i < iterations; i++) { + sum(10, 20); + } + end = steady_clock::now(); + duration = duration_cast(end - start).count(); + std::cout << "sum(10, 20) took " << duration << " microseconds" << std::endl; + + return 0; +} +``` + +This code tries to measure the time it takes to execute the sum function 100 +million times for two different input values. To compile it, we can put +everything into a file named [`simple_micro.cpp`](simple_micro.cpp) and use our +build system of choice. For this post, we will be using the Meson build system. +Having created a simple build description file, [`meson.build`](meson.build), +we build the code and then run the executable: + +```sh {linenos=true} +% meson setup builddir +% cd builddir +% ninja simple_micro +% ./simple_micro +sum(1, 2) took 0 microseconds +sum(10, 20) took 0 microseconds +``` + +However, as we can see, the execution time for both function calls is strangely +low. This happens because we forgot to use the results of `sum` and the +compiler is free to optimize away all calls to `sum`. When writing your own +microbenchmarking code from scratch, mistakes such as this one are very easy to +make. Therefore, using a robust, battle-tested microbenchmarking library is +highly recommended. + +### Google Benchmark + +One such library is [Google Benchmark](https://github.com/google/benchmark). It +provides a simple and reliable way to measure the performance of your code. It +takes care of warm-up, statistical analysis and can prevent excessive compiler +optimizations amongst other factors that can affect the accuracy of the +results. + +Here's a simple example of how we can use Google Benchmark to measure the +performance of our sum function: + +```cpp {linenos=true} +#include +#include "sum.hpp" + +static void BM_Sum(benchmark::State& state) { + for (auto _ : state) { + benchmark::DoNotOptimize(sum(state.range(0), state.range(1))); + } + state.SetComplexityN(state.range(0) + state.range(1)); +} + +BENCHMARK(BM_Sum)->Args({1, 2})->Args({10, 20})->Complexity(); + +BENCHMARK_MAIN(); +``` + +This code creates a benchmark function `BM_Sum` that takes a benchmark `State` +as an argument, which provides an iterator that is used to run our benchmarks +for an appropriate amount of times. The `state` also contains arguments that +are passed to the benchmark function, which can be accessed using the +`state.range` function. Here, the first two arguments are simply passed onto +`sum`. Looking at the `sum` call again, we can see that it is surrounded by a +call to `DoNotOptimize`, which does exactly what it says: It prevents the +compiler from optimizing out the call to `sum` and it does so without +additional overhead. + +We also set the complexity of the benchmark to be proportional to the sum of +the input values, which lets `benchmark` determine the complexity of our +function in Big O notation. For those familiar with this concept, this can be +an enormous help in comparing the performance of algorithms. + +Finally, we use the `BENCHMARK` macro to register the `BM_Sum` function with +Google Benchmark and pass two sets of arguments to it: `{1, 2}` and `{10, 20}`. +To execute the benchmark, we use the `BENCHMARK_MAIN` macro to generate the +main function that runs the benchmarks. + +Now that we understand what our sample program does, let's try running it. +First of all, we need to ensure that Google Benchmark is installed. Many +operating systems ship it in their package repositories and using +[Repology](https://repology.org/project/benchmark/versions) we can find out the +correct package name to install. + +After installing the prerequisites, we navigate to our `builddir` and run +`ninja reconfigure` to detect the installed libraries. After that, we can +finally build and run our benchmark: + +```sh {linenos=true} +% ninja google_micro +... +% ./google_micro +2023-02-28T09:34:02+01:00 +Running ./google_micro +Run on (4 X 4600 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x4) + L1 Instruction 32 KiB (x4) + L2 Unified 256 KiB (x4) + L3 Unified 6144 KiB (x1) +Load Average: 0.46, 0.32, 0.30 +***WARNING*** CPU scaling is enabled, the benchmark real time measurements may +be noisy and will incur extra overhead. +------------------------------------------------------- +Benchmark Time CPU Iterations +------------------------------------------------------- +BM_Sum/1/2 0.655 ns 0.654 ns 1000000000 +BM_Sum/10/20 0.655 ns 0.654 ns 1000000000 +BM_Sum_BigO 0.65 (1) 0.65 (1) +BM_Sum_RMS 0 % 0 % +``` + +As we can see, this provides much more information than our DIY implementation +and even warns us that CPU frequency scaling might be causing inconsistent +measurements. + +Google Benchmark provides many more features that make it easy to write +accurate and reliable microbenchmarks. This small overview should help you get +started with it relatively quickly. + +## Macrobenchmarking Tools + +Contrary to microbenchmarking, macrobenchmarking is the process of measuring +the performance of larger code sections, usually at the program level. It is +useful for identifying performance issues that are not related to specific +functions or algorithms, as well as comparing different versions of programs or +even the same versions compiled with different compiler flags. + +### POSIX time + +One way to perform a macrobenchmark is with the POSIX `time` utility. A very +simple example would be to find out how long `ls` takes to list our directory +contents: + +```sh {linenos=true} +% time ls +build.ninja google_micro.p meson-private +compile_commands.json meson-info monte_carlo.p +google_micro meson-logs simple_micro.p + +real 0m0.001s +user 0m0.001s +sys 0m0.000s +``` + +This was a benchmark, just not a very good one. The major problems with using +`time` like this are that the benchmarked program is only run once, making the +result vary with the random error. We also need a more interesting program to +benchmark, since the runtime of `ls` is so low that it becomes hard to +benchmark even with better tools. + +Here's a rather simple program we can benchmark that approximates the number Pi +using a [Monte Carlo method](https://en.wikipedia.org/wiki/Monte_Carlo_method): + +```cpp {linenos=true} +#include +#include + +int main() { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + + const size_t num_points = 10000000; + size_t num_points_inside_circle = 0; + + for (size_t i = 0; i < num_points; ++i) { + const double x = dis(gen); + const double y = dis(gen); + if (x * x + y * y <= 1.0) { + ++num_points_inside_circle; + } + } + + const double pi = 4.0 * num_points_inside_circle / num_points; + std::cout << "Pi: " << pi << '\n'; + + return 0; +} +``` + +To build it, we can just issue `ninja monte_carlo` in our `builddir`. + +### hyperfine + +Now that we have a more interesting program to benchmark, we need a better way +to perform macrobenchmarks. The preferred way to do this is to use a dedicated +tool like [`hyperfine`](https://github.com/sharkdp/hyperfine)[^1]. `hyperfine` +is a command-line benchmarking tool written in Rust that provides a simple and +flexible way to measure the performance of your programs. Compared to `time`, +`hyperfine` actively tries to minimize random errors and will warn about +outliers. There are also many useful formats that `hyperfine` can output, such +as Markdown and raw JSON data. + +To get `hyperfine`, once again check if it is provided by your distribution's +repositories at [Repology](https://repology.org/project/hyperfine/versions). If +you have a working Rust installation on your computer, you can also simply +install the latest version with `cargo install hyperfine`. + +Here's how we can now benchmark our `monte_carlo` program: + +```sh {linenos=true} +% hyperfine ./monte_carlo +Benchmark 1: ./monte_carlo + Time (mean ± σ): 235.8 ms ± 0.9 ms [User: 234.0 ms, System: 0.7 ms] + Range (min … max): 234.5 ms … 237.3 ms 12 runs +``` + +This is much more helpful than the `time` output! We can clearly see the mean +runtime of our program along with the standard deviation, as well as the +minimum and maximum runtimes. Our program was run a total of 12 times until +`hyperfine` was satisfied with the results. We even saw a nice progress bar +while the benchmark was running! + +Although the default settings are already helpful, `hyperfine` provides many +more options that make it easy to customize the benchmarking process. For +example, you can set the number of runs, the warm-up time, and the maximum +execution time of the benchmark. You can also control the environment +variables, the working directory, and the input arguments of the benchmark. + +Using these advanced features, we can try to find the best compiler flags for +`monte_carlo`. The relevant flags in this case are `--setup` to build our +program with different flags and `--parameter-list` to specify the different +optimization levels we want to try. Putting it all together, we end up with +something along these lines: + +```sh {linenos=true} +% hyperfine \ + --parameter-list opt 2,3,fast \ + --parameter-list march x86-64,native \ + --setup 'meson configure -Dcpp_args="-O{opt} -march={march}" && ninja' \ + --warmup 2 \ + --export-markdown monte_carlo_results.md \ + --command-name 'cpp -O{opt} -march={march}' \ + ./monte_carlo + +Benchmark 1: cpp -O2 -march=x86-64 + Time (mean ± σ): 234.6 ms ± 0.0 ms [User: 233.7 ms, System: 0.7 ms] + Range (min … max): 234.5 ms … 234.7 ms 12 runs + +Benchmark 2: cpp -O3 -march=x86-64 + Time (mean ± σ): 235.2 ms ± 1.0 ms [User: 233.9 ms, System: 0.8 ms] + Range (min … max): 234.2 ms … 237.4 ms 12 runs + +Benchmark 3: cpp -Ofast -march=x86-64 + Time (mean ± σ): 230.0 ms ± 0.4 ms [User: 229.4 ms, System: 0.3 ms] + Range (min … max): 229.7 ms … 230.9 ms 13 runs + +Benchmark 4: cpp -O2 -march=native + Time (mean ± σ): 174.9 ms ± 0.4 ms [User: 174.1 ms, System: 0.6 ms] + Range (min … max): 174.2 ms … 175.5 ms 17 runs + +Benchmark 5: cpp -O3 -march=native + Time (mean ± σ): 142.1 ms ± 5.4 ms [User: 140.8 ms, System: 0.7 ms] + Range (min … max): 139.5 ms … 164.4 ms 20 runs + +Benchmark 6: cpp -Ofast -march=native + Time (mean ± σ): 132.9 ms ± 0.7 ms [User: 131.9 ms, System: 0.6 ms] + Range (min … max): 132.0 ms … 134.3 ms 22 runs + +Summary + 'cpp -Ofast -march=native' ran + 1.07 ± 0.04 times faster than 'cpp -O3 -march=native' + 1.32 ± 0.01 times faster than 'cpp -O2 -march=native' + 1.73 ± 0.01 times faster than 'cpp -Ofast -march=x86-64' + 1.77 ± 0.01 times faster than 'cpp -O2 -march=x86-64' + 1.77 ± 0.01 times faster than 'cpp -O3 -march=x86-64' +``` + +Looking at the results, it's obvious that a great speedup can be achieved +simply by changing compiler flags, even with a simple program like +`monte_carlo`. Of course, this does not allow general assumptions about these +flags, since they may worsen performance on other machines or for other +programs and as such, benchmarking them would be necessary. + +Apart from the command-line output, we also made `hyperfine` generate this nice +Markdown table: + +| Command | Mean [ms] | Min [ms] | Max [ms] | Relative | +|:---|---:|---:|---:|---:| +| `cpp -O2 -march=x86-64` | 234.6 ± 0.0 | 234.5 | 234.7 | 1.77 ± 0.01 | +| `cpp -O3 -march=x86-64` | 235.2 ± 1.0 | 234.2 | 237.4 | 1.77 ± 0.01 | +| `cpp -Ofast -march=x86-64` | 230.0 ± 0.4 | 229.7 | 230.9 | 1.73 ± 0.01 | +| `cpp -O2 -march=native` | 174.9 ± 0.4 | 174.2 | 175.5 | 1.32 ± 0.01 | +| `cpp -O3 -march=native` | 142.1 ± 5.4 | 139.5 | 164.4 | 1.07 ± 0.04 | +| `cpp -Ofast -march=native` | 132.9 ± 0.7 | 132.0 | 134.3 | 1.00 | + +With this knowledge and the excellent `hyperfine` manpage, you should be able +to tackle even more complex benchmarks with ease. + +## Whole-System Benchmarking Tools + +Whole-system benchmarking is the process of measuring the performance of your +entire system, including the operating system, the hardware, and the software +stack. Besides comparing different computer systems, it is useful for +identifying performance issues that are related to the system configuration, +such as memory usage, disk I/O, and network latency. + +Since we need to run many different benchmarking programs to get a decent +picture of the many aspects of a system's performance, installing and running +these manually quickly becomes both cumber- and tiresome. + +One way to deal with this is to use the +[Phoronix Test Suite](https://www.phoronix-test-suite.com/), which is a +cross-platform benchmarking tool written in PHP and which provides a +comprehensive collection of benchmarks and entire test suites. The Phoronix +Test Suite, from now on referred to as `PTS`, automates most of the aspects of +benchmarking a system: From installing dependencies, downloading and compiling +tests to generating and uploading results to +[OpenBenchmarking.org](https://openbenchmarking.org/). + +Here's an example of how we can use the `PTS` to run a quick benchmark testing +the `lz4` compression program: + +```sh {linenos=true} +% phoronix-test-suite benchmark compress-lz4 +... + Would you like to save these test results (Y/n): y + Enter a name for the result file: ... + Enter a unique name to describe this test run / configuration: ... +... +Current Description: ... +New Description: ... +... + Would you like to upload the results to OpenBenchmarking.org (y/n): y + Results Uploaded To: ... +``` + +Since this command will attempt to install missing dependencies using the +system package manager, I suggest running the `PTS` on a live system or inside +a [container](https://hub.docker.com/r/phoronix/pts/tags). Before the benchmark +starts, the `PTS` will ask for common options such as test name and +description. After completion, it will ask whether or not to upload results to +[OpenBenchmarking.org](https://openbenchmarking.org/). All of these questions +can be answered in advance for fully automatic test runs. + +We have published the results of a sample test run +[here](https://openbenchmarking.org/result/2302281-NE-20230228P45) and if you +wish to compare your own system to these results, you can use the following +command to run the same tests and attach your results to this run: + +```sh {linenos=true} +% phoronix-test-suite benchmark 2302281-NE-20230228P45 +``` + +The `PTS` offers so many more features that covering them all would be +out-of-scope for this blog post, but we still hope that this small introduction +helped you. + +## Conclusion + +In this blog post, we have explored different tools for benchmarking, including +microbenchmarking tools, macrobenchmarking tools and whole-system benchmarking +tools. We have shown how to write microbenchmarks using Google Benchmark, how +to perform macrobenchmarks using `hyperfine` and find the best compiler flags +for a simple program and how to perform whole-system benchmarks using the +Phoronix Test Suite. We hope that this blog post has been useful for you and +that it has inspired you to improve the performance of your code. + +Happy benchmarking! + +[^1]: Peter, D. (2022). hyperfine (Version 1.15.0) [Computer software]. https://github.com/sharkdp/hyperfine diff --git a/content/posts/2023/02/benchmarking/meson.build b/content/posts/2023/02/benchmarking/meson.build new file mode 100644 index 0000000..d1bf000 --- /dev/null +++ b/content/posts/2023/02/benchmarking/meson.build @@ -0,0 +1,16 @@ +project( + 'benchmarks', + 'cpp', + default_options : [ + 'b_lto=true', + 'buildtype=release', + 'warning_level=3', + 'cpp_std=c++17', + ], +) +executable('simple_micro', 'simple_micro.cpp') +executable('monte_carlo', 'monte_carlo.cpp') +benchmark = dependency('benchmark', version : '>=1.3.0', required : false) +if benchmark.found() + executable('google_micro', 'google_micro.cpp', dependencies : benchmark) +endif diff --git a/content/posts/2023/02/benchmarking/monte_carlo.cpp b/content/posts/2023/02/benchmarking/monte_carlo.cpp new file mode 100644 index 0000000..e89449e --- /dev/null +++ b/content/posts/2023/02/benchmarking/monte_carlo.cpp @@ -0,0 +1,24 @@ +#include +#include + +int main() { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + + const size_t num_points = 10000000; + size_t num_points_inside_circle = 0; + + for (size_t i = 0; i < num_points; ++i) { + const double x = dis(gen); + const double y = dis(gen); + if (x * x + y * y <= 1.0) { + ++num_points_inside_circle; + } + } + + const double pi = 4.0 * num_points_inside_circle / num_points; + std::cout << "Pi: " << pi << '\n'; + + return 0; +} diff --git a/content/posts/2023/02/benchmarking/monte_carlo_results.md b/content/posts/2023/02/benchmarking/monte_carlo_results.md new file mode 100644 index 0000000..11f5d95 --- /dev/null +++ b/content/posts/2023/02/benchmarking/monte_carlo_results.md @@ -0,0 +1,8 @@ +| Command | Mean [ms] | Min [ms] | Max [ms] | Relative | +|:---|---:|---:|---:|---:| +| `cpp -O2 -march=x86-64` | 234.6 ± 0.0 | 234.5 | 234.7 | 1.77 ± 0.01 | +| `cpp -O3 -march=x86-64` | 235.2 ± 1.0 | 234.2 | 237.4 | 1.77 ± 0.01 | +| `cpp -Ofast -march=x86-64` | 230.0 ± 0.4 | 229.7 | 230.9 | 1.73 ± 0.01 | +| `cpp -O2 -march=native` | 174.9 ± 0.4 | 174.2 | 175.5 | 1.32 ± 0.01 | +| `cpp -O3 -march=native` | 142.1 ± 5.4 | 139.5 | 164.4 | 1.07 ± 0.04 | +| `cpp -Ofast -march=native` | 132.9 ± 0.7 | 132.0 | 134.3 | 1.00 | diff --git a/content/posts/2023/02/benchmarking/simple_micro.cpp b/content/posts/2023/02/benchmarking/simple_micro.cpp new file mode 100644 index 0000000..72bc422 --- /dev/null +++ b/content/posts/2023/02/benchmarking/simple_micro.cpp @@ -0,0 +1,29 @@ +#include +#include +#include "sum.hpp" + +using namespace std::chrono; + +const int iterations = 100000000; + +int main() { + // Benchmark sum(1, 2) + auto start = steady_clock::now(); + for (int i = 0; i < iterations; i++) { + sum(1, 2); + } + auto end = steady_clock::now(); + auto duration = duration_cast(end - start).count(); + std::cout << "sum(1, 2) took " << duration << " microseconds" << std::endl; + + // Benchmark sum(10, 20) + start = steady_clock::now(); + for (int i = 0; i < iterations; i++) { + sum(10, 20); + } + end = steady_clock::now(); + duration = duration_cast(end - start).count(); + std::cout << "sum(10, 20) took " << duration << " microseconds" << std::endl; + + return 0; +} diff --git a/content/posts/2023/02/benchmarking/sum.hpp b/content/posts/2023/02/benchmarking/sum.hpp new file mode 100644 index 0000000..22dfe63 --- /dev/null +++ b/content/posts/2023/02/benchmarking/sum.hpp @@ -0,0 +1,5 @@ +#pragma once + +int sum(int a, int b) { + return a + b; +} diff --git a/data/authors/philipp.david.yml b/data/authors/philipp.david.yml new file mode 100644 index 0000000..936c57b --- /dev/null +++ b/data/authors/philipp.david.yml @@ -0,0 +1 @@ +name: Philipp David