Written by:
Last updated: 21 July 2022
The reason why multithreading is important today can be explained by the following diagram:
Computers have gotten faster and more complex, but single threaded performance is not improving anywhere near as quickly as multi threaded performance.
In this lecture we’ll cover the broad strokes for the language features that C++ provides, but not how to use multithreading correctly for performance (CS3210) or the theory required to use the lower level language features (CS3211). You should take CS3210 and CS3211 instead! (Thanks Prof. Cristina!)
Reference: OpenMP: tutorials and articles
To use OpenMP,
-fopenmp, similar to how we compiled
with -fsanitize=thread for ThreadSanitizer.
#pragma omp ...
to appropriate places in your code.
There is a lot that can be done with OpenMP. Here, we’ll just demonstrate two very basic examples and leave more complicated use cases to CS3210 :)
#include <omp.h>
#include <cstdio>
int main() {
// Note that blocks { ... } immediately
// follow parallel directives!
// In this case, think of `parallel`
// similar to `for (...)`
#pragma omp parallel
{
int ID = omp_get_thread_num();
int TOTAL = omp_get_num_threads();
printf("Hello from thread %d/%d\n", ID, TOTAL);
}
}
$ make -B hello-openmp.clang++.out
clang++ -g -O3 -std=c++20 -fopenmp -Wpedantic -Wall -Wextra -Wconversion -Wno-unused-variable -Wno-unused-but-set-variable hello-openmp.cpp -o hello-openmp.clang++.out
$ ./hello-openmp.clang++.out
Hello from thread 6/8
Hello from thread 1/8
Hello from thread 4/8
Hello from thread 7/8
Hello from thread 2/8
Hello from thread 0/8
Hello from thread 5/8
Hello from thread 3/8
parallel constructHere will be our starting point: Godbolt link
#include <algorithm>
#include <array>
#include <execution>
#include <iostream>
#include <random>
#include <ranges>
#include <vector>
using CellT = bool;
using NeighboursT = int;
// C++23 feature, we'll just make a UDL for it
size_t operator""_uz(unsigned long long int x);
// Calculates the number of live neighbours (out of 8)
//
// Neighbours of x:
// 1 2 3
// 4 x 5
// 6 7 8
NeighboursT neighbours(auto* board, size_t i, size_t j);
struct Row : std::array<CellT, 1026> {
alignas(128) struct { } padding; };
std::array<Row, 1026> board1;
std::array<Row, 1026> board2;
int main() {
auto* curr_board = &board1;
auto* next_board = &board2;
std::array<std::array<CellT, 3>, 3> glider{{
{{0, 1, 0}},
{{0, 0, 1}},
{{1, 1, 1}},
}};
for (size_t i = 0; i < 3; ++i) {
for (size_t j = 0; j < 3; ++j) {
(*curr_board)[i + 1][j + 1] = glider[i][j];
}
}
for (size_t iteration = 0; iteration < 2048; ++iteration) {
for (size_t i = 1; i < 1024_uz + 1; ++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
std::swap(curr_board, next_board);
}
for (size_t i = 512; i < 512 + 3; ++i) {
for (size_t j = 512; j < 512 + 3; ++j) {
std::cout << (*curr_board)[i + 1][j + 1] << ' ';
}
std::cout << '\n';
}
return 0;
}
size_t operator""_uz(unsigned long long int x) {
return x;
}
NeighboursT neighbours(auto* board, size_t i, size_t j) {
NeighboursT num_neighbours = 0;
for (size_t ni = i - 1; ni <= i + 1; ++ni) {
for (size_t nj = j - 1; nj <= j + 1; ++nj) {
num_neighbours += (*board)[ni][nj];
}
}
num_neighbours -= (*board)[i][j];
return num_neighbours;
}
The main loop we will be focusing on is the following:
for (size_t iteration = 0; iteration < 2048; ++iteration) {
for (size_t i = 1; i < 1024_uz + 1; ++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
std::swap(curr_board, next_board);
}
In this situation, we have what’s known as an embarrassingly parallel problem. While each iteration has to be done sequentially, notice that the update function on each individual cell can be carried out in any order and in parallel, and no communication needs to happen between the cells.
The easiest way to make this parallel is to use OpenMP, which helps us spawn multiple threads with 0 effort.
for (size_t iteration = 0; iteration < 2048; ++iteration) {
#pragma omp parallel
{
size_t ID = (size_t) omp_get_thread_num();
size_t TOTAL = (size_t) omp_get_num_threads();
size_t rows_per_thread = (1024_uz + TOTAL - 1) / TOTAL;
// Using the thread ID,
// only compute on the rows we're responsible for
for (size_t i = ID * rows_per_thread + 1;
i < std::min(1024_uz, (ID + 1) * rows_per_thread) + 1;
++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
}
std::swap(curr_board, next_board);
}
parallel construct
But with OpenMP, there’s actually an even easier way, which is to let OpenMP do the splitting of the work for us.
Using a parallel for construct, we can tell OpenMP to run the
loop but split up each iteration to be run by a different thread.
for (size_t iteration = 0; iteration < 2048; ++iteration) {
#pragma omp parallel for
for (size_t i = 1; i < 1024_uz + 1; ++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
std::swap(curr_board, next_board);
}
parallel for construct
By default, it will pick a sensible way to split up the work. We can intervene and add additional arguments to tell it how to schedule the tasks, like so:
for (size_t iteration = 0; iteration < 2048; ++iteration) {
#pragma omp parallel for schedule(static, 1)
for (size_t i = 1; i < 1024_uz + 1; ++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
std::swap(curr_board, next_board);
}
Let’s benchmark!
$ hyperfine ./*.out --warmup 2
Benchmark 1: ./demo-loop.clang++.out
Time (mean ± σ): 719.9 ms ± 9.1 ms [User: 716.0 ms, System: 1.9 ms]
Range (min … max): 711.5 ms … 743.3 ms 10 runs
Benchmark 2: ./demo-loop.g++.out
Time (mean ± σ): 3.061 s ± 0.017 s [User: 3.054 s, System: 0.003 s]
Range (min … max): 3.040 s … 3.090 s 10 runs
Benchmark 3: ./demo-openmp-parallel-for.clang++.out
Time (mean ± σ): 236.3 ms ± 34.3 ms [User: 1777.2 ms, System: 9.0 ms]
Range (min … max): 207.0 ms … 316.3 ms 11 runs
Benchmark 4: ./demo-openmp-parallel-for.g++.out
Time (mean ± σ): 2.151 s ± 0.135 s [User: 16.176 s, System: 0.033 s]
Range (min … max): 1.833 s … 2.349 s 10 runs
Benchmark 5: ./demo-openmp-parallel.clang++.out
Time (mean ± σ): 304.5 ms ± 44.2 ms [User: 2276.1 ms, System: 11.8 ms]
Range (min … max): 270.0 ms … 415.4 ms 10 runs
Benchmark 6: ./demo-openmp-parallel.g++.out
Time (mean ± σ): 1.158 s ± 0.019 s [User: 8.830 s, System: 0.009 s]
Range (min … max): 1.135 s … 1.195 s 10 runs
Benchmark 7: ./demo-openmp-schedule.clang++.out
Time (mean ± σ): 337.8 ms ± 23.6 ms [User: 2513.8 ms, System: 18.1 ms]
Range (min … max): 298.5 ms … 368.5 ms 10 runs
Benchmark 8: ./demo-openmp-schedule.g++.out
Time (mean ± σ): 1.210 s ± 0.036 s [User: 9.185 s, System: 0.011 s]
Range (min … max): 1.172 s … 1.287 s 10 runs
Summary
'./demo-openmp-parallel-for.clang++.out' ran
1.29 ± 0.26 times faster than './demo-openmp-parallel.clang++.out'
1.43 ± 0.23 times faster than './demo-openmp-schedule.clang++.out'
3.05 ± 0.44 times faster than './demo-loop.clang++.out'
4.90 ± 0.72 times faster than './demo-openmp-parallel.g++.out'
5.12 ± 0.76 times faster than './demo-openmp-schedule.g++.out'
9.10 ± 1.44 times faster than './demo-openmp-parallel-for.g++.out'
12.95 ± 1.88 times faster than './demo-loop.g++.out'
Firstly, the good news is that the parallel versions run faster than not parallelizing at all.
Secondly, we can see that clang++ does a much better job at this benchmark
than g++, but we also see that OpenMP’s
parallel for construct on clang++ was superior to the other
methods we’ve tried, such as using parallel construct and
then manually splitting up the tasks, or using an inferior scheduling
strategy.
<algorithm> Execution policies
Reference: cppreference Execution policies
Purists may not agree with using OpenMP as it is not a core C++ feature,
but rather just a widely supported compiler extension. That’s why the
syntax to use OpenMP requires using spooky
#pragma directives.
C++ does in fact also support built-in parallelism (and also other things such as vectorisation). This is still not guaranteed by the language, but is a language feature that allows standard libraries to implement parallel algorithms or compilers to enable certain optimizations.
This is done through tag parameters called execution policies that you can add to most STL algorithms that tells the standard library / compiler to allow parallelism / vectorisation. There are 4 execution policies:
sequenced_policy: (Default) Everything should run
sequentially, if there are callbacks, then the body of each callback is
run in sequence, with no parallel calls or unsequenced calls.
parallel_policy: Parallel algorithms may be used, and if
there are callbacks, these may be run in parallel.
unsequenced_policy: Vectorisation can be used, and if there
are callbacks, these may be run unsequenced. Basically similar
requirements to parallel policy, but only one thread is used.
parallel_unsequenced_policy: Both of the above,
parallelisation and vectorisation can both be used.
Just like how std::piecewise_construct_t has a
std::piecewise_construct object that you can pass in, the
object we use here are
std::execution::{seq,par,unseq,par_unseq} respectively.
The classic example to demonstrate this is with the algorithm
std::sort.
#include <algorithm>
#include <iostream>
#include <random>
#include <vector>
// C++23 feature, we'll just make a UDL for it
size_t operator""_uz(unsigned long long int x);
int main() {
// We'll make a vector containing a bunch of random numbers
// Use a fixed seed so it's consistent across benchmarks
std::mt19937_64 rng{420};
std::uniform_int_distribution dist{0_uz, 1_uz << 63};
std::vector<size_t> nums;
const size_t NUM_NUMS = 1 << 24;
nums.reserve(NUM_NUMS);
for (size_t i = 0; i < NUM_NUMS; ++i) {
nums.push_back(dist(rng));
}
// Then sort it
std::sort(nums.begin(), nums.end());
std::cout << "smallest number is: " << nums[0] << std::endl;
}
size_t operator""_uz(unsigned long long int x) {
return x;
}
We can add the execution policy tag as the first parameter to
std::sort:
std::sort(std::execution::par_unseq, nums.begin(), nums.end());
To compile this, you might need to link a library like TBB or provide a
flag like -fopenmp. Here, we added -ltbb to our
linker flags to link with TBB.
Let’s benchmark!
$ hyperfine ./*.out --warmup 5
Benchmark 1: ./sorting-par.clang++.out
Time (mean ± σ): 583.3 ms ± 26.3 ms [User: 2378.1 ms, System: 59.8 ms]
Range (min … max): 555.9 ms … 638.9 ms 10 runs
Benchmark 2: ./sorting-par.g++.out
Time (mean ± σ): 650.6 ms ± 19.0 ms [User: 2302.9 ms, System: 54.4 ms]
Range (min … max): 620.8 ms … 677.2 ms 10 runs
Benchmark 3: ./sorting-parunseq.clang++.out
Time (mean ± σ): 620.6 ms ± 14.8 ms [User: 2630.9 ms, System: 56.0 ms]
Range (min … max): 599.8 ms … 646.6 ms 10 runs
Benchmark 4: ./sorting-parunseq.g++.out
Time (mean ± σ): 664.5 ms ± 13.6 ms [User: 2337.7 ms, System: 52.6 ms]
Range (min … max): 644.8 ms … 680.3 ms 10 runs
Benchmark 5: ./sorting-unseq.clang++.out
Time (mean ± σ): 1.365 s ± 0.014 s [User: 1.348 s, System: 0.011 s]
Range (min … max): 1.352 s … 1.401 s 10 runs
Benchmark 6: ./sorting-unseq.g++.out
Time (mean ± σ): 1.488 s ± 0.039 s [User: 1.466 s, System: 0.015 s]
Range (min … max): 1.431 s … 1.542 s 10 runs
Benchmark 7: ./sorting.clang++.out
Time (mean ± σ): 1.353 s ± 0.008 s [User: 1.334 s, System: 0.013 s]
Range (min … max): 1.338 s … 1.365 s 10 runs
Benchmark 8: ./sorting.g++.out
Time (mean ± σ): 1.423 s ± 0.008 s [User: 1.402 s, System: 0.018 s]
Range (min … max): 1.411 s … 1.438 s 10 runs
Summary
'./sorting-par.clang++.out' ran
1.06 ± 0.05 times faster than './sorting-parunseq.clang++.out'
1.12 ± 0.06 times faster than './sorting-par.g++.out'
1.14 ± 0.06 times faster than './sorting-parunseq.g++.out'
2.32 ± 0.11 times faster than './sorting.clang++.out'
2.34 ± 0.11 times faster than './sorting-unseq.clang++.out'
2.44 ± 0.11 times faster than './sorting.g++.out'
2.55 ± 0.13 times faster than './sorting-unseq.g++.out'
std::sort
Seems like this time, par execution policy was the fastest,
closely tied to par_unseq. The single-threaded versions of
std::sort both performed roughly 2 times slower than the
parallel ones, with clang++ being a little faster than g++ across all
benchmarks.
Using the std::for_each algorithm, we can effectively convert
for loops to an algorithm call, which means we can now use execution
policies with it!
In order to do this however, we need to give a pair of iterators to
std::for_each. In C++20, we should be able to do this with
std::ranges::iota_view, but standard library support is not
complete for std::ranges, so instead I implemented my own
ghetto version called ValIter, which simply wraps a
size_t in a way that looks like an iterator, and returns the
size_t when dereferenced.
for_each-ified game of life code snippet
#include <algorithm>
#include <array>
#include <concepts>
#include <execution>
#include <iostream>
#include <random>
#include <ranges>
#include <vector>
using CellT = bool;
using NeighboursT = int;
// See std::ranges::iota_view
// This is an iterator that returns T
template <typename T = size_t, T Stride = 1>
requires std::integral<T>
struct ValIter;
// C++23 feature, we'll just make a UDL for it
size_t operator""_uz(unsigned long long int x);
// Calculates the number of live neighbours (out of 8)
//
// Neighbours of x:
// 1 2 3
// 4 x 5
// 6 7 8
NeighboursT neighbours(auto* board, size_t i, size_t j);
struct Row : std::array<CellT, 1026> {
alignas(128) struct { } padding; };
std::array<Row, 1026> board1;
std::array<Row, 1026> board2;
template <typename st = size_t>
int my_main() {
// Magic: Make ValIter a dependent name so it can compile
// even though ValIter isn't defined yet.
using ValIter = ValIter<st>;
auto* __restrict curr_board = &board1;
auto* __restrict next_board = &board2;
std::array<std::array<CellT, 3>, 3> glider{{
{{0, 1, 0}},
{{0, 0, 1}},
{{1, 1, 1}},
}};
for (size_t i = 0; i < 3; ++i) {
for (size_t j = 0; j < 3; ++j) {
(*curr_board)[i + 1][j + 1] = glider[i][j];
}
}
static_assert(std::random_access_iterator<ValIter>);
for (size_t iteration = 0; iteration < 2048; ++iteration) {
// Note that we capture by value here because we only
// capture pointers (curr_board, next_board) or indices (i).
// It would be silly to capture a reference to a pointer,
// as that would effectively be a double pointer,
// and in fact, we get much worse performance on clang++
// when capturing by reference.
//
// Here, capturing by value is really cheap,
// so there's no downside to doing so.
std::for_each( //
ValIter{1_uz},
ValIter{1024_uz + 1},
[=](size_t i) {
std::for_each( //
ValIter{1_uz},
ValIter{1024_uz + 1},
[=](size_t j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
});
});
std::swap(curr_board, next_board);
}
for (size_t i = 512; i < 512 + 3; ++i) {
for (size_t j = 512; j < 512 + 3; ++j) {
std::cout << (*curr_board)[i + 1][j + 1] << ' ';
}
std::cout << '\n';
}
return 0;
}
template <typename T, T Stride>
requires std::integral<T>
struct ValIter {
using ST = std::make_signed_t<T>;
T value;
using iterator_category = std::random_access_iterator_tag;
using value_type = T;
T operator*() const {
return value;
}
T operator[](ST inc) const {
return value + static_cast<T>(inc);
}
ValIter& operator++() {
value += Stride;
return *this;
}
ValIter operator++(int) {
ValIter it{value};
value += Stride;
return it;
}
ValIter& operator--() {
value -= Stride;
return *this;
}
ValIter operator--(int) {
ValIter it{value};
value -= Stride;
return it;
}
ValIter& operator+=(ST i) {
value += i * Stride;
return *this;
}
ValIter& operator-=(ST i) {
value -= i * Stride;
return *this;
}
ST operator-(ValIter other) const {
return static_cast<ST>((value - other.value) / Stride);
}
friend ValIter operator-(ValIter i, ST dec) {
return ValIter{i.value - static_cast<T>(dec) * Stride};
}
friend ValIter operator+(ValIter i, ST inc) {
return ValIter{i.value + static_cast<T>(inc) * Stride};
}
friend ValIter operator+(ST inc, ValIter i) {
return ValIter{i.value + static_cast<T>(inc) * Stride};
}
friend std::strong_ordering operator<=>(ValIter lhs,
ValIter rhs) = default;
friend bool operator==(ValIter lhs, ValIter rhs) = default;
};
size_t operator""_uz(unsigned long long int x) {
return x;
}
NeighboursT neighbours(auto* board, size_t i, size_t j) {
NeighboursT num_neighbours = 0;
for (size_t ni = i - 1; ni <= i + 1; ++ni) {
for (size_t nj = j - 1; nj <= j + 1; ++nj) {
num_neighbours += (*board)[ni][nj];
}
}
num_neighbours -= (*board)[i][j];
return num_neighbours;
}
int main() {
return my_main();
}
for (size_t iteration = 0; iteration < 2048; ++iteration) {
// Note that we capture by value here because we only
// capture pointers (curr_board, next_board) or indices (i).
// It would be silly to capture a reference to a pointer,
// as that would effectively be a double pointer,
// and in fact, we get much worse performance on clang++
// when capturing by reference.
//
// Here, capturing by value is really cheap,
// so there's no downside to doing so.
std::for_each( //
ValIter{1_uz},
ValIter{1024_uz + 1},
[=](size_t i) {
std::for_each( //
ValIter{1_uz},
ValIter{1024_uz + 1},
[=](size_t j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
});
});
std::swap(curr_board, next_board);
}
for_each form
Let’s benchmark!
$ hyperfine ./*.clang++.out --warmup 5
Benchmark 1: ./demo-foreach-par.clang++.out
Time (mean ± σ): 224.5 ms ± 11.6 ms [User: 1493.9 ms, System: 44.3 ms]
Range (min … max): 211.0 ms … 243.4 ms 13 runs
Benchmark 2: ./demo-foreach-parunseq.clang++.out
Time (mean ± σ): 223.9 ms ± 8.5 ms [User: 1520.4 ms, System: 31.6 ms]
Range (min … max): 216.8 ms … 246.2 ms 13 runs
Benchmark 3: ./demo-foreach-unseq.clang++.out
Time (mean ± σ): 712.9 ms ± 5.7 ms [User: 710.4 ms, System: 1.5 ms]
Range (min … max): 706.3 ms … 725.3 ms 10 runs
Benchmark 4: ./demo-foreach.clang++.out
Time (mean ± σ): 709.8 ms ± 5.2 ms [User: 707.4 ms, System: 1.5 ms]
Range (min … max): 705.7 ms … 723.4 ms 10 runs
Benchmark 5: ./demo-loop.clang++.out
Time (mean ± σ): 707.9 ms ± 5.9 ms [User: 704.6 ms, System: 2.6 ms]
Range (min … max): 702.6 ms … 723.4 ms 10 runs
Summary
'./demo-foreach-parunseq.clang++.out' ran
1.00 ± 0.06 times faster than './demo-foreach-par.clang++.out'
3.16 ± 0.12 times faster than './demo-loop.clang++.out'
3.17 ± 0.12 times faster than './demo-foreach.clang++.out'
3.18 ± 0.12 times faster than './demo-foreach-unseq.clang++.out'
I omitted the g++ versions here because they perform much worse. Here, we can see that the single threaded versions perform roughly 3 times slower than the parallel versions.
<thread> for multi-threading
The most manual way of achieving parallelism that we will show today is using threads directly.
To use the <thread> library, we need to link with
pthreads, the backend used for C++ threads on Linux. We do that by
compiling and linking with -pthread.
Note that simply linking with -lpthread is not sufficient. We
need to both link with the pthreads library, but also tell the compiler to
properly interface with the library, and this is done with
-pthread. This is important due to the C++ memory model,
which we briefly mention later and properly explain in CS3211.
#include <cstdio>
#include <thread>
#include <vector>
int main() {
std::vector<std::thread> threads;
// Make 8 threads, each one
// executing the code in the lambda
size_t TOTAL = 8;
threads.reserve(TOTAL);
for (size_t ID = 0; ID < TOTAL; ++ID) {
threads.emplace_back([=]() { //
printf("Hello from thread %zu/%zu\n", ID, TOTAL);
});
}
// Wait for all the threads to exit
// If we don't, then we will crash,
// since we cannot destroy a `std::thread`
// without either `join`ing it or `detach`ing it.
for (auto& t : threads) {
t.join();
}
}
$ make -B hello-thread.clang++.out
clang++ -g -O3 -std=c++20 -pthread -Wpedantic -Wall -Wextra -Wconversion -Wno-unused-variable -Wno-unused-but-set-variable hello-thread.cpp -o hello-thread.clang++.out
$ ./hello-thread.clang++.out
Hello from thread 0/8
Hello from thread 2/8
Hello from thread 3/8
Hello from thread 4/8
Hello from thread 5/8
Hello from thread 6/8
Hello from thread 7/8
Hello from thread 1/8
We can use threads to parallelise game of life. However, starting and stopping threads every iteration is very slow, due to the relatively large cost of creating threads.
What we can use instead is a thread pool. We create a number of threads (say, 8 threads) at the start of the program, and communicate with these 8 threads to get them to do the calculations for us.
We will get each thread to wait until jobs show up in a job queue, and when one shows up, take it and execute the job. Then all our main loop needs to do is divide the work of each iteration into parts, submit these jobs to the job queue, and wait until they’re done.
Here are a list of references if you would like to understand how to write your own task pool:
std::promise,
std::future: Similar to the equivalent classes in Java.
std::packaged_task: Basically std::function but combined with a
std::promise.
std::mutex,
std::condition_variable: Required in order to synchronise the job queue.
#include <algorithm>
#include <array>
#include <condition_variable>
#include <execution>
#include <future>
#include <iostream>
#include <optional>
#include <queue>
#include <random>
#include <ranges>
#include <thread>
#include <vector>
using CellT = bool;
using NeighboursT = int;
// C++23 feature, we'll just make a UDL for it
size_t operator""_uz(unsigned long long int x);
// Calculates the number of live neighbours (out of 8)
//
// Neighbours of x:
// 1 2 3
// 4 x 5
// 6 7 8
NeighboursT neighbours(auto* board, size_t i, size_t j);
struct Row : std::array<CellT, 1026> {
alignas(128) struct { } padding; };
std::array<Row, 1026> board1;
std::array<Row, 1026> board2;
template <typename T>
struct Channel {
std::mutex mut{};
std::condition_variable cond{};
std::queue<T> elems{};
bool closed = false;
std::optional<T> pop() {
std::unique_lock lock{mut};
cond.wait(lock, [this]() { return !elems.empty() || closed; });
if (!elems.empty()) {
// We exited the wait due to having an element to pop
T elem = std::move(elems.front());
elems.pop();
if (!elems.empty()) {
lock.unlock();
}
return elem;
}
// We exited the wait due to closed being true
return std::nullopt;
}
void push(T elem) {
{
std::scoped_lock lock{mut};
if (closed) {
throw std::runtime_error("Push into closed channel is a panic");
}
elems.push(std::move(elem));
}
cond.notify_all();
}
template <typename Iter>
void push_range(Iter begin, Iter end) {
{
std::scoped_lock lock{mut};
if (closed) {
throw std::runtime_error("Push into closed channel is a panic");
}
for (; begin != end; ++begin) {
elems.push(std::move(*begin));
}
}
cond.notify_all();
}
void close() {
{
std::scoped_lock lock{mut};
if (closed) {
throw std::runtime_error("Closing closed channel is a panic");
}
closed = true;
}
cond.notify_all();
}
};
struct Worker {
Channel<std::packaged_task<void()>>& chan;
std::thread worker_thread{[this]() {
while (true) {
if (auto task = chan.pop(); !task) {
return;
} else {
(*task)();
}
}
}};
~Worker() {
worker_thread.join();
}
};
struct Pool {
Channel<std::packaged_task<void()>> chan;
std::array<Worker, 8> workers{
{{chan}, {chan}, {chan}, {chan}, {chan}, {chan}, {chan}, {chan}} //
};
void submit_task(std::packaged_task<void()> task) {
chan.push(std::move(task));
}
template <typename Iter>
void submit_task_range(Iter begin, Iter end) {
chan.push_range(begin, end);
}
~Pool() {
chan.close();
}
};
Pool pool;
int main() {
auto* curr_board = &board1;
auto* next_board = &board2;
std::array<std::array<CellT, 3>, 3> glider{{
{{0, 1, 0}},
{{0, 0, 1}},
{{1, 1, 1}},
}};
for (size_t i = 0; i < 3; ++i) {
for (size_t j = 0; j < 3; ++j) {
(*curr_board)[i + 1][j + 1] = glider[i][j];
}
}
for (size_t iteration = 0; iteration < 2048; ++iteration) {
std::vector<std::packaged_task<void()>> tasks;
std::vector<std::future<void>> futures;
tasks.reserve(8);
futures.reserve(8);
for (size_t job_id = 0; job_id < 8; ++job_id) {
tasks.emplace_back([=]() mutable {
for (size_t i = job_id * (1024_uz / 8) + 1;
i < (job_id + 1) * (1024_uz / 8) + 1;
++i) {
for (size_t j = 1; j < 1024_uz + 1; ++j) {
NeighboursT num_neighbours = neighbours(curr_board, i, j);
if (num_neighbours == 2) {
(*next_board)[i][j] = (*curr_board)[i][j];
} else {
(*next_board)[i][j] = num_neighbours == 3;
}
}
}
});
futures.push_back(tasks.back().get_future());
}
pool.submit_task_range(tasks.begin(), tasks.end());
for (const auto& f : futures) {
f.wait();
}
std::swap(curr_board, next_board);
}
for (size_t i = 512; i < 512 + 3; ++i) {
for (size_t j = 512; j < 512 + 3; ++j) {
std::cout << (*curr_board)[i + 1][j + 1] << ' ';
}
std::cout << '\n';
}
return 0;
}
size_t operator""_uz(unsigned long long int x) {
return x;
}
NeighboursT neighbours(auto* board, size_t i, size_t j) {
NeighboursT num_neighbours = 0;
for (size_t ni = i - 1; ni <= i + 1; ++ni) {
for (size_t nj = j - 1; nj <= j + 1; ++nj) {
num_neighbours += (*board)[ni][nj];
}
}
num_neighbours -= (*board)[i][j];
return num_neighbours;
}
Let’s benchmark!
$ hyperfine ./*.clang++.out --warmup 5
Benchmark 1: ./demo-loop.clang++.out
Time (mean ± σ): 680.7 ms ± 6.7 ms [User: 678.3 ms, System: 1.6 ms]
Range (min … max): 671.3 ms … 691.1 ms 10 runs
Benchmark 2: ./demo-thread.clang++.out
Time (mean ± σ): 225.3 ms ± 12.7 ms [User: 1281.0 ms, System: 57.9 ms]
Range (min … max): 212.9 ms … 253.9 ms 12 runs
Summary
'./demo-thread.clang++.out' ran
3.02 ± 0.17 times faster than './demo-loop.clang++.out'
We got roughly a 3 times speedup, similar to what we got with OpenMP.
References:
We showed how execution policies allow us to select what algorithm
implementation to use. In previous lectures, we also showed use of
function objects to change the behaviour of stuff like
std::map, or std::sort.
Similarly, allocators can be used with the STL containers to change the allocation and deallocation algorithms used by such containers.
#include <benchmark/benchmark.h>
#include <deque>
#include <fstream>
#include <iostream>
#include <memory>
#include <mutex>
#include <numeric>
#include <ranges>
#include <sstream>
#include <stdexcept>
#include <vector>
template <typename Derived>
struct ExprCommon {
enum Op {
VALUE,
PLUS,
MINUS,
TIMES,
DIVIDE,
};
static constexpr char opcodes[] = {
'?',
'+',
'-',
'*',
'/',
};
double evaluate() const {
const Derived& self = static_cast<const Derived&>(*this);
switch (self.op) {
case VALUE: {
return self.value;
}
case PLUS: {
double res = 0.0;
for (const Derived& e : self.args) {
res += e.evaluate();
}
return res;
}
case MINUS: {
if (self.args.size() == 0) {
throw std::runtime_error("Bad number of arguments to -");
}
if (self.args.size() == 1) {
// Unary -
// aka negation
return -self.args[0].evaluate();
}
// First arg minus the remaining self.args
auto it = self.args.cbegin();
double res = it->evaluate();
++it;
for (; it != self.args.cend(); ++it) {
res -= it->evaluate();
}
return res;
}
case TIMES: {
double res = 1.0;
for (const Derived& e : self.args) {
res *= e.evaluate();
}
return res;
}
case DIVIDE: {
if (self.args.size() == 0) {
throw std::runtime_error("Bad number of arguments to /");
}
if (self.args.size() == 1) {
// Unary /
// aka reciprocal
return 1.0 / self.args[0].evaluate();
}
// First arg divide the remaining self.args
auto it = self.args.cbegin();
double res = it->evaluate();
++it;
for (; it != self.args.cend(); ++it) {
res /= it->evaluate();
}
return res;
}
default:
throw std::runtime_error("Invalid opcode in evaluate");
}
}
friend std::ostream& operator<<(std::ostream& os, const Derived& self) {
if (self.op == self.VALUE) {
return os << self.value;
}
os << '(' << opcodes[self.op];
for (const Derived& e : self.self.args) {
os << ' ' << e;
}
os << ')';
return os;
}
friend std::istream& operator>>(std::istream& is, Derived& self) {
is >> std::ws;
if (is.peek() != '(') {
self.op = self.VALUE;
is >> self.value;
return is;
}
// Need to parse complex expression
is.get();
char op;
is >> std::ws >> op;
switch (op) {
case '+':
self.op = PLUS;
break;
case '-':
self.op = MINUS;
break;
case '*':
self.op = TIMES;
break;
case '/':
self.op = DIVIDE;
break;
default:
throw std::runtime_error("Invalid opcode");
}
while (true) {
is >> std::ws;
if (is.peek() == ')') {
is.get();
return is;
}
self.args.emplace_back();
is >> self.args.back();
}
}
};
struct Expr : ExprCommon<Expr> {
Op op = VALUE;
std::vector<Expr> args{};
double value = 0.0;
};
template <typename T>
struct GlobalMonotonicAllocator {
static inline char buffer[200'000];
static inline char* next_allocation = buffer;
using value_type = T;
T* allocate(size_t n) {
T* ptr = reinterpret_cast<T*>(next_allocation);
next_allocation += sizeof(T) * n;
if (next_allocation > buffer + 200'000) {
throw std::bad_alloc();
}
return ptr;
}
void deallocate(T*, size_t) {
// Monotonic means deallocate is no-op
}
static void release() {
// deallocate all allocations so far in one go
next_allocation = buffer;
}
friend bool operator==(const GlobalMonotonicAllocator&,
const GlobalMonotonicAllocator&) = default;
};
struct GlobalMonotonicExpr : ExprCommon<GlobalMonotonicExpr> {
Op op = VALUE;
std::vector<GlobalMonotonicExpr,
GlobalMonotonicAllocator<GlobalMonotonicExpr>>
args{};
double value = 0.0;
};
template <typename T>
struct StatefulMonotonicAllocator {
struct Buffer {
char* begin;
char* end;
char* next_allocation;
};
Buffer* buffer;
using value_type = T;
T* allocate(size_t n) {
T* ptr = reinterpret_cast<T*>(buffer->next_allocation);
buffer->next_allocation += sizeof(T) * n;
if (buffer->next_allocation > buffer->end) {
throw std::bad_alloc();
}
return ptr;
}
void deallocate(T*, size_t) {
// Monotonic means deallocate is no-op
}
void release() {
// deallocate all allocations so far in one go
buffer->next_allocation = buffer->begin;
}
friend bool operator==(const StatefulMonotonicAllocator&,
const StatefulMonotonicAllocator&) = default;
// Also, when a vector tries to emplace back, it will ask our allocator
// to construct. Here's our chance to copy over the allocator.
template <typename... Args>
void construct(T* ptr, Args&&... args) {
// Fancy C++20 version
std::uninitialized_construct_using_allocator(
ptr, *this, std::forward<Args>(args)...);
// Cursed pre-C++20 version
// Don't do drugs, kids
// std::apply(
// [=]<typename... Xs>(Xs&&... xs) {
// std::construct_at(ptr, std::forward<Xs>(xs)...);
// },
// std::uses_allocator_construction_args<T>(
// *this, std::forward<Args>(args)...));
}
};
struct StatefulMonotonicExpr : ExprCommon<StatefulMonotonicExpr> {
using allocator_type =
StatefulMonotonicAllocator<StatefulMonotonicExpr>;
Op op = VALUE;
std::vector<StatefulMonotonicExpr, allocator_type> args{};
double value = 0.0;
// We need to forward the allocator to the inner vector of exprs
StatefulMonotonicExpr() = default;
explicit StatefulMonotonicExpr(allocator_type alloc)
: op{VALUE}, args{alloc} {}
StatefulMonotonicExpr(const StatefulMonotonicExpr&) = default;
StatefulMonotonicExpr(StatefulMonotonicExpr&&) = default;
StatefulMonotonicExpr(const StatefulMonotonicExpr& other,
allocator_type alloc)
: op{other.op}, args{other.args, alloc}, value{other.value} {}
StatefulMonotonicExpr(StatefulMonotonicExpr&& other,
allocator_type alloc)
: op{other.op},
args{std::move(other.args), alloc},
value{other.value} {}
StatefulMonotonicExpr& operator=(const StatefulMonotonicExpr&) =
default;
StatefulMonotonicExpr& operator=(StatefulMonotonicExpr&&) = default;
static_assert(
std::uses_allocator_v<StatefulMonotonicExpr, allocator_type>);
};
static void BM_Default(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
Expr e{};
ss.str(s);
ss >> e;
Expr copy{e};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Default);
static void BM_GlobalMonotonic(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
GlobalMonotonicExpr e;
ss.str(s);
ss >> e;
GlobalMonotonicExpr copy{e};
benchmark::DoNotOptimize(copy.evaluate());
GlobalMonotonicAllocator<GlobalMonotonicExpr>::release();
}
}
}
BENCHMARK(BM_GlobalMonotonic);
static void BM_LocalMonotonic(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<char[]>(200'000);
StatefulMonotonicAllocator<StatefulMonotonicExpr>::Buffer buf{
pbuf.get(), pbuf.get() + 200'000, pbuf.get()};
StatefulMonotonicAllocator<StatefulMonotonicExpr> alloc{&buf};
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
StatefulMonotonicExpr e{alloc};
ss.str(s);
ss >> e;
StatefulMonotonicExpr copy{e, alloc};
benchmark::DoNotOptimize(copy.evaluate());
alloc.release();
}
}
}
BENCHMARK(BM_LocalMonotonic);
BENCHMARK_MAIN();
#include <benchmark/benchmark.h>
#include <deque>
#include <fstream>
#include <iostream>
#include <memory>
#include <memory_resource>
#include <mutex>
#include <numeric>
#include <ranges>
#include <sstream>
#include <stdexcept>
#include <vector>
struct Expr {
enum Op {
VALUE,
PLUS,
MINUS,
TIMES,
DIVIDE,
} op = VALUE;
std::vector<Expr> args{};
double value = 0.0;
double evaluate() const {
switch (op) {
case VALUE: {
return value;
}
case PLUS: {
double res = 0.0;
for (const Expr& e : args) {
res += e.evaluate();
}
return res;
}
case MINUS: {
if (args.size() == 0) {
throw std::runtime_error("Bad number of arguments to -");
}
if (args.size() == 1) {
// Unary -
// aka negation
return -args[0].evaluate();
}
// First arg minus the remaining args
auto it = args.cbegin();
double res = it->evaluate();
++it;
for (; it != args.cend(); ++it) {
res -= it->evaluate();
}
return res;
}
case TIMES: {
double res = 1.0;
for (const Expr& e : args) {
res *= e.evaluate();
}
return res;
}
case DIVIDE: {
if (args.size() == 0) {
throw std::runtime_error("Bad number of arguments to /");
}
if (args.size() == 1) {
// Unary /
// aka reciprocal
return 1.0 / args[0].evaluate();
}
// First arg divide the remaining args
auto it = args.cbegin();
double res = it->evaluate();
++it;
for (; it != args.cend(); ++it) {
res /= it->evaluate();
}
return res;
}
default:
throw std::runtime_error("Invalid opcode in evaluate");
}
}
static constexpr char opcodes[] = {
'?',
'+',
'-',
'*',
'/',
};
friend std::ostream& operator<<(std::ostream& os, const Expr& self) {
if (self.op == VALUE) {
return os << self.value;
}
os << '(' << opcodes[self.op];
for (const Expr& e : self.args) {
os << ' ' << e;
}
os << ')';
return os;
}
friend std::istream& operator>>(std::istream& is, Expr& self) {
is >> std::ws;
if (is.peek() != '(') {
self.op = VALUE;
is >> self.value;
return is;
}
// Need to parse complex expression
is.get();
char op;
is >> std::ws >> op;
switch (op) {
case '+':
self.op = PLUS;
break;
case '-':
self.op = MINUS;
break;
case '*':
self.op = TIMES;
break;
case '/':
self.op = DIVIDE;
break;
default:
throw std::runtime_error("Invalid opcode");
}
while (true) {
is >> std::ws;
if (is.peek() == ')') {
is.get();
return is;
}
self.args.emplace_back();
is >> self.args.back();
}
}
};
struct PMRExpr {
enum Op {
VALUE,
PLUS,
MINUS,
TIMES,
DIVIDE,
} op;
std::pmr::vector<PMRExpr> args;
double value;
using allocator_type = std::pmr::polymorphic_allocator<Expr>;
// Default ctor and allocator aware version
PMRExpr() : op{VALUE}, args{}, value{0.0} {}
explicit PMRExpr(const allocator_type& alloc)
: op{VALUE}, args{alloc}, value{0.0} {}
// Note the explicit!
// We don't want allocators to implicitly convert to Exprs.
// Without explicit, the following would compile:
//
// std::pmr::monotonic_buffer_resource resource;
// std::pmr::polymorphic_allocator<Expr> alloc{&resource};
// Expr e = alloc;
// Copy and allocator aware version
PMRExpr(const PMRExpr& other) = default;
PMRExpr& operator=(const PMRExpr& other) = default;
PMRExpr(const PMRExpr& other, const allocator_type& alloc)
: op{other.op}, args{other.args, alloc}, value{other.value} {}
// Move and allocator aware version
PMRExpr(PMRExpr&& other) = default;
PMRExpr& operator=(PMRExpr&& other) = default;
PMRExpr(PMRExpr&& other, const allocator_type& alloc)
: op{std::move(other.op)},
args{std::move(other.args), alloc},
value{std::move(other.value)} {}
double evaluate() const {
switch (op) {
case VALUE: {
return value;
}
case PLUS: {
double res = 0.0;
for (const PMRExpr& e : args) {
res += e.evaluate();
}
return res;
}
case MINUS: {
if (args.size() == 0) {
throw std::runtime_error("Bad number of arguments to -");
}
if (args.size() == 1) {
// Unary -
// aka negation
return -args[0].evaluate();
}
// First arg minus the remaining args
auto it = args.cbegin();
double res = it->evaluate();
++it;
for (; it != args.cend(); ++it) {
res -= it->evaluate();
}
return res;
}
case TIMES: {
double res = 1.0;
for (const PMRExpr& e : args) {
res *= e.evaluate();
}
return res;
}
case DIVIDE: {
if (args.size() == 0) {
throw std::runtime_error("Bad number of arguments to /");
}
if (args.size() == 1) {
// Unary /
// aka reciprocal
return 1.0 / args[0].evaluate();
}
// First arg divide the remaining args
auto it = args.cbegin();
double res = it->evaluate();
++it;
for (; it != args.cend(); ++it) {
res /= it->evaluate();
}
return res;
}
default:
throw std::runtime_error("Invalid opcode in evaluate");
}
}
static constexpr char opcodes[] = {
'?',
'+',
'-',
'*',
'/',
};
friend std::ostream& operator<<(std::ostream& os, const PMRExpr& self) {
if (self.op == VALUE) {
return os << self.value;
}
os << '(' << opcodes[self.op];
for (const PMRExpr& e : self.args) {
os << ' ' << e;
}
os << ')';
return os;
}
friend std::istream& operator>>(std::istream& is, PMRExpr& self) {
is >> std::ws;
if (is.peek() != '(') {
self.op = VALUE;
is >> self.value;
return is;
}
// Need to parse complex expression
is.get();
char op;
is >> std::ws >> op;
switch (op) {
case '+':
self.op = PLUS;
break;
case '-':
self.op = MINUS;
break;
case '*':
self.op = TIMES;
break;
case '/':
self.op = DIVIDE;
break;
default:
throw std::runtime_error("Invalid opcode");
}
while (true) {
is >> std::ws;
if (is.peek() == ')') {
is.get();
return is;
}
self.args.emplace_back();
is >> self.args.back();
}
}
};
struct NoDestroyPMRExpr {
enum Op {
VALUE,
PLUS,
MINUS,
TIMES,
DIVIDE,
} op;
alignas(alignof(std::pmr::vector<NoDestroyPMRExpr>)) char args_storage
[sizeof(std::pmr::vector<NoDestroyPMRExpr>)];
std::pmr::vector<NoDestroyPMRExpr>& args() {
return reinterpret_cast<std::pmr::vector<NoDestroyPMRExpr>&>(
args_storage);
};
const std::pmr::vector<NoDestroyPMRExpr>& args() const {
return reinterpret_cast<const std::pmr::vector<NoDestroyPMRExpr>&>(
args_storage);
};
double value;
using allocator_type = std::pmr::polymorphic_allocator<Expr>;
// Default ctor and allocator aware version
NoDestroyPMRExpr() : op{VALUE}, value{0.0} {
std::construct_at(&args());
}
explicit NoDestroyPMRExpr(const allocator_type& alloc)
: op{VALUE}, value{0.0} {
std::construct_at(&args(), alloc);
}
// Note the explicit!
// We don't want allocators to implicitly convert to Exprs.
// Without explicit, the following would compile:
//
// std::pmr::monotonic_buffer_resource resource;
// std::pmr::polymorphic_allocator<Expr> alloc{&resource};
// Expr e = alloc;
// Copy and allocator aware version
NoDestroyPMRExpr(const NoDestroyPMRExpr& other) = default;
NoDestroyPMRExpr& operator=(const NoDestroyPMRExpr& other) = default;
NoDestroyPMRExpr(const NoDestroyPMRExpr& other,
const allocator_type& alloc)
: op{other.op}, value{other.value} {
std::construct_at(&args(), other.args(), alloc);
}
// Move and allocator aware version
NoDestroyPMRExpr(NoDestroyPMRExpr&& other) = default;
NoDestroyPMRExpr& operator=(NoDestroyPMRExpr&& other) = default;
NoDestroyPMRExpr(NoDestroyPMRExpr&& other, const allocator_type& alloc)
: op{std::move(other.op)}, value{std::move(other.value)} {
std::construct_at(&args(), std::move(other.args()), alloc);
}
double evaluate() const {
switch (op) {
case VALUE: {
return value;
}
case PLUS: {
double res = 0.0;
for (const NoDestroyPMRExpr& e : args()) {
res += e.evaluate();
}
return res;
}
case MINUS: {
if (args().size() == 0) {
throw std::runtime_error("Bad number of arguments to -");
}
if (args().size() == 1) {
// Unary -
// aka negation
return -args()[0].evaluate();
}
// First arg minus the remaining args()
auto it = args().cbegin();
double res = it->evaluate();
++it;
for (; it != args().cend(); ++it) {
res -= it->evaluate();
}
return res;
}
case TIMES: {
double res = 1.0;
for (const NoDestroyPMRExpr& e : args()) {
res *= e.evaluate();
}
return res;
}
case DIVIDE: {
if (args().size() == 0) {
throw std::runtime_error("Bad number of arguments to /");
}
if (args().size() == 1) {
// Unary /
// aka reciprocal
return 1.0 / args()[0].evaluate();
}
// First arg divide the remaining args()
auto it = args().cbegin();
double res = it->evaluate();
++it;
for (; it != args().cend(); ++it) {
res /= it->evaluate();
}
return res;
}
default:
throw std::runtime_error("Invalid opcode in evaluate");
}
}
static constexpr char opcodes[] = {
'?',
'+',
'-',
'*',
'/',
};
friend std::ostream& operator<<(std::ostream& os,
const NoDestroyPMRExpr& self) {
if (self.op == VALUE) {
return os << self.value;
}
os << '(' << opcodes[self.op];
for (const NoDestroyPMRExpr& e : self.args()) {
os << ' ' << e;
}
os << ')';
return os;
}
friend std::istream& operator>>(std::istream& is,
NoDestroyPMRExpr& self) {
is >> std::ws;
if (is.peek() != '(') {
self.op = VALUE;
is >> self.value;
return is;
}
// Need to parse complex expression
is.get();
char op;
is >> std::ws >> op;
switch (op) {
case '+':
self.op = PLUS;
break;
case '-':
self.op = MINUS;
break;
case '*':
self.op = TIMES;
break;
case '/':
self.op = DIVIDE;
break;
default:
throw std::runtime_error("Invalid opcode");
}
while (true) {
is >> std::ws;
if (is.peek() == ')') {
is.get();
return is;
}
self.args().emplace_back();
is >> self.args().back();
}
}
};
static void BM_Default(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
Expr e{};
ss.str(s);
ss >> e;
Expr copy{e};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Default);
static void BM_Monotonic_1M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 1'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Monotonic_1M);
static void BM_Monotonic_2M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 2'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Monotonic_2M);
static void BM_Monotonic_4M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 4'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Monotonic_4M);
static void BM_Monotonic_8M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 8'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Monotonic_8M);
static void BM_Monotonic_12M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 12'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Monotonic_12M);
static void BM_MonotonicNoDestroyTop_1M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 1'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
std::pmr::polymorphic_allocator<Expr> alloc{&resource};
// This way, we allocate a PMRExpr into the buffer resource,
// but we DON'T properly `delete` it!
// This is fine in this very specific case as we're making use
// of certain assumptions about how std::vector is implemented,
// namely that it's ok if the destructor isn't called if all the
// subobjects will be released when the allocator goes out of
// scope.
PMRExpr* pe = alloc.allocate_object<PMRExpr>();
std::construct_at(pe, &resource);
PMRExpr& e = *pe;
ss.str(s);
ss >> e;
PMRExpr* pcopy = alloc.allocate_object<PMRExpr>();
std::construct_at(pcopy, e, &resource);
PMRExpr& copy = *pcopy;
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicNoDestroyTop_1M);
static void BM_MonotonicNoDestroyTop_8M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 8'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
std::pmr::polymorphic_allocator<Expr> alloc{&resource};
// This way, we allocate a PMRExpr into the buffer resource,
// but we DON'T properly `delete` it!
// This is fine in this very specific case as we're making use
// of certain assumptions about how std::vector is implemented,
// namely that it's ok if the destructor isn't called if all the
// subobjects will be released when the allocator goes out of
// scope.
PMRExpr* pe = alloc.allocate_object<PMRExpr>();
std::construct_at(pe, &resource);
PMRExpr& e = *pe;
ss.str(s);
ss >> e;
PMRExpr* pcopy = alloc.allocate_object<PMRExpr>();
std::construct_at(pcopy, e, &resource);
PMRExpr& copy = *pcopy;
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicNoDestroyTop_8M);
static void BM_MonotonicNoDestroy_1M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 1'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
NoDestroyPMRExpr e{&resource};
ss.str(s);
ss >> e;
NoDestroyPMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicNoDestroy_1M);
static void BM_MonotonicNoDestroy_8M(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 8'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource resource{buf.begin(),
buf.size()};
NoDestroyPMRExpr e{&resource};
ss.str(s);
ss >> e;
NoDestroyPMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicNoDestroy_8M);
static void BM_Pool(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::unsynchronized_pool_resource resource{};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_Pool);
static void BM_MonotonicPool(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 8'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource buf_resource{buf.begin(),
buf.size()};
std::pmr::unsynchronized_pool_resource resource{&buf_resource};
PMRExpr e{&resource};
ss.str(s);
ss >> e;
PMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicPool);
static void BM_MonotonicPoolNoDestroy(benchmark::State& state) {
std::ostringstream sstr;
sstr << std::ifstream("test-0.700.in").rdbuf();
std::string s = sstr.str();
std::unique_ptr pbuf = std::make_unique<std::array<char, 8'000'000>>();
auto& buf = *pbuf;
for (auto _ : state) {
std::istringstream ss;
for (size_t i = 0; i < 1024; ++i) {
std::pmr::monotonic_buffer_resource buf_resource{buf.begin(),
buf.size()};
std::pmr::unsynchronized_pool_resource resource{&buf_resource};
NoDestroyPMRExpr e{&resource};
ss.str(s);
ss >> e;
NoDestroyPMRExpr copy{e, &resource};
benchmark::DoNotOptimize(copy.evaluate());
}
}
}
BENCHMARK(BM_MonotonicPoolNoDestroy);
BENCHMARK_MAIN();
© 21 July 2022, Thomas Tan, All Rights Reserved
^