Backstory
int sum(const vector<int> &v) {
int result = 0;
for (size_t i = 0; i < v.size(); ++i)
result += v[i];
return result;
}
int sum(const vector<int> &v) {
int result = 0;
for (int x : v) result += x;
return result;
}
Is one better than the other?
2012, C++11isms, range-for
WARNING
Reading assembly alone can be misleading
Always measure too
Google Benchmark
quick-bench.com
Shout out to google benchmarking tool. Microbenchmarks, perils.
But always measure end-to-end perf if you can too.
Where were we?
int sum(const vector<int> &v) {
int result = 0;
for (size_t i = 0; i < v.size(); ++i)
result += v[i];
return result;
}
int sum(const vector<int> &v) {
int result = 0;
for (int x : v) result += x;
return result;
}
Which is better?
Compiler Explorer v0.1
$ g++ /tmp/test.cc -O2 -c -S -o - -masm=intel \
| c++filt \
| grep -vE '\s+\.'
sum(std::vector<int, std::allocator<int> > const&):
.LFB786:
mov rcx, QWORD PTR [rdi]
mov rax, QWORD PTR 8[rdi]
sub rax, rcx
shr rax, 2
mov rsi, rax
...
Compiler Explorer v0.1
Not very pretty
To the web!
Demo
/// g71:-O2 -std=c++1z -march=haswell
// setup
#include <numeric>
#include <vector>
using namespace std;
int sum(const vector<int> &v) {
int result = 0;
for (size_t i = 0; i < v.size(); ++i)
result += v[i];
return result;
}
Pop out
Show code
Show optimizer off
Show diff
Walkthrough
; rdi = const vector<int> *
mov rdx, QWORD PTR [rdi] ; rdx = *rdi ≡ begin()
mov rcx, QWORD PTR [rdi+8] ; rcx = *(rdi+8) ≡ end()
template<typename T> struct _Vector_impl {
T *_M_start;
T *_M_finish;
T *_M_end_of_storage;
};
Traditional
sub rcx, rdx ; rcx = end-begin
mov rax, rcx
shr rax, 2 ; (end-begin)/4
je .L4
add rcx, rdx
xor eax, eax
size_t size() const noexcept {
return _M_finish - _M_start;
}
Range
xor eax, eax
cmp rdx, rcx ; begin==end?
je .L4
auto __begin = begin(v);
auto __end = end(v);
for (auto __it = __begin;
__it != __end;
++it)
Walkthrough
; rcx ≡ end, rdx = begin, eax = 0
.L3:
add eax, DWORD PTR [rdx] ; eax += *rdx
add rdx, 4 ; rdx += sizeof(int)
cmp rdx, rcx ; is rdx == end?
jne .L3 ; if not, loop
ret ; we're done
Backstory
So, which approach is best?
Also
Optimizer settings are important
std::accumulate
What has my compiler done for me lately?
Multiplication
/// g71:-O2 -std=c++1z -march=haswell
int mulByY(int x, int y) {
return x * y;
}
mulByY(int, int):
mov eax, edi
imul eax, esi
ret
Multiplication
1101 (13)
x 0101 (5)
--------
1101
0000
1101
+ 0000
--------
01000001 (65)
That's a lot of additions!
Haswell 32-bit multiply - 4 cycles
Multiplication
/// g71:-O2 -std=c++1z -march=haswell
int mulByConstant(int x) { return x * 2; }
Multiplication
/// g71:-O2 -std=c++1z -march=haswell
int mulBy65599(int a) {
return (a << 16) + (a << 6) - a;
// ^ ^
// a * 65536 |
// a * 64
// 65536a + 64a - 1a = 65599a
}
-march=i486 -m32
shows up what you asked
Division
int divByY(int x, int y) {
return x / y;
}
int modByY(int x, int y) {
return x % y;
}
divByY(int, int):
mov eax, edi
cdq
idiv esi
ret
modByY(int, int):
mov eax, edi
cdq
idiv esi
mov eax, edx
ret
Haswell 32-bit divide - 22-29 cycles!
cdq
does sign expansion into registers. div
divides eax:edx with the
operand, results in
eax (divisor) and edx (dividend).
cdq
sign extends eax into edx, ready for a div
Division
/// g71:-O2 -std=c++1z -march=haswell
unsigned divByConstant(unsigned x) { return x / 2; }
Division
mov eax, edi ; eax = x
mov edx, 0xaaaaaaab
mul edx ; eax:edx = x * 0xaaaaaaab
mov eax, edx ; (x * 0xaaaaaaab) >> 32
; ≡ (x * 0xaaaaaaab) / 0x10000000
; ≡ x * 0.6666666667
shr eax ; x * 0.333333333
ret
Modulus
int modBy3(unsigned x) {
return x % 3;
}
mov eax, edi
mov edx, 0xaaaaaaab
mul edx
mov eax, edx
shr eax
lea eax, [rdx+rdx*2]
sub edi, eax
mov eax, edi
ret
Counting bits
/// g71:-O2 -std=c++1z -march=haswell
int countSetBits(int a) {
int count = 0;
while (a) {
count++;
a &= (a-1);
}
return count;
}
Summation
/// g71:-O2 -std=c++1z -march=haswell
constexpr int sumTo(int x) {
int sum = 0;
for (int i = 0; i <= x; ++i)
sum += i;
return sum;
}
int main(int argc, const char *argv[]) {
return sumTo(20);
}
Show code
Modify code to show how to make it depend on argc/argv
Show clang's cleverness
Show clang's weirdness if starting at 1 instead of 0
Sum(x)
\[
\begin{aligned}
\sum{x} &≡ \frac{x(x + 1)}{2} \\
&≡ x + \frac{x(x - 1)}{2}
\end{aligned}
\]
What has my compiler done for me lately?
A lot!