Thanks for all the comments so far. I am sorry that I have used a bad example in my original question, that almost everyone would say: "Oh, you should use memcopy!" But that is not what my question is about.
My question is more generic about how manual loop unrolling should be done. Consider this example this time, by summing all elements in an array:
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + (n - nr);
double sum_x = 0.0;
for (; x < end; x++) sum_x += *x;
if (nr) sum_x += *x;
return sum_x;
}
The compiler generated assembly admits a similar behaviour (to what is shown by the array-copying example in my original question)
sum:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
leaq (%rsi,%rdi,8), %rdx
cmpq %rdx, %rsi
jnb .L5
movq %rsi, %rax
pxor %xmm0, %xmm0
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L3
movq %rsi, %rax
notq %rax
addq %rax, %rdx
shrq $3, %rdx
leaq 8(%rsi,%rdx,8), %rsi
.L2:
testq %rcx, %rcx
je .L1
addsd (%rsi), %xmm0
.L1:
ret
.L5:
pxor %xmm0, %xmm0
jmp .L2
However, if I now schedule the "fractional" part ahead of the main loop (as I later dig out in an answer I posted), the compiler does much better job.
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + n;
double sum_x = 0.0;
if (nr) sum_x += *x;
for (x += nr; x < end; x++) sum_x += *x;
return sum_x;
}
sum:
leaq (%rsi,%rdi,8), %rdx
pxor %xmm0, %xmm0
andl $1, %edi
je .L2
addsd (%rsi), %xmm0
.L2:
leaq (%rsi,%rdi,8), %rax
cmpq %rax, %rdx
jbe .L1
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L4
.L1:
ret
I have only used a compiler flag -O2. So as Peter said, the compiler generated assembly should be close to C source code. Then the question is, why does a compiler do better in the latter case?
This is not really a performance-related question. It is just something I unconsciously found (and can't explain) when checking compiler's assembly output for C code from a C project I have been writing. Thanks again. Thank Peter for proposing a better title for the question.
Original question:
The following small C function copies a, a vector of n entries to b. A manual loop unrolling of depth 2 is applied.
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = 0;
ptrdiff_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
It gives the x64 assembly under gcc -O2 (any gcc version 5.4+). However, I find the part of the output as commented weird. Why does the compiler ever generate them?
foo:
movq %rdi, %rcx
xorl %eax, %eax
andl $1, %ecx
subq %rcx, %rdi
testq %rdi, %rdi
jle .L11
.L12:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
jg .L12 // the loop ends, with `i = n`, BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
shrq %rdi // n = n / 2;
leaq 2(%rdi,%rdi), %rax // i = 2 * n + 2; (this is just `i = n`, isn't it?)
.L11:
testq %rcx, %rcx
je .L10
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L10:
ret
A similar version using size_t instead of ptrdiff_t gives something similar:
#include <stdlib.h>
void bar (size_t n, double *a, double *b) {
size_t i = 0;
size_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
bar:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
je .L20
xorl %eax, %eax
.L21:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
ja .L21 // the loop ends, with `i = n`, BUT BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
andq $-2, %rdi // n = n & (-2);
addq $2, %rdi // n = n + 2; (this is just `i = n`, isn't it?)
.L20:
testq %rcx, %rcx
je .L19
movsd (%rsi,%rdi,8), %xmm0
movsd %xmm0, (%rdx,%rdi,8)
.L19:
ret
And here is another equivalence,
#include <stdlib.h>
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
n -= nr;
double *b_end = b + n;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
} // `b = b_end` when the loop ends
if (nr) b[0] = a[0];
}
but the following assembly looks more odd (though produced under -O2). Now n, a and b are all copied, and when the loop ends, we take 5 lines of code just to end up with b_copy = 0?!
baz: // initially, `n` in %rdi, `a` in %rsi, `b` in %rdx
movq %rdi, %r8 // n_copy = n;
andl $1, %r8d // nr = n_copy & 1;
subq %r8, %rdi // n_copy -= nr;
leaq (%rdx,%rdi,8), %rdi // b_end = b + n;
cmpq %rdi, %rdx // if (b >= b_end) jump to .L31
jnb .L31
movq %rdx, %rax // b_copy = b;
movq %rsi, %rcx // a_copy = a;
.L32:
movsd (%rcx), %xmm0
addq $16, %rax
addq $16, %rcx
movsd %xmm0, -16(%rax)
movsd -8(%rcx), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rdi // `b_copy` in %rax, `b_end` in %rdi
ja .L32 // the loop ends, with `b_copy = b_end`
movq %rdx, %rax // b_copy = b;
notq %rax // b_copy = ~b_copy;
addq %rax, %rdi // b_end = b_end + b_copy;
andq $-16, %rdi // b_end = b_end & (-16);
leaq 16(%rdi), %rax // b_copy = b_end + 16;
addq %rax, %rsi // a += b_copy; (isn't `b_copy` just 0?)
addq %rax, %rdx // b += b_copy;
.L31:
testq %r8, %r8 // if (nr == 0) jump to .L30
je .L30
movsd (%rsi), %xmm0 // xmm0 = a[0];
movsd %xmm0, (%rdx) // b[0] = xmm0;
.L30:
ret
Can anyone explain what the compiler has in mind in all three cases?
Looks like if I unroll the loop in the following manner, a compiler can generate neater code.
#include <stdlib.h>
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void bar (size_t n, double *a, double *b) {
size_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
double *b_end = b + n;
if (nr) b[0] = a[0];
b += nr;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
}
}
foo:
movq %rdi, %rax
andl $1, %eax
je .L9
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jle .L11
.L4:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L9:
cmpq %rax, %rdi
jg .L4
.L11:
ret
bar:
movq %rdi, %rax
andl $1, %eax
je .L20
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jbe .L21
.L15:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L20:
cmpq %rax, %rdi
ja .L15
.L21:
ret
baz:
leaq (%rdx,%rdi,8), %rcx
andl $1, %edi
je .L23
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L23:
leaq (%rdx,%rdi,8), %rax
cmpq %rax, %rcx
jbe .L22
.L25:
movsd (%rsi), %xmm0
addq $16, %rax
addq $16, %rsi
movsd %xmm0, -16(%rax)
movsd -8(%rsi), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rcx
ja .L25
.L22:
ret
If you're asking why the assembly is relatively large, its because the compiler can't assume what you may know.
For example, if you know the source array will not be modified during the copy, tell the compiler so by adding a const qualifier to the pointed at source data.
void foo (ptrdiff_t n, double *a, double const *b)
Further, if you know the two memory ranges will never overlap, add a restrict qualifier to each of the two pointers.
void foo (ptrdiff_t n, double *restrict a, double const *restrict b)
Ultimately, if you want the most optimized copy (compiler vendors spend a LOT of time on this), use memcpy for non-overlapping ranges, and memmove for overlapping ranges.
I have this class assignment that i can't seem to figure out.
the point is to convert this assembly to C (Code assembled with GNU assembler AT&T syntax):
.section .rdata,"dr"
LC0:
.ascii "%d\12\0"
.text
.globl _main
_main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
movl 12(%ebp), %eax
addl $4, %eax
movl (%eax), %eax
movl %eax, (%esp)
call _atoi
movl %eax, 24(%esp)
cmpl $4, 24(%esp)
je L2
cmpl $6, 24(%esp)
jle L3
cmpl $9, 24(%esp)
jg L3
L2:
movl 24(%esp), %eax
addl $20, %eax
movl %eax, 28(%esp)
jmp L4
L3:
cmpl $0, 24(%esp)
jne L5
movl $44, 28(%esp)
jmp L4
L5:
cmpl $-1, 24(%esp)
jne L6
movl $-44, 28(%esp)
jmp L4
L6:
movl $99, 28(%esp)
L4:
movl 28(%esp), %eax
movl %eax, 4(%esp)
movl $LC0, (%esp)
call _printf
movl $0, %eax
leave
ret
however I always get lost on L2. it seems that no matter if we get into the first if statement or not we will always execute the label L2, which doesn't make any sense. I tried to make sense of it all and the closest I got was this C code:
#include <stdio.h>
int main(int argc, char *argv[]){
int y = 0;
int x = atoi(*++argv);
if (x != 4){
if (x > 6 && x <= 9){
y = 20 + x;
}
else if (x == 0){
y = 44;
}
else if (x == -1){
y = -44;
}
else {
y = 99;
}
}
else{
y = 20 + x;
}
printf("%d %d", y, x);
}
can someone please help with this confusing issue i'm having,
thanks
In decoding compiler output it often helps to rewrite the assembly code in a denser, but still very low-level form (pseudo code), using only elementary, low-level transformations. The point is to group/combine small numbers of instructions in a way that is difficult to get wrong but that exposes the inner logic of the code fragment better. The next step is to eliminate redundant stores and temporaries (like EAX used for storing x + 20 to y). I'm skipping the first step here but it can be dangerous to do that in more complicated code.
The sequence of conditionals then becomes:
x equ [esp + 24]
y equ [esp + 28]
if (x == 4) goto L2;
if (x <= 6) goto L3;
if (x > 9) goto L3;
L2: y = x + 20; goto L4;
L3: if (x != 0) goto L5;
y = 44; goto L4;
L5: if (x != -1) goto L6;
y = -44; goto L4;
L6: y = 99;
L4: printf("%d\f", y);
The first three conditionals form a conspicuous pattern employed by compilers to evaluate complex conditions. The compiler inverted the second and third terms of that conditional to use its 'jump around' solution fragment; inverting again allows you to code the original condition (jumps to L2/'then' are ORs, jumps to L3/'else' are AND NOTs):
if (x == 4 || !(x <= 6) && !(x > 9))
->
if (x == 4 || (x > 6) && (x <= 9))
and Bob's your uncle. The other conditionals could have been the result of a chained if or of a switch statement, it's difficult to tell. But that hardly matters. Hence your decompilation was almost perfect already, you just missed a tiny beat. In an intermediate step the C-ified conditionals look like this:
if (x == 4 || x > 6 && x <= 9)
{
y = x + 20;
}
else // L3
{
if (x == 0)
{
y = 44;
}
else // L5
{
if (x == -1)
{
y = -44;
}
else // L6
{
y = 99;
}
}
}
This can then be tightened to:
if (x == 4 || x > 6 && x <= 9)
{
y = x + 20;
}
else if (x == 0)
{
y = 44;
}
else if (x == -1)
{
y = -44;
}
else
{
y = 99;
}
P.S.: the value of (argv + 1) is not stored back to argv, just dereferenced. Hence it's atoi(*(argv + 1)) or atoi(argv[1]).
I was practicing some assembly code to C and need some help with two questions. Based on the GCC objdump it seems okay but I want to make sure I can do this WITHOUT a computer (still kind of new to assembly code)
Question 1 :
q1:
pushl %ebp
movl %esp, %ebp
subl $4, %esp
cmpl $0, 8(%ebp)\\ compare variable1 to zero
jle .L2 \\jump if less than or equal to zero
movl $1, -4(%ebp)\\ ?? variable2 = 1??
jmp .L4\\else
.L2:
movl $0, -4(%ebp)\\ variable2 = 0
.L4:
movl -4(%ebp), %eax\\ variable2 = variable1
leave
ret
what I got was
int main(int x, int z)
{
if (x < 0)
z = 0;
else
z = x;
}
But I was not sure what the purpose of movl $1, -4(%ebp) was.
Question 2 :
fn:
pushl %ebp
movl $1, %eax
movl %esp, %ebp
movl 8(%ebp), %edx
cmpl $1, %edx\\ compare variable1 to 1
jle .L4\\ less than or equal jump.
.L5:
imull %edx, %eax\\ multiply variable1 by variable 2
subl $1, %edx\\ variable1 -1
cmpl $1, %edx\\ compare variable1 with 1
jne .L5 Loop if not equal
.L4:
popl %ebp\\ return value
ret
How I interpreted the information
int main(int x)
{
int result;
if (x <= 1){
for (result=1; x != 1; x = x-1)
result *= x;}
else{return result;}
}
Not sure if my logic is correct on either of those.
Q1 you have one argument 8(%ebp) and one local variable at -4(%ebp). Return value will be in %eax. Knowing this, the function looks more like:
int foo(int arg)
{
int local;
if (arg <= 0) {
local = 0;
} else {
local = 1;
}
return local;
}
Q2 popl %ebp // return value that's not the return value, that's restoring the saved %ebp of the caller (that was pushed in the beginning). Also, the condition in the loop should use > not !=. You are missing an if (x > 1) conditional around the for loop. (Thanks to Mooing Duck for pointing this out.) Also, technically it's a do-while loop. Otherwise you got this function right.
int factorial(int x)
{
int result = 1;
if (x > 1) {
do {
result *= x;
x -= 1;
} while(x != 1);
}
return result;
}
I'm reading a book Computer Systems: A Programmer's Perspective (2nd Edition)
and Practice Problem 3.23 are little confused me:
A function fun_b has the following overall structure:
int fun_b(unsigned x) {
int val = 0;
int i;
for ( ____;_____;_____) {
}
return val;
}
The gcc C compiler generates the following assembly code:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 leal (%eax,%eax), %edx
6 movl %ebx, %eax
7 andl $1, %eax
8 orl %edx, %eax
9 shrl %ebx Shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Reverse engineer the operation of this code and then do the following:
A. Use the assembly-code version to fill in the missing parts of the C code.
My solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for ( i = 0 ;i < 32;i++) {
val += val; //because leal (%eax,%eax), edx --> %edx = %eax + %eax
val = val | x & 0x1;
x >>= 1;
}
return val;
}
Book's solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for (i = 0; i < 32; i++) {
val = (val << 1) | (x & 0x1);
x >>= 1;
}
return val;
}
Please, explain to me why leal function has non typical behavior in this function.
And I dont understand how this assembly code is yielding this statement val = (val << 1) | (x & 0x1)
In your code:
val += val;
val = val | x & 0x1;
Here, val += val which is equivalent to (val*2) which is effectively equal to val left shifted by 1.
But I think your solution is correct only if the assembly code was something like:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 addl %eax, %eax
6 movl %ebx, %edx
7 andl $1, %edx
8 orl %edx, %eax
9 shrl %ebx # shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Because if val + val was a separate statement, compiler usually places it in eax register rather than in edx (i'm not sure this is the case always). So, for the code you have given, the possible solutions are:
val = (val << 1) | (x & 0x1);
or
val = (val + val) | (x & 0x1);
or
val = (val * 2) | (x & 0x1);
x >>= 1; means multiplying x by 2 which in binary is shifting to the left or adding 0 at the right side
x >>= 1; == x * 2; == x +=x;
The requirement is like following:
/* length must be >= 18 */
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
using bit-wise operator, I could refactor the 1st line
int remainder = (length - 2) & 7;
Can it be further optimized?
((length+5)&~7)+2
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
==>
int HELPER_calcActualLength(int length) {
int remainder = length % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
int calcActualLength(int length) {
return 18 + HELPER_calcActualLength(length - 18);
}
And HELPER_calcActualLength() equals to ROUNDUP_8() in the semantics when the argument >= 0
And more simpler ROUNDUP_8() can be:
#define ROUNDUP_8(x) (((x)+7)&~7)
int calcActualLength(int length) {
return 18 + ROUNDUP_8(length - 18);
}
==> 2 + ROUNDUP_8(length - 18 + 16);
==> 2 + ROUNDUP_8(length - 2);
==> 2 + (((length - 2)+7)&~7)
==> ((length+5)&~7)+2
Original code produces the following 64-bit assembly when compiling with gcc -O3:
movl %edi, %eax
leal -18(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $29, %edx
addl %edx, %ecx
andl $7, %ecx
subl %edx, %ecx
je .L2
addl $8, %eax
subl %ecx, %eax
.L2:
rep
As suggested in the comments to your question, changing the argument to unsigned int allows for greater optimisations and results in the following assembly:
leal -18(%rdi), %edx
movl %edi, %eax
andl $7, %edx
je .L3
leal 8(%rdi), %eax
subl %edx, %eax
.L3:
rep
Rounding up to a multiple of 8 can be performed by adding 7 and masking with ~7. It works like this: if the last three bits are not all zero, then adding 7 carries into the 4-th bit, otherwise no carry occurs. So your function could be simplified to:
return (((length - 18) + 7) & ~7) + 18;
or simpler:
return ((length - 11) & ~7) + 18;
GCC compiles the last line to simply:
leal -11(%rdi), %eax
andl $-8, %eax
addl $18, %eax
Note that the lea (Load Effective Address) instruciton is often "abused" for its ability to compute simple linear combinations like reg1 + size*reg2 + offset