Manual conversion of a simple C program into assembly - c

Take the following C program that has two functions:
// main.c
int times_two(int num) {
int b = num + num;
return b;
}
int main(void) {
int a=2;
int num = times_two(a) + a;
return num;
}
Is the following a more-or-less accurate way to represent that in x86 assembly? I know this is verbose and I have a bunch of extra push/pop and such on the stacks that I don't need, but does the following more or less faithfully follow the C code?
SYS_EXIT = 60
.globl _start
_start:
call main
mov %eax, %edi
mov $SYS_EXIT, %eax
syscall
main:
# function() {...
push %rbp
mov %rsp, %rbp
# int a = 2
sub $8, %rsp
movl $2, -4(%rbp)
# times_two(a)
movq -4(%rbp), %rdi
call times_two
# ... + a
addl -4(%rbp), %eax
# ...} // (return value already in eax)
add $8, %rsp
pop %rbp
ret
times_two:
push %rbp
mov %rsp, %rbp
# return num + num
xor %eax, %eax # <-- update: not necessary: next line wipes it out anyways.
mov %edi, %eax # <-- update: could also do lea (%eax,,2) to multiply by constant
add %edi, %eax
pop %rbp
ret
If not, what may I be screwing up or missing/doing wrong?

Related

mov -8(%rbp), %rax not updating %rax value

Below is my assembly for the C code
int main() {
int a = 1;
int b = 2;
return a + b;
}
.globl main
main:
push %rbp
mov %rsp, %rbp
mov $1, %rax
push %rax
mov $2, %rax
push %rax
mov 0(%rbp), %rax
push %rax
mov -8(%rbp), %rax
pop %rcx
add %rcx, %rax
mov %rbp, %rsp
pop %rbp
ret
I am partially sure its the current mapping of C code to ASM. The issue am facing which I observed while running the assembled binary through GDB, is that:
mov -8(%rbp), %rax
doesn't update the value of %rax to 2.
The command I used for assembling the assembly code is:
gcc data/stage_5/valid/multiple_vars.s -o data/stage_5/valid/multiple_vars

GCC emits a label that's not jumped to by anything outside that label?

Taking the following C code
#include <stdio.h>
void test(unsigned char buffer[], int size) {
for (int i = 0; i < size; i++) {
unsigned char data = buffer[i];
printf("%c", data);
}
}
void main() {
unsigned char buffer[5] = "Hello";
test(buffer, 5);
return;
}
and compiling it the flags -fno-stack-protector -fno-asynchronous-unwind-tables -fno-unroll-loops for clarity produces the following assembly for the test() function:
test:
testl %esi, %esi
jle .L6
pushq %rbp
leal -1(%rsi), %eax
pushq %rbx
leaq 1(%rdi,%rax), %rbp
movq %rdi, %rbx
subq $8, %rsp
.p2align 4,,10
.p2align 3
.L3:
movzbl (%rbx), %edi
addq $1, %rbx
call putchar#PLT
cmpq %rbp, %rbx
jne .L3
addq $8, %rsp
popq %rbx
popq %rbp
ret
.p2align 4,,10
.p2align 3
.L6:
ret
.size test, .-test
.section .text.startup,"ax",#progbits
.p2align 4
It seems to me like the L3 label here is completely useless since it is never jumped to or entered. (Except by jne .L3, but that instruction is inside of the L3 label already).
Can anyone explain how and why this assembly still produces the expected effect?
If you read the assembler code from the top you will see that it reaches .L3, plus it also jumps to it with jne .L3, which is your for loop in C.

Translating assembly

So I'm learning how to convert the assembly into readable C code. The assembly is as follows...
Consider the compiler places the C variables: a at -4(%rbp), b at -8(%rbp), and c at -12(rbp).
file "main.c"
.text
.globl main
.type main, #function
main:
endbr64
pushq %rbp
movq %rsp, %rbp
movl $10, -12(%rbp)
movl $20, -4(%rbp)
movl $1, -8(%rbp)
.L4:
cmpl $1, -12(%rbp)
je .L7
movl -8(%rbp), %eax
imull -12(%rbp), %eax
movl %eax, -8(%rbp)
subl $1, -12(%rbp)
jmp .L4
.L7:
nop
movl -4(%rbp), %eax
imull -8(%rbp), %eax
movl %eax, -4(%rbp)
movl $0, %eax
popq %rbp
ret
This is what I have so far.
int c = 10;
int a = 20;
int b = 1;
for(c = 10; c > 1; c--)
{
int x = b;
x = c * x;
b = x;
}
Not completely sure how correct that is. The part that confuses me the most is the appearance (from what seems like out of nowhere) of eax. When eax appears, should I just assume that it is some other random variable? (hence the integer x I introduced)

Fast modulo 10 in c

I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}
Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.
You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax
The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.
This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function

Writing a function in 64 bit assembly

I am trying to write a function (max) in 64 bit assembly and I don't know what I am doing wrong, maybe some of you guys can determine what I am doing wrong :/
Here's the function:
int max(int a, int b) {
/* Return the larger of the two integers `a' and `b'. */
if (a > b)
return a;
else
return b;
}
And here is my assembly code (with comments):
push %rbp
mov %rsp, %rbp
mov %rdi, %rax
mov %rsi, %rcx
test %rax, %rax // Checking if first parameter is signed
js .signedRAX
test %rcx, %rcx // Checking if second parameter is signed
js .signedRCX
jmp .compare // If either one signed then jump to .compare
.signedRAX:
test %rcx, %rcx // Checking if both are signed
js .signedRAXandRCX
mov %rcx, %rax // If not then return the positive number
jmp .end // finish the function
.signedRCX:
jmp .end // If only the second parameter is signed then jump
.signedRAXandRCX: // straight to end of function and return %rax
cmp %rax, %rcx // If both are signed compare which one is the max
jl .end
mov %rcx, %rax
jmp .end
.compare:
cmp %rax, %rcx // If both are positive then compare which one is
jg .end // the max
mov %rcx, %rax
.end:
mov %rbp, %rsp
pop %rbp
ret
I am getting the wrong output when comparing two parameters that are both signed and then both positive.
You work nuch too complicated.
If I input your program to gcc -S, I get
max:
.LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
movl 8(%ebp), %eax
cmpl 12(%ebp), %eax
jle .L2
movl 8(%ebp), %eax
jmp .L3
.L2:
movl 12(%ebp), %eax
.L3:
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE0:
If I take over your "ABI" and way of passing arguments,
I get
max:
push %rbp
mov %rsp, %rbp
mov %rdi, %rax
mov %rsi, %rcx
cmpl %rcx, %rax
jle .L2
movl %rcx, %rax
.L2:
mov %rbp, %rsp
pop %rbp
ret
Here is equivalent C-pseudo code for ASM from question. You can see, for a >= 0 and b < 0 it return b. For a < 0 and b >= 0 it return a. That's incorrect. There may be another errors in the code, because so simple operation encoded really messy. It's really hard to see anything in the code. Don't make simple things so complex. Follow KISS principle.
// test %rax, %rax
// js .signedRAX
if (a >= 0) {
// test %rcx, %rcx
// js .signedRCX
if (b >= 0) {
// .compare
// cmp %rax, %rcx // If both are positive then compare which one is
// jg .end // the max
if (a > b) {
b = a;
}
return b;
} else {
// .signedRCX
return b;
}
} else {
// .signedRAX
// test %rcx, %rcx // Checking if both are signed
// js .signedRAXandRCX
if (b >= 0) {
b = a;
return b;
} else {
// .signedRAXandRCX
// cmp %rax, %rcx // If both are signed compare which one is the max
// jl .end
if (a < b) {
b = a;
}
return b;
}
}

Resources