I'm doing an x86 assembly project for class and we're supposed to implement a heap of personnel records. The call heap_swap line is giving me trouble. If I uncomment it, it throws a seg fault. However, the heap_swap function works fine no matter how I test it. I've really racked my brain and would appreciate any help anyone can give!
sift_up1:
# ecx = i
# rdx = address to heap
# r9 = address to heap[i]
# rax = offset of id
# r8 = address for heap[i].id_number
# r10d = heap[i].id_number
# r11d = index of parent
# rdx = address for parent id number
# ebx = heap[parent].id_number
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
pushq %rbx #a section to keep track of all the callee saved registers
pushq %rdi #that need to be restored
leaq offset_of_id(%rip), %rax #put the id offset into a register
leaq heap(%rip), %rdx
jmp LOOP_TOP
LOOP_TOP:
cmpl $0, %ecx #Check if i=0, if so jump to exit loop
je EXIT_LOOP
movl $8, %r9d
imull %ecx, %r9d #finding heap[i]
addq (%rdx), %r9
movq %r9, %r8 #r8 contains heap[i]
addq (%rax), %r8 #add id offset, it becomes heap[i].id_number
movl (%r8), %r10d #dereference id_number and place it into r10d
movl %ecx, %r11d #find the index of the parent of i
subl $1, %r11d
shrl $1, %r11d
movl $8, %edi
imull %r11d, %edi
addq (%rdx), %rdi #rdi holds the address of heap[parent]
addq (%rax), %rdi #rdi holds the address of heap[parent].id_number
movl (%rdi), %ebx #ebx holds the heap[parent].id_number
cmpl %ebx, %r10d
jle EXIT_LOOP
pushq %rdx
movq %r11, %rdx #put the indexes in the correct parameter functions
# call heap_swap #call heap_swap
popq %rdx
movl %r11d, %ecx #modify i
jmp LOOP_TOP #jump to loop top
Getting segmentation fault in the function, due to instruction
movq -8(%rbp), %rax, one before the printf. I can't understand why ?
Note : this is not gcc generated assembly, but by compiler i am writing. Assembly code is almost similar to what gcc generates.
.text
.globl main
.type main, #function
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $2, -4(%rbp)
leaq -4(%rbp), %rax
movl %eax, %edi
movb $0, %al
call fcvt2
movl %eax, -4(%rbp)
leaq .LC0(%rip), %rdi
movl -4(%rbp), %esi
movb $0, %al
call printf
leave
ret
.globl fcvt2
.type fcvt2, #function
fcvt2:
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
movq %rdi, -8(%rbp)
leaq .LC1(%rip), %rdi
movq -8(%rbp), %rax
movl (%rax), %esi
movb $0, %al
call printf
movq -8(%rbp), %rax
movl (%rax), %edi
movl %edi, %eax
leave
ret
.section .rodata
.LC1:
.string "It should be : %d\f"
.LC0:
.string "%d\n"
And C Program is :
int fcvt2(int *ip) {
int i;
printf("It should be : %d\f", *ip);
return *ip;
}
void main() {
int i;
i = 2;
i = fcvt2(&i);
printf("%d\n",i);
return;
}
gdb output at fault point:
rax 0xffffdd4c 4294958412
rbx 0x0 0
rcx 0x7ffffff7 2147483639
rdx 0x7ffff7dd3780 140737351858048
rsi 0x7fffffffdd48 140737488346440
rdi 0xffffdd4c 4294958412
rbp 0x7fffffffdd30 0x7fffffffdd30
rsp 0x7fffffffdd00 0x7fffffffdd00
r8 0x0 0
r9 0x9 9
r10 0x7ffff7dd1b78 140737351850872
r11 0x246 582
r12 0x400430 4195376
r13 0x7fffffffde30 140737488346672
r14 0x0 0
r15 0x0 0
rip 0x40059c 0x40059c <fcvt2+20>
eflags 0x10206 [ PF IF RF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
movl %eax, %edi in the caller truncates the pointer arg to fcvt2. You actually segfault on mov (%rax),%esi. rax, not the instruction before it like you claimed. (Time for a refresher on your GDB skills?)
leaq -4(%rbp), %rax generated it correctly in %rax, but then your compiler forgot that it was a 64-bit pointer to a 32-bit value. (Ideally you'd want to leaq -4(%rbp), %rdi directly into the arg register.)
Off topic: if you don't need to preserve the upper bytes of EAX, movb $0, %al is less efficient than xor %eax, %eax. I think you're doing this for the x86-64 SysV variadic function convention, and you're right that only %al needs to say how many XMM register args there are, not the whole %eax, so you got that right. But zeroing eax is the most efficient way to zero al. Of course, you don't need to do this at all for non-variadic functions, but your compiler is obviously still in the just-get-it-working phase, so doing it unconditionally isn't a correctness problem; you never need to pass anything else in rax, and function calls are always assumed to clobber rax.
(Also related: Haswell/Skylake partial registers have false dependencies: al isn't renamed separately from rax anymore)
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have practiced Assembler long time ago and I would like to understand a simple program (I generate assembler code from a C code) which adds 2 vectors (actually 2 arrays) and store the result in another vector (an output array). My goal is after to study vectorization. For this, I use gcc-4.9 under Debian Wheezy on i7-core processor.
Here the C code snippet (not vectorized version) :
#include <stdio.h>
#define SIZE 10000
void test(double *a, double *b, double *c)
{
int i;
for (i = 0; i < SIZE; i++)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int i;
double tab1[SIZE];
double tab2[SIZE];
double tab3[SIZE];
for (i = 0; i < SIZE; i++)
{
tab1[i] = i;
tab2[i] = i;
tab3[i] = 0;
}
test(tab1, tab2, tab3);
for (i = 0; i < SIZE; i++)
printf(" tab3[%d] = %f\n", i, tab3[i]);
return 0;
}
I generate Assembler code with AT&T syntax :
gcc -std=c99 -c main_no_vectorized.c -O3 -S -o main_no_vectorized.s
Here is the assembly code :
.file "main_no_vectorized.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl test
.type test, #function
test:
.LFB3:
.cfi_startproc
leaq 16(%rdx), %rax
leaq 16(%rsi), %rcx
cmpq %rax, %rsi
setae %r8b
cmpq %rcx, %rdx
setae %cl
orb %cl, %r8b
je .L7
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %cl
cmpq %rax, %rdx
setae %al
orb %al, %cl
je .L7
testb $8, %dil
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
je .L8
movsd (%rdi), %xmm0
movl $9998, %ebp
movl $4999, %r9d
movl $9999, %r12d
movl $1, %r8d
movl $1, %ebx
addsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L3:
salq $3, %r8
xorl %eax, %eax
xorl %ecx, %ecx
leaq (%rdi,%r8), %r11
leaq (%rsi,%r8), %r10
addq %rdx, %r8
.p2align 4,,10
.p2align 3
.L4:
movupd (%r10,%rax), %xmm0
addl $1, %ecx
addpd (%r11,%rax), %xmm0
movups %xmm0, (%r8,%rax)
addq $16, %rax
cmpl %r9d, %ecx
jb .L4
cmpl %ebp, %r12d
leal (%rbx,%rbp), %eax
je .L1
cltq
movsd (%rdi,%rax,8), %xmm0
addsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L1:
popq %rbx
.cfi_remember_state
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L8:
.cfi_restore_state
movl $10000, %ebp
movl $5000, %r9d
movl $10000, %r12d
xorl %r8d, %r8d
xorl %ebx, %ebx
jmp .L3
.L7:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movsd (%rdi,%rax), %xmm0
addsd (%rsi,%rax), %xmm0
movsd %xmm0, (%rdx,%rax)
addq $8, %rax
cmpq $80000, %rax
jne .L2
rep ret
.cfi_endproc
.LFE3:
.size test, .-test
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string " tab3[%d] = %f\n"
.section .text.unlikely
.LCOLDB4:
.section .text.startup,"ax",#progbits
.LHOTB4:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl %eax, %eax
subq $240016, %rsp
.cfi_def_cfa_offset 240032
movdqa .LC2(%rip), %xmm3
leaq 32(%rsp), %rcx
leaq 80032(%rsp), %rdx
movdqa .LC1(%rip), %xmm1
.p2align 4,,10
.p2align 3
.L21:
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm2
paddd %xmm3, %xmm1
movaps %xmm2, 16(%rsp,%rax)
cvtdq2pd %xmm0, %xmm0
movaps %xmm2, 80016(%rsp,%rax)
movaps %xmm0, (%rcx,%rax)
movaps %xmm0, (%rdx,%rax)
addq $32, %rax
cmpq $80000, %rax
jne .L21
leaq 160016(%rsp), %rdi
movl $80000, %edx
xorl %esi, %esi
call memset
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L22:
movapd 16(%rsp,%rax), %xmm0
addpd 80016(%rsp,%rax), %xmm0
movaps %xmm0, 160016(%rsp,%rax)
addq $16, %rax
cmpq $80000, %rax
jne .L22
xorl %ebx, %ebx
.p2align 4,,10
.p2align 3
.L23:
movsd 160016(%rsp,%rbx,8), %xmm4
movl %ebx, %esi
movl $.LC3, %edi
movl $1, %eax
addq $1, %rbx
movapd %xmm4, %xmm0
movsd %xmm4, 8(%rsp)
call printf
cmpq $10000, %rbx
jne .L23
addq $240016, %rsp
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.section .text.unlikely
.LCOLDE4:
.section .text.startup
.LHOTE4:
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 4
.long 4
.long 4
.long 4
.ident "GCC: (Debian 4.9.1-16) 4.9.1"
.section .note.GNU-stack,"",#progbits
Could you explain to me the main steps of this above assembly code in relation with the C code, in particulary the "test" function, the loop of initialization in main function and the parameters passing (i.e where's the push and pop instructions for the stack) and the effective addition of "a" and "b" arrays ?
What corresponds to .L2, .L3, ... segments ? is there a relation with L2 cache, L3 cache ?
Sorry for these basics questions but I begin with Intel x86_64 assembler.
Thanks for your precious help
The generated assembly code is quite complicated. It first checks to see if the arrays a, b, and c overlap in a way that will cause an optimized loop to fail. For example, if you did this:
test(tab1, tab2, &tab1[1]);
then the overlap would be detected and cause the code to jump to L7 (the straightforward implementation). By the way, L stands for Label, and the label numbers are just generated by the compiler with no particular meaning. So L1, L2, L3, etc are just labels that are used for the code to branch to various places. The overlap checks start at .LFB3 and end at the last je .L7.
If no overlap is detected, then an optimized loop will be used. This optimized loop will try to add two doubles at a time instead of just one. The first thing the optimized loop does is to find out if array a is aligned to a 16 byte boundary (the testb $8, %dil instruction). If it is, it will jump to L8 to load a set of constants (e.g. r9 = 5000). If the array is not aligned, if will fall through and load a different set of constants (e.g. r9 = 4999), and also handle the first element. This is because the unaligned case will need to do 4999 iterations two at a time and handle the first and last unaligned elements separately outside the loop. The aligned case will just do 5000 iterations.
Either way, the code reaches L3 next. The code at L3 and L4 is the optimized loop that does the adds two at a time using the addpd instruction (the nonoptimized loop at L7 used addsd to do one add at a time). After the L4 loop finishes, it checks to see if it needs to handle the last element (for the unaligned case). Then it returns with the ret instruction.
By the way, it helps to know that when test is called, a is in rdi, b is in rsi, and c is in rdx. That is the calling convention for 64-bit. Therefore, there are no arguments pushed on the stack. If you don't understand x86 assembly too well, concentrate on the code starting at L7. That is the non-optimized version and you should be able to figure that part out given that I said your three arguments were in rdi, rsi, and rdx.
The .L2 and such are labels, they are used to refer to the next instruction. They are pretty much exactly like labels in C, if you've used goto. The primary use of a label is with a jump or branch, to specify where the jump goes to.
For example, the .L2 label is start of the body of your for (i = 0; i < SIZE; i++) loop in test(), it is counting by 8 bytes (the size of a double) up to 8*10000. The last instruction in the loop is jne .L2, which jumps to .L2 if the previous comparison was not equal.
You may find this reference (PDF) on x64 helpful.
I have the code working with lined lists. I use tail calls. Unfortunately, GCC does not optimise the calls.
Here is C code of the function that recursively calculates length of the linked list:
size_t ll_length(const ll_t* list) {
return ll_length_rec(list, 0);
}
size_t ll_length_rec(const ll_t* list, size_t size_so_far)
{
if (list) {
return ll_length_rec(list->next, size_so_far + 1);
} else {
return size_so_far;
}
}
and here is the assembler code:
.globl _ll_length_rec
_ll_length_rec:
LFB8:
.loc 1 47 0
pushq %rbp
LCFI6:
movq %rsp, %rbp
LCFI7:
subq $32, %rsp
LCFI8:
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 1 48 0
cmpq $0, -8(%rbp)
je L8
.loc 1 49 0
movq -16(%rbp), %rsi
incq %rsi
movq -8(%rbp), %rax
movq 8(%rax), %rdi
call _ll_length_rec # < THIS SHOUD BE OPTIMIZED
movq %rax, -24(%rbp)
jmp L10
If GCC would optimize it, there would be no call in the asm. I compile it with:
gcc -S -fnested-functions -foptimize-sibling-calls \
-03 -g -Wall -o llist llist.c
and GCC version is:
i686-apple-darwin10-gcc-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5666) (dot 3)
If I add -O3 to your compilation line, it does not seem to generate the offending call, while without it, I get the unoptimised call. I don't know all gcc options in my head, but is -03 a typo for -O3 or intentional?
Ltmp2:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
jmp LBB1_1
.align 4, 0x90
LBB1_3:
addq $2, %rsi
Ltmp3:
movq (%rax), %rdi
Ltmp4:
LBB1_1:
Ltmp5:
testq %rdi, %rdi
je LBB1_5
Ltmp6:
movq (%rdi), %rax
testq %rax, %rax
jne LBB1_3
incq %rsi
LBB1_5:
movq %rsi, %rax
Ltmp7:
Ltmp8:
popq %rbp
ret
Most likely because neither of your functions are declared as static, which means that the symbols must be visible to the linker in case any other compilation units need them at link time. Try to compile with the -fwhole-program flag and see what happens.
Probably depends on the version of GCC and specific build. This is what I get from GCC 3.4.4 on Windows starting from -O2 and up
.globl _ll_length_rec
.def _ll_length_rec; .scl 2; .type 32; .endef
_ll_length_rec:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %edx
movl 12(%ebp), %eax
jmp L3
.p2align 4,,7
L6:
movl (%edx), %edx
incl %eax
L3:
testl %edx, %edx
jne L6
popl %ebp
ret