Hi I'm new to the gcc's built-in atomic functions. And I'm using the test-and-set one. You may find the reference here
Here's the question:
I've done this code:
#define bool int
#define true 1
#define false 0
int main() {
bool lock = true;
bool val = __sync_lock_test_and_set(&lock, true);
return 0;
}
What I intend to do is to check the assembly instruction of __sync_lock_test_and_set. I've used:
gcc -S [filename].c
And the result is:
.file "test_and_set.c"
.file "test_and_set.c"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $16, %esp
movl $1, -8(%ebp)
movl $1, %eax
xchgl -8(%ebp), %eax
movl %eax, -4(%ebp)
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 4.8.1"
However, I can't find where the test_and_set instruction is...
As you can see, I'm using gcc-4.8.1, and the environment is MAC OSX 10.10(I'm sure that this gcc is not what Apple provides. I've compiled this by myself)
Thanks!
movl $1, -8(%ebp) # lock = true
movl $1, %eax # true argument
xchgl -8(%ebp), %eax # the test-and-set
It is an atomic exchange, which returns the previous value (that's the test part) and writes 1 into the variable (the set part). This is used to implement mutexes. After the operation the lock will be held by somebody - either the original owner or your code that has just acquired it. That's why it's safe to write a value of 1. The original value is returned, so you can distinguish between those two events. If the original value was 0 then you got the lock and can proceed, otherwise you need to wait because somebody else has it.
Related
Let say I have the (x << n) | (x >> (-n & 63)) expression.
There is nothing conditional in it.
So, to my understanding, it will be executed in constant time.
Indeed, when I compile the following code using gcc -O3 -S:
#include <stdint.h>
// rotate left x by n places assuming n < 64
uint64_t rotl64(uint64_t x, uint8_t n) {
return (x << n) | (x >> (-n & 63));
}
I get, on linux/amd64, the following output (which executes in constant time):
.file "test.c"
.text
.p2align 4
.globl rotl64
.type rotl64, #function
rotl64:
.LFB0:
.cfi_startproc
movq %rdi, %rax
movl %esi, %ecx
rolq %cl, %rax
ret
.cfi_endproc
.LFE0:
.size rotl64, .-rotl64
.ident "GCC: (Alpine 9.3.0) 9.3.0"
.section .note.GNU-stack,"",#progbits
However, on linux/386 I get an output that contains conditional jumps:
.file "test.c"
.text
.p2align 4
.globl rotl64
.type rotl64, #function
rotl64:
.LFB0:
.cfi_startproc
pushl %edi
.cfi_def_cfa_offset 8
.cfi_offset 7, -8
pushl %esi
.cfi_def_cfa_offset 12
.cfi_offset 6, -12
movl 12(%esp), %eax
movl 16(%esp), %edx
movzbl 20(%esp), %ecx
movl %eax, %esi
movl %edx, %edi
shldl %esi, %edi
sall %cl, %esi
testb $32, %cl
je .L4
movl %esi, %edi
xorl %esi, %esi
.L4:
negl %ecx
andl $63, %ecx
shrdl %edx, %eax
shrl %cl, %edx
testb $32, %cl
je .L5
movl %edx, %eax
xorl %edx, %edx
.L5:
orl %esi, %eax
orl %edi, %edx
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 8
popl %edi
.cfi_restore 7
.cfi_def_cfa_offset 4
ret
.cfi_endproc
.LFE0:
.size rotl64, .-rotl64
.ident "GCC: (Alpine 9.3.0) 9.3.0"
.section .note.GNU-stack,"",#progbits
From what I understand, here the 64 bits operations have to be emulated, hence the need of conditional jumps.
Does GCC provide a builtin function that indicates if an expression will be compiled with no jumps?
If it isn't the case, how can I know if an expression will be executed in constant time?
Is this a problem for timing sensitive applications like security?
Does GCC provide a builtin function that indicates if an expression will be compiled with no jumps?
No.
If it isn't the case, how can I know if an expression will be executed in constant time?
By looking at the generated assembly code.
Is this a problem for timing sensitive applications like security?
Yes. That's why in these cases don't trust the compilers (and porters/package builders changing compiler settings) and rather implement it in assembly.
There are some constant time functions in general libc's, like in OpenBSD and FreeBSD. Like timingsafe_bcmp and timingsafe_memcmp, which are written in pure C, but their authors trust their packagers not to be like Debian or Ubuntu, who are assumed to break it.
Many other such functions are in the various security libraries itself, but even then you can safely assume that they are broken. For sure in OpenSSL and libsodium in many cases.
No such a function does not exist.
And unless you are writing the compiler (you're not) you should not really care about the actual machine code being generated. The compiler is free to optimize that code anyway it sees fit (as long as it is correct) depending on the options you pass in. And with -O3 you should get the fastest code, even with jumps.
If there were a function like you suggested, you're code would be tied to a single version of a single compiler with a particular set of optimization options. In other words: bye bye portability.
I am trying to understand how to embed assembly language in C (using gcc on x86_64 architecture). I wrote this program to increment the value of a single variable. But I am getting garbage value as output. And ideas why?
#include <stdio.h>
int main(void) {
int x;
x = 4;
asm("incl %0": "=r"(x): "r0"(x));
printf("%d", x);
return 0;
}
Thanks
Update The program is giving expected result on gcc 4.8.3 but not on gcc 4.6.3. I am pasting the assembly output of the non-working code:
.file "abc.c"
.section .rodata
.LC0:
.string "%d"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
movl $4, -20(%rbp)
movl -20(%rbp), %eax
incl %edx
movl %edx, %ebx
.cfi_offset 3, -24
movl %ebx, -20(%rbp)
movl $.LC0, %eax
movl -20(%rbp), %edx
movl %edx, %esi
movq %rax, %rdi
movl $0, %eax
call printf
movl $0, %eax
addq $24, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits
You don't need to say x twice; once is sufficient:
asm("incl %0": "+r"(x));
The +r says that the value will be input and output.
Your way, with separate inputs and output registers, requires that you take the input from %1, add one, and write the output to %0, but you can't do that with incl.
The reason it works on some compilers is because GCC is free to allocate both %0 and %1 to the same register, and appears to have done so in those cases, but it does not have to. Incidentally, if you want to prevent GCC allocating an input and output to the same register (say, if you want to initialize the output before using the input to calculate a final output), you need to use the & modifier.
The documentation for the modifiers is here.
I am trying to create a small assembly program to create a folder. I looked up the system call for creating a directory on this page. It says that it is identified by 27h. How would I go about implementing the mkdir somename in assembly?
I am aware that the program should move 27 into eax but I am unsure where to go next. I have googled quite a bit and no one seems to have posted anthing about this online.
This is my current code (I don't know in which register to put filename and so on):
section .data
section .text
global _start
mov eax, 27
mov ????????
....
int 80h
Thanks
One way of finding out, is using GCC to translate the following C code:
#include <stdio.h>
#include <sys/stat.h>
int main()
{
if (mkdir("testdir", 0777) != 0)
{
return -1;
}
return 0;
}
to assembly, with: gcc mkdir.c -S
.file "mkdir.c"
.section .rodata
.LC0:
.string "testdir"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $16, %esp
movl $511, 4(%esp)
movl $.LC0, (%esp)
call mkdir ; interesting call
testl %eax, %eax
setne %al
testb %al, %al
je .L2
movl $-1, %eax
jmp .L3
.L2:
movl $0, %eax
.L3:
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 4.5.1 20100924 (Red Hat 4.5.1-4)"
.section .note.GNU-stack,"",#progbits
Anyway, ProgrammingGroundUp page 272 lists important syscalls, including mkdir:
%eax Name %ebx %ecx %edx Notes
------------------------------------------------------------------
39 mkdir NULL terminated Permission Creates the given
directory name directory. Assumes all
directories leading up
to it already exist.
You could also do like the Assembly Howto is suggesting. But indeed, calling mkdir from Libc is more portable. You need to look into asm/unistd.h to get the syscall number.
C code:
#include <stdio.h>
main() {
int i;
for (i = 0; i < 10; i++) {
printf("%s\n", "hello");
}
}
ASM:
.file "simple_loop.c"
.section .rodata
.LC0:
.string "hello"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushl %ebp # push ebp onto stack
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp # setup base pointer or stack ?
.cfi_def_cfa_register 5
andl $-16, %esp # ?
subl $32, %esp # ?
movl $0, 28(%esp) # i = 0
jmp .L2
.L3:
movl $.LC0, (%esp) # point stack pointer to "hello" ?
call puts # print "hello"
addl $1, 28(%esp) # i++
.L2:
cmpl $9, 28(%esp) # if i < 9
jle .L3 # goto l3
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
So I am trying to improve my understanding of x86 assembly code. For the above code, I marked off what I believe I understand. As for the question marked content, could someone share some light? Also, if any of my comments are off, please let me know.
andl $-16, %esp # ?
subl $32, %esp # ?
This reserves some space on the stack. First, the andl instruction rounds the %esp register down to the next lowest multiple of 16 bytes (exercise: find out what the binary value of -16 is to see why). Then, the subl instruction moves the stack pointer down a bit further (32 bytes), reserving some more space (which it will use next). I suspect this rounding is done so that access through the %esp register is slightly more efficient (but you'd have to inspect your processor data sheets to figure out why).
movl $.LC0, (%esp) # point stack pointer to "hello" ?
This places the address of the string "hello" onto the stack (this instruction doesn't change the value of the %esp register itself). Apparently your compiler considers it more efficient to move data onto the stack directly, rather than to use the push instruction.
The return value of a function is usually stored on the stack or in a register. But for a large structure, it has to be on the stack. How much copying has to happen in a real compiler for this code? Or is it optimized away?
For example:
struct Data {
unsigned values[256];
};
Data createData()
{
Data data;
// initialize data values...
return data;
}
(Assuming the function cannot be inlined..)
None; no copies are done.
The address of the caller's Data return value is actually passed as a hidden argument to the function, and the createData function simply writes into the caller's stack frame.
This is known as the named return value optimisation. Also see the c++ faq on this topic.
commercial-grade C++ compilers implement return-by-value in a way that lets them eliminate the overhead, at least in simple cases
...
When yourCode() calls rbv(), the compiler secretly passes a pointer to the location where rbv() is supposed to construct the "returned" object.
You can demonstrate that this has been done by adding a destructor with a printf to your struct. The destructor should only be called once if this return-by-value optimisation is in operation, otherwise twice.
Also you can check the assembly to see that this happens:
Data createData()
{
Data data;
// initialize data values...
data.values[5] = 6;
return data;
}
here's the assembly:
__Z10createDatav:
LFB2:
pushl %ebp
LCFI0:
movl %esp, %ebp
LCFI1:
subl $1032, %esp
LCFI2:
movl 8(%ebp), %eax
movl $6, 20(%eax)
leave
ret $4
LFE2:
Curiously, it allocated enough space on the stack for the data item subl $1032, %esp, but note that it takes the first argument on the stack 8(%ebp) as the base address of the object, and then initialises element 6 of that item. Since we didn't specify any arguments to createData, this is curious until you realise this is the secret hidden pointer to the parent's version of Data.
But for a large structure, it has to be on the heap stack.
Indeed so! A large structure declared as a local variable is allocated on the stack. Glad to have that cleared up.
As for avoiding copying, as others have noted:
Most calling conventions deal with "function returning struct" by passing an additional parameter that points the location in the caller's stack frame in which the struct should be placed. This is definitely a matter for the calling convention and not the language.
With this calling convention, it becomes possible for even a relatively simple compiler to notice when a code path is definitely going to return a struct, and for it to fix assignments to that struct's members so that they go directly into the caller's frame and don't have to be copied. The key is for the compiler to notice that all terminating code paths through the function return the same struct variable. If that's the case, the compiler can safely use the space in the caller's frame, eliminating the need for a copy at the point of return.
There are many examples given, but basically
This question does not have any definite answer. it will depend on the compiler.
C does not specify how large structs are returned from a function.
Here's some tests for one particular compiler, gcc 4.1.2 on x86 RHEL 5.4
gcc trivial case, no copying
[00:05:21 1 ~] $ gcc -O2 -S -c t.c
[00:05:23 1 ~] $ cat t.s
.file "t.c"
.text
.p2align 4,,15
.globl createData
.type createData, #function
createData:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movl $1, 24(%eax)
popl %ebp
ret $4
.size createData, .-createData
.ident "GCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-46)"
.section .note.GNU-stack,"",#progbits
gcc more realistic case , allocate on stack, memcpy to caller
#include <stdlib.h>
struct Data {
unsigned values[256];
};
struct Data createData()
{
struct Data data;
int i;
for(i = 0; i < 256 ; i++)
data.values[i] = rand();
return data;
}
[00:06:08 1 ~] $ gcc -O2 -S -c t.c
[00:06:10 1 ~] $ cat t.s
.file "t.c"
.text
.p2align 4,,15
.globl createData
.type createData, #function
createData:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
movl $1, %ebx
subl $1036, %esp
movl 8(%ebp), %edi
leal -1036(%ebp), %esi
.p2align 4,,7
.L2:
call rand
movl %eax, -4(%esi,%ebx,4)
addl $1, %ebx
cmpl $257, %ebx
jne .L2
movl %esi, 4(%esp)
movl %edi, (%esp)
movl $1024, 8(%esp)
call memcpy
addl $1036, %esp
movl %edi, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret $4
.size createData, .-createData
.ident "GCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-46)"
.section .note.GNU-stack,"",#progbits
gcc 4.4.2### has grown a lot, and does not copy for the above non-trivial case.
.file "t.c"
.text
.p2align 4,,15
.globl createData
.type createData, #function
createData:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
movl $1, %ebx
subl $1036, %esp
movl 8(%ebp), %edi
leal -1036(%ebp), %esi
.p2align 4,,7
.L2:
call rand
movl %eax, -4(%esi,%ebx,4)
addl $1, %ebx
cmpl $257, %ebx
jne .L2
movl %esi, 4(%esp)
movl %edi, (%esp)
movl $1024, 8(%esp)
call memcpy
addl $1036, %esp
movl %edi, %eax
popl %ebx
popl %esi
popl %edi
popl %ebp
ret $4
.size createData, .-createData
.ident "GCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-46)"
.section .note.GNU-stack,"",#progbits
In addition, VS2008 (compiled the above as C) will reserve struct Data on the stack of createData() and do a rep movsd loop to copy it back to the caller in Debug mode, in Release mode it will move the return value of rand() (%eax) directly back to the caller
typedef struct {
unsigned value[256];
} Data;
Data createData(void) {
Data r;
calcualte(&r);
return r;
}
Data d = createData();
msvc(6,8,9) and gcc mingw(3.4.5,4.4.0) will generate code like the following pseudocode
void createData(Data* r) {
calculate(&r)
}
Data d;
createData(&d);
gcc on linux will issue a memcpy() to copy the struct back on the stack of the caller. If the function has internal linkage, more optimizations become available though.