Here is the explanation of what it means 6.7.6.3/7:
If the keyword static also appears within the [ and ] of the array
type derivation, then for each call to the function, the value of the
corresponding actual argument shall provide access to the first
element of an array with at least as many elements as specified by the
size expression.
It is not quite clear what it means. I ran the following example:
main.c
#include "func.h"
int main(void){
char test[4] = "123";
printf("%c\n", test_func(2, test));
}
And 2 different implementations of the test_func:
static version
func.h
char test_func(size_t idx, const char[const static 4]);
func.c
char test_func(size_t idx, const char arr[const static 4]){
return arr[idx];
}
non-static version
func.h
char test_func(size_t idx, const char[const 4]);
func.c
char test_func(size_t idx, const char arr[const 4]){
return arr[idx];
}
I checked the assembly code compiled with gcc 7.4.0 -O3 of the function in both of the cases and it turned out to be completely identical:
Disassembly of the functions
(gdb) disas main
sub rsp,0x18
mov edi,0x2
lea rsi,[rsp+0x4]
mov DWORD PTR [rsp+0x4],0x333231
mov rax,QWORD PTR fs:0x28
mov QWORD PTR [rsp+0x8],rax
xor eax,eax
call 0x740 <test_func>
[...]
(gdb) disas test_func
movzx eax,BYTE PTR [rsi+rdi*1]
ret
Can you give an example where the static keyword gives some benefits (or any differences at all) comparing to non-static counterpart?
Here is an example where static actually makes a difference:
unsigned foo(unsigned a[2])
{
return a[0] ? a[0] * a[1] : 0;
}
clang (for x86-64, with -O3) compiles this to
foo:
mov eax, dword ptr [rdi]
test eax, eax
je .LBB0_1
imul eax, dword ptr [rdi + 4]
ret
.LBB0_1:
xor eax, eax
ret
But after replacing the function parameter with unsigned a[static 2], the result is simply
foo:
mov eax, dword ptr [rdi + 4]
imul eax, dword ptr [rdi]
ret
The conditional branch is not necessary because a[0] * a[1] evaluates to the correct result whether a[0] is zero or not. But without the static keyword, the compiler cannot assume that a[1] can be accessed, and thus has to check a[0].
Currently only clang does this optimization; ICC and gcc produce the same code in both cases.
This isn't used much by compilers in my experience, but one use is that the compiler can assume that the (array decayed into pointer) parameter is not NULL.
Given this function, both gcc and clang (x86) produce identical machine code at -O3:
int func (int a[2])
{
if(a)
return 1;
return 0;
}
Disassembly:
func:
xor eax, eax
test rdi, rdi
setne al
ret
When changing the parameter to int a[static 2], gcc gives the same output as before, but clang does a better job:
func:
mov eax, 1
ret
Since clang realizes that a can never be NULL, so it can skip the check.
Related
Why does using the extern inline __attribute__((gnu_inline)) over static inline affects GCC 8.3 code generation so much?
The example code is based on glibc bsearch code (build with -O3):
#include <stddef.h>
extern inline __attribute__((gnu_inline))
void *bsearch (const void *__key, const void *__base, size_t __nmemb, size_t __size,
int (*__compar)(const void *, const void *))
{
size_t __l, __u, __idx;
const void *__p;
int __comparison;
__l = 0;
__u = __nmemb;
while (__l < __u) {
__idx = (__l + __u) / 2;
__p = (void *) (((const char *) __base) + (__idx * __size));
__comparison = (*__compar) (__key, __p);
if (__comparison < 0)
__u = __idx;
else if (__comparison > 0)
__l = __idx + 1;
else
return (void *) __p;
}
return NULL;
}
static int comp_int(const void *a, const void *b)
{
int l = *(const int *) a, r = *(const int *) b;
if (l > r) return 1;
else if (l < r) return -1;
else return 0;
}
int *bsearch_int(int key, const int *data, size_t num)
{
return bsearch(&key, data, num, sizeof(int), &comp_int);
}
The code generated for the bsearch_int function is:
bsearch_int:
test rdx, rdx
je .L6
xor r8d, r8d
.L5:
lea rcx, [rdx+r8]
shr rcx
lea rax, [rsi+rcx*4]
cmp DWORD PTR [rax], edi
jl .L3
jg .L10
ret
.L10:
mov rdx, rcx
.L4:
cmp rdx, r8
ja .L5
.L6:
xor eax, eax
ret
.L3:
lea r8, [rcx+1]
jmp .L4
If I use static inline over extern inline __attribute__((gnu_inline)) I get much larger code:
bsearch_int:
xor r8d, r8d
test rdx, rdx
je .L11
.L2:
lea rcx, [r8+rdx]
shr rcx
lea rax, [rsi+rcx*4]
cmp edi, DWORD PTR [rax]
jg .L7
jl .L17
.L1:
ret
.L17:
cmp r8, rcx
jnb .L11
lea rdx, [r8+rcx]
shr rdx
lea rax, [rsi+rdx*4]
cmp edi, DWORD PTR [rax]
jg .L12
jge .L1
cmp r8, rdx
jnb .L11
.L6:
lea rcx, [r8+rdx]
shr rcx
lea rax, [rsi+rcx*4]
cmp DWORD PTR [rax], edi
jl .L7
jle .L1
mov rdx, rcx
cmp r8, rdx
jb .L6
.L11:
xor eax, eax
.L18:
ret
.L12:
mov rax, rcx
mov rcx, rdx
mov rdx, rax
.L7:
lea r8, [rcx+1]
cmp r8, rdx
jb .L2
xor eax, eax
jmp .L18
What makes GCC generate so much shorter code in the first case?
Notes:
Clang does not seem to be affected by this.
It only compiles because you do not use any optimisations and inlining is not active. Try with -O1 for example and your code will not compile at all.
The code is different because when you use static the compiler does not have to care about the calling conventions as the function will be not visible to another compilation units.
The answer below was based on revision 2 of the question, whereas revision 3 changed, based on this answer, the meaning of the question, after which much of the answer below can seem a bit out of context. Leaving this answer as it was written, based on edition 2.
From 6.31.1 Common Function Attributes of GCC's manual [emphasis mine]:
gnu_inline
This attribute should be used with a function that is also declared
with the inline keyword. It directs GCC to treat the function as
if it were defined in gnu90 mode even when compiling in C99 or gnu99
mode.
...
And, from Section 6.42 An Inline Function is As Fast As a Macro [emphasis mine]:
When a function is both inline and static, if all calls to the
function are integrated into the caller, and the function's address is
never used, then the function's own assembler code is never
referenced. In this case, GCC does not actually output assembler
code for the function, unless you specify the option
-fkeep-inline-functions.
...
The remainder of this section is specific to GNU C90 inlining.
When an inline function is not static, then the compiler must
assume that there may be calls from other source files; since a global
symbol can be defined only once in any program, the function must not
be defined in the other source files, so the calls therein cannot be
integrated. Therefore, a non-static inline function is always
compiled on its own in the usual fashion.
If you specify both inline and extern in the function
definition, then the definition is used only for inlining. In no
case is the function compiled on its own, not even if you refer to its
address explicitly. Such an address becomes an external reference, as
if you had only declared the function, and had not defined it.
...
They key here is that the gnu_inline attribute will only have an effect on the following two cases, where GNU C90 inlining will apply:
using both extern and inline, and
only using inline.
As expected, we see a large difference in the generated assembly between these two.
When using static and inline, however, the GNU C90 inlining rules do not apply (or rather, does not specifically cover this case), which means the gnu_inline attribute will not matter.
Indeed, these two signatures results in the same assembly:
static inline __attribute__((gnu_inline))
void *bsearch ...
static inline
void *bsearch ...
As extern inline and static inline are using two different inlining approaches (GNU C90 inlining strategy and more modern inlining strategies, respectively) it can be expected that the generated assembly may differ slightly between these two. Nonetheless, both these yield substantially less assembly output than when using only inline (in which case, as cited above, the function is always compiled on its own).
this is my first question, because I couldn't find anything related to this topic.
Recently, while making a class for my C game engine project I've found something interesting:
struct Stack *S1 = new(Stack);
struct Stack *S2 = new(Stack);
S1->bPush(S1, 1, 2); //at this point
bPush is a function pointer in the structure.
So I wondered, what does operator -> in that case, and I've discovered:
mov r8b,2 ; a char, written to a low point of register r8
mov dl,1 ; also a char, but to d this time
mov rcx,qword ptr [S1] ; this is the 1st parameter of function
mov rax,qword ptr [S1] ; !Why cannot I use this one?
call qword ptr [rax+1A0h] ; pointer call
so I assume -> writes an object pointer to rcx, and I'd like to use it in functions (methods they shall be). So the question is, how can I do something alike
push rcx
// do other call vars
pop rcx
mov qword ptr [this], rcx
before it starts writing other variables of the function. Something with preprocessor?
It looks like you'd have an easier time (and get asm that's the same or more efficient) if you wrote in C++ so you could use language built-in support for virtual functions, and for running constructors on initialization. Not to mention not having to manually run destructors. You wouldn't need your struct Class hack.
I'd like to implicitly pass *this pointer, because as shown in second asm part it does the same thing twice, yes, it is what I'm looking for, bPush is a part of a struct and it cannot be called from outside, but I have to pass the pointer S1, which it already has.
You get inefficient asm because you disabled optimization.
MSVC -O2 or -Ox doesn't reload the static pointer twice. It does waste a mov instruction copying between registers, but if you want better asm use a better compiler (like gcc or clang).
The oldest MSVC on the Godbolt compiler explorer is CL19.0 from MSVC 2015, which compiles this source
struct Stack {
int stuff[4];
void (*bPush)(struct Stack*, unsigned char value, unsigned char length);
};
struct Stack *const S1 = new(Stack);
int foo(){
S1->bPush(S1, 1, 2);
//S1->bPush(S1, 1, 2);
return 0; // prevent tailcall optimization
}
into this asm (Godbolt)
# MSVC 2015 -O2
int foo(void) PROC ; foo, COMDAT
$LN4:
sub rsp, 40 ; 00000028H
mov rax, QWORD PTR Stack * __ptr64 __ptr64 S1
mov r8b, 2
mov dl, 1
mov rcx, rax ;; copy RAX to the arg-passing register
call QWORD PTR [rax+16]
xor eax, eax
add rsp, 40 ; 00000028H
ret 0
int foo(void) ENDP ; foo
(I compiled in C++ mode so I could write S1 = new(Stack) without having to copy your github code, and write it at global scope with a non-constant initializer.)
Clang7.0 -O3 loads into RCX straight away:
# clang -O3
foo():
sub rsp, 40
mov rcx, qword ptr [rip + S1]
mov dl, 1
mov r8b, 2
call qword ptr [rcx + 16] # uses the arg-passing register
xor eax, eax
add rsp, 40
ret
Strangely, clang only decides to use low-byte registers when targeting the Windows ABI with __attribute__((ms_abi)). It uses mov esi, 1 to avoid false dependencies when targeting its default Linux calling convention, not mov sil, 1.
Or if you are using optimization, then it's because even older MSVC is even worse. In that case you probably can't do anything in the C source to fix it, although you might try using a struct Stack *p = S1 local variable to hand-hold the compiler into loading it into a register once and reusing it from there.)
If I understand clang assumes that the stack segment for x86 is flat (has 0 base). E.g., when compiling using the following command line:
clang -cc1 -S -mllvm --x86-asm-syntax=intel -o - -triple i986-unknown-unknown -mrelocation-model static xxx.c
xxx.c:
void f()
{
int a = 5;
int *ap = &a;
int b = *ap;
}
the following assembly is produced:
f:
sub ESP, 12
lea EAX, DWORD PTR [ESP + 8]
mov DWORD PTR [ESP + 8], 5
mov DWORD PTR [ESP + 4], EAX
mov EAX, DWORD PTR [ESP + 4]
mov EAX, DWORD PTR [EAX]
mov DWORD PTR [ESP], EAX
add ESP, 12
ret
This may only be correct if the stack is flat, because EAX contains the offset from SS base.
Is it possible to compile the C code for SS with an arbitrary base?
When executing in protected mode, segment registers do not contain offsets you can use. Rather, each segment is independent of the others. As far as I know, no operating system in use today uses an ABI where pointers contain segment information and as you can take the address of any local variable (placed on the stack) this would be necessary. I don't think clang (or gcc for that matter) can compile code for such a model.
I'm compiling this code with -O3 -x c -std=c99 -fno-builtin -nostdlib -ffreestanding
unsigned char *memset(unsigned char *dest, unsigned char val, int count)
{
unsigned char* p = dest;
while (count--)
*p++ = val;
return dest;
}
#include <stdio.h>
int main()
{
unsigned char c[20];
memset(c, 'a', 19);
c[19] = '\0';
printf((const char*) c);
}
and using godbolt to examine what memset gcc is calling in the assembly output.
memset:
test edx, edx
je .L6
sub edx, 1
sub rsp, 8
movzx esi, sil
add rdx, 1
call memset
add rsp, 8
ret
.L6:
mov rax, rdi
ret
main:
sub rsp, 40
movabs rax, 7016996765293437281
mov QWORD PTR [rsp], rax
mov QWORD PTR [rsp+8], rax
mov eax, 24929
mov WORD PTR [rsp+16], ax
mov rdi, rsp
xor eax, eax
mov BYTE PTR [rsp+18], 97
mov BYTE PTR [rsp+19], 0
call printf
add rsp, 40
ret
With the flags I used I'm attempting to eliminate all possibility of it calling a built-in memset and judging from the colorization godbolt uses, it looks like gcc is doing a recursive call at *p++ = val;. So is it doing recursion or calling builtin memset?
As others have indicated, the setting of the array c elements has been inlined. As a result, the memset() you implemented is not even getting called. This is a result of the use of the -03 compiler option. The compiler is being very aggressive in its optimizations. Furthermore, there is no recursion on the execution path.
However, that does not entirely answer your question. The memset() shown in the disassembled output is indeed NOT the built in version and it is not even being executed.
Incidentally, you do not need to apply the -fno-builtin flag as the -ffreestanding flag automatically implies it. Also, if you enable garbage collection, I am sure that will find that the memset() routine in the disassembled output will vanish.
I am trying to get a sense of how I should use const in C code. First I didn't really bother using it, but then I saw a quite a few examples of const being used throughout. Should I make an effort and go back and religiously make suitable variables const? Or will I just be waisting my time?
I suppose it makes it easier to read which variables that are expected to change, especially in function calls, both for humans and the compiler. Am I missing any other important points?
const is typed, #define macros are not.
const is scoped by C block, #define applies to a file (or more strictly, a compilation unit).
const is most useful with parameter passing. If you see const used on a prototype with pointers, you know it is safe to pass your array or struct because the function will not alter it. No const and it can.
Look at the definition for such as strcpy() and you will see what I mean. Apply "const-ness" to function prototypes at the outset. Retro-fitting const is not so much difficult as "a lot of work" (but OK if you get paid by the hour).
Also consider:
const char *s = "Hello World";
char *s = "Hello World";
which is correct, and why?
How do I best use the const keyword in C?
Use const when you want to make it "read-only". It's that simple :)
Using const is not only a good practice but improves the readability and comprehensibility of the code as well as helps prevent some common errors. Definitely do use const where appropriate.
Apart from producing a compiler error when attempting to modify the constant and passing the constant as a non-const parameter, therefore acting as a compiler guard, it also enables the compiler to perform certain optimisations knowing that the value will not change and therefore it can cache the value and not have to read it fresh from memory, because it won't have changed, and it allows it to be immediately substituted in the code.
C const
const and register are basically the opposite of volatile and using volatile will override the const optimisations at file and block scope and the register optimisations at block-scope. const register and register will produce identical outputs because const does nothing on C at block-scope on gcc C -O0, and is redundant on -O1 and onwards, so only the register optimisations apply at -O0, and are redundant from -O1 onwards.
#include<stdio.h>
int main() {
const int i = 1;
printf("%d", i);
}
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-4] //load from stack isn't eliminated for block-scope consts on gcc C unlike on gcc C++ and clang C, even though value will be the same
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
In this instance, with -O0, const, volatile and auto all produce the same code, with only register differing c.f.
#include<stdio.h>
const int i = 1;
int main() {
printf("%d", i);
}
i:
.long 1
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
mov eax, DWORD PTR i[rip] //load from memory
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
pop rbp
ret
with const int i = 1; instead:
i:
.long 1
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
mov eax, 1 //saves load from memory, now immediate
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
pop rbp
ret
C++ const
#include <iostream>
int main() {
int i = 1;
std::cout << i;
}
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1 //stores on stack
mov eax, DWORD PTR [rbp-4] //loads the value stored on the stack
mov esi, eax
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
leave
ret
#include <iostream>
int main() {
const int i = 1;
std::cout << i;
}
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1 //stores it on the stack
mov esi, 1 //but saves a load from memory here, unlike on C
//'register' would skip this store on the stack altogether
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
leave
ret
#include <iostream>
int i = 1;
int main() {
std::cout << i;
}
i:
.long 1
main:
push rbp
mov rbp, rsp
mov eax, DWORD PTR i[rip] //load from memory
mov esi, eax
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
pop rbp
ret
#include <iostream>
const int i = 1;
int main() {
std::cout << i;
}
main:
push rbp
mov rbp, rsp
mov esi, 1 //eliminated load from memory, now immediate
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
pop rbp
ret
C++ has the extra restriction of producing a compiler error if a const is not initialised (both at file-scope and block-scope). const also has internal linkage as a default on C++. volatile still overrides const and register but const register combines both optimisations on C++.
Even though all the above code is compiled using the default implicit -O0, when compiled with -Ofast, const surprisingly still isn't redundant on C or C++ on clang or gcc for file-scoped consts. The load from memory isn't optimised out unless const is used, even if the file-scope variable isn't modified in the code. https://godbolt.org/z/PhDdxk.