Where the const reduce performance rather than optimizing it?

Where the const reduce performance rather than optimizing it? - c

Please take a look at following code snippet
#define HF_ND_SZ sizeof(struct huffman_node)
#define TSIZE_MAX 256
struct huffman_node * build_decomp_huffman_tree(uint64_t *table, int size) {
static struct huffman_node huffman_node_list2[TSIZE_MAX * 3];
int i = 0, j = 0;
int k = TSIZE_MAX * 2; // this is the case point 1
//...//
for (i = 0; i < size - 1; i++) {
huffman_node_list2[k + i] = huffman_node_list2[i + 1]; // point 2
huffman_node_list2[TSIZE_MAX + i].right = &huffman_node_list2[k+ i];
// ... //
}
return &huffman_node_list2[size - 1];
}
For simplicity I reduced the code and point out the locations where I want to highlight,also do not think algorithm and structure too deeply.
What I want to is that if we define point 1 as const int k = TSIZE_MAX * 2;,then is there any optimization happens at point 2 or 3 where assignment happens to contiguous data(array) huffman_node_list2[k + i] = huffman_node_list2[i + 1]; ?
(Please bear with and correct my assumption if it is wrong,I thought when we declare const in local or global scope it's being created as an immutable memory allocation, if we use that immutable memory and carried out math operation as in point 2 or 3([k + i]) in a loop structure ,during runtime program has to load immutable memory every iteration of the loop and store the result in temporary memory location,what if happend if that immutable memory has large chunk,hope you can grab my idea,Am I correct?)

const can be slower if the compiler puts it in the read only .text section far enough away that causes a cache miss.
This can happen with global consts or when the compiler hoists it out of a function rather than having to build it with instructions (a fairly common optimization for structs or arrays) This can reduce code size if multiple functions use the same constant, but also increases the distance from the code and thus the likeliness to cause a miss.
Since you aren't using any aggregate types, there should be no difference with a decent optimizing compiler.
There is a good article on how different data gets laid out here

Using Visual C, I compiled both versions of your code : with const int k and without const. The flag /FA produces code machine in a .asm file readable by (some) human. No optimization flags were used.
The result is : there's no optimization, no difference. The machine code produced is strictly the same :
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.00.24231.0
TITLE opt_const.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC _main
_BSS SEGMENT
?huffman_node_list2#?1??main##9#9 DB 01fd4H DUP (?) ; `main'::`2'::huffman_node_list2
_BSS ENDS
; Function compile flags: /Odtp
; File c:\joël\tests\opt_const.c
_TEXT SEGMENT
_j$ = -16 ; size = 4
_size$ = -12 ; size = 4
_k$ = -8 ; size = 4
_i$ = -4 ; size = 4
_argc$ = 8 ; size = 4
_argv$ = 12 ; size = 4
_main PROC
; 10 : {
push ebp
mov ebp, esp
sub esp, 16 ; 00000010H
push esi
push edi
; 11 : static struct huffman_node huffman_node_list2[TSIZE_MAX * 3];
; 12 : int i = 0, j = 0, size = 17;
mov DWORD PTR _i$[ebp], 0
mov DWORD PTR _j$[ebp], 0
mov DWORD PTR _size$[ebp], 17 ; 00000011H
; 13 : int k = TSIZE_MAX * 2; // this is the case point 1
mov DWORD PTR _k$[ebp], 194 ; 000000c2H
; 14 : //...//
; 15 : for (i = 0; i < size - 1; i++) {
mov DWORD PTR _i$[ebp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD PTR _i$[ebp]
add eax, 1
mov DWORD PTR _i$[ebp], eax
$LN4#main:
mov ecx, DWORD PTR _size$[ebp]
sub ecx, 1
cmp DWORD PTR _i$[ebp], ecx
jge SHORT $LN3#main
; 16 : huffman_node_list2[k + i] = huffman_node_list2[i + 1]; // point 2
mov edx, DWORD PTR _i$[ebp]
add edx, 1
imul esi, edx, 28
add esi, OFFSET ?huffman_node_list2#?1??main##9#9
mov eax, DWORD PTR _k$[ebp]
add eax, DWORD PTR _i$[ebp]
imul edi, eax, 28
add edi, OFFSET ?huffman_node_list2#?1??main##9#9
mov ecx, 7
rep movsd
; 17 : huffman_node_list2[TSIZE_MAX + i].right = &huffman_node_list2[k+ i];
mov ecx, DWORD PTR _k$[ebp]
add ecx, DWORD PTR _i$[ebp]
imul edx, ecx, 28
add edx, OFFSET ?huffman_node_list2#?1??main##9#9
mov eax, DWORD PTR _i$[ebp]
add eax, 97 ; 00000061H
imul ecx, eax, 28
mov DWORD PTR ?huffman_node_list2#?1??main##9#9[ecx], edx
; 18 : // ... //
; 19 : }
jmp SHORT $LN2#main
$LN3#main:
; 20 : return 0;
xor eax, eax
; 21 : }
pop edi
pop esi
mov esp, ebp
pop ebp
ret 0
_main ENDP
_TEXT ENDS
END
EDIT : I did the same test with gcc, -O3 optimization flags.
And... same result : the generated assembler code is again stricly the same with and without the const keyword.
.file "opt_const.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.section .text.startup,"ax",#progbits
.LHOTB0:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB23:
.cfi_startproc
movl $huffman_node_list2.2488+16384, %eax
.p2align 4,,10
.p2align 3
.L2:
movq -16352(%rax), %rdx
movq %rax, -8192(%rax)
addq $32, %rax
movq %rdx, -32(%rax)
movq -16376(%rax), %rdx
movq %rdx, -24(%rax)
movq -16368(%rax), %rdx
movq %rdx, -16(%rax)
movq -16360(%rax), %rdx
movq %rdx, -8(%rax)
cmpq $huffman_node_list2.2488+17088, %rax
jne .L2
xorl %eax, %eax
ret
.cfi_endproc
.LFE23:
.size main, .-main
.section .text.unlikely
.LCOLDE0:
.section .text.startup
.LHOTE0:
.local huffman_node_list2.2488
.comm huffman_node_list2.2488,24576,32
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

const doesn't necessarily create a memory location at all, unless you take its address. They can just disappear into the instruction stream as immediate-mode constants, or be added into addresses at compile or link time.
For example, huffman_node_list2[k + i] = huffman_node_list2[i + 1] is almost certainly compiled as huffman_node_list2[TSIZE_MAX * 2 + i] = huffman_node_list2[i + 1], where not only is TSIZE_MAX * 2 evaluated at compile time but huffman_node_list2+TSIZE_MAX*2 is evaluated at link time.

Related

Efficient access to function-local constant data in PIC code, without going through the GOT

In code such as…
extern const long long *tget(void);
static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
long long l;
const signed long long *op;
long long u = t;
const long long *lp = tget();
loop:
l = *lp;
op = o;
if (l < 0) {
l = -l;
++op;
}
[…]
… GCC, when producing position-independent code, insists on using GOT-relative accesses to the o array, even though that’s not necessary. The generated assembly (relevant parts) is…
.section ".rodata"
.align 8
.type o , #object
.size o , 16
o:
.long -1
.long -1
.long 0
.long 1
.section ".text"
.align 4
.LGETPC0:
retl
add %o7, %l7, %l7
.align 4
.globl tmogrify
.type tmogrify , #function
.proc 05
tmogrify:
!#PROLOGUE# 0
save %sp, -104, %sp
sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %l7
call .LGETPC0
add %l7, %lo(_GLOBAL_OFFSET_TABLE_+4), %l7
call tget, 0
nop
mov %i0, %o2
mov %i1, %o3
.L2:
sethi %hi(o), %g2
ldd [%o0], %o4
or %g2, %lo(o), %g1
cmp %o4, 0
bge .L3
ld [%l7+%g1], %g2 //← here, %l7-relative
subcc %g0, %o5, %o5
add %g2, 8, %g2
subx %g0, %o4, %o4
.L3:
… (SPARC) or…
.section .rodata
.align 8
.type o , #object
.size o , 16
o:
.quad -1
.quad 1
.text
.globl tmogrify
.type tmogrify , #function
tmogrify:
[…]
call .L6
.L6:
pop ebx
add ebx, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_+(.-.L6)
mov esi, DWORD PTR 8[ebp]
mov edi, DWORD PTR 12[ebp]
call tget#PLT
mov DWORD PTR -20[ebp], eax
.L2:
mov eax, DWORD PTR -20[ebp]
mov ecx, DWORD PTR 4[eax]
mov edx, DWORD PTR [eax]
test ecx, ecx
lea eax, o#GOTOFF[ebx] //← here
mov DWORD PTR -16[ebp], eax
jns .L3
neg edx
adc ecx, 0
add eax, 8
neg ecx
mov DWORD PTR -16[ebp], eax
.L3:
… (i386).
How can I get GCC to optimise that to not require going through the GOT without leaving standard C territory (no UB/IB)?
As assembly programmer I’d put it into .text, either just before the function or within that call to get the self offset (EIP/%pc), but this is obviously not possible here. GCC’s __attribute__((__section__("text"))) is also of no help (this can be #ifdef’d and would have been acceptable).
As this code may (here: does, but in reality it long long may also be a 32-bit type; this is just for experimenting with structuring code so that codegen DTRT) use multiple 64-bit variables, dropping GOT-relative access would free a register, which is especially worth it on i386, but also simplify the calculations for sparc I think.
For the sake of completeness, here’s the entire test module (with nop markers so I see the two places I experiment with changing clearly delined in the -S output):
extern long long tmogrify(long long);
extern const long long *tget(void);
static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
long long l;
/* unsigned long ofs;*/
const signed long long *op;
long long u = t;
const long long *lp = tget();
loop:
asm volatile("nop");
l = *lp;
op = o;
if (l < 0) {
l = -l;
++op;
}
asm volatile("nop");
if (l && (t >= l)) {
asm volatile("nop");
u += *op; /*o[ofs];*/
asm volatile("nop");
++lp;
goto loop;
}
return (u);
}

Intel-x86, confused about the allocation of bytes in the esp register

I've been a bit stuck on this question. Given the following C code:
#include <stdio.h>
#define BUF_SIZE 13
int foo(){
int i;
int B[BUF_SIZE];
for(i = 0; i < BUF_SIZE; i++)
B[i] = 5;
return i;
}
int main(){
foo();
return 0;
}
The following Intel-x86 assembly is generated:
1. .file "code.c"
2. .intel_syntax noprefix
3. .text
4. .globl foo
5. .type foo, #function
6. foo:
7. push ebp
8. mov ebp, esp
9. sub esp, 64
10. mov DWORD PTR [ebp-4], 0
11. jmp .L2
12. .L3:
13. mov eax, DWORD PTR [ebp-4]
14. mov DWORD PTR [ebp-56+eax*4], 5
15. add DWORD PTR [ebp-4], 1
16. .L2:
17. cmp DWORD PTR [ebp-4], 12
18. jle .L3
19. mov eax, DWORD PTR [ebp-4]
20. leave
21. ret
22. .size foo, .-foo
23. .globl main
24. .type main, #function
25. main:
26. push ebp
27. mov ebp, esp
28. call foo
29. mov eax, 0
30. pop ebp
31. ret
32. .size main, .-main
33. .ident "GCC: (Debian 6.3.0-18+deb9u1) 6.3.0 20170516"
34. .section .note.GNU-stack,"",#progbits
I'm a bit stuck trying to determine the meaning of line 9 in the assembly. My understanding is that we subtract from the stack register in order to allocate space on the stack for local variables. I know, then, that 52 bytes are being subtracted for the array B, and another 4 bytes for i. But I'm wondering where the other 8 bytes come from? Are those the return values of foo and main? Any help would be appreciated.

The amount of bytes added onto esp is rounded up to maintain some stack alignment. Imagine, you would only add 57 or something. A function you would call, would then need to realign the stack pointer first before storing a 4-byte integer. Everyone is saved that hassle if everyone keeps the stack aligned.

GCC vs CLANG pointer to char* optimization

I have this code:
mainP.c:
int main(int c, char **v){
char *s = v[0];
while (*s++ != 0) {
if ((*s == 'a') && (*s != 'b')) {
return 1;
}
}
return 0;
}
which I compile with clang and gcc generating the assembly code to compare the optimizations:
clang-3.9 -S -masm=intel -O3 mainP.c
gcc -S -masm=intel -O3 mainP.c
The compiler version are:
clang version 3.9.1-9 (tags/RELEASE_391/rc2)
Target: x86_64-pc-linux-gnu
gcc (Debian 6.3.0-18) 6.3.0 20170516
The 2 resulting assembly codes are:
gcc assembly code:
main:
.LFB0:
.cfi_startproc
mov rax, QWORD PTR [rsi]
jmp .L2
.p2align 4,,10
.p2align 3
.L4:
cmp BYTE PTR [rax], 97
je .L5
.L2:
add rax, 1
cmp BYTE PTR -1[rax], 0
jne .L4
xor eax, eax
ret
.L5:
mov eax, 1
ret
clang assembly code:
main: # #main
.cfi_startproc
# BB#0:
mov rcx, qword ptr [rsi]
mov dl, byte ptr [rcx]
inc rcx
.p2align 4, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
xor eax, eax
test dl, dl
je .LBB0_3
# BB#2: # in Loop: Header=BB0_1 Depth=1
movzx edx, byte ptr [rcx]
inc rcx
mov eax, 1
cmp dl, 97
jne .LBB0_1
.LBB0_3:
ret
I notice this: in the gcc assembly code, *s is accessed twice in the loop while *s is accessed only once the clang assembly code.
is there an explanation for the difference?
Then after changing the C code a bit (adding a local char variable), I get about same the assembly code with GCC:
int main(int c, char **v){
char *s = v[0];
char ch;
ch = *s;
while (ch != 0) {
if ((ch == 'a') && (ch != 'b')) {
return 1;
}
ch = *s++;
}
return 0;
}
Resulting assembly code with GCC:
main:
.LFB0:
.cfi_startproc
mov rax, QWORD PTR [rsi]
movzx edx, BYTE PTR [rax]
test dl, dl
je .L6
add rax, 1
cmp dl, 97
jne .L5
jmp .L8
.p2align 4,,10
.p2align 3
.L4:
cmp dl, 97
je .L8
.L5:
add rax, 1
movzx edx, BYTE PTR -1[rax]
test dl, dl
jne .L4
.L6:
xor eax, eax
ret
.L8:
mov eax, 1
ret

The explanation for the difference: compiler optimizations are hard to do, and different compilers will optimize your code in different ways.
Because the expression is relatively simple, we can assume that *s is the same value in both places and only one load is really needed.
But figuring out that optimization is tricky, because the compiler has to absolutely "know" that "whatever s points to" can't be changed between the first reference and the second.
In your example, clang is better at figuring that out than gcc is.

Does while(1){} do the same as while(1);

I just want to be sure that this C code:
while(flag==true)
{
}
foo();
does the same as this:
while(flag==true);
foo();

; alone is a null statement in C.
In your case, {} or ; are syntactically needed, but they do the same: nothing
Related: Use of null statement in C

In addition to the other answers: It's the same thing.
But I prefer this:
while (condition)
{
}
foo();
over this:
while (condition);
foo();
because if you forget the semicolon after the while, your code will compile fine but it won't do what you expect:
while(condition) // ; forgotten here
foo();
will actually be equivalent of:
while(condition)
{
foo();
}

Yes, having an empty body of the loop is equivaled to just while(<some condition>);

Yes. A ; following a control structure (e.g., while, for, etc.) that can be followed with a block is treated as if it was followed by an empty block.

Yes, because when put semicolon after while loop statement that indicate empty body and when the condition becomes false then it goes to the immediate next statement after that loop.

Yes, they are same.
You Can Generate The assembly of the code and see for yourself that they produce the same assembly. (Using gcc filename.c -S -masm=intel -o ouputfilename)
#include<stdio.h>
int foo(void);
int main(){
int flag;
scanf("%d" , &flag);
while(flag==1);
foo();
}
int foo(void){
int x = 2;
return x*x;
}
.LC0:
.ascii "%d\0"
.text
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
push rbp
.seh_pushreg rbp
mov rbp, rsp
.seh_setframe rbp, 0
sub rsp, 48
.seh_stackalloc 48
.seh_endprologue
call __main
lea rax, -4[rbp]
mov rdx, rax
lea rcx, .LC0[rip]
call scanf
nop
.L2:
mov eax, DWORD PTR -4[rbp]
cmp eax, 1
je .L2
call foo
mov eax, 0
add rsp, 48
pop rbp
ret
.seh_endproc
.globl foo
.def foo; .scl 2; .type 32; .endef
.seh_proc foo
foo:
push rbp
.seh_pushreg rbp
mov rbp, rsp
.seh_setframe rbp, 0
sub rsp, 16
.seh_stackalloc 16
.seh_endprologue
mov DWORD PTR -4[rbp], 2
mov eax, DWORD PTR -4[rbp]
imul eax, DWORD PTR -4[rbp]
add rsp, 16
pop rbp
ret
.seh_endproc
.ident "GCC: (x86_64-posix-seh-rev1, Built by MinGW-W64 project) 6.3.0"
.def scanf; .scl 2; .type 32; .endef
And When I Changed while(flag == 1); to while(flag==1){} Assembly Code Generated is :
.LC0:
.ascii "%d\0"
.text
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
push rbp
.seh_pushreg rbp
mov rbp, rsp
.seh_setframe rbp, 0
sub rsp, 48
.seh_stackalloc 48
.seh_endprologue
call __main
lea rax, -4[rbp]
mov rdx, rax
lea rcx, .LC0[rip]
call scanf
nop
.L2:
mov eax, DWORD PTR -4[rbp]
cmp eax, 1
je .L2
call foo
mov eax, 0
add rsp, 48
pop rbp
ret
.seh_endproc
.globl foo
.def foo; .scl 2; .type 32; .endef
.seh_proc foo
foo:
push rbp
.seh_pushreg rbp
mov rbp, rsp
.seh_setframe rbp, 0
sub rsp, 16
.seh_stackalloc 16
.seh_endprologue
mov DWORD PTR -4[rbp], 2
mov eax, DWORD PTR -4[rbp]
imul eax, DWORD PTR -4[rbp]
add rsp, 16
pop rbp
ret
.seh_endproc
.ident "GCC: (x86_64-posix-seh-rev1, Built by MinGW-W64 project) 6.3.0"
.def scanf; .scl 2; .type 32; .endef
You can see that the relevant portion is same in both cases.
//Below Portion is same in both cases.
.L2:
mov eax, DWORD PTR -4[rbp]
cmp eax, 1
je .L2
call foo
mov eax, 0
add rsp, 48
pop rbp
ret
.seh_endproc
.globl foo
.def foo; .scl 2; .type 32; .endef
.seh_proc foo

Understanding these assembly instructions [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 7 years ago.
Improve this question
I'm learning assembly by writing C programs and viewing the assembly output. I've included the C program at the bottom for the page to make it easier. I'm struggling to understand one line of assembly:
cdqe
movzx eax, BYTE PTR [rbp-32+rax] <--- what is this doing?
movsx eax, al
So I think cdqe extends eax into rax (64 bits). Its clear that the string I want to print fits into the al register but I don't understand what is happening deep down with rbp-32+rax. Can someone explain for me?
.file "string_manip.c"
.intel_syntax noprefix
.section .rodata
.LC0:
.string "Hello"
.string ""
.zero 3
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
sub rsp, 48
mov rax, QWORD PTR fs:40
mov QWORD PTR [rbp-8], rax
xor eax, eax
mov DWORD PTR [rbp-36], 0
mov eax, DWORD PTR .LC0[rip]
mov DWORD PTR [rbp-32], eax
movzx eax, WORD PTR .LC0[rip+4]
mov WORD PTR [rbp-28], ax
movzx eax, BYTE PTR .LC0[rip+6]
mov BYTE PTR [rbp-26], al
mov WORD PTR [rbp-25], 0
mov BYTE PTR [rbp-23], 0
mov DWORD PTR [rbp-36], 0
jmp .L2
.L3:
mov eax, DWORD PTR [rbp-36]
cdqe
movzx eax, BYTE PTR [rbp-32+rax] <--- what is this doing?
movsx eax, al
mov edi, eax
call putchar
add DWORD PTR [rbp-36], 1
.L2:
cmp DWORD PTR [rbp-36], 5
jle .L3
mov edi, 10
call putchar
mov eax, 0
mov rdx, QWORD PTR [rbp-8]
xor rdx, QWORD PTR fs:40
je .L5
call __stack_chk_fail
.L5:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4"
.section .note.GNU-stack,"",#progbits
#include <string.h>
#include <stdio.h>
int main()
{
int i = 0;
char array[10] = "Hello\0";
for(i=0; i<6; i++)
printf("%c", array[i]);
printf("\n");
return 0;
}

It's just calculating the address of one of the characters.
Presumably your string starts at rbp-32 and then the instruction does the C equivalent of ch = string[rax].
I guess this is unoptimized code, so the compiler does a few extra sign extend and zero extend that are not really needed.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Where the const reduce performance rather than optimizing it? - c

Related

Efficient access to function-local constant data in PIC code, without going through the GOT

Intel-x86, confused about the allocation of bytes in the esp register

GCC vs CLANG pointer to char* optimization

Does while(1){} do the same as while(1);

Understanding these assembly instructions [closed]

Categories

Resources