compiler over-optimization causing data run time and debugging inconsistency

compiler over-optimization causing data run time and debugging inconsistency - c

I have the following code:
struct cre_eqEntry *
cre_eventGet(struct cre_eqObj *eq_obj)
{
struct cre_eqEntry *eqe = cre_queueTailNode(&eq_obj->q);
Memcpy(&tmpEqo, eq_obj, sizeof(struct cre_eqObj));
volatile u32 ddd = 0;
ddd = ((struct cre_eqEntry *)(eq_obj->q.dma_mem.virtaddr + 4 * eq_obj->q.tail))->evt;
CPUMemFenceReadWrite();
if (!ddd) {
tmp = eq_obj->q.tail;
assert(0);
return NULL;
}
}
It is a piece of kernel code. When I ran it, it fails at assert(0). So apparently ddd should be 0. But when I used GDB to debug the core dump and printed out '((struct cre_eqEntry *)(eq_obj->q.dma_mem.virtaddr + 4 * eq_obj->q.tail))->evt', surprisingly, the value is not 0.
So I start suspecting it is the problem of compiler over-optimization. Here's the disassembly code:
00000000000047ec <cre_eventGet>:
47ec: 55 push %rbp
47ed: 48 89 fe mov %rdi,%rsi
47f0: ba 80 00 00 00 mov $0x80,%edx
47f5: 53 push %rbx
47f6: 48 89 fb mov %rdi,%rbx
47f9: 48 83 ec 18 sub $0x18,%rsp
47fd: 0f b7 6f 24 movzwl 0x24(%rdi),%ebp
4801: 0f b7 47 28 movzwl 0x28(%rdi),%eax
4805: 0f af e8 imul %eax,%ebp
4808: 48 63 ed movslq %ebp,%rbp
480b: 48 03 6f 18 add 0x18(%rdi),%rbp
480f: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi # 4816 <cre_eventGet+0x2a>
4816: e8 00 00 00 00 callq 481b <cre_eventGet+0x2f>
481b: 0f b7 43 28 movzwl 0x28(%rbx),%eax
481f: 48 8b 53 18 mov 0x18(%rbx),%rdx
4823: c7 44 24 0c 00 00 00 movl $0x0,0xc(%rsp)
482a: 00
482b: c1 e0 02 shl $0x2,%eax
482e: 48 98 cltq
4830: 8b 04 02 mov (%rdx,%rax,1),%eax
4833: 89 44 24 0c mov %eax,0xc(%rsp)
4837: 0f ae f0 mfence
483a: 8b 44 24 0c mov 0xc(%rsp),%eax
483e: 85 c0 test %eax,%eax
4840: 74 14 je 4856 <cre_eventGet+0x6a>
As far as I can see, the assembly code does the same thing as the C code.
So now I ran out of ideas what is causing the problem of inconsistency of 'ddd'.
Please kindly give me some hints!

ddd = ((struct cre_eqEntry *)(eq_obj->q.dma_mem.virtaddr + 4 * eq_obj->q.tail))->evt;
Simplify your code. Perform address/boundary checks/validation. Your problem is likely that you are de-referencing some random, uninitialized, address within your process/thread's address space.

ddd = ((struct cre_eqEntry *)(eq_obj->q.dma_mem.virtaddr + 4 * eq_obj->q.tail))->evt; probably violates the strict aliasing rule (can't say 100% for sure without seeing the whole code).
If using gcc/clang, compile with -fno-strict-aliasing unless you want to rewrite your code to comply with the standard.
To do the latter, memcpy((u32 *)&ddd, &(struct cre_eqEntry *)(eq_obj->q.dma_mem.virtaddr + 4 * eq_obj->q.tail)->evt, sizeof ddd); but I guess your codebase may have similar violations in many places, so as a first step, using the compiler flag would be a way to see if this really is the problem.
The magic number 4 is suspicious too, review your code to check if this really is the correct offset and also check that it is not out of bounds of allocated memory.

Related

Why is this code acting different with a single printf? ucontext.h

When I compile my code below it prints
I am running :)
forever(Until I send KeyboardInterrupt signal to the program),
but when I uncomment // printf("done:%d\n", done);, recompile and run it, it will print only two times, prints done: 1 and then returns.
I'm new to ucontext.h and I'm very confused about how this code is working and
why a single printf is changing whole behavior of the code, if you replace printf with done++; it would do the same but if you replace it with done = 2; it does not affect anything and works as we had the printf commented at first place.
Can anyone explain:
Why is this code acting like this and what's the logic behind it?
Sorry for my bad English,
Thanks a lot.
#include <ucontext.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
int main()
{
register int done = 0;
ucontext_t one;
ucontext_t two;
getcontext(&one);
printf("I am running :)\n");
sleep(1);
if (!done)
{
done = 1;
swapcontext(&two, &one);
}
// printf("done:%d\n", done);
return 0;
}

This is a compiler optimization "problem". When the "printf()" is commented, the compiler deduces that "done" will not be used after the "if (!done)", so it does not set it to 1 as it is not worth. But when the "printf()" is present, "done" is used after "if (!done)", so the compiler sets it.
Assembly code with the "printf()":
$ gcc ctx.c -o ctx -g
$ objdump -S ctx
[...]
int main(void)
{
11e9: f3 0f 1e fa endbr64
11ed: 55 push %rbp
11ee: 48 89 e5 mov %rsp,%rbp
11f1: 48 81 ec b0 07 00 00 sub $0x7b0,%rsp
11f8: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
11ff: 00 00
1201: 48 89 45 f8 mov %rax,-0x8(%rbp)
1205: 31 c0 xor %eax,%eax
register int done = 0;
1207: c7 85 5c f8 ff ff 00 movl $0x0,-0x7a4(%rbp) <------- done set to 0
120e: 00 00 00
ucontext_t one;
ucontext_t two;
getcontext(&one);
1211: 48 8d 85 60 f8 ff ff lea -0x7a0(%rbp),%rax
1218: 48 89 c7 mov %rax,%rdi
121b: e8 c0 fe ff ff callq 10e0 <getcontext#plt>
1220: f3 0f 1e fa endbr64
printf("I am running :)\n");
1224: 48 8d 3d d9 0d 00 00 lea 0xdd9(%rip),%rdi # 2004 <_IO_stdin_used+0x4>
122b: e8 70 fe ff ff callq 10a0 <puts#plt>
sleep(1);
1230: bf 01 00 00 00 mov $0x1,%edi
1235: e8 b6 fe ff ff callq 10f0 <sleep#plt>
if (!done)
123a: 83 bd 5c f8 ff ff 00 cmpl $0x0,-0x7a4(%rbp)
1241: 75 27 jne 126a <main+0x81>
{
done = 1;
1243: c7 85 5c f8 ff ff 01 movl $0x1,-0x7a4(%rbp) <----- done set to 1
124a: 00 00 00
swapcontext(&two, &one);
124d: 48 8d 95 60 f8 ff ff lea -0x7a0(%rbp),%rdx
1254: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax
125b: 48 89 d6 mov %rdx,%rsi
125e: 48 89 c7 mov %rax,%rdi
1261: e8 6a fe ff ff callq 10d0 <swapcontext#plt>
1266: f3 0f 1e fa endbr64
}
printf("done:%d\n", done);
126a: 8b b5 5c f8 ff ff mov -0x7a4(%rbp),%esi
1270: 48 8d 3d 9d 0d 00 00 lea 0xd9d(%rip),%rdi # 2014 <_IO_stdin_used+0x14>
1277: b8 00 00 00 00 mov $0x0,%eax
127c: e8 3f fe ff ff callq 10c0 <printf#plt>
return 0;
Assembly code without the "printf()":
$ gcc ctx.c -o ctx -g
$ objdump -S ctx
[...]
int main(void)
{
11c9: f3 0f 1e fa endbr64
11cd: 55 push %rbp
11ce: 48 89 e5 mov %rsp,%rbp
11d1: 48 81 ec b0 07 00 00 sub $0x7b0,%rsp
11d8: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
11df: 00 00
11e1: 48 89 45 f8 mov %rax,-0x8(%rbp)
11e5: 31 c0 xor %eax,%eax
register int done = 0;
11e7: c7 85 5c f8 ff ff 00 movl $0x0,-0x7a4(%rbp) <------ done set to 0
11ee: 00 00 00
ucontext_t one;
ucontext_t two;
getcontext(&one);
11f1: 48 8d 85 60 f8 ff ff lea -0x7a0(%rbp),%rax
11f8: 48 89 c7 mov %rax,%rdi
11fb: e8 c0 fe ff ff callq 10c0 <getcontext#plt>
1200: f3 0f 1e fa endbr64
printf("I am running :)\n");
1204: 48 8d 3d f9 0d 00 00 lea 0xdf9(%rip),%rdi # 2004 <_IO_stdin_used+0x4>
120b: e8 80 fe ff ff callq 1090 <puts#plt>
sleep(1);
1210: bf 01 00 00 00 mov $0x1,%edi
1215: e8 b6 fe ff ff callq 10d0 <sleep#plt>
if (!done)
121a: 83 bd 5c f8 ff ff 00 cmpl $0x0,-0x7a4(%rbp)
1221: 75 1d jne 1240 <main+0x77>
{
done = 1; <------------- done is no set here (it is optimized by the compiler)
swapcontext(&two, &one);
1223: 48 8d 95 60 f8 ff ff lea -0x7a0(%rbp),%rdx
122a: 48 8d 85 30 fc ff ff lea -0x3d0(%rbp),%rax
1231: 48 89 d6 mov %rdx,%rsi
1234: 48 89 c7 mov %rax,%rdi
1237: e8 74 fe ff ff callq 10b0 <swapcontext#plt>
123c: f3 0f 1e fa endbr64
}
//printf("done:%d\n", done);
return 0;
1240: b8 00 00 00 00 mov $0x0,%eax
}
1245: 48 8b 4d f8 mov -0x8(%rbp),%rcx
1249: 64 48 33 0c 25 28 00 xor %fs:0x28,%rcx
1250: 00 00
1252: 74 05 je 1259 <main+0x90>
1254: e8 47 fe ff ff callq 10a0 <__stack_chk_fail#plt>
1259: c9 leaveq
125a: c3 retq
125b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
To disable the optimization on "done", add the "volatile" keyword in its definition:
volatile register int done = 0;
This makes the program work in both cases.

(There is some overlap with Rachid K's answer as it was posted while I was writing this.)
I am guessing you are declaring done as register in hopes that it will actually be put in a register, so that its value will be saved and restored by the context switch. But the compiler is never obliged to honor this; most modern compilers ignore register declarations completely and make their own decisions about register usage. And in particular, gcc without optimizations will nearly always put local variables in memory on the stack.
As such, in your test case, the value of done is not restored by the context switch. So when getcontext returns for the second time, done has the same value as when swapcontext was called.
When the printf is present, as Rachid also points out, the done = 1 is actually stored before the swapcontext, so on the second return of getcontext, done has the value 1, the if block is skipped, and the program prints done:1 and exits.
However, when the printf is absent, the compiler notices that the value of done is never used after its assignment (since it assumes swapcontext is a normal function and doesn't know that it will actually return somewhere else), so it optimizes out the dead store (yes, even though optimizations are off). Thus we have done == 0 when getcontext returns the second time, and you get an infinite loop. This is maybe what you were expecting if you thought done would be placed in a register, but if so, you got the "right" behavior for the wrong reason.
If you enable optimizations, you'll see something else again: the compiler notices that done can't be affected by the call to getcontext (again assuming it's a normal function call) and therefore it is guaranteed to be 0 at the if. So the test need not be done at all, because it will always be true. The swapcontext is then executed unconditionally, and as for done, it's optimized completely out of existence, because it no longer has any effect on the code. You'll again see an infinite loop.
Because of this issue, you really can't make any safe assumptions about local variables that have been modified in between the getcontext and swapcontext. When getcontext returns for the second time, you might or might not see the changes. There are further issues if the compiler chose to reorder some of your code around the function call (which it knows no reason not to do, since again it thinks these are ordinary function calls that can't see your local variables).
The only way to get any certainty is to declare a variable volatile. Then you can be sure that intermediate changes will be seen, and the compiler will not assume that getcontext can't change it. The value seen at the second return of getcontext will be the same as at the call to swapcontext. If you write volatile int done = 0; you ought to see just two "I am running" messages, regardless of other code or optimization settings.

How to stop icc from eliminating function called from inline assembly

Background
I'm making an app that needs to run several tasks concurrently. I can't use threads and such because the app should work without any OS (i.e. straight from the bootsector). Using x86 tasks looks like an overkill (both logically and performance-wise). Thus, I decided to implement a task-switching utility myself. I would save processor state, make a call to the task code and then restore the previous state. So I have to make the call from inline assembly.
Problem
Here's some example code:
#include <stdio.h>
void func() {
printf("Hello, world!\n");
}
void (*funcptr)();
int main() {
funcptr = func;
asm(
"call *%0;"
:
:"r"(funcptr)
);
return 0;
}
It compiles perfectly under icc with no options, gcc and clang and yields "Hello, world!" when run. However, if I compile it with icc main.c -ipo, it segfaults.
I disassembled the code that was generated by icc main.c and got the following:
0000000000401220 <main>:
401220: 55 push %rbp
401221: 48 89 e5 mov %rsp,%rbp
401224: 48 83 e4 80 and $0xffffffffffffff80,%rsp
401228: 48 81 ec 80 00 00 00 sub $0x80,%rsp
40122f: bf 03 00 00 00 mov $0x3,%edi
401234: 33 f6 xor %esi,%esi
401236: e8 45 00 00 00 callq 401280 <__intel_new_feature_proc_init>
40123b: 0f ae 1c 24 stmxcsr (%rsp)
40123f: 48 c7 05 f6 78 00 00 movq $0x401270,0x78f6(%rip) # 408b40 <funcptr>
401246: 70 12 40 00
40124a: b8 70 12 40 00 mov $0x401270,%eax
40124f: 81 0c 24 40 80 00 00 orl $0x8040,(%rsp)
401256: 0f ae 14 24 ldmxcsr (%rsp)
40125a: ff d0 callq *%rax
40125c: 33 c0 xor %eax,%eax
40125e: 48 89 ec mov %rbp,%rsp
401261: 5d pop %rbp
401262: c3 retq
401263: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
401268: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
40126f: 00
0000000000401270 <func>:
401270: bf 04 40 40 00 mov $0x404004,%edi
401275: e9 e6 fd ff ff jmpq 401060 <puts#plt>
40127a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
On the other hand, icc main.c -ipo yields:
0000000000401210 <main>:
401210: 55 push %rbp
401211: 48 89 e5 mov %rsp,%rbp
401214: 48 83 e4 80 and $0xffffffffffffff80,%rsp
401218: 48 81 ec 80 00 00 00 sub $0x80,%rsp
40121f: bf 03 00 00 00 mov $0x3,%edi
401224: 33 f6 xor %esi,%esi
401226: e8 25 00 00 00 callq 401250 <__intel_new_feature_proc_init>
40122b: 0f ae 1c 24 stmxcsr (%rsp)
40122f: 81 0c 24 40 80 00 00 orl $0x8040,(%rsp)
401236: 48 8b 05 cb 2d 00 00 mov 0x2dcb(%rip),%rax # 404008 <funcptr_2.dp.0>
40123d: 0f ae 14 24 ldmxcsr (%rsp)
401241: ff d0 callq *%rax
401243: 33 c0 xor %eax,%eax
401245: 48 89 ec mov %rbp,%rsp
401248: 5d pop %rbp
401249: c3 retq
40124a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
So, while -ipo didn't remove funcptr variable (see address 401236), it did remove assignment. I guess that icc noticed that func is not called from C code so it can be safely removed, so funcptr is allowed to contain garbage. However, it didn't notice that I'm calling func indirectly via assembly.
What I tried
Replacing "r"(funcptr) with "r"(func) works but I can't hardcode a specific function (see background).
Calling funcptr and/or func before and/or after inline assembly block don't help because icc just inlines printf("Hello, world!\n");.
I can't get rid of inline assembly because I have to do low-level register, flags and stack manipulation before and after call.
Making funcptr volatile yields the following warning but still segfaults:
a value of type "void (*)()" cannot be assigned to an entity of type "volatile void (*)()"
Adding volatile to almost every other word doesn't help either.
Moving func and/or funcptr to other source files and then linking them together doesn't help.
Moving inline assembly to a separate function doesn't work.
Am I doing something wrong or is it an icc bug? If the former, how do I fix the code? If the latter, is there any workaround and should I report the bug?
$ icc --version
icc (ICC) 19.1.0.166 20191121
Copyright (C) 1985-2019 Intel Corporation. All rights reserved.

Understanding array declaration in C

I'm trying to understand how the C Standard explains that the declaration can cause an error. Consider the following pretty simple code:
int main()
{
char test[1024 * 1024 * 1024];
test[0] = 0;
return 0;
}
Demo
This segfaluts. But the following code does not:
int main()
{
char test[1024 * 1024 * 1024];
return 0;
}
Demo
But when I compiled it on my machine the latest one segfaulted too. The main function looks as
00000000000008c6 <main>:
8c6: 55 push %rbp
8c7: 48 89 e5 mov %rsp,%rbp
8ca: 48 81 ec 20 00 00 40 sub $0x40000020,%rsp
8d1: 89 bd ec ff ff bf mov %edi,-0x40000014(%rbp) // <---HERE
8d7: 48 89 b5 e0 ff ff bf mov %rsi,-0x40000020(%rbp)
8de: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
8e5: 00 00
8e7: 48 89 45 f8 mov %rax,-0x8(%rbp)
8eb: 31 c0 xor %eax,%eax
8ed: b8 00 00 00 00 mov $0x0,%eax
8f2: 48 8b 55 f8 mov -0x8(%rbp),%rdx
8f6: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx
8fd: 00 00
8ff: 74 05 je 906 <main+0x40>
901: e8 1a fe ff ff callq 720 <__stack_chk_fail#plt>
906: c9 leaveq
907: c3 retq
908: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
90f: 00
As far as I understood the segfault occurred when trying to mov %edi,-0x40000014(%rbp).
I tried to find the exaplanation in the N1570, Section 6.7.9 Initialization, but it does not seem to be the relevant one.
So how does the Standard explains this behavior?

The result is implementation-dependent
I can think of several reasons of why the behaviour should differ
compiler seeing that variable isn't used, no possible side-effect, and optimizing it away (even without optimization levels)
stack resizing on request. Since there are no writes to this variable yet, why resizing the stack now?
compilers don't have to use the stack for auto memory. Compiler can allocate memory using malloc, and free it on exit. Using heap would allow to allocate 1Gb without issues
stack size set at 1Gb :)

Create a Data Hazard in a C Program

I am working on a problem where I am attempting to create different scenarios in different C programs such as
Data Hazard
Branch Evaluation
Procedure Call
This is in an attempt at learning pipelining and the different hazards that come up.
So I am writing simple C programs and disassembling to assembly language to see if a hazard gets created. But I cannot figure out how to create these hazards. Do yall have any idea how I could do this? Here is some of the simple code I have written.
I compile using.
gcc -g -c programName.c -o programName.o
gcc programName.o -o programName
objdump -d programName.o > programName.asm
Code:
#include <stdio.h>
int main()
{
int i = 0;
int size = 5;
int num[5] = {1,2,3,4,5};
int sum=0;
int average = 0;
for(i = 0; i < size; i++)
{
sum += num[i];
}
average=sum/size;
return 0;
}
...and here is the assembly for that.
average.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <main>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: c7 45 f0 00 00 00 00 movl $0x0,0xfffffffffffffff0(%rbp)
b: c7 45 f4 05 00 00 00 movl $0x5,0xfffffffffffffff4(%rbp)
12: c7 45 d0 01 00 00 00 movl $0x1,0xffffffffffffffd0(%rbp)
19: c7 45 d4 02 00 00 00 movl $0x2,0xffffffffffffffd4(%rbp)
20: c7 45 d8 03 00 00 00 movl $0x3,0xffffffffffffffd8(%rbp)
27: c7 45 dc 04 00 00 00 movl $0x4,0xffffffffffffffdc(%rbp)
2e: c7 45 e0 05 00 00 00 movl $0x5,0xffffffffffffffe0(%rbp)
35: c7 45 f8 00 00 00 00 movl $0x0,0xfffffffffffffff8(%rbp)
3c: c7 45 fc 00 00 00 00 movl $0x0,0xfffffffffffffffc(%rbp)
43: c7 45 f0 00 00 00 00 movl $0x0,0xfffffffffffffff0(%rbp)
4a: eb 10 jmp 5c <main+0x5c>
4c: 8b 45 f0 mov 0xfffffffffffffff0(%rbp),%eax
4f: 48 98 cltq
51: 8b 44 85 d0 mov 0xffffffffffffffd0(%rbp,%rax,4),%eax
55: 01 45 f8 add %eax,0xfffffffffffffff8(%rbp)
58: 83 45 f0 01 addl $0x1,0xfffffffffffffff0(%rbp)
5c: 8b 45 f0 mov 0xfffffffffffffff0(%rbp),%eax
5f: 3b 45 f4 cmp 0xfffffffffffffff4(%rbp),%eax
62: 7c e8 jl 4c <main+0x4c>
64: 8b 55 f8 mov 0xfffffffffffffff8(%rbp),%edx
67: 89 d0 mov %edx,%eax
69: c1 fa 1f sar $0x1f,%edx
6c: f7 7d f4 idivl 0xfffffffffffffff4(%rbp)
6f: 89 45 fc mov %eax,0xfffffffffffffffc(%rbp)
72: b8 00 00 00 00 mov $0x0,%eax
77: c9 leaveq
78: c3 retq
Would appreciate any insight or help. Thanks!

Since this is homework, I'm not going to give you a straight answer, but some food for thought to push you in the right direction.
x86 is a terrible ISA to be using to try and comprehend pipelining. A single x86 instruction can hide two or three side-effects, making it difficult to tease out how a given instruction would perform in even the simplest of pipelines. Are you sure you're not provided a RISC ISA to use for this problem?
Put your loop/hazard code into a function and preferably randomize the creation of the array. Make the array much longer. A good compiler will basically figure out the answer otherwise and remove most of the code you wrote! For reasons I don't understand it's putting your variables in memory.
A good compiler will also do things such as loop unrolling in attempt to hide data hazards and get better code scheduling. Learn how to defeat that (or if you can, give the flag the compiler telling it to NOT do those things if messing around with the compiler is allowed).
The keyword "volatile" can be very helpful in telling the compiler to not optimize around/away certain variables (it tells the compiler this value can change at any moment, so don't be clever and optimize code with it and also don't keep the variable inside the register file).
A data hazard means the pipeline will stall waiting on data. Normally instructions get bypassed just in time, so no stalling occurs. Think about which types of instructions may not be able to be bypassed and could cause a stall on a data hazard. This is dependent on the pipeline, so code that stalls for a specific processor may not stall for another. Modern out-of-order Intel processors are excellent at avoiding these stalls and compilers are great at re-scheduling code so they won't occur even on an in-order core.

Is GCC's option -O2 breaking this small program or do I have undefined behavior [duplicate]

This question already has answers here:
Decrementing a pointer out of bounds; incrementing it into bounds [duplicate]
(3 answers)
Why is out-of-bounds pointer arithmetic undefined behaviour?
(7 answers)
Closed 8 years ago.
I found this problem in a very large application, have made an SSCCE from it. I don't know whether the code has undefined behavior or -O2 breaks it.
When compiling it with gcc a.c -o a.exe -O2 -Wall -Wextra -Werror it prints 5.
But it prints 25 when compiling without -O2 (eg -O1) or uncommenting one of the 2 commented lines (prevent inlining).
#include <stdio.h>
#include <stdlib.h>
// __attribute__((noinline))
int f(int* todos, int input) {
int* cur = todos-1; // fixes the ++ at the beginning of the loop
int result = input;
while(1) {
cur++;
int ch = *cur;
// printf("(%i)\n", ch);
switch(ch) {
case 0:;
goto end;
case 1:;
result = result*result;
break;
}
}
end:
return result;
}
int main() {
int todos[] = { 1, 0}; // 1:square, 0:end
int input = 5;
int result = f(todos, input);
printf("=%i\n", result);
printf("end\n");
return 0;
}
Is GCC's option -O2 breaking this small program or do I have undefined behavior somewhere?

int* cur = todos-1;
invokes undefined behavior. todos - 1 is an invalid pointer address.
Emphasis mine:
(C99, 6.5.6p8) "If both the pointer operand and the result point to elements of the same array object, or one past the last element of the array object, the evaluation shall not produce an overflow; otherwise, the behavior is undefined."

In supplement to #ouah's answer, this explains what the compiler is doing.
Generated assembler for reference:
400450: 48 83 ec 18 sub $0x18,%rsp
400454: be 05 00 00 00 mov $0x5,%esi
400459: 48 8d 44 24 fc lea -0x4(%rsp),%rax
40045e: c7 44 24 04 00 00 00 movl $0x0,0x4(%rsp)
400465: 00
400466: 48 83 c0 04 add $0x4,%rax
40046a: 8b 10 mov (%rax),%edx
However if I add a printf in main():
400450: 48 83 ec 18 sub $0x18,%rsp
400454: bf 84 06 40 00 mov $0x400684,%edi
400459: 31 c0 xor %eax,%eax
40045b: 48 89 e6 mov %rsp,%rsi
40045e: c7 04 24 01 00 00 00 movl $0x1,(%rsp)
400465: c7 44 24 04 00 00 00 movl $0x0,0x4(%rsp)
40046c: 00
40046d: e8 ae ff ff ff callq 400420 <printf#plt>
400472: 48 8d 44 24 fc lea -0x4(%rsp),%rax
400477: be 05 00 00 00 mov $0x5,%esi
40047c: 48 83 c0 04 add $0x4,%rax
400480: 8b 10 mov (%rax),%edx
Specifically (in the printf version), these two instructions populate the todo array
40045e: c7 04 24 01 00 00 00 movl $0x1,(%rsp)
400465: c7 44 24 04 00 00 00 movl $0x0,0x4(%rsp)
This is conspicuously missing from the non-printf version, which for some reason only assigns the second element:
40045e: c7 44 24 04 00 00 00 movl $0x0,0x4(%rsp)

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight