incrementing struct members - c

Say I have a struct defined as follows
struct my_struct
{
int num;
};
....
Here I have a pointer to my_struct and I want to do an increment on num
void foo(struct my_struct* my_ptr)
{
// increment num
// method #1
my_ptr->num++;
// method #2
++(my_ptr->num);
// method #3
my_ptr->++num;
}
Do these 3 ways of incrementing num do the same thing?
While we're at it, is it true that pre-increment is more efficient than post-increment?
Thanks!

First two will have the same effect (when on a line on their own like that), but the third method isn't valid C code (you can't put the ++ there).
As for efficiency, there is no difference. The difference you may have heard people talking about is when, in C++, you increment a non-pointer data type, such as an iterator. In some cases, pre-increment can be faster there.
You can see the generated code using GCC Explorer.
void foo(struct my_struct* my_ptr)
{
my_ptr->num++;
}
void bar(struct my_struct* my_ptr)
{
++(my_ptr->num);
}
Output:
foo(my_struct*): # #foo(my_struct*)
incl (%rdi)
ret
bar(my_struct*): # #bar(my_struct*)
incl (%rdi)
ret
As you can see, there's no difference whatsoever.
The only possible difference between the first two is when you use them in expressions:
my_ptr->num = 0;
int x = my_ptr->num++; // x = 0
my_ptr->num = 0;
int y = ++my_ptr->num; // y = 1

If your only intention is to increment the value of num then the 1st and 2nd method will yield same intented result to the callee method.
However, if you change your code to the following, you can see the difference between the code generated by gcc (assembly level code):
struct my_struct
{
int num;
};
void foo(struct my_struct* my_ptr)
{
printf("\nPost Increment: %d", my_ptr->num++);
}
int main()
{
struct my_struct a;
a.num = 10;
foo(&a);
}
Now compile it using: gcc -masm=intel -S structTest.c -o structTest.s
This asks gcc to generate the assembly code:
Open structTest.s in a text editor.
foo:
.LFB0:
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov edx, eax
**lea ecx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], ecx
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
leave
ret
.cfi_endproc
main:
.LFB1:
push rbp
mov rbp, rsp
sub rsp, 16
**mov DWORD PTR [rbp-16], 10
lea rax, [rbp-16]
mov rdi, rax
call foo**
leave
ret
.cfi_endproc
And when you change the operation to pre-increment, the follwoing code is generated:
foo:
.LFB0:
.cfi_startproc
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
**lea edx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
**mov DWORD PTR [rax], edx**
mov rax, QWORD PTR [rbp-8]
**mov edx, DWORD PTR [rax]**
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
leave
ret
.cfi_endproc
So, you would see that in the second case, the compiler increments the num value and passes on this num value to printf().
In terms of performance, I would expect the post-increment to be more efficient since the memory locations are touched a fewer number of times.
The important lines have been marked between ** in the above code.

Related

Pass a pointer instead of return new variable in C and Go?

Why is it convention in C and Go to pass a pointer to a variable and change it rather return a new variable with the value?
In C:
#include <stdio.h>
int getValueUsingReturn() {
int value = 42;
return value;
}
void getValueUsingPointer(int* value ) {
*value = 42;
}
int main(void) {
int valueUsingReturn = getValueUsingReturn();
printf("%d\n", valueUsingReturn);
int valueUsingPointer;
getValueUsingPointer(&valueUsingPointer);
printf("%d\n", valueUsingPointer);
return 0;
}
In Go:
package main
import "fmt"
func getValueUsingReturn() int {
value := 42
return value
}
func getValueUsingPointer(value *int) {
*value = 42
}
func main() {
valueUsingReturn := getValueUsingReturn()
fmt.Printf("%d\n", valueUsingReturn)
var valueUsingPointer int
getValueUsingPointer(&valueUsingPointer)
fmt.Printf("%d\n", valueUsingPointer)
}
It there any performance benefits or restrictions in doing one or the other?
First off, I don't know enough about Go to give a judgement on it, but the answer will apply in the case of C.
If you're just working on primitive types like ints, then I'd say there is no performance difference between the two techniques.
When structs come into play, there is a very slight advantage of modifying a variable via pointer (based purely on what you're doing in your code)
#include <stdio.h>
struct Person {
int age;
const char *name;
const char *address;
const char *occupation;
};
struct Person getReturnedPerson() {
struct Person thePerson = {26, "Chad", "123 Someplace St.", "Software Engineer"};
return thePerson;
}
void changeExistingPerson(struct Person *thePerson) {
thePerson->age = 26;
thePerson->name = "Chad";
thePerson->address = "123 Someplace St.";
thePerson->occupation = "Software Engineer";
}
int main(void) {
struct Person someGuy = getReturnedPerson();
struct Person theSameDude;
changeExistingPerson(&theSameDude);
return 0;
}
GCC x86-64 11.2
With No Optimizations
Returning a struct variable through the function's return is slower because the variable has to be "built" by assigning the desired values, after which, the variable is copied to the return value.
When you're modifying a variable by pointer indirection, there is nothing to do except write the desired values to the memory addresses (based off the pointer you passed in)
.LC0:
.string "Chad"
.LC1:
.string "123 Someplace St."
.LC2:
.string "Software Engineer"
getReturnedPerson:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-40], rdi
mov DWORD PTR [rbp-32], 26
mov QWORD PTR [rbp-24], OFFSET FLAT:.LC0
mov QWORD PTR [rbp-16], OFFSET FLAT:.LC1
mov QWORD PTR [rbp-8], OFFSET FLAT:.LC2
mov rcx, QWORD PTR [rbp-40]
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rbp-24]
mov QWORD PTR [rcx], rax
mov QWORD PTR [rcx+8], rdx
mov rax, QWORD PTR [rbp-16]
mov rdx, QWORD PTR [rbp-8]
mov QWORD PTR [rcx+16], rax
mov QWORD PTR [rcx+24], rdx
mov rax, QWORD PTR [rbp-40]
pop rbp
ret
changeExistingPerson:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], 26
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+8], OFFSET FLAT:.LC0
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+16], OFFSET FLAT:.LC1
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+24], OFFSET FLAT:.LC2
nop
pop rbp
ret
main:
push rbp
mov rbp, rsp
sub rsp, 64
lea rax, [rbp-32]
mov rdi, rax
mov eax, 0
call getReturnedPerson
lea rax, [rbp-64]
mov rdi, rax
call changeExistingPerson
mov eax, 0
leave
ret
With Slight Optimization
However, most compilers today can figure out what you're trying to do here, and will equalize the performance between the two techniques.
If you want to be absolutely stingy, passing pointers is still slightly faster by a few clock cycles at best.
In returning a variable from the function, you still have to at least set the address of the return value.
mov rax, rdi
But in passing the pointer, not even this is done.
But other than that, the two techniques have no performance difference.
.LC0:
.string "Chad"
.LC1:
.string "123 Someplace St."
.LC2:
.string "Software Engineer"
getReturnedPerson:
mov rax, rdi
mov DWORD PTR [rdi], 26
mov QWORD PTR [rdi+8], OFFSET FLAT:.LC0
mov QWORD PTR [rdi+16], OFFSET FLAT:.LC1
mov QWORD PTR [rdi+24], OFFSET FLAT:.LC2
ret
changeExistingPerson:
mov DWORD PTR [rdi], 26
mov QWORD PTR [rdi+8], OFFSET FLAT:.LC0
mov QWORD PTR [rdi+16], OFFSET FLAT:.LC1
mov QWORD PTR [rdi+24], OFFSET FLAT:.LC2
ret
main:
mov eax, 0
ret
I think the short answer to you question (At least for C, I am not familiar with GO internals) is that C functions are pass by value and generally also return by value so data objects must be copied and people worried about the performance of all the copying. For large objects or objects that are complex in their depth (containing pointers to other stuff) it is often just more efficient or logical for the value being copied to be a pointer so the function can "operate" on the data without needing to copy it.
That being said, modern compilers are pretty smart about figuring out stuff like whether the parameter data will fit in registers or efficiently copying returned structures.
Bottom line is for modern C code do what seems best for your application or what is clearest to you. Avoid premature optimization if it detracts from readability at least in the beginning.
Also Compiler Explorer (https://godbolt.org/) is your friend if you want to examine the effect of different styles, especially in light of optimization.

How can I store abc(x, y) which is a pointer function into an array as per the following code sample?

Function 1.
It is a pointer function.
char *abc(unsigned int a, unsigned int b)
{
//do something here ...
}
Function 2
Leveraged the function 1 into function 2.
I am trying to store the abc function into an array, however I am getting the error as : error: assignment to expression with array type.
fun2()
{
unsigned int x, y;
x= 5, y=6;
char *array1;
char array2;
for(i=0; i<3; i++)
{
array2[i] = abc(x, y);
}
}
You can't store the invocation of a function in C since it would defeat many existing popular optimizations involving register parameters passing - see because normally parameters are assigned their argument values immediately before the execution flow is transferred to the calling site - compilers may choose to use the registers to store those values but as it stands those registers are volatile and so if we were to delay the actual call they would be overwritten at said later time - possibly even by another call to some function which also have its arguments passed as registers. A solution - which I've personally implemented - is to have a function simulate the call for you by re-assigning to the proper registers and any further arguments - to the stack. In this case you store the argument values in a flat memory. But this must be done in assembly exclusively for this purpose and specific to your target architecture. On the other hand if your architecture is not using any such optimizations - it could be quite easier but still hand written assembly would be required.
In any case this is not a feature the standard (or even pre standard as far as I know) C has implemented anytime.
For example this is an implementation for x86-64 I've wrote some time ago (for MSVC masm assembler):
PUBLIC makeuniquecall
.data
makeuniquecall_jmp_table dq zero_zero, one_zero, two_zero, three_zero ; ordinary
makeuniquecall_jmp_table_one dq zero_one, one_one, two_one, three_one ; single precision
makeuniquecall_jmp_table_two dq zero_two, one_two, two_two, three_two ; double precision
.code
makeuniquecall PROC
;rcx - function pointer
;rdx - raw argument data
;r8 - a byte array specifying each register parameter if it's float and the last qword is the size of the rest
push r12
push r13
push r14
mov r12, rcx
mov r13, rdx
mov r14, r8
; first store the stack vars
mov rax, [r14 + 4] ; retrieve size of stack
sub rsp, rax
mov rdi, rsp
xor rdx, rdx
mov r8, 8
div r8
mov rcx, rax
mov rsi, r13
;add rsi, 32
rep movs qword ptr [rdi], qword ptr [rsi]
xor r10,r10
cycle:
mov rax, r14
add rax, r10
movzx rax, byte ptr [rax]
test rax, rax
jnz jmp_one
lea rax, makeuniquecall_jmp_table
jmp qword ptr[rax + r10 * 8]
jmp_one:
cmp rax, 1
jnz jmp_two
lea rax, makeuniquecall_jmp_table_one
jmp qword ptr[rax + r10 * 8]
jmp_two:
lea rax, makeuniquecall_jmp_table_two
jmp qword ptr[rax + r10 * 8]
zero_zero::
mov rcx, qword ptr[r13+r10*8]
jmp continue
one_zero::
mov rdx, qword ptr[r13+r10*8]
jmp continue
two_zero::
mov r8, qword ptr[r13+r10*8]
jmp continue
three_zero::
mov r9, qword ptr[r13+r10*8]
jmp continue
zero_one::
movss xmm0, dword ptr[r13+r10*8]
jmp continue
one_one::
movss xmm1, dword ptr[r13+r10*8]
jmp continue
two_one::
movss xmm2, dword ptr[r13+r10*8]
jmp continue
three_one::
movss xmm3, dword ptr[r13+r10*8]
jmp continue
zero_two::
movsd xmm0, qword ptr[r13+r10*8]
jmp continue
one_two::
movsd xmm1, qword ptr[r13+r10*8]
jmp continue
two_two::
movsd xmm2, qword ptr[r13+r10*8]
jmp continue
three_two::
movsd xmm3, qword ptr[r13+r10*8]
continue:
inc r10
cmp r10, 4
jb cycle
mov r14, [r14 + 4] ; retrieve size of stack
call r12
add rsp, r14
pop r14
pop r13
pop r12
ret
makeuniquecall ENDP
END
And your code will look something like this:
#include <stdio.h>
char* abc(unsigned int a, unsigned int b)
{
printf("a - %d, b - %d\n", a, b);
return "return abc str\n";
}
extern makeuniquecall();
main()
{
unsigned int x, y;
x = 5, y = 6;
#pragma pack(4)
struct {
struct { char maskargs[4]; unsigned long long szargs; } invok;
char *(*pfunc)();
unsigned long long args[2], shadow[2];
} array2[3];
#pragma pack(pop)
for (int i = 0; i < 3; i++)
{
memset(array2[i].invok.maskargs, 0, sizeof array2[i].invok.maskargs); // standard - no floats passed
array2[i].invok.szargs = 8 * 4; //consider shadow space
array2[i].pfunc = abc;
array2[i].args[0] = x;
array2[i].args[1] = y;
}
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", ((char *(*)())makeuniquecall)(array2[i].pfunc, array2[i].args, &array2[i].invok));
}
You'll probably not need that for your specific case you will get away with simply storing each argument and calling the function directly - i.e. (plus this method won't be x86-64 specific):
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", array2[i].pfunc(array2[i].args[0], array2[i].args[1]));
But mine implementation gives you the flexibility to store different amount of arguments for each call.
Note consider this guide for running above examples on msvc (since it requires to add asm file for the assembly code).
I love such noob questions since they make you think about why x-y feature doesn't actually exist in the language.

Understanding pointers in assembler from machine's view

Here is a basic program I written on the godbolt compiler, and it's as simple as:
#include<stdio.h>
void main()
{
int a = 10;
int *p = &a;
printf("%d", *p);
}
The results after compilation I get:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-12], 10
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
nop
leave
ret
Question: Pushing the rbp, making the stack frame by making a 16 byte block, how from a register, a value is moved to a stack location and vice versa, how the job of LEA is to figure out the address, I got this part.
Problem:
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
Lea -> getting address of rbp-12 into rax,
then moving the value which is the address of rbp-12 into rax,
but next line again says, move to rax, the value of rbp-8. This seems ambiguous. Then again moving the value of rax to eax. I don't understand the amount of work here. Why couldn't I have done
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov eax, QWORD PTR [rbp-8]
and be done with it? coz on the original line, rbp-12's address is stored onto rax, then rax stored to rbp-8. then rbp-8 stored again into rax, and then again rax is stored into eax? couldn't we have just copied the rbp-8 directly to eax? i guess not. But my question is why?
I know there is de-referencing in pointers, so How LEA helps grabbing the address of rbp-12, I understand, but on the next parts, when did it went from grabbing values from addresses I completely lost. And also, after that, I didn't understand any of the asm lines.
You're seeing very un-optimized code. Here's my line-by-line interpretation:
.LC0:
.string "%d" ; Format string for printf
main:
push rbp ; Save original base pointer
mov rbp, rsp ; Set base pointer to beginning of stack frame
sub rsp, 16 ; Allocate space for stack frame
mov DWORD PTR [rbp-12], 10 ; Initialize variable 'a'
lea rax, [rbp-12] ; Load effective address of 'a'
mov QWORD PTR [rbp-8], rax ; Store address of 'a' in 'p'
mov rax, QWORD PTR [rbp-8] ; Load 'p' into rax (even though it's already there - heh!)
mov eax, DWORD PTR [rax] ; Load 32-bit value of '*p' into eax
mov esi, eax ; Load value to print into esi
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
mov eax, 0 ; Zero out eax (not sure why -- likely printf call protocol)
call printf ; Make the printf call
nop ; No-op (not sure why)
leave ; Remove the stack frame
ret ; Return
Compilers, when not optimizing, generate code like this as they parse the code you gave them. It's doing a lot of unnecessary stuff, but it is quicker to generate and makes using a debugger easier.
Compare this with the optimized code (-O2):
.LC0:
.string "%d" ; Format string for printf
main:
mov esi, 10 ; Don't need those variables -- just a 10 to pass to printf!
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
xor eax, eax ; It's a few cycles faster to xor a register with itself than to load an immediate 0
jmp printf ; Just jmp to printf -- it will handle the return
The optimizer found that the variables weren't necessary, so no stack frame is created. Nothing is left but the printf call! And that's done as a jmp since nothing else need be done here when the printf is complete.

assembly output + questions about stacks

I was studying one of my courses when I ran into a specific exercise that I cannot seem to resolve... It is pretty basic because I am VERY new to assembly. So lets begin.
I have a C function
unsigned int func(int *ptr, unsigned int j) {
unsigned int res = j;
int i = ptr[j+1];
for(; i<8; ++i) {
res >>= 1;
}
return res;
}
I translated it with gcc to assembly
.file "func.c"
.intel_syntax noprefix
.text
.globl func
.type func, #function
func:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov DWORD PTR [rbp-8], eax
mov eax, DWORD PTR [rbp-28]
add eax, 1
mov eax, eax
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-4], eax
jmp .L2
.L3:
shr DWORD PTR [rbp-8]
add DWORD PTR [rbp-4], 1
.L2:
cmp DWORD PTR [rbp-4], 7
jle .L3
mov eax, DWORD PTR [rbp-8]
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size func, .-func
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4"
.section .note.GNU-stack,"",#progbits
The question is as follow. what is the command that place j (variable in the c function) on top of the stack?
I sincerely cannot find out please enlighten me XD.
The variable j is the second parameter for func; it is stored in the register esi in the x86-64 System V ABI calling convention. This instruction mov DWORD PTR [rbp-28], esi put j into the stack.
You can see it very clearly by writing a simple function that calls "func" and compiling it with -O0 (or with -O2 and marking it as noinline, or only providing a prototype so there's nothing for the compiler to inline).
unsigned int func(int *ptr, unsigned int j) {
unsigned int res = j;
int i = ptr[j+1];
for(; i<8; ++i) {
res >>= 1;
}
return res;
}
int main()
{
int a = 1;
int array[10];
func (array, a);
return 0;
}
Using the Godbolt compiler explorer, we can easily get gcc -O0 -fverbose-asm assembly output.
Please focus on the following instructions:
# in main:
...
mov DWORD PTR [rbp-4], 1
mov edx, DWORD PTR [rbp-4]
...
mov esi, edx
...
func(int*, unsigned int):
...
mov DWORD PTR [rbp-28], esi # j, j
...
j, j is a comment added by gcc -fverbose-asm tell you that the source and destination operands are both the C variable j in that instruction.
The full assembly instructions:
func(int*, unsigned int):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-28]
add eax, 1
mov eax, eax
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-8], eax
jmp .L2
.L3:
shr DWORD PTR [rbp-4]
add DWORD PTR [rbp-8], 1
.L2:
cmp DWORD PTR [rbp-8], 7
jle .L3
mov eax, DWORD PTR [rbp-4]
pop rbp
ret
main:
push rbp
mov rbp, rsp
sub rsp, 48
mov DWORD PTR [rbp-4], 1
mov edx, DWORD PTR [rbp-4]
lea rax, [rbp-48]
mov esi, edx
mov rdi, rax
call func(int*, unsigned int)
mov eax, 0
leave
ret
Taking into account these instructions
mov eax, DWORD PTR [rbp-28]
add eax, 1
it seems that j is stored at address rbp-28 While ptr is stored at address rbp-24.
These are instructions where the values are stored in the stack
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
It seems the arguments are passed to the function using registers rdi and esi.
Compilers can optimize their calls of functions and use registers instead of the stack to pass arguments of small sizes to functions. Within the functions they can use the stack to temporary store the arguments passed through registers.
Just a suggestion for further explorations on your own. Use gcc -O0 -g2 f.c -Wa,-adhln. It will turn off optimizations and generate assembly code intermixed with the source. It might give you better ideas about what it does.
As an alternative you can use the objdump -Sd f.o on the output '.o' or executable. Just make sure that you add debugging info and turn off optimizations at compilation.

Is it faster in C to use a written jump table or switch statement?

So, I am trying to see if there is any difference between using a jump table of function pointers versus a switch statements for performing many, one command operations like these.
This is the code to assembly link i made
Here is my actual code as well
enum code {
ADD,
SUB,
MUL,
DIV,
REM
};
typedef struct {
int val;
} Value;
typedef struct {
enum code ins;
int operand;
} Op;
void run(Value* arg, Op* func)
{
switch(func->ins)
{
case ADD: arg->val += func->operand; break;
case SUB: arg->val -= func->operand; break;
case MUL: arg->val *= func->operand; break;
case DIV: arg->val /= func->operand; break;
case REM: arg->val %= func->operand; break;
}
}
My question is, based on the generated assembly in that link or the code, would there be any difference from making a bunch of small functions to complete the operations in the cases of the switch statement, and making an array of pointers to those functions and calling them with the same enum?
Using gcc x86_64 7.1
void add(Value* arg, Op* func)
{
arg->val += func->operand;
}
static void (*jmptable)(Value*, Op*)[] = {
&add
}
Assembly code paste:
run(Value*, Op*):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov QWORD PTR [rbp-16], rsi
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax]
cmp eax, 4
ja .L9
mov eax, eax
mov rax, QWORD PTR .L4[0+rax*8]
jmp rax
.L4:
.quad .L3
.quad .L5
.quad .L6
.quad .L7
.quad .L8
.L3:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
add edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L5:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
sub edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L6:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
imul edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L7:
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov rdx, QWORD PTR [rbp-16]
mov esi, DWORD PTR [rdx+4]
cdq
idiv esi
mov edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L8:
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov rdx, QWORD PTR [rbp-16]
mov ecx, DWORD PTR [rdx+4]
cdq
idiv ecx
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
nop
.L2:
.L9:
nop
pop rbp
ret
A catchall answer to all these questions: you should measure.
Practically though, I'm betting on the switch version. Function calls have overhead (and they can be hardly inlined in this context), which you could eliminate with labels as values, which is a common compiler extension*, but you should really try all your options and measure if the performance of this piece of code matters to you greatly.
Otherwise, use whatever's most convenient to you.
*a switch is likely to generate a jump table equivalent to what you could compose from labels as values but it could switch between different implementations depending on the particular case values and their number
Can you spot the difference? Trust in compiler (it will do such a micro optimisations much better than you) - and do not forget break statements. Care about algorithm, not about such a small details.
https://godbolt.org/g/sPxse2
Looks like due to branch prediction and bounds checking, using the switch labels as jump points may be up to 20% faster on older systems - newer systems having better branch prediction. Basically, this relies on a compiler extension. You still have the switch, but the switch doesn't fall through to the dispatcher. Instead, each case has its own dispatcher that jumps directly into the case. A number of popular VMs do this.
See here for more info and examples:https://www.cipht.net/2017/10/03/are-jump-tables-always-fastest.html

Resources