All is in the title, I have an int** as my function argument and I want to display the int with assembly, so what i've done is:
global bar
section .text:
bar:
mov rdx, [rdi]
mov rdi, [rdx]
mov rdx, [rdi]
mov rdi, [rdx]
call rsi
ret
My C code:
#include <stdio.h>
void bar(int **i, void (*f)()); //assembly function prototype
void foo(int i)
{
printf("%d\n", i);
}
int main(void)
{
int i = 3;
int *ptr = &i;
bar(&ptr, &foo);
return (0);
}
It segfaults and says "invalid address" at foo with lldb, so I think it's because i'm not dereferencing the right way, so I'm stuck because I need to do this for a larger function. Thanks for the help.
Let me trace the assembly code:
global bar
section .text:
bar:
mov rdx, [rdi] // rdi = &ptr, rdx = *&ptr = ptr
mov rdi, [rdx] // rdx = ptr, rdi = *ptr = i
mov rdx, [rdi] // rdi = i, rdx = *i = (invalid)
mov rdi, [rdx]
call rsi
ret
This suggests actually you have to pass int****, not int**, as the first argument because you are doing dereference 4 times.
It will be like this:
#include <stdio.h>
void bar(int ****i, void (*f)()); //assembly function prototype
void foo(int i)
{
printf("%d\n", i);
}
int main(void)
{
int i = 3;
int *ptr = &i;
int **pptr = &ptr;
int ***ppptr = &pptr;
bar(&ppptr, &foo);
return (0);
}
Also stack pointer should be 16-byte aligned on function call in x86-64, so the assembly function bar should be (for example):
global bar
section .text:
bar:
mov rdx, [rdi]
mov rdi, [rdx]
mov rdx, [rdi]
mov rdi, [rdx]
sub rsp, 8 // adjust stack pointer
call rsi
add rsp, 8 // restore stack pointer
ret
The alignment is done before call and 8 byte (return address) is pushed by call, so another 8 byte should be subtracted from the function pointer to retain 16-byte alignment.
If you want to use int** as the first argument, do dereferences (memory accesses) only 2 times.
global bar
section .text:
bar:
mov rdx, [rdi]
mov rdi, [rdx]
sub rsp, 8
call rsi
add rsp, 8
ret
Another thing you may want to do is
Create stack frame
Store the argument on the stack memory for later use
global bar
section .text:
bar:
push rbp // create stack frame
mov rbp, rsp
sub rsp, 16 // create region for local variables (note 16-byte alignment)
mov [rbp-8], rdi // save the argument to the memory
mov rdx, [rdi]
mov rdi, [rdx]
mov rdi, [rbp-8] // restore the argument from the memory
mov rdx, [rdi]
mov rdi, [rdx]
call rsi
leave // destruct stack frame
ret
I think you might want this:
bar:
mov rax, [rdi]
mov edi, [rax]
call rsi
ret
which would match the void bar(int **i, void (*f)()) prototype
Related
I'm trying to get the address of the end of a function, so I use __builtin_return_address but it returns a value higher than the next declared funciton, here main:
uintptr_t foo( void )
{
return (uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0));
}
int main( int argc, char **argv )
{
printf("foo:\t%zu\nmain:\t%zu\n", foo(), (uintptr_t)main);
return (0);
}
output:
foo: 94718524985719
main: 94718524985687
But how is it possible ? How could main start address be lower than foo return address ?
__builtin_extract_return_addr(__builtin_return_address(0)) gives the address at which execution continues when the current function returns. In your snippet, this will be somewhere in main since the call to foo is in main.
32 bytes after the start of main makes sense.
When I compiled your code[1], it produced the following assembly and a difference of 26:
foo:
push rbp
mov rbp, rsp
mov rax, QWORD PTR [rbp+8]
pop rbp
ret
.LC0:
.string "foo:\t%lu\nmain:\t%lu\n"
main:
push rbp --- <--- Address of this
mov rbp, rsp |
push rbx |
sub rsp, 24 | 26 bytes
mov DWORD PTR [rbp-20], edi |
mov QWORD PTR [rbp-32], rsi |
mov ebx, OFFSET FLAT:main |
call foo ---
mov rdx, rbx <--- Address of this
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
mov rbx, QWORD PTR [rbp-8]
leave
ret
I corrected the format pattern.
#include <inttypes.h>
printf("foo:\t%" PRIuPTR "\nmain:\t%" PRIuPTR "\n", foo(), (uintptr_t)main);
Why is it convention in C and Go to pass a pointer to a variable and change it rather return a new variable with the value?
In C:
#include <stdio.h>
int getValueUsingReturn() {
int value = 42;
return value;
}
void getValueUsingPointer(int* value ) {
*value = 42;
}
int main(void) {
int valueUsingReturn = getValueUsingReturn();
printf("%d\n", valueUsingReturn);
int valueUsingPointer;
getValueUsingPointer(&valueUsingPointer);
printf("%d\n", valueUsingPointer);
return 0;
}
In Go:
package main
import "fmt"
func getValueUsingReturn() int {
value := 42
return value
}
func getValueUsingPointer(value *int) {
*value = 42
}
func main() {
valueUsingReturn := getValueUsingReturn()
fmt.Printf("%d\n", valueUsingReturn)
var valueUsingPointer int
getValueUsingPointer(&valueUsingPointer)
fmt.Printf("%d\n", valueUsingPointer)
}
It there any performance benefits or restrictions in doing one or the other?
First off, I don't know enough about Go to give a judgement on it, but the answer will apply in the case of C.
If you're just working on primitive types like ints, then I'd say there is no performance difference between the two techniques.
When structs come into play, there is a very slight advantage of modifying a variable via pointer (based purely on what you're doing in your code)
#include <stdio.h>
struct Person {
int age;
const char *name;
const char *address;
const char *occupation;
};
struct Person getReturnedPerson() {
struct Person thePerson = {26, "Chad", "123 Someplace St.", "Software Engineer"};
return thePerson;
}
void changeExistingPerson(struct Person *thePerson) {
thePerson->age = 26;
thePerson->name = "Chad";
thePerson->address = "123 Someplace St.";
thePerson->occupation = "Software Engineer";
}
int main(void) {
struct Person someGuy = getReturnedPerson();
struct Person theSameDude;
changeExistingPerson(&theSameDude);
return 0;
}
GCC x86-64 11.2
With No Optimizations
Returning a struct variable through the function's return is slower because the variable has to be "built" by assigning the desired values, after which, the variable is copied to the return value.
When you're modifying a variable by pointer indirection, there is nothing to do except write the desired values to the memory addresses (based off the pointer you passed in)
.LC0:
.string "Chad"
.LC1:
.string "123 Someplace St."
.LC2:
.string "Software Engineer"
getReturnedPerson:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-40], rdi
mov DWORD PTR [rbp-32], 26
mov QWORD PTR [rbp-24], OFFSET FLAT:.LC0
mov QWORD PTR [rbp-16], OFFSET FLAT:.LC1
mov QWORD PTR [rbp-8], OFFSET FLAT:.LC2
mov rcx, QWORD PTR [rbp-40]
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rbp-24]
mov QWORD PTR [rcx], rax
mov QWORD PTR [rcx+8], rdx
mov rax, QWORD PTR [rbp-16]
mov rdx, QWORD PTR [rbp-8]
mov QWORD PTR [rcx+16], rax
mov QWORD PTR [rcx+24], rdx
mov rax, QWORD PTR [rbp-40]
pop rbp
ret
changeExistingPerson:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], 26
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+8], OFFSET FLAT:.LC0
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+16], OFFSET FLAT:.LC1
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax+24], OFFSET FLAT:.LC2
nop
pop rbp
ret
main:
push rbp
mov rbp, rsp
sub rsp, 64
lea rax, [rbp-32]
mov rdi, rax
mov eax, 0
call getReturnedPerson
lea rax, [rbp-64]
mov rdi, rax
call changeExistingPerson
mov eax, 0
leave
ret
With Slight Optimization
However, most compilers today can figure out what you're trying to do here, and will equalize the performance between the two techniques.
If you want to be absolutely stingy, passing pointers is still slightly faster by a few clock cycles at best.
In returning a variable from the function, you still have to at least set the address of the return value.
mov rax, rdi
But in passing the pointer, not even this is done.
But other than that, the two techniques have no performance difference.
.LC0:
.string "Chad"
.LC1:
.string "123 Someplace St."
.LC2:
.string "Software Engineer"
getReturnedPerson:
mov rax, rdi
mov DWORD PTR [rdi], 26
mov QWORD PTR [rdi+8], OFFSET FLAT:.LC0
mov QWORD PTR [rdi+16], OFFSET FLAT:.LC1
mov QWORD PTR [rdi+24], OFFSET FLAT:.LC2
ret
changeExistingPerson:
mov DWORD PTR [rdi], 26
mov QWORD PTR [rdi+8], OFFSET FLAT:.LC0
mov QWORD PTR [rdi+16], OFFSET FLAT:.LC1
mov QWORD PTR [rdi+24], OFFSET FLAT:.LC2
ret
main:
mov eax, 0
ret
I think the short answer to you question (At least for C, I am not familiar with GO internals) is that C functions are pass by value and generally also return by value so data objects must be copied and people worried about the performance of all the copying. For large objects or objects that are complex in their depth (containing pointers to other stuff) it is often just more efficient or logical for the value being copied to be a pointer so the function can "operate" on the data without needing to copy it.
That being said, modern compilers are pretty smart about figuring out stuff like whether the parameter data will fit in registers or efficiently copying returned structures.
Bottom line is for modern C code do what seems best for your application or what is clearest to you. Avoid premature optimization if it detracts from readability at least in the beginning.
Also Compiler Explorer (https://godbolt.org/) is your friend if you want to examine the effect of different styles, especially in light of optimization.
Function 1.
It is a pointer function.
char *abc(unsigned int a, unsigned int b)
{
//do something here ...
}
Function 2
Leveraged the function 1 into function 2.
I am trying to store the abc function into an array, however I am getting the error as : error: assignment to expression with array type.
fun2()
{
unsigned int x, y;
x= 5, y=6;
char *array1;
char array2;
for(i=0; i<3; i++)
{
array2[i] = abc(x, y);
}
}
You can't store the invocation of a function in C since it would defeat many existing popular optimizations involving register parameters passing - see because normally parameters are assigned their argument values immediately before the execution flow is transferred to the calling site - compilers may choose to use the registers to store those values but as it stands those registers are volatile and so if we were to delay the actual call they would be overwritten at said later time - possibly even by another call to some function which also have its arguments passed as registers. A solution - which I've personally implemented - is to have a function simulate the call for you by re-assigning to the proper registers and any further arguments - to the stack. In this case you store the argument values in a flat memory. But this must be done in assembly exclusively for this purpose and specific to your target architecture. On the other hand if your architecture is not using any such optimizations - it could be quite easier but still hand written assembly would be required.
In any case this is not a feature the standard (or even pre standard as far as I know) C has implemented anytime.
For example this is an implementation for x86-64 I've wrote some time ago (for MSVC masm assembler):
PUBLIC makeuniquecall
.data
makeuniquecall_jmp_table dq zero_zero, one_zero, two_zero, three_zero ; ordinary
makeuniquecall_jmp_table_one dq zero_one, one_one, two_one, three_one ; single precision
makeuniquecall_jmp_table_two dq zero_two, one_two, two_two, three_two ; double precision
.code
makeuniquecall PROC
;rcx - function pointer
;rdx - raw argument data
;r8 - a byte array specifying each register parameter if it's float and the last qword is the size of the rest
push r12
push r13
push r14
mov r12, rcx
mov r13, rdx
mov r14, r8
; first store the stack vars
mov rax, [r14 + 4] ; retrieve size of stack
sub rsp, rax
mov rdi, rsp
xor rdx, rdx
mov r8, 8
div r8
mov rcx, rax
mov rsi, r13
;add rsi, 32
rep movs qword ptr [rdi], qword ptr [rsi]
xor r10,r10
cycle:
mov rax, r14
add rax, r10
movzx rax, byte ptr [rax]
test rax, rax
jnz jmp_one
lea rax, makeuniquecall_jmp_table
jmp qword ptr[rax + r10 * 8]
jmp_one:
cmp rax, 1
jnz jmp_two
lea rax, makeuniquecall_jmp_table_one
jmp qword ptr[rax + r10 * 8]
jmp_two:
lea rax, makeuniquecall_jmp_table_two
jmp qword ptr[rax + r10 * 8]
zero_zero::
mov rcx, qword ptr[r13+r10*8]
jmp continue
one_zero::
mov rdx, qword ptr[r13+r10*8]
jmp continue
two_zero::
mov r8, qword ptr[r13+r10*8]
jmp continue
three_zero::
mov r9, qword ptr[r13+r10*8]
jmp continue
zero_one::
movss xmm0, dword ptr[r13+r10*8]
jmp continue
one_one::
movss xmm1, dword ptr[r13+r10*8]
jmp continue
two_one::
movss xmm2, dword ptr[r13+r10*8]
jmp continue
three_one::
movss xmm3, dword ptr[r13+r10*8]
jmp continue
zero_two::
movsd xmm0, qword ptr[r13+r10*8]
jmp continue
one_two::
movsd xmm1, qword ptr[r13+r10*8]
jmp continue
two_two::
movsd xmm2, qword ptr[r13+r10*8]
jmp continue
three_two::
movsd xmm3, qword ptr[r13+r10*8]
continue:
inc r10
cmp r10, 4
jb cycle
mov r14, [r14 + 4] ; retrieve size of stack
call r12
add rsp, r14
pop r14
pop r13
pop r12
ret
makeuniquecall ENDP
END
And your code will look something like this:
#include <stdio.h>
char* abc(unsigned int a, unsigned int b)
{
printf("a - %d, b - %d\n", a, b);
return "return abc str\n";
}
extern makeuniquecall();
main()
{
unsigned int x, y;
x = 5, y = 6;
#pragma pack(4)
struct {
struct { char maskargs[4]; unsigned long long szargs; } invok;
char *(*pfunc)();
unsigned long long args[2], shadow[2];
} array2[3];
#pragma pack(pop)
for (int i = 0; i < 3; i++)
{
memset(array2[i].invok.maskargs, 0, sizeof array2[i].invok.maskargs); // standard - no floats passed
array2[i].invok.szargs = 8 * 4; //consider shadow space
array2[i].pfunc = abc;
array2[i].args[0] = x;
array2[i].args[1] = y;
}
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", ((char *(*)())makeuniquecall)(array2[i].pfunc, array2[i].args, &array2[i].invok));
}
You'll probably not need that for your specific case you will get away with simply storing each argument and calling the function directly - i.e. (plus this method won't be x86-64 specific):
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", array2[i].pfunc(array2[i].args[0], array2[i].args[1]));
But mine implementation gives you the flexibility to store different amount of arguments for each call.
Note consider this guide for running above examples on msvc (since it requires to add asm file for the assembly code).
I love such noob questions since they make you think about why x-y feature doesn't actually exist in the language.
Look at this code. I return an address of the compound literal here.
#include <stdio.h>
#define FOO(bar) ((bar)->a + (bar)->b)
struct bar {
int a;
int b;
};
static struct bar * to_bar(int a, int b);
int main(void)
{
int baz = FOO((struct bar *) {to_bar(1, 2)});
printf("%d\n", baz);
return 0;
}
static struct bar *
to_bar(int a, int b)
{
return &(struct bar) {a, b};
}
Output:
3
ISO/IEC 9899 says:
If the compound literal occurs outside the body of a function, the
object has static storage duration; otherwise, it has automatic
storage duration associated with the enclosing block.
I. e., in the to_bar function the unnamed object, created by the compound literal has automatic storage duration. Thereby, it will be destroyed outside scope of to_bar. It seems, this code produces undefined behaviour (based on the standard). Is it so?
You are right. In your example, you immediately retrieved the fields after returning from to_bar, so you didn't have time to corrupt the stack frame of the deceased to_bar function. But here's another example:
struct bar {
int a;
int b;
};
static struct bar * to_bar(int a, int b);
int main(void)
{
struct bar * corrupted_bar = to_bar(1, 2);
printf("this print will corrupt\n");
int baz = corrupted_bar->a + corrupted_bar->b;
printf("baz = %d\n", baz);
return 0;
}
static struct bar *
to_bar(int a, int b)
{
return &(struct bar) {a, b};
}
which when executed
this print will corrupt
baz = -59543507
If you look at the assembly
.LC0:
.string "this print will corrupt"
.LC1:
.string "baz = %d\n"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov esi, 2
mov edi, 1
call to_bar ; call to_bar
mov QWORD PTR [rbp-8], rax ; save address returned to a local pointer
mov edi, OFFSET FLAT:.LC0 ; argument into puts()
call puts ; call puts(), which creates its own local variables that corrupts the bar struct
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax+4]
add eax, edx
mov DWORD PTR [rbp-12], eax
mov eax, DWORD PTR [rbp-12]
mov esi, eax
mov edi, OFFSET FLAT:.LC1
mov eax, 0
call printf
mov eax, 0
leave
ret
to_bar:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov eax, DWORD PTR [rbp-20]
mov DWORD PTR [rbp-8], eax ; field 'a' gets stored, notice dest addr rbp-8 is in the stack frame of this function (local variable)
mov eax, DWORD PTR [rbp-24]
mov DWORD PTR [rbp-4], eax ; field 'b' gets stored, same as above
lea rax, [rbp-8]
pop rbp
ret
Say I have a struct defined as follows
struct my_struct
{
int num;
};
....
Here I have a pointer to my_struct and I want to do an increment on num
void foo(struct my_struct* my_ptr)
{
// increment num
// method #1
my_ptr->num++;
// method #2
++(my_ptr->num);
// method #3
my_ptr->++num;
}
Do these 3 ways of incrementing num do the same thing?
While we're at it, is it true that pre-increment is more efficient than post-increment?
Thanks!
First two will have the same effect (when on a line on their own like that), but the third method isn't valid C code (you can't put the ++ there).
As for efficiency, there is no difference. The difference you may have heard people talking about is when, in C++, you increment a non-pointer data type, such as an iterator. In some cases, pre-increment can be faster there.
You can see the generated code using GCC Explorer.
void foo(struct my_struct* my_ptr)
{
my_ptr->num++;
}
void bar(struct my_struct* my_ptr)
{
++(my_ptr->num);
}
Output:
foo(my_struct*): # #foo(my_struct*)
incl (%rdi)
ret
bar(my_struct*): # #bar(my_struct*)
incl (%rdi)
ret
As you can see, there's no difference whatsoever.
The only possible difference between the first two is when you use them in expressions:
my_ptr->num = 0;
int x = my_ptr->num++; // x = 0
my_ptr->num = 0;
int y = ++my_ptr->num; // y = 1
If your only intention is to increment the value of num then the 1st and 2nd method will yield same intented result to the callee method.
However, if you change your code to the following, you can see the difference between the code generated by gcc (assembly level code):
struct my_struct
{
int num;
};
void foo(struct my_struct* my_ptr)
{
printf("\nPost Increment: %d", my_ptr->num++);
}
int main()
{
struct my_struct a;
a.num = 10;
foo(&a);
}
Now compile it using: gcc -masm=intel -S structTest.c -o structTest.s
This asks gcc to generate the assembly code:
Open structTest.s in a text editor.
foo:
.LFB0:
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov edx, eax
**lea ecx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], ecx
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
leave
ret
.cfi_endproc
main:
.LFB1:
push rbp
mov rbp, rsp
sub rsp, 16
**mov DWORD PTR [rbp-16], 10
lea rax, [rbp-16]
mov rdi, rax
call foo**
leave
ret
.cfi_endproc
And when you change the operation to pre-increment, the follwoing code is generated:
foo:
.LFB0:
.cfi_startproc
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
**lea edx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
**mov DWORD PTR [rax], edx**
mov rax, QWORD PTR [rbp-8]
**mov edx, DWORD PTR [rax]**
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
leave
ret
.cfi_endproc
So, you would see that in the second case, the compiler increments the num value and passes on this num value to printf().
In terms of performance, I would expect the post-increment to be more efficient since the memory locations are touched a fewer number of times.
The important lines have been marked between ** in the above code.