How to push variable in sdcc inline assembly? - c

I have this code in ThreadCreate():
int tmpPSW = newThID << 3;
__asm
PUSH A
PUSH _tmpPSW
__endasm;
This results in:
?ASlink-Warning-Undefined Global '_tmpPSW' referenced by module 'cooperative'
I don't get why. tmpPSW is clearly defined, but sdcc complains. What am I doing wrong here? Is there other way to push C variable in sdcc inline assembly?
Also this is probably relevant. The .asm file generated contains allocation info:
;------------------------------------------------------------
;Allocation info for local variables in function 'ThreadCreate'
;------------------------------------------------------------
;fp Allocated to registers
;newThMask Allocated to registers r6 r7
;newThID Allocated to registers r5
;startSP Allocated to registers r3 r4
;tmp Allocated to registers
;tmpPSW Allocated to registers
;------------------------------------------------------------
Does this mean I run out of registers? If so how should I mitigate this?
EDIT:
Source of ThreadCreate():
// ThreadID is typedef'ed as char
ThreadID ThreadCreate(FunctionPtr fp) {
if (activeTh == 0b1111)
return -1;
// i.e. get rightmost bit 0 in bitmask
// https://stackoverflow.com/a/42747608/6306190
int newThMask = ~activeTh & (activeTh + 1);
activeTh |= newThMask;
ThreadID newThID = 0;
while (newThMask >>= 1) { newThID++; }
int startSP = (newThID ^ (1UL << 2)) << 4;
int tmp = SP;
SP = startSP;
int tmpPSW = newThID << 3;
__asm
PUSH DPL ;; push _fp (argument passed in as DPTR in SDCC)
PUSH DPH ;; push _fp
MOV A, #0
PUSH A ;; ACC
PUSH A ;; B
PUSH A ;; DPL
PUSH A ;; DPH
PUSH _tmpPSW ;; PSW
__endasm;
savedSP[newThID] = SP;
SP = tmp;
return newThID;
}
Generated assembly of ThreadCreate():
;------------------------------------------------------------
;Allocation info for local variables in function 'ThreadCreate'
;------------------------------------------------------------
;fp Allocated to registers
;newThMask Allocated to registers r6 r7
;newThID Allocated to registers r5
;startSP Allocated to registers r3 r4
;tmp Allocated to registers
;tmpPSW Allocated to registers
;------------------------------------------------------------
; cooperative.c:104: ThreadID ThreadCreate(FunctionPtr fp) {
; -----------------------------------------
; function ThreadCreate
; -----------------------------------------
_ThreadCreate:
; cooperative.c:110: if (activeTh == 0b1111)
mov a,#0x0f
cjne a,_activeTh,00121$
clr a
cjne a,(_activeTh + 1),00121$
sjmp 00122$
00121$:
sjmp 00102$
00122$:
; cooperative.c:111: return -1;
mov dpl,#0xff
ret
00102$:
; cooperative.c:119: int newThMask = ~activeTh & (activeTh + 1);
mov a,_activeTh
cpl a
mov r6,a
mov a,(_activeTh + 1)
cpl a
mov r7,a
mov a,#0x01
add a,_activeTh
mov r4,a
clr a
addc a,(_activeTh + 1)
mov r5,a
mov a,r4
anl ar6,a
mov a,r5
anl ar7,a
; cooperative.c:157: activeTh |= newThMask;
mov a,r6
orl _activeTh,a
mov a,r7
orl (_activeTh + 1),a
; cooperative.c:160: while (newThMask >>= 1) { newThID++; }
mov r5,#0x00
00103$:
mov ar3,r6
mov a,r7
mov c,acc.7
rrc a
xch a,r3
rrc a
xch a,r3
mov r4,a
mov ar6,r3
mov ar7,r4
mov a,r3
orl a,r4
jz 00105$
inc r5
sjmp 00103$
00105$:
; cooperative.c:161: int startSP = (newThID ^ (1UL << 2)) << 4;
mov ar3,r5
mov r4,#0x00
mov r6,#0x00
xrl ar3,#0x04
mov a,r4
swap a
anl a,#0xf0
xch a,r3
swap a
xch a,r3
xrl a,r3
xch a,r3
anl a,#0xf0
xch a,r3
xrl a,r3
mov r4,a
; cooperative.c:163: int tmp = SP;
mov r7,_SP
; cooperative.c:164: SP = startSP;
mov _SP,r3
; cooperative.c:176: __endasm;
PUSH DPL ;; push _fp (argument passed in as DPTR in 390)
PUSH DPH ;; push _fp
MOV A, #0
PUSH A ;; ACC
PUSH A ;; B
PUSH A ;; DPL
PUSH A ;; DPH
PUSH _tmpPSW ;; PSW
; cooperative.c:178: savedSP[newThID] = SP;
mov a,r5
add a,r5
add a,#_savedSP
mov r1,a
mov r4,_SP
mov r6,#0x00
mov #r1,ar4
inc r1
mov #r1,ar6
; cooperative.c:179: SP = tmp;
mov _SP,r7
; cooperative.c:180: return newThID;
mov dpl,r5
; cooperative.c:181: }
ret

The compiler optimized your variable away, because it was never used from the view of the compiler. Compare the generated assembly with the source to see this.
You might try other options. Because I don't have SDCC installed, I can just suggest:
Make the variable volatile.
Make the variable static, since one flavor of PUSH works with an address in the internal RAM.
Because inline assembly is heavily depending on the surrounding code and the compiler, you could also use newThID in the assembly part, and do the shift there.
Note 1: The generated assembly demonstrates how slow machine code gets if you use int without thinking. Limit your variables to the smallest appropriate data type.
Note 2: Don't make the variable global. static does what you want without exposing the variable globally: Place it in RAM so it can be accessed.

Related

Segmentation fault when trying to fprintf after executing machine code with jit

devs! Could you help me? The project's goal is to translate byte code from a fictional architecture, generating an array of real machine code and make it run with jit, but I get a segmentation fault when I try to save a certain part of the output on a file. Part of the code responsible for this:
uint32_t length = sysconf(4096);
void * memory = mmap(0 , length , PROT_NONE , MAP_PRIVATE | MAP_ANONYMOUS , -1 , 0);
//{machine array receives the translated machine code here...}
mprotect ( memory , length , PROT_WRITE ) ;
// copying the machine code array to the memory
memcpy ( memory , ( void *) ( machine ) , sizeof ( machine ) ) ;
mprotect ( memory , length , PROT_EXEC ) ;
uint32_t length = sysconf(4096);
const uint32_t (* jit ) (int32_t*, uint8_t*) = ( uint32_t (*) (int32_t*, uint8_t*) ) ( memory );
// running the machine code to produce de outputs
// &R is the array of registers to store the output and &mem contains the original byte
// code to receive inputs from a instruction that changes the original code
(*jit)((int *)&R, (unsigned char *)&mem);
munmap(memory,length);
// printf/fprintf that causes the segmentation fault if we try to print n and ic[n]
// n = 0; - does not work to print the correct starting value for n
// fflush(stdout); - works to print the correct starting value for n
for(n = 0; n < 16; n++) {
// fprintf(output,"%02x:\n",n);
// fprintf(output,":%d\n",ic[n]);
fprintf(output,"%02x:%d\n",n,ic[n]);
// printf("%02x:%d\n",n,ic[n]);
// fflush(stdout);
}
for (k = 0; k < 16; k++) {
fprintf(output,"R[%d]=0x%08x\n",k,R[k]);
}
The original byte code translated to instructions on this pseudo-assembly code. On this code, the R's represent and array of registers that is passed to the real assembly code R0 is %rdi, R1 = %rdi+0x4,..., R15 = %rdi+0x3C.
Some of those pseudo-instructions translate to one or more actual assembly instructions, and [Rn] represents the memory location which contains the byte code for the original architecture. So when it access [Rn], it uses the current value for the register as the position to get the next 4 bytes (an instruction on the fantasy architecture is 4 bytes long).
mov R0, 0x006C
mov R1, 0x0001
mov R2, [R0]
cmp R15, R2
je 0x0030
mov R14, R2
jg 0x0000
jl 0x0000
add R13, R14
and R12, R13
or R11, R12
xor R10, R11
shl R10, 0x00
shr R10, 0x00
sub R2, R1
mov [R0], R2
jmp 0xFFC8
mov R1, 0x0004
add R0, R1
mov R2, [R0]
add R0, R1
mov R4, [R0]
add R0, R1
mov R8, [R0]
add R0, R1
mov R12, [R0]
jmp 0x0004
mov R3, R12
or R6, R13
add R10, R8
sub R15, R14
For the original architecture instructions (00 to 0F) and 16 registers (R[0] to R[15], the output should follow the model:
original instruction opcode: number of times executed
array of registers: value stored. Something like this:
00:2
01:1
...
0e:1
0f:1
R[0]=0x0000006c
R[1]=0x00000001
...
R[13]=0x03885533
R[14]=0x03885533
R[15]=0x00000000
The problem is that I keep getting a segmentation fault when I try to save the opcode: number of executions. If I try to print only the "opcode:" and register:value pairs, there's no segmentation fault, but instead of printing the first opcode value as "0:", it prints "6C:" which is the R[0] and the r12 (asm register) according to the gdb:
I have tried to insert the push rbp, mov rbp, rsp before the assembly code and the pop rbp, ret after the assembly, but nothing works. Any ideas that could help? Any more infos that I could provide?
Thanks for the help and have a good day.

How can I store abc(x, y) which is a pointer function into an array as per the following code sample?

Function 1.
It is a pointer function.
char *abc(unsigned int a, unsigned int b)
{
//do something here ...
}
Function 2
Leveraged the function 1 into function 2.
I am trying to store the abc function into an array, however I am getting the error as : error: assignment to expression with array type.
fun2()
{
unsigned int x, y;
x= 5, y=6;
char *array1;
char array2;
for(i=0; i<3; i++)
{
array2[i] = abc(x, y);
}
}
You can't store the invocation of a function in C since it would defeat many existing popular optimizations involving register parameters passing - see because normally parameters are assigned their argument values immediately before the execution flow is transferred to the calling site - compilers may choose to use the registers to store those values but as it stands those registers are volatile and so if we were to delay the actual call they would be overwritten at said later time - possibly even by another call to some function which also have its arguments passed as registers. A solution - which I've personally implemented - is to have a function simulate the call for you by re-assigning to the proper registers and any further arguments - to the stack. In this case you store the argument values in a flat memory. But this must be done in assembly exclusively for this purpose and specific to your target architecture. On the other hand if your architecture is not using any such optimizations - it could be quite easier but still hand written assembly would be required.
In any case this is not a feature the standard (or even pre standard as far as I know) C has implemented anytime.
For example this is an implementation for x86-64 I've wrote some time ago (for MSVC masm assembler):
PUBLIC makeuniquecall
.data
makeuniquecall_jmp_table dq zero_zero, one_zero, two_zero, three_zero ; ordinary
makeuniquecall_jmp_table_one dq zero_one, one_one, two_one, three_one ; single precision
makeuniquecall_jmp_table_two dq zero_two, one_two, two_two, three_two ; double precision
.code
makeuniquecall PROC
;rcx - function pointer
;rdx - raw argument data
;r8 - a byte array specifying each register parameter if it's float and the last qword is the size of the rest
push r12
push r13
push r14
mov r12, rcx
mov r13, rdx
mov r14, r8
; first store the stack vars
mov rax, [r14 + 4] ; retrieve size of stack
sub rsp, rax
mov rdi, rsp
xor rdx, rdx
mov r8, 8
div r8
mov rcx, rax
mov rsi, r13
;add rsi, 32
rep movs qword ptr [rdi], qword ptr [rsi]
xor r10,r10
cycle:
mov rax, r14
add rax, r10
movzx rax, byte ptr [rax]
test rax, rax
jnz jmp_one
lea rax, makeuniquecall_jmp_table
jmp qword ptr[rax + r10 * 8]
jmp_one:
cmp rax, 1
jnz jmp_two
lea rax, makeuniquecall_jmp_table_one
jmp qword ptr[rax + r10 * 8]
jmp_two:
lea rax, makeuniquecall_jmp_table_two
jmp qword ptr[rax + r10 * 8]
zero_zero::
mov rcx, qword ptr[r13+r10*8]
jmp continue
one_zero::
mov rdx, qword ptr[r13+r10*8]
jmp continue
two_zero::
mov r8, qword ptr[r13+r10*8]
jmp continue
three_zero::
mov r9, qword ptr[r13+r10*8]
jmp continue
zero_one::
movss xmm0, dword ptr[r13+r10*8]
jmp continue
one_one::
movss xmm1, dword ptr[r13+r10*8]
jmp continue
two_one::
movss xmm2, dword ptr[r13+r10*8]
jmp continue
three_one::
movss xmm3, dword ptr[r13+r10*8]
jmp continue
zero_two::
movsd xmm0, qword ptr[r13+r10*8]
jmp continue
one_two::
movsd xmm1, qword ptr[r13+r10*8]
jmp continue
two_two::
movsd xmm2, qword ptr[r13+r10*8]
jmp continue
three_two::
movsd xmm3, qword ptr[r13+r10*8]
continue:
inc r10
cmp r10, 4
jb cycle
mov r14, [r14 + 4] ; retrieve size of stack
call r12
add rsp, r14
pop r14
pop r13
pop r12
ret
makeuniquecall ENDP
END
And your code will look something like this:
#include <stdio.h>
char* abc(unsigned int a, unsigned int b)
{
printf("a - %d, b - %d\n", a, b);
return "return abc str\n";
}
extern makeuniquecall();
main()
{
unsigned int x, y;
x = 5, y = 6;
#pragma pack(4)
struct {
struct { char maskargs[4]; unsigned long long szargs; } invok;
char *(*pfunc)();
unsigned long long args[2], shadow[2];
} array2[3];
#pragma pack(pop)
for (int i = 0; i < 3; i++)
{
memset(array2[i].invok.maskargs, 0, sizeof array2[i].invok.maskargs); // standard - no floats passed
array2[i].invok.szargs = 8 * 4; //consider shadow space
array2[i].pfunc = abc;
array2[i].args[0] = x;
array2[i].args[1] = y;
}
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", ((char *(*)())makeuniquecall)(array2[i].pfunc, array2[i].args, &array2[i].invok));
}
You'll probably not need that for your specific case you will get away with simply storing each argument and calling the function directly - i.e. (plus this method won't be x86-64 specific):
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", array2[i].pfunc(array2[i].args[0], array2[i].args[1]));
But mine implementation gives you the flexibility to store different amount of arguments for each call.
Note consider this guide for running above examples on msvc (since it requires to add asm file for the assembly code).
I love such noob questions since they make you think about why x-y feature doesn't actually exist in the language.

In Assembly how would I get the value from a call function?

This my C code for my MSP430. I am trying to write str = word_start(str); in assembly but I am not sure if this is correct.
char** tokenize(char* str){
int totalWords = count_words(str);
printf("%d\n", totalWords);
char **array;
array = (char **)malloc(sizeof(char*) * (++totalWords));
//filling the array with individual words
int diff = 0;
int i;
for(i = 0; i < totalWords-1; i++){
str = word_start(str);
// find difference in length
diff = word_terminator(str) - str;
// add new allocated string to array
array[i] = copy_str(str, diff);
// update pointer p to next word
str = word_terminator(str);
}
array[i] = '\0';
return array;
}
This is my assembly code, I want to convert the above method from C to MSP430 assembly.
tokenize:
mov r12, 0(r1) ; put str in str
call #count_words
mov r12 2(r1) ;
mov 2(r1), r12 ; get totalWords
add #1, r12 totalword
add r12, r12
call #malloc
mov r12 4(r1)
mov #0 6(r1) ; int i = 0;
mov #0 8(r1) ; int diff = 0;
top: cmp 0(r1), 6(r1)
JL end
mov 0(r1), r12
call #word_start ; calling word start
mov r12, 0(r1) ; getting the value returned
mov 0(r1), r12
call #word_terminator
mov r12, 8(r1)
mov 0(r1), r12
mov 8(r1), r13
sub r12, r13
call #copy_str
mov r12, 10(r1) ; what we get from cpoy_str put in r12
mov 6(r1), r12 ; put I in r12
add r12, r12 ; add r12
add 4(r1), r12 ; add 4(r1) to whats in r12
mov 10(r1), #r12 ; put what we r12 is in 10(r1)
mov 0(r1), r12
call #word_terminator
move r12, 0(r1)
mov 0(r1), r12
add #1, 6(r1) ; increment i
end:
mov 6(r1), r12
add r12, r12
add 4(r1), r12
mov #0, #r12,
add #12, r1
ret
When I call word_start in assembly, is the way I am passing the value str and getting the value returned done correctly? If not can you show me how you're supposed to pass in variables in assembly to a function and get the returned value from that function?

Conversion from 32 bit to 8-bit values and vice-versa in assembly giving segmentation fault

This is probably my final hurdle in learning x86 assembly language.
The following subroutine is giving me a segmentation fault:
;=================================================================
; RemCharCodeFromAToB - removes all chars between a and e from str
; arguments:
; str - string to be processed
; a - start
; e - end
; return value:
; n/a
;-------------------------------------------------------------------
RemCharCodeFromAToB:
; standard entry sequence
push ebp ; save the previous value of ebp for the benefi$
mov ebp, esp ; copy esp -> ebp so that ebp can be used as a $
; accessing arguments
; [ebp + 0] = old ebp stack frame
; [ebp + 4] = return address
mov edx, [ebp + 8] ; string address
while_loop_rcc:
mov cl, [edx] ; obtain the address of the 1st character of the string
cmp cl, 0 ; check the null value
je while_loop_exit_rcc ; exit if the null-character is reached
mov al, cl ; save cl
mov cl, [ebp + 16] ; end-char
push cx ; push end-char
mov cl, [ebp + 12] ; start-char
push cx ; push start-char
push ax; ; push ch
call IsBetweenAandB
add esp, 12
cmp eax, 0 ; if(ch is not between 'a' and 'e')
je inner_loop_exit_rcc
mov eax, edx ; copy the current address
inner_loop_rcc:
mov cl, [eax+1]
cmp cl, 0
je inner_loop_exit_rcc
mov [eax], cl
inc eax
jmp inner_loop_rcc
inner_loop_exit_rcc:
inc edx ; increment the address
jmp while_loop_rcc ; start the loop again
while_loop_exit_rcc:
; standard exit sequence
mov esp, ebp ; restore esp with ebp
pop ebp ; remove ebp from stack
ret ; return the value of temporary variable
;===================================================================
I am suspecting that there is something wrong with data conversions from 32-bit to 8-bit registers and vice-versa. My concept regarding this is not clear yet.
Or, is there something wrong in the following part
mov al, cl ; save cl
mov cl, [ebp + 16] ; end-char
push cx ; push end-char
mov cl, [ebp + 12] ; start-char
push cx ; push start-char
push ax; ; push ch
call IsBetweenAandB
add esp, 12
?
Full asm code is here.
C++ code is here.
Makefile is here.
cx and ax are 16-bit registers, so your push cx ; push cx; push ax are pushing 16-bit values on the stack, a total of 6 bytes. But IsBetweenAandB is apparently expecting 32-bit values, and you add 12 to esp at the end (instead of 6). So you probably wanted push ecx etc.
Also, you probably want to zero out eax and ecx before using them. As it stands, they probably contain garbage initially, and you only load useful data into the low 8 bits al and cl. Thus when IsBetweenAandB tries to compare the full 32-bit values, you are going to get false results. Or else you want to rewrite IsBetweenAandB to only compare the low bytes that you care about.

Understanding the C function call prolog with __cdecl on windows

Compiling this simple function with MSVC2008, in Debug mode:
int __cdecl sum(int a, int b)
{
return a + b;
}
I get the following disassembly listing:
int __cdecl sum(int a, int b)
{
004113B0 push ebp
004113B1 mov ebp,esp
004113B3 sub esp,0C0h
004113B9 push ebx
004113BA push esi
004113BB push edi
004113BC lea edi,[ebp-0C0h]
004113C2 mov ecx,30h
004113C7 mov eax,0CCCCCCCCh
004113CC rep stos dword ptr es:[edi]
return a + b;
004113CE mov eax,dword ptr [a]
004113D1 add eax,dword ptr [b]
}
004113D4 pop edi
004113D5 pop esi
004113D6 pop ebx
004113D7 mov esp,ebp
004113D9 pop ebp
004113DA ret
There are some parts of the prolog I don't understand:
004113BC lea edi,[ebp-0C0h]
004113C2 mov ecx,30h
004113C7 mov eax,0CCCCCCCCh
004113CC rep stos dword ptr es:[edi]
Why is this required?
EDIT:
After removing the /RTC compiler option, as was suggested, most of this code indeed went away. What remained is:
int __cdecl sum(int a, int b)
{
00411270 push ebp
00411271 mov ebp,esp
00411273 sub esp,40h
00411276 push ebx
00411277 push esi
00411278 push edi
return a + b;
00411279 mov eax,dword ptr [a]
0041127C add eax,dword ptr [b]
}
Now, why is the: sub esp, 40h needed? It's as if place is being allocated for local variables, though there aren't any. Why is the compiler doing this? Is there another flag involved?
This code is emitted due to the /RTC compile option. It initializes all local variables in your function to a bit pattern that is highly likely to generate an access violation or to cause unusual output values. That helps you find out when you forgot to initialize a variable.
The extra space in the stack frame you see allocated is there to support the Edit + Continue feature. This space will be used when you edit the function while debugging and add more local variables. Change the /ZI option to /Zi to disable it.
and in any case of buffer overflow (if you would overwrite local variables) you will end up in a field of "int 3" opcodes:
int 3 ; 0xCC
int 3 ; 0xCC
int 3 ; 0xCC
int 3 ; 0xCC
int 3 ; 0xCC
int 3 ; 0xCC
...
that can be catched by the debugger, so you can fix your code

Resources