It seems that on Arduino, using loops results in more clock cycles. The default loop adds about 4 cycles per iteration, any other loop (while, for etc.) adds 2. Thus, not using any loop at all, and instead explicitly typing in the repeats, results in fewer cycles. Why is that?
Here are some examples of what I mean:
Example 1
void loop() {
for (int i=0;i<4;i++) {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
}
screenshot of measurement
Example 2
void loop() {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
screenshot of measurement
Example 3
void loop() {
while (1) {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
}
screenshot of measurement
The simple story is:
In this case, example 2 and 3 are the same (and usually the compiler optimize the code and remove useless instructions). Those examples have only 2 instructions:
PORTB |= 0b00010000;
PORTB &= 0b11101111;
Conclusion for those example the code only change the port value
For example 1 you have 5 instructions (1 once per for and 4 for each iteration):
int i = 0 (once per for)
i < 4
PORTB |= 0b00010000;
PORTB &= 0b11101111;
i++
Conclusion for this example the code change the port value and handle the for index
In order to understand why you should learn what a compiler does and how a processor works. Here is the assembly code for your examples
Example 1
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], 0
.L3:
cmp DWORD PTR [rbp-4], 3
jg .L4
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
add DWORD PTR [rbp-4], 1
jmp .L3
.L4:
nop
pop rbp
ret
Example 2
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
nop
pop rbp
ret
Example 3
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
.L2:
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
jmp .L2
Related
I am writing an x86_64 kernel for an exam and it seems to reboot every time it runs an STI instruction, which, as defined in my code, is on every boot. I have set up GDT, IDT and ICWs, and masked all IRQs except IRQ1, which is for my keyboard input. The kernel runs perfectly without STI, except for keyboard input.
Here is my bootloader:
main.asm
global start
extern long_mode_start
bits 32
section .text
start:
mov esp, stack_top
call check_multiboot
call check_cpuid
call check_long_mode
call setup_page_tables
call enable_paging
lgdt [gdt64.pointer]
jmp gdt64.code_segment:long_mode_start
hlt
check_multiboot:
cmp eax, 0x36d76289
jne .no_multiboot
ret
.no_multiboot:
mov al, "M"
jmp error
check_cpuid:
pushfd
pop eax
mov ecx, eax
xor eax, 1 << 21
push eax
popfd
pushfd
pop eax
push ecx
popfd
cmp eax, ecx
je .no_cpuid
ret
.no_cpuid:
mov al, "C"
jmp error
check_long_mode:
mov eax, 0x80000000
cpuid
cmp eax, 0x80000001
jb .no_long_mode
mov eax, 0x80000001
cpuid
test edx, 1 << 29
jz .no_long_mode
ret
.no_long_mode:
mov al, "L"
jmp error
setup_page_tables:
mov eax, page_table_l3
or eax, 0b11 ; present, writable
mov [page_table_l4], eax
mov eax, page_table_l2
or eax, 0b11 ; present, writable
mov [page_table_l3], eax
mov ecx, 0 ; counter
.loop:
mov eax, 0x200000 ; 2MiB
mul ecx
or eax, 0b10000011 ; present, writable, huge page
mov [page_table_l2 + ecx * 8], eax
inc ecx ; increment counter
cmp ecx, 512 ; checks if the whole table is mapped
jne .loop ; if not, continue
ret
enable_paging:
; pass page table location to cpu
mov eax, page_table_l4
mov cr3, eax
; enable PAE
mov eax, cr4
or eax, 1 << 5
mov cr4, eax
; enable long mode
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 8
wrmsr
; enable paging
mov eax, cr0
or eax, 1 << 31
mov cr0, eax
ret
error:
; print "ERR: X" where X is the error code
mov dword [0xb8000], 0x4f524f45
mov dword [0xb8004], 0x4f3a4f52
mov dword [0xb8008], 0x4f204f20
mov byte [0xb800a], al
hlt
section .bss
align 4096
page_table_l4:
resb 4096
page_table_l3:
resb 4096
page_table_l2:
resb 4096
stack_bottom:
resb 4096 * 4
stack_top:
section .rodata
gdt64:
dq 0 ; zero entry
.code_segment: equ $ - gdt64
dq (1 << 43) | (1 << 44) | (1 << 47) | (1 << 53) ; code segment
.pointer:
dw $ - gdt64 - 1 ; length
dq gdt64 ; address
main64.asm
global long_mode_start
global load_gdt
global load_idt
global keyboard_handler
global ioport_in
global ioport_out
global enable_interrupts
extern main
extern handle_keyboard_interrupt
section .text
bits 64
long_mode_start:
; load null into all data segment registers
mov ax, 0
mov ss, ax
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
call main
hlt
bits 32
load_idt:
mov edx, [esp + 4]
lidt [edx]
ret
keyboard_handler:
pushad
cld
call handle_keyboard_interrupt
popad
iretd
ioport_in:
mov edx, [esp + 4]
in al, dx
ret
ioport_out:
mov edx, [esp + 4]
mov eax, [esp + 8]
out dx, al
ret
bits 16
enable_interrupts:
sti
ret
And here is my kernel:
main.c
#include "io/print.h"
#include "io/input.h"
void print_prompt(){
print_str("> ");
}
void kernel_main() {
print_clear();
print_set_color(PRINT_COLOR_YELLOW, PRINT_COLOR_BLACK);
print_str("Welcome to vgOS v0.1!!");
print_newline();
print_newline();
print_prompt();
}
int main(){
kernel_main();
init_idt();
enable_interrupts();
init_kb();
print_str("here");
print_newline();
while(1);
return 0;
}
input.h
#pragma once
#include <stdint.h>
#define IDT_SIZE 256
#define KERNEL_CODE_SEGMENT_OFFSET 0x8
#define IDT_INTERRUPT_GATE_64BIT 0x0e
#define PIC1_COMMAND_PORT 0x20
#define PIC1_DATA_PORT 0x21
#define PIC2_COMMAND_PORT 0xA0
#define PIC2_DATA_PORT 0xA1
#define KEYBOARD_DATA_PORT 0x60
#define KEYBOARD_STATUS_PORT 0x64
extern void load_gdt();
extern void load_idt(unsigned int idt_address);
extern void keyboard_handler();
extern char ioport_in(unsigned short port);
extern void ioport_out(unsigned short port, unsigned char data);
extern void enable_interrupts();
struct IDTPointer{
uint16_t limit;
unsigned long long base;
} __attribute__((packed));
struct IDTEntry{
uint16_t offset_1; // Offset bits 0-15
uint16_t selector; // Code segment selector
uint8_t ist; // Interrupt Stack Table offset
uint32_t zero;
uint8_t type_attr; // Gate, type, dpl and p fields
uint16_t offset_2; // Offset bits 16-31
uint32_t offset_3; // Offset bits 32-63
} __attribute__((packed));
void init_idt();
void init_kb();
input.c
#include "input.h"
#include "print.h"
// Declare IDT
struct IDTEntry IDT[IDT_SIZE];
void init_idt(){
// Set IDT keyboard entry
uint64_t offset = (uint64_t)keyboard_handler;
IDT[0x21].offset_1 = offset & 0x000000000000FFFF;
IDT[0x21].selector = KERNEL_CODE_SEGMENT_OFFSET;
IDT[0x21].ist = 0xE; // Set gate type to 'Interrupt'
IDT[0x21].zero = 0; // 0 for testing purposes
IDT[0x21].type_attr = IDT_INTERRUPT_GATE_64BIT;
IDT[0x21].offset_2 = (offset & 0x00000000FFFF0000) >> 16;
IDT[0x21].offset_3 = (offset & 0xFFFFFFFF00000000) >> 32;
// Setup ICWs
// ICW1
ioport_out(PIC1_COMMAND_PORT, 0x11);
ioport_out(PIC2_COMMAND_PORT, 0x11);
// ICW2
ioport_out(PIC1_DATA_PORT, 0x20);
ioport_out(PIC2_DATA_PORT, 0x28);
// ICW3
ioport_out(PIC1_DATA_PORT, 0x4);
ioport_out(PIC2_DATA_PORT, 0x2);
// ICW4
ioport_out(PIC1_DATA_PORT, 0x01);
ioport_out(PIC2_DATA_PORT, 0x01);
// Mask all interrupts
ioport_out(PIC1_DATA_PORT, 0xff);
ioport_out(PIC2_DATA_PORT, 0xff);
// Load IDT data structure
struct IDTPointer idt_ptr;
idt_ptr.limit = (sizeof(struct IDTEntry) * IDT_SIZE) - 1;
idt_ptr.base = (unsigned long long)(&IDT);
load_idt(&idt_ptr);
}
void init_kb(){
// 0xFD = 1111 1101 - Unmask IRQ1
ioport_out(PIC1_DATA_PORT, 0xFD);
}
void handle_keyboard_interrupt(){
ioport_out(PIC1_COMMAND_PORT, 0x20);
unsigned char status = ioport_in(KEYBOARD_STATUS_PORT);
if(status & 0x1){
char keycode = ioport_in(KEYBOARD_DATA_PORT);
if(keycode < 0) return;
print_char(keycode);
}
}
I want to have a register containing 4 bytes of address and 4 bytes of data. For that, I thought about building it in an array of structures (containing address and data as members) or in a matrix. Here a sample code to test what I want to achieve:
#include <stdio.h>
#include <stdint.h>
void reg_init();
#define REG_SIZE 1
typedef struct Reg{
uint8_t addr[4];
uint8_t data[4];
} reg;
static reg reg_struct[REG_SIZE];
static uint8_t reg_matrix[REG_SIZE][8];
int main()
{
int index=-1;
reg_init();
for(int i=0; i<REG_SIZE; i++)
{
uint8_t addr_to_check[4]={0x12,0x34,0x56,0x78};
// FOR STRUCT
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_struct[i].addr[j]) break;
if(j==3) index = i;
}
//FOR MATRIX
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_matrix[i][j]) break;
if(j==3) index = i;
}
}
if (index<0) printf("Address not found\n");
else printf("Address at index: %i",index);
return 0;
}
void reg_init()
{
// Register init for testing
reg_struct[0].addr[0] = 0x12;
reg_struct[0].addr[1] = 0x34;
reg_struct[0].addr[2] = 0x56;
reg_struct[0].addr[3] = 0x78;
reg_struct[0].data[0] = 0x01;
reg_struct[0].data[1] = 0x02;
reg_struct[0].data[2] = 0x03;
reg_struct[0].data[3] = 0x04;
reg_matrix[0][0] = 0x12;
reg_matrix[0][1] = 0x34;
reg_matrix[0][2] = 0x56;
reg_matrix[0][3] = 0x78;
reg_matrix[0][4] = 0x01;
reg_matrix[0][5] = 0x02;
reg_matrix[0][6] = 0x03;
reg_matrix[0][7] = 0x04;
}
The example shows just a unit size register, but the size will be much higher (up to 8 bytes). Overall, I'm interested in optimization in terms of performance. Does it really care to use one or another, or will the compiler build the same machine code?
Below is the assembly of the above code created using visual studio 2019.
Look at record line number ; Line 50 and ; Line 51 it looks like compiler has created same assembly code for both matrix and structure.
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.20.27508.1
TITLE D:\main.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ___local_stdio_printf_options
PUBLIC __vfprintf_l
PUBLIC _printf
PUBLIC _reg_init
PUBLIC _main
EXTRN ___acrt_iob_func:PROC
EXTRN ___stdio_common_vfprintf:PROC
_DATA SEGMENT
COMM ?_OptionsStorage#?1??__local_stdio_printf_options##9#9:QWORD ; `__local_stdio_printf_options'::`2'::_OptionsStorage
_DATA ENDS
_BSS SEGMENT
_reg_struct DQ 01H DUP (?)
_reg_matrix DB 08H DUP (?)
_BSS ENDS
_DATA SEGMENT
$SG8132 DB 'Address not found', 0aH, 00H
ORG $+1
$SG8133 DB 'Address at index: %i', 00H
_DATA ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_index$ = -20 ; size = 4
_addr_to_check$1 = -16 ; size = 4
_j$2 = -12 ; size = 4
_j$3 = -8 ; size = 4
_i$4 = -4 ; size = 4
_main PROC
; File D:\main.c
; Line 16
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
; Line 17
mov DWORD PTR _index$[ebp], -1
; Line 18
call _reg_init
; Line 19
mov DWORD PTR _i$4[ebp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD PTR _i$4[ebp]
add eax, 1
mov DWORD PTR _i$4[ebp], eax
$LN4#main:
cmp DWORD PTR _i$4[ebp], 1
jge $LN3#main
; Line 21
mov BYTE PTR _addr_to_check$1[ebp], 18 ; 00000012H
mov BYTE PTR _addr_to_check$1[ebp+1], 52 ; 00000034H
mov BYTE PTR _addr_to_check$1[ebp+2], 86 ; 00000056H
mov BYTE PTR _addr_to_check$1[ebp+3], 120 ; 00000078H
; Line 23
mov DWORD PTR _j$3[ebp], 0
jmp SHORT $LN7#main
$LN5#main:
mov ecx, DWORD PTR _j$3[ebp]
add ecx, 1
mov DWORD PTR _j$3[ebp], ecx
$LN7#main:
cmp DWORD PTR _j$3[ebp], 4
jge SHORT $LN6#main
; Line 25
mov edx, DWORD PTR _j$3[ebp]
movzx eax, BYTE PTR _addr_to_check$1[ebp+edx]
mov ecx, DWORD PTR _j$3[ebp]
mov edx, DWORD PTR _i$4[ebp]
movzx ecx, BYTE PTR _reg_struct[ecx+edx*8]
cmp eax, ecx
je SHORT $LN11#main
jmp SHORT $LN6#main
$LN11#main:
; Line 26
cmp DWORD PTR _j$3[ebp], 3
jne SHORT $LN12#main
mov edx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], edx
$LN12#main:
; Line 27
jmp SHORT $LN5#main
$LN6#main:
; Line 30
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN10#main
$LN8#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN10#main:
cmp DWORD PTR _j$2[ebp], 4
jge SHORT $LN9#main
; Line 32
mov ecx, DWORD PTR _j$2[ebp]
movzx edx, BYTE PTR _addr_to_check$1[ebp+ecx]
mov eax, DWORD PTR _j$2[ebp]
mov ecx, DWORD PTR _i$4[ebp]
movzx eax, BYTE PTR _reg_matrix[eax+ecx*8]
cmp edx, eax
je SHORT $LN13#main
jmp SHORT $LN9#main
$LN13#main:
; Line 33
cmp DWORD PTR _j$2[ebp], 3
jne SHORT $LN14#main
mov ecx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], ecx
$LN14#main:
; Line 34
jmp SHORT $LN8#main
$LN9#main:
; Line 35
jmp $LN2#main
$LN3#main:
; Line 36
cmp DWORD PTR _index$[ebp], 0
jge SHORT $LN15#main
push OFFSET $SG8132
call _printf
add esp, 4
jmp SHORT $LN16#main
$LN15#main:
; Line 37
mov edx, DWORD PTR _index$[ebp]
push edx
push OFFSET $SG8133
call _printf
add esp, 8
$LN16#main:
; Line 38
xor eax, eax
; Line 39
mov esp, ebp
pop ebp
ret 0
_main ENDP
_TEXT ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_reg_init PROC
; File D:\main.c
; Line 41
push ebp
mov ebp, esp
; Line 43
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 0
mov BYTE PTR _reg_struct[ecx+eax], 18 ; 00000012H
; Line 44
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 0
mov BYTE PTR _reg_struct[edx+eax], 52 ; 00000034H
; Line 45
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 1
mov BYTE PTR _reg_struct[edx+eax], 86 ; 00000056H
; Line 46
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 3
mov BYTE PTR _reg_struct[edx+ecx], 120 ; 00000078H
; Line 47
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 0
mov BYTE PTR _reg_struct[eax+edx+4], 1
; Line 48
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 0
mov BYTE PTR _reg_struct[ecx+edx+4], 2
; Line 49
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 1
mov BYTE PTR _reg_struct[ecx+edx+4], 3
; Line 50
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 3
mov BYTE PTR _reg_struct[ecx+eax+4], 4
; Line 51
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 0
mov BYTE PTR _reg_matrix[edx+ecx], 18 ; 00000012H
; Line 52
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 0
mov BYTE PTR _reg_matrix[eax+ecx], 52 ; 00000034H
; Line 53
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 1
mov BYTE PTR _reg_matrix[eax+ecx], 86 ; 00000056H
; Line 54
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 3
mov BYTE PTR _reg_matrix[eax+edx], 120 ; 00000078H
; Line 55
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 2
mov BYTE PTR _reg_matrix[ecx+edx], 1
; Line 56
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 5
mov BYTE PTR _reg_matrix[ecx+eax], 2
; Line 57
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 6
mov BYTE PTR _reg_matrix[edx+ecx], 3
; Line 58
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 7
mov BYTE PTR _reg_matrix[eax+edx], 4
; Line 59
pop ebp
ret 0
_reg_init ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT _printf
_TEXT SEGMENT
__Result$ = -8 ; size = 4
__ArgList$ = -4 ; size = 4
__Format$ = 8 ; size = 4
_printf PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 954
push ebp
mov ebp, esp
sub esp, 8
; Line 957
lea eax, DWORD PTR __Format$[ebp+4]
mov DWORD PTR __ArgList$[ebp], eax
; Line 958
mov ecx, DWORD PTR __ArgList$[ebp]
push ecx
push 0
mov edx, DWORD PTR __Format$[ebp]
push edx
push 1
call ___acrt_iob_func
add esp, 4
push eax
call __vfprintf_l
add esp, 16 ; 00000010H
mov DWORD PTR __Result$[ebp], eax
; Line 959
mov DWORD PTR __ArgList$[ebp], 0
; Line 960
mov eax, DWORD PTR __Result$[ebp]
; Line 961
mov esp, ebp
pop ebp
ret 0
_printf ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT __vfprintf_l
_TEXT SEGMENT
__Stream$ = 8 ; size = 4
__Format$ = 12 ; size = 4
__Locale$ = 16 ; size = 4
__ArgList$ = 20 ; size = 4
__vfprintf_l PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 642
push ebp
mov ebp, esp
; Line 643
mov eax, DWORD PTR __ArgList$[ebp]
push eax
mov ecx, DWORD PTR __Locale$[ebp]
push ecx
mov edx, DWORD PTR __Format$[ebp]
push edx
mov eax, DWORD PTR __Stream$[ebp]
push eax
call ___local_stdio_printf_options
mov ecx, DWORD PTR [eax+4]
push ecx
mov edx, DWORD PTR [eax]
push edx
call ___stdio_common_vfprintf
add esp, 24 ; 00000018H
; Line 644
pop ebp
ret 0
__vfprintf_l ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT ___local_stdio_printf_options
_TEXT SEGMENT
___local_stdio_printf_options PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\corecrt_stdio_config.h
; Line 86
push ebp
mov ebp, esp
; Line 88
mov eax, OFFSET ?_OptionsStorage#?1??__local_stdio_printf_options##9#9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
; Line 89
pop ebp
ret 0
___local_stdio_printf_options ENDP
_TEXT ENDS
END
I'm trying to convert 32bit float to 64bit double in asm on x86 architecture. The conversion is done by function written in asm and then I want to call it from C. I have no idea what I'm doing wrong, but memory pointed by dst seem to stay untouched and after printf program crashes. I want to do it without any floating-point intructions. Here's the code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
mov esi, dword ptr [ebp+8] ; src
mov edi, dword ptr [ebp+12]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END
And the C code:
void conv(float * src, double * dst);
int main()
{
float src = 4.5f;
double dst = 0.;
conv(&src, &dst);
printf("%f\n", dst);
return 0;
}
Your primary problem is accessing the arguments. Since you did pusha the arguments are not at [ebp+8] and [ebp+12], rather at [ebp+36] and [ebp+40]. A debugger would have shown you this right away. Even with those changes your code is still broken though.
Ok, finally it works. Very helpful was Jester's advice about args access. Stupid thing, but hard to notice. Here's final code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
;+36 and +40 since pusha
mov esi, dword ptr [ebp+36]; src
mov edi, dword ptr [ebp+40]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
;mentissa:
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
;exponent:
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi+4], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END
I wrote this classic function : (in 32-bit mode)
void ex(size_t a, size_t b)
{
size_t c;
c = a;
a = b;
b = c;
}
I call it inside the main as follows :
size_t a = 4;
size_t b = 5;
ex(a,b);
What I was expecting from the assembly code generated when entering the function is something like this :
1-Push the values of b and a in the stack : (which was done)
mov eax,dword ptr [b]
push eax
mov ecx,dword ptr [a]
push ecx
2-Use the values of a and b in the stack :
push ebp
mov ebp, esp
sub esp, 4
c = a;
mov eax, dword ptr [ebp+8]
mov dword ptr [ebp-4], eax
and so on for the other variables.
However, this is what I find when debugging :
push ebp
mov ebp,esp
sub esp,0CCh // normal since it's in debug with ZI option
push ebx
push esi
push edi
lea edi,[ebp-0CCh]
mov ecx,33h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
size_t c;
c = a;
mov eax,dword ptr [a]
mov dword ptr [c],eax
Why is it using the variable a directly instead of calling the value stored in the stack? I don't understand...
The debugger doesn't show the instruction using ebp to access a. The same syntax is permitted when you write inline assembly. Otherwise the reason that dword ptr still appears.
It is easy to get it your preferred way, right click > untick "Show Symbol Names".
Using the assembly output option (right click on file name, properties, ...), I get what you expect from debug assembly output. This could depend on which version of VS you use. For this example, I used VS2005. I have VS2015 on a different system, but didn't try it yet.
_c$ = -8 ; size = 4
_a$ = 8 ; size = 4
_b$ = 12 ; size = 4
_ex PROC ; COMDAT
push ebp
mov ebp, esp
sub esp, 204 ; 000000ccH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-204]
mov ecx, 51 ; 00000033H
mov eax, -858993460 ; ccccccccH
rep stosd ;fill with 0cccccccch
mov eax, DWORD PTR _a$[ebp]
mov DWORD PTR _c$[ebp], eax
mov eax, DWORD PTR _b$[ebp]
mov DWORD PTR _a$[ebp], eax
mov eax, DWORD PTR _c$[ebp]
mov DWORD PTR _b$[ebp], eax
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
_ex ENDP
Note this doesn't work, you need to use pointers for the swap to work.
void ex(size_t *pa, size_t *pb)
{
size_t c;
c = *pa;
*pa = *pb;
*pb = c;
}
which gets translated into:
_c$ = -8 ; size = 4
_pa$ = 8 ; size = 4
_pb$ = 12 ; size = 4
_ex PROC ; COMDAT
push ebp
mov ebp, esp
sub esp, 204 ; 000000ccH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-204]
mov ecx, 51 ; 00000033H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR _pa$[ebp]
mov ecx, DWORD PTR [eax]
mov DWORD PTR _c$[ebp], ecx
mov eax, DWORD PTR _pa$[ebp]
mov ecx, DWORD PTR _pb$[ebp]
mov edx, DWORD PTR [ecx]
mov DWORD PTR [eax], edx
mov eax, DWORD PTR _pb$[ebp]
mov ecx, DWORD PTR _c$[ebp]
mov DWORD PTR [eax], ecx
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
_ex ENDP
I have a school project about compilers and how it differs in assembly code between Intel x86 and ARMv7, but i'm stuck trying to comprehend the assembly for the Intel x86 architecture.
The source code is:
int main()
{
int a=5,b=2;
int result;
result = a % b;
printf("Result of 5 modulo 2 is %i\n", result);
}
Assembly output (gcc masm=Intel)
main:
/*
Intel32-x86 Arhchitecture
Little endian
ebp register -- base pointer
esp register -- stack pointer
*/
push ebp ; ebp register put on stack
mov ebp, esp ; Move data from ebp to esp
and esp, -16 ; Logical AND ??
sub esp, 32 ; Subtraction ??
mov DWORD PTR [esp+20], 5
;5 as 32 bits
;00000101-00000000-00000000-00000000
mov DWORD PTR [esp+24], 2
;2 as 32 bits
;00000010-00000000-00000000-00000000
mov eax, DWORD PTR [esp+20]
mov edx, eax
sar edx, 31
;Shift Arithmetically right - edx med 31.
;00000101-00000000-00000000-00000000 BEFORE
;00000000-00000000-00000000-00000000 AFTER
idiv DWORD PTR [esp+24]
;Signed divide - IDIV r/m32 - EDX:EAX register
;Dividing EDX:EAX on value of esp+24, and save the remainder in edx.
;EDX:EAX 00000000-00000000-00000000-00000000-00000101-00000000-00000000-00000000
mov DWORD PTR [esp+28], edx
mov eax, OFFSET FLAT:.LC0
mov edx, DWORD PTR [esp+28]
mov DWORD PTR [esp+4], edx
mov DWORD PTR [esp], eax
call printf
leave
ret
and esp, -16 ; Logical AND
sub esp, 32 ; Subtraction
What is the purpose of those two instructions?
The purpose is mentioned in the comments:
and esp,-16 ;round esp down to 16 byte boundary
sub esp,32 ;allocate 32 bytes of space for local variables
In case you didn't catch this part about sign extending the dividend:
mov eax, DWORD PTR [esp+20] ; eax = dividend
mov edx, eax ; edx = dividend
sar edx, 31 ; edx = 0 or -1 (the sign extension)