Kernel falls into a boot loop at STI instruction - c

I am writing an x86_64 kernel for an exam and it seems to reboot every time it runs an STI instruction, which, as defined in my code, is on every boot. I have set up GDT, IDT and ICWs, and masked all IRQs except IRQ1, which is for my keyboard input. The kernel runs perfectly without STI, except for keyboard input.
Here is my bootloader:
main.asm
global start
extern long_mode_start
bits 32
section .text
start:
mov esp, stack_top
call check_multiboot
call check_cpuid
call check_long_mode
call setup_page_tables
call enable_paging
lgdt [gdt64.pointer]
jmp gdt64.code_segment:long_mode_start
hlt
check_multiboot:
cmp eax, 0x36d76289
jne .no_multiboot
ret
.no_multiboot:
mov al, "M"
jmp error
check_cpuid:
pushfd
pop eax
mov ecx, eax
xor eax, 1 << 21
push eax
popfd
pushfd
pop eax
push ecx
popfd
cmp eax, ecx
je .no_cpuid
ret
.no_cpuid:
mov al, "C"
jmp error
check_long_mode:
mov eax, 0x80000000
cpuid
cmp eax, 0x80000001
jb .no_long_mode
mov eax, 0x80000001
cpuid
test edx, 1 << 29
jz .no_long_mode
ret
.no_long_mode:
mov al, "L"
jmp error
setup_page_tables:
mov eax, page_table_l3
or eax, 0b11 ; present, writable
mov [page_table_l4], eax
mov eax, page_table_l2
or eax, 0b11 ; present, writable
mov [page_table_l3], eax
mov ecx, 0 ; counter
.loop:
mov eax, 0x200000 ; 2MiB
mul ecx
or eax, 0b10000011 ; present, writable, huge page
mov [page_table_l2 + ecx * 8], eax
inc ecx ; increment counter
cmp ecx, 512 ; checks if the whole table is mapped
jne .loop ; if not, continue
ret
enable_paging:
; pass page table location to cpu
mov eax, page_table_l4
mov cr3, eax
; enable PAE
mov eax, cr4
or eax, 1 << 5
mov cr4, eax
; enable long mode
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 8
wrmsr
; enable paging
mov eax, cr0
or eax, 1 << 31
mov cr0, eax
ret
error:
; print "ERR: X" where X is the error code
mov dword [0xb8000], 0x4f524f45
mov dword [0xb8004], 0x4f3a4f52
mov dword [0xb8008], 0x4f204f20
mov byte [0xb800a], al
hlt
section .bss
align 4096
page_table_l4:
resb 4096
page_table_l3:
resb 4096
page_table_l2:
resb 4096
stack_bottom:
resb 4096 * 4
stack_top:
section .rodata
gdt64:
dq 0 ; zero entry
.code_segment: equ $ - gdt64
dq (1 << 43) | (1 << 44) | (1 << 47) | (1 << 53) ; code segment
.pointer:
dw $ - gdt64 - 1 ; length
dq gdt64 ; address
main64.asm
global long_mode_start
global load_gdt
global load_idt
global keyboard_handler
global ioport_in
global ioport_out
global enable_interrupts
extern main
extern handle_keyboard_interrupt
section .text
bits 64
long_mode_start:
; load null into all data segment registers
mov ax, 0
mov ss, ax
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
call main
hlt
bits 32
load_idt:
mov edx, [esp + 4]
lidt [edx]
ret
keyboard_handler:
pushad
cld
call handle_keyboard_interrupt
popad
iretd
ioport_in:
mov edx, [esp + 4]
in al, dx
ret
ioport_out:
mov edx, [esp + 4]
mov eax, [esp + 8]
out dx, al
ret
bits 16
enable_interrupts:
sti
ret
And here is my kernel:
main.c
#include "io/print.h"
#include "io/input.h"
void print_prompt(){
print_str("> ");
}
void kernel_main() {
print_clear();
print_set_color(PRINT_COLOR_YELLOW, PRINT_COLOR_BLACK);
print_str("Welcome to vgOS v0.1!!");
print_newline();
print_newline();
print_prompt();
}
int main(){
kernel_main();
init_idt();
enable_interrupts();
init_kb();
print_str("here");
print_newline();
while(1);
return 0;
}
input.h
#pragma once
#include <stdint.h>
#define IDT_SIZE 256
#define KERNEL_CODE_SEGMENT_OFFSET 0x8
#define IDT_INTERRUPT_GATE_64BIT 0x0e
#define PIC1_COMMAND_PORT 0x20
#define PIC1_DATA_PORT 0x21
#define PIC2_COMMAND_PORT 0xA0
#define PIC2_DATA_PORT 0xA1
#define KEYBOARD_DATA_PORT 0x60
#define KEYBOARD_STATUS_PORT 0x64
extern void load_gdt();
extern void load_idt(unsigned int idt_address);
extern void keyboard_handler();
extern char ioport_in(unsigned short port);
extern void ioport_out(unsigned short port, unsigned char data);
extern void enable_interrupts();
struct IDTPointer{
uint16_t limit;
unsigned long long base;
} __attribute__((packed));
struct IDTEntry{
uint16_t offset_1; // Offset bits 0-15
uint16_t selector; // Code segment selector
uint8_t ist; // Interrupt Stack Table offset
uint32_t zero;
uint8_t type_attr; // Gate, type, dpl and p fields
uint16_t offset_2; // Offset bits 16-31
uint32_t offset_3; // Offset bits 32-63
} __attribute__((packed));
void init_idt();
void init_kb();
input.c
#include "input.h"
#include "print.h"
// Declare IDT
struct IDTEntry IDT[IDT_SIZE];
void init_idt(){
// Set IDT keyboard entry
uint64_t offset = (uint64_t)keyboard_handler;
IDT[0x21].offset_1 = offset & 0x000000000000FFFF;
IDT[0x21].selector = KERNEL_CODE_SEGMENT_OFFSET;
IDT[0x21].ist = 0xE; // Set gate type to 'Interrupt'
IDT[0x21].zero = 0; // 0 for testing purposes
IDT[0x21].type_attr = IDT_INTERRUPT_GATE_64BIT;
IDT[0x21].offset_2 = (offset & 0x00000000FFFF0000) >> 16;
IDT[0x21].offset_3 = (offset & 0xFFFFFFFF00000000) >> 32;
// Setup ICWs
// ICW1
ioport_out(PIC1_COMMAND_PORT, 0x11);
ioport_out(PIC2_COMMAND_PORT, 0x11);
// ICW2
ioport_out(PIC1_DATA_PORT, 0x20);
ioport_out(PIC2_DATA_PORT, 0x28);
// ICW3
ioport_out(PIC1_DATA_PORT, 0x4);
ioport_out(PIC2_DATA_PORT, 0x2);
// ICW4
ioport_out(PIC1_DATA_PORT, 0x01);
ioport_out(PIC2_DATA_PORT, 0x01);
// Mask all interrupts
ioport_out(PIC1_DATA_PORT, 0xff);
ioport_out(PIC2_DATA_PORT, 0xff);
// Load IDT data structure
struct IDTPointer idt_ptr;
idt_ptr.limit = (sizeof(struct IDTEntry) * IDT_SIZE) - 1;
idt_ptr.base = (unsigned long long)(&IDT);
load_idt(&idt_ptr);
}
void init_kb(){
// 0xFD = 1111 1101 - Unmask IRQ1
ioport_out(PIC1_DATA_PORT, 0xFD);
}
void handle_keyboard_interrupt(){
ioport_out(PIC1_COMMAND_PORT, 0x20);
unsigned char status = ioport_in(KEYBOARD_STATUS_PORT);
if(status & 0x1){
char keycode = ioport_in(KEYBOARD_DATA_PORT);
if(keycode < 0) return;
print_char(keycode);
}
}

Related

returning Assembly Function In C Code x86 Compilation on 64 bit linux

C code
#include <stdio.h>
int fibonacci(int);
int main()
{
int x = fibonacci(3);
printf("Fibonacci is : %d",x);
return 0;
}
Assembly
section .text
global fibonacci
fibonacci:
push ebp;
mov ebp, esp;
; initialize
mov dword [prev], 0x00000000;
mov dword [cur], 0x00000001;
mov byte [it], 0x01;
mov eax, dword [ebp + 8]; // n = 3
mov byte [n], al;
getfib:
xor edx,edx;
mov dl, byte [n];
cmp byte [it] , dl;
jg loopend;
mov eax,dword [prev];
add eax, dword [cur];
mov ebx, dword [cur];
mov dword [prev], ebx;
mov dword [cur] , eax;
inc byte [it];
jmp getfib;
loopend:
mov eax, dword [cur];
pop ebp;
ret;
section .bss
it resb 1
prev resd 1
cur resd 1
n resb 1
I was trying to run this assembly function in C code and on debugging , i saw that value in variable x in C code is right but there is some error coming when i use the printf function
Need Help on it
Command to compile:
nasm -f elf32 asmcode.asm -o a.o
gcc -ggdb -no-pie -m32 a.o ccode.c -o a.out
Click Below Pictures if they seem blurred
Below is debug before printf execute
Below is after printf execute
Your code does not preserve the ebx register which is a callee-preserved register. The main function apparently tries to do some rip-relative addressing to obtain the address of the format string for printf using ebx as a base register. This fails because your code overwrote ebx.
To fix this issue, make sure to save all callee-saved registers before you use them and then restore their value on return. For example, you can do
fibonacci:
push ebp
mov ebp, esp
push ebx ; <---
...
pop ebx ; <---
pop ebp
ret

C kernel variable data is a sequence of random bytes in long mode x86

I am trying to write a simple OS just for fun and somewhat practice. I've worked in the real mode before but I decided to move on and try playing with protected mode to have an opportunity to use C rather than plain assembly. I have copied bootloader code that seemed to work and it seems to me that basically all it does is just goes into long mode and of course starts the kernel. So it works fine until you declare a variable in the code, because if you do, QEMU will output what I believe is a chunk of memory translated to ASCII.
So the bootloader code itself:
[org 0x7c00]
KERNEL_ADDRESS equ 0x100000
cli
lgdt [gdt_descriptor]
;Switch to PM
mov eax, cr0
or eax, 0x1
mov cr0, eax
jmp 0x8:init_pm
[bits 32]
init_pm :
mov ax, 0x10
mov ds, ax
mov ss, ax
mov es, ax
mov fs, ax
mov gs, ax
call build_page_tables
;Enable PAE
mov eax, cr4
or eax, 1 << 5
mov cr4, eax
;# Optional : Enable global-page mechanism by setting CR0.PGE bit to 1
mov eax, cr4
or eax, 1 << 7
mov cr4, eax
;Load CR3 with PML4 base address
;NB: in some examples online, the address is not offseted as it seems to
;be in the proc datasheet (if you were wondering about this strange thing).
mov eax, 0x1000
mov cr3, eax
;Set LME bit in EFER register (address 0xC0000080)
mov ecx, 0xC0000080 ;operand of 'rdmsr' and 'wrmsr'
rdmsr ;read before pr ne pas écraser le contenu
or eax, 1 << 8 ;eax : operand de wrmsr
wrmsr
;Enable paging by setting CR0.PG bit to 1
mov eax, cr0
or eax, (1 << 31)
mov cr0, eax
;Load 64-bit GDT
lgdt [gdt64_descriptor]
;Jump to code segment in 64-bit GDT
jmp 0x8:init_lm
[bits 64]
init_lm:
mov ax, 0x10
mov fs, ax ;other segments are ignored
mov gs, ax
mov rbp, 0x90000 ;set up stack
mov rsp, rbp
;Load kernel from disk
xor ebx, ebx ;upper 2 bytes above bh in ebx is for cylinder = 0x0
mov bl, 0x2 ;read from 2nd sectors
mov bh, 0x0 ;head
mov ch, 2 ;read 2 sectors
mov rdi, KERNEL_ADDRESS
call ata_chs_read
jmp KERNEL_ADDRESS
jmp $
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;[bits 16]
;; http://wiki.osdev.org/ATA_in_x86_RealMode_%28BIOS%29
;load_loader:
;;!! il faut rester sur le meme segment, ie <0x10000 (=2**16)
;mov bx, LOADER_OFFSET
;mov dh, 1 ;load 1 sector (max allowed by BIOS is 128)
;mov dl, 0x80 ;drive number
;mov ah, 0x02 ;read function
;mov al, dh
;mov ch, 0x00 ;cylinder
;mov dh, 0x00 ;head
;; !! Sector is 1-based, and not 0-based
;mov cl, 0x02 ;1st sector to read
;int 0x13
;ret
[bits 32]
build_page_tables:
;PML4 starts at 0x1000
;il faut laisser la place pour tte la page PML4/PDP/PD ie. 0x1000
;PML4 # 0x1000
mov eax, 0x2000 ;PDP base address
or eax, 0b11 ;P and R/W bits
mov ebx, 0x1000 ;MPL4 base address
mov [ebx], eax
;PDP # 0x2000; maps 64Go
mov eax, 0x3000 ;PD base address
mov ebx, 0x2000 ;PDP physical address
mov ecx, 64 ;64 PDP
build_PDP:
or eax, 0b11
mov [ebx], eax
add ebx, 0x8
add eax, 0x1000 ;next PD page base address
loop build_PDP
;PD # 0x3000 (ends at 0x4000, fits below 0x7c00)
; 1 entry maps a 2MB page, the 1st starts at 0x0
mov eax, 0x0 ;1st page physical base address
mov ebx, 0x3000 ;PD physical base address
mov ecx, 512
build_PD:
or eax, 0b10000011 ;P + R/W + PS (bit for 2MB page)
mov [ebx], eax
add ebx, 0x8
add eax, 0x200000 ;next 2MB physical page
loop build_PD
;(tables end at 0x4000 => fits before Bios boot sector at 0x7c00)
ret
;=============================================================================
; ATA read sectors (CHS mode)
; Max head index is 15, giving 16 possible heads
; Max cylinder index can be a very large number (up to 65535)
; Sector is usually always 1-63, sector 0 reserved, max 255 sectors/track
; If using 63 sectors/track, max disk size = 31.5GB
; If using 255 sectors/track, max disk size = 127.5GB
; See OSDev forum links in bottom of [http://wiki.osdev.org/ATA]
;
; #param EBX The CHS values; 2 bytes, 1 byte (BH), 1 byte (BL) accordingly
; #param CH The number of sectors to read
; #param RDI The address of buffer to put data obtained from disk
;
; #return None
;=============================================================================
[bits 64]
ata_chs_read: pushfq
push rax
push rbx
push rcx
push rdx
push rdi
mov rdx,1f6h ;port to send drive & head numbers
mov al,bh ;head index in BH
and al,00001111b ;head is only 4 bits long
or al,10100000b ;default 1010b in high nibble
out dx,al
mov rdx,1f2h ;Sector count port
mov al,ch ;Read CH sectors
out dx,al
mov rdx,1f3h ;Sector number port
mov al,bl ;BL is sector index
out dx,al
mov rdx,1f4h ;Cylinder low port
mov eax,ebx ;byte 2 in ebx, just above BH
mov cl,16
shr eax,cl ;shift down to AL
out dx,al
mov rdx,1f5h ;Cylinder high port
mov eax,ebx ;byte 3 in ebx, just above byte 2
mov cl,24
shr eax,cl ;shift down to AL
out dx,al
mov rdx,1f7h ;Command port
mov al,20h ;Read with retry.
out dx,al
.still_going: in al,dx
test al,8 ;the sector buffer requires servicing.
jz .still_going ;until the sector buffer is ready.
mov rax,512/2 ;to read 256 words = 1 sector
xor bx,bx
mov bl,ch ;read CH sectors
mul bx
mov rcx,rax ;RCX is counter for INSW
mov rdx,1f0h ;Data port, in and out
rep insw ;in to [RDI]
pop rdi
pop rdx
pop rcx
pop rbx
pop rax
popfq
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
[bits 16]
GDT:
;null :
dd 0x0
dd 0x0
;code :
dw 0xffff ;Limit
dw 0x0 ;Base
db 0x0 ;Base
db 10011010b ;1st flag, Type flag
db 11001111b ;2nd flag, Limit
db 0x0 ;Base
;data :
dw 0xffff
dw 0x0
db 0x0
db 10010010b
db 11001111b
db 0x0
gdt_descriptor :
dw $ - GDT - 1 ;16-bit size
dd GDT ;32-bit start address
[bits 32]
;see manual 2, §4.8: most fields are ignored in long mode
GDT64:
;null;
dq 0x0
;code
dd 0x0
db 0x0
db 0b10011000
db 0b00100000
db 0x0
;data
dd 0x0
db 0x0
db 0b10010000
db 0b00000000
db 0x0
gdt64_descriptor :
dw $ - GDT64 - 1 ;16-bit size
dd GDT64 ;32-bit start address
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
[bits 16]
times 510 -($-$$) db 0
dw 0xaa55
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
I've played around with the code a little bit and simplified it to this state:
void putc(uchar_t __char, uint8_t pos) {
uint16_t* vidmemory = (uint16_t*) 0xB8000;
vidmemory[pos] = (uint16_t)__char | (uint16_t)0x0F << 8;
}
const char* hw = "Hello, World!";
void kmain() {
for (uint8_t i = 0; i < 14; ++i) {
putc(hw[i], i);
}
for (;;);
return;
}
I had the same problem with the real mode after loading up the kernel, because I forgot to set the Data Segment, but as far as I am concerned segments are almost obsolete in the long mode.
Update
After debugging I, as was expected, found out that values of hw were some random bytes from memory. It looks like this:
(gdb) x/14bc hw
0x0 <putc>: 83 &apos;S&apos; -1 &apos;\377&apos; 0 &apos;\000&apos; -16 &apos;\360&apos; 83 &apos;S&apos; -1 &apos;\377&apos; 0 &apos;\000&apos; -16 &apos;\360&apos;
0x8 <putc+8>: -61 &apos;\303&apos; -30 &apos;\342&apos; 0 &apos;\000&apos; -16 &apos;\360&apos; 83 &apos;S&apos; -1 &apos;\377&apos;
I build my image with this script, which also uses "loader.s", but essentially all it does is just calls the kernel itself:
loader.s:
[bits 64]
extern kmain
global _start
_start:
call kmain
jmp $
And the build.sh:
#!/bin/bash
nasm -f bin bootload.s -o boot.bin
nasm -f elf64 loader.s -o loader.o
cc -m64 -masm=intel -c kernel.c -ffreestanding -Wall -Wextra -g -O2
ld -Ttext 0x100000 -o kernel.elf loader.o kernel.o -e kmain
objcopy -R .note -R .comment -S -O binary kernel.elf kernel.bin
dd if=/dev/zero of=image.bin bs=512 count=2880
dd if=boot.bin of=image.bin conv=notrunc
dd if=kernel.bin of=image.bin conv=notrunc bs=512 seek=1
There are probably other problems with my code, but for now I believe it's the matter of the data segment, which I haven't set.
Screenshot of QEMU

Arduino loops add clock cycles, why?

It seems that on Arduino, using loops results in more clock cycles. The default loop adds about 4 cycles per iteration, any other loop (while, for etc.) adds 2. Thus, not using any loop at all, and instead explicitly typing in the repeats, results in fewer cycles. Why is that?
Here are some examples of what I mean:
Example 1
void loop() {
for (int i=0;i<4;i++) {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
}
screenshot of measurement
Example 2
void loop() {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
screenshot of measurement
Example 3
void loop() {
while (1) {
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
PORTB |= 0b00010000;
PORTB &= 0b11101111;
}
}
screenshot of measurement
The simple story is:
In this case, example 2 and 3 are the same (and usually the compiler optimize the code and remove useless instructions). Those examples have only 2 instructions:
PORTB |= 0b00010000;
PORTB &= 0b11101111;
Conclusion for those example the code only change the port value
For example 1 you have 5 instructions (1 once per for and 4 for each iteration):
int i = 0 (once per for)
i < 4
PORTB |= 0b00010000;
PORTB &= 0b11101111;
i++
Conclusion for this example the code change the port value and handle the for index
In order to understand why you should learn what a compiler does and how a processor works. Here is the assembly code for your examples
Example 1
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], 0
.L3:
cmp DWORD PTR [rbp-4], 3
jg .L4
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
add DWORD PTR [rbp-4], 1
jmp .L3
.L4:
nop
pop rbp
ret
Example 2
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
nop
pop rbp
ret
Example 3
PORTB:
.zero 4
loop():
push rbp
mov rbp, rsp
.L2:
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
or eax, 16
mov DWORD PTR PORTB[rip], eax
mov eax, DWORD PTR PORTB[rip]
and eax, 239
mov DWORD PTR PORTB[rip], eax
jmp .L2

Struct or matrix for a register?

I want to have a register containing 4 bytes of address and 4 bytes of data. For that, I thought about building it in an array of structures (containing address and data as members) or in a matrix. Here a sample code to test what I want to achieve:
#include <stdio.h>
#include <stdint.h>
void reg_init();
#define REG_SIZE 1
typedef struct Reg{
uint8_t addr[4];
uint8_t data[4];
} reg;
static reg reg_struct[REG_SIZE];
static uint8_t reg_matrix[REG_SIZE][8];
int main()
{
int index=-1;
reg_init();
for(int i=0; i<REG_SIZE; i++)
{
uint8_t addr_to_check[4]={0x12,0x34,0x56,0x78};
// FOR STRUCT
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_struct[i].addr[j]) break;
if(j==3) index = i;
}
//FOR MATRIX
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_matrix[i][j]) break;
if(j==3) index = i;
}
}
if (index<0) printf("Address not found\n");
else printf("Address at index: %i",index);
return 0;
}
void reg_init()
{
// Register init for testing
reg_struct[0].addr[0] = 0x12;
reg_struct[0].addr[1] = 0x34;
reg_struct[0].addr[2] = 0x56;
reg_struct[0].addr[3] = 0x78;
reg_struct[0].data[0] = 0x01;
reg_struct[0].data[1] = 0x02;
reg_struct[0].data[2] = 0x03;
reg_struct[0].data[3] = 0x04;
reg_matrix[0][0] = 0x12;
reg_matrix[0][1] = 0x34;
reg_matrix[0][2] = 0x56;
reg_matrix[0][3] = 0x78;
reg_matrix[0][4] = 0x01;
reg_matrix[0][5] = 0x02;
reg_matrix[0][6] = 0x03;
reg_matrix[0][7] = 0x04;
}
The example shows just a unit size register, but the size will be much higher (up to 8 bytes). Overall, I'm interested in optimization in terms of performance. Does it really care to use one or another, or will the compiler build the same machine code?
Below is the assembly of the above code created using visual studio 2019.
Look at record line number ; Line 50 and ; Line 51 it looks like compiler has created same assembly code for both matrix and structure.
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.20.27508.1
TITLE D:\main.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ___local_stdio_printf_options
PUBLIC __vfprintf_l
PUBLIC _printf
PUBLIC _reg_init
PUBLIC _main
EXTRN ___acrt_iob_func:PROC
EXTRN ___stdio_common_vfprintf:PROC
_DATA SEGMENT
COMM ?_OptionsStorage#?1??__local_stdio_printf_options##9#9:QWORD ; `__local_stdio_printf_options'::`2'::_OptionsStorage
_DATA ENDS
_BSS SEGMENT
_reg_struct DQ 01H DUP (?)
_reg_matrix DB 08H DUP (?)
_BSS ENDS
_DATA SEGMENT
$SG8132 DB 'Address not found', 0aH, 00H
ORG $+1
$SG8133 DB 'Address at index: %i', 00H
_DATA ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_index$ = -20 ; size = 4
_addr_to_check$1 = -16 ; size = 4
_j$2 = -12 ; size = 4
_j$3 = -8 ; size = 4
_i$4 = -4 ; size = 4
_main PROC
; File D:\main.c
; Line 16
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
; Line 17
mov DWORD PTR _index$[ebp], -1
; Line 18
call _reg_init
; Line 19
mov DWORD PTR _i$4[ebp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD PTR _i$4[ebp]
add eax, 1
mov DWORD PTR _i$4[ebp], eax
$LN4#main:
cmp DWORD PTR _i$4[ebp], 1
jge $LN3#main
; Line 21
mov BYTE PTR _addr_to_check$1[ebp], 18 ; 00000012H
mov BYTE PTR _addr_to_check$1[ebp+1], 52 ; 00000034H
mov BYTE PTR _addr_to_check$1[ebp+2], 86 ; 00000056H
mov BYTE PTR _addr_to_check$1[ebp+3], 120 ; 00000078H
; Line 23
mov DWORD PTR _j$3[ebp], 0
jmp SHORT $LN7#main
$LN5#main:
mov ecx, DWORD PTR _j$3[ebp]
add ecx, 1
mov DWORD PTR _j$3[ebp], ecx
$LN7#main:
cmp DWORD PTR _j$3[ebp], 4
jge SHORT $LN6#main
; Line 25
mov edx, DWORD PTR _j$3[ebp]
movzx eax, BYTE PTR _addr_to_check$1[ebp+edx]
mov ecx, DWORD PTR _j$3[ebp]
mov edx, DWORD PTR _i$4[ebp]
movzx ecx, BYTE PTR _reg_struct[ecx+edx*8]
cmp eax, ecx
je SHORT $LN11#main
jmp SHORT $LN6#main
$LN11#main:
; Line 26
cmp DWORD PTR _j$3[ebp], 3
jne SHORT $LN12#main
mov edx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], edx
$LN12#main:
; Line 27
jmp SHORT $LN5#main
$LN6#main:
; Line 30
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN10#main
$LN8#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN10#main:
cmp DWORD PTR _j$2[ebp], 4
jge SHORT $LN9#main
; Line 32
mov ecx, DWORD PTR _j$2[ebp]
movzx edx, BYTE PTR _addr_to_check$1[ebp+ecx]
mov eax, DWORD PTR _j$2[ebp]
mov ecx, DWORD PTR _i$4[ebp]
movzx eax, BYTE PTR _reg_matrix[eax+ecx*8]
cmp edx, eax
je SHORT $LN13#main
jmp SHORT $LN9#main
$LN13#main:
; Line 33
cmp DWORD PTR _j$2[ebp], 3
jne SHORT $LN14#main
mov ecx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], ecx
$LN14#main:
; Line 34
jmp SHORT $LN8#main
$LN9#main:
; Line 35
jmp $LN2#main
$LN3#main:
; Line 36
cmp DWORD PTR _index$[ebp], 0
jge SHORT $LN15#main
push OFFSET $SG8132
call _printf
add esp, 4
jmp SHORT $LN16#main
$LN15#main:
; Line 37
mov edx, DWORD PTR _index$[ebp]
push edx
push OFFSET $SG8133
call _printf
add esp, 8
$LN16#main:
; Line 38
xor eax, eax
; Line 39
mov esp, ebp
pop ebp
ret 0
_main ENDP
_TEXT ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_reg_init PROC
; File D:\main.c
; Line 41
push ebp
mov ebp, esp
; Line 43
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 0
mov BYTE PTR _reg_struct[ecx+eax], 18 ; 00000012H
; Line 44
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 0
mov BYTE PTR _reg_struct[edx+eax], 52 ; 00000034H
; Line 45
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 1
mov BYTE PTR _reg_struct[edx+eax], 86 ; 00000056H
; Line 46
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 3
mov BYTE PTR _reg_struct[edx+ecx], 120 ; 00000078H
; Line 47
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 0
mov BYTE PTR _reg_struct[eax+edx+4], 1
; Line 48
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 0
mov BYTE PTR _reg_struct[ecx+edx+4], 2
; Line 49
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 1
mov BYTE PTR _reg_struct[ecx+edx+4], 3
; Line 50
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 3
mov BYTE PTR _reg_struct[ecx+eax+4], 4
; Line 51
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 0
mov BYTE PTR _reg_matrix[edx+ecx], 18 ; 00000012H
; Line 52
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 0
mov BYTE PTR _reg_matrix[eax+ecx], 52 ; 00000034H
; Line 53
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 1
mov BYTE PTR _reg_matrix[eax+ecx], 86 ; 00000056H
; Line 54
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 3
mov BYTE PTR _reg_matrix[eax+edx], 120 ; 00000078H
; Line 55
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 2
mov BYTE PTR _reg_matrix[ecx+edx], 1
; Line 56
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 5
mov BYTE PTR _reg_matrix[ecx+eax], 2
; Line 57
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 6
mov BYTE PTR _reg_matrix[edx+ecx], 3
; Line 58
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 7
mov BYTE PTR _reg_matrix[eax+edx], 4
; Line 59
pop ebp
ret 0
_reg_init ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT _printf
_TEXT SEGMENT
__Result$ = -8 ; size = 4
__ArgList$ = -4 ; size = 4
__Format$ = 8 ; size = 4
_printf PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 954
push ebp
mov ebp, esp
sub esp, 8
; Line 957
lea eax, DWORD PTR __Format$[ebp+4]
mov DWORD PTR __ArgList$[ebp], eax
; Line 958
mov ecx, DWORD PTR __ArgList$[ebp]
push ecx
push 0
mov edx, DWORD PTR __Format$[ebp]
push edx
push 1
call ___acrt_iob_func
add esp, 4
push eax
call __vfprintf_l
add esp, 16 ; 00000010H
mov DWORD PTR __Result$[ebp], eax
; Line 959
mov DWORD PTR __ArgList$[ebp], 0
; Line 960
mov eax, DWORD PTR __Result$[ebp]
; Line 961
mov esp, ebp
pop ebp
ret 0
_printf ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT __vfprintf_l
_TEXT SEGMENT
__Stream$ = 8 ; size = 4
__Format$ = 12 ; size = 4
__Locale$ = 16 ; size = 4
__ArgList$ = 20 ; size = 4
__vfprintf_l PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 642
push ebp
mov ebp, esp
; Line 643
mov eax, DWORD PTR __ArgList$[ebp]
push eax
mov ecx, DWORD PTR __Locale$[ebp]
push ecx
mov edx, DWORD PTR __Format$[ebp]
push edx
mov eax, DWORD PTR __Stream$[ebp]
push eax
call ___local_stdio_printf_options
mov ecx, DWORD PTR [eax+4]
push ecx
mov edx, DWORD PTR [eax]
push edx
call ___stdio_common_vfprintf
add esp, 24 ; 00000018H
; Line 644
pop ebp
ret 0
__vfprintf_l ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT ___local_stdio_printf_options
_TEXT SEGMENT
___local_stdio_printf_options PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\corecrt_stdio_config.h
; Line 86
push ebp
mov ebp, esp
; Line 88
mov eax, OFFSET ?_OptionsStorage#?1??__local_stdio_printf_options##9#9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
; Line 89
pop ebp
ret 0
___local_stdio_printf_options ENDP
_TEXT ENDS
END

Understanding Intel x86 assembly output for a modulo calculation

I have a school project about compilers and how it differs in assembly code between Intel x86 and ARMv7, but i'm stuck trying to comprehend the assembly for the Intel x86 architecture.
The source code is:
int main()
{
int a=5,b=2;
int result;
result = a % b;
printf("Result of 5 modulo 2 is %i\n", result);
}
Assembly output (gcc masm=Intel)
main:
/*
Intel32-x86 Arhchitecture
Little endian
ebp register -- base pointer
esp register -- stack pointer
*/
push ebp ; ebp register put on stack
mov ebp, esp ; Move data from ebp to esp
and esp, -16 ; Logical AND ??
sub esp, 32 ; Subtraction ??
mov DWORD PTR [esp+20], 5
;5 as 32 bits
;00000101-00000000-00000000-00000000
mov DWORD PTR [esp+24], 2
;2 as 32 bits
;00000010-00000000-00000000-00000000
mov eax, DWORD PTR [esp+20]
mov edx, eax
sar edx, 31
;Shift Arithmetically right - edx med 31.
;00000101-00000000-00000000-00000000 BEFORE
;00000000-00000000-00000000-00000000 AFTER
idiv DWORD PTR [esp+24]
;Signed divide - IDIV r/m32 - EDX:EAX register
;Dividing EDX:EAX on value of esp+24, and save the remainder in edx.
;EDX:EAX 00000000-00000000-00000000-00000000-00000101-00000000-00000000-00000000
mov DWORD PTR [esp+28], edx
mov eax, OFFSET FLAT:.LC0
mov edx, DWORD PTR [esp+28]
mov DWORD PTR [esp+4], edx
mov DWORD PTR [esp], eax
call printf
leave
ret
and esp, -16 ; Logical AND
sub esp, 32 ; Subtraction
What is the purpose of those two instructions?
The purpose is mentioned in the comments:
and esp,-16 ;round esp down to 16 byte boundary
sub esp,32 ;allocate 32 bytes of space for local variables
In case you didn't catch this part about sign extending the dividend:
mov eax, DWORD PTR [esp+20] ; eax = dividend
mov edx, eax ; edx = dividend
sar edx, 31 ; edx = 0 or -1 (the sign extension)

Resources