Related
I am writing an x86_64 kernel for an exam and it seems to reboot every time it runs an STI instruction, which, as defined in my code, is on every boot. I have set up GDT, IDT and ICWs, and masked all IRQs except IRQ1, which is for my keyboard input. The kernel runs perfectly without STI, except for keyboard input.
Here is my bootloader:
main.asm
global start
extern long_mode_start
bits 32
section .text
start:
mov esp, stack_top
call check_multiboot
call check_cpuid
call check_long_mode
call setup_page_tables
call enable_paging
lgdt [gdt64.pointer]
jmp gdt64.code_segment:long_mode_start
hlt
check_multiboot:
cmp eax, 0x36d76289
jne .no_multiboot
ret
.no_multiboot:
mov al, "M"
jmp error
check_cpuid:
pushfd
pop eax
mov ecx, eax
xor eax, 1 << 21
push eax
popfd
pushfd
pop eax
push ecx
popfd
cmp eax, ecx
je .no_cpuid
ret
.no_cpuid:
mov al, "C"
jmp error
check_long_mode:
mov eax, 0x80000000
cpuid
cmp eax, 0x80000001
jb .no_long_mode
mov eax, 0x80000001
cpuid
test edx, 1 << 29
jz .no_long_mode
ret
.no_long_mode:
mov al, "L"
jmp error
setup_page_tables:
mov eax, page_table_l3
or eax, 0b11 ; present, writable
mov [page_table_l4], eax
mov eax, page_table_l2
or eax, 0b11 ; present, writable
mov [page_table_l3], eax
mov ecx, 0 ; counter
.loop:
mov eax, 0x200000 ; 2MiB
mul ecx
or eax, 0b10000011 ; present, writable, huge page
mov [page_table_l2 + ecx * 8], eax
inc ecx ; increment counter
cmp ecx, 512 ; checks if the whole table is mapped
jne .loop ; if not, continue
ret
enable_paging:
; pass page table location to cpu
mov eax, page_table_l4
mov cr3, eax
; enable PAE
mov eax, cr4
or eax, 1 << 5
mov cr4, eax
; enable long mode
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 8
wrmsr
; enable paging
mov eax, cr0
or eax, 1 << 31
mov cr0, eax
ret
error:
; print "ERR: X" where X is the error code
mov dword [0xb8000], 0x4f524f45
mov dword [0xb8004], 0x4f3a4f52
mov dword [0xb8008], 0x4f204f20
mov byte [0xb800a], al
hlt
section .bss
align 4096
page_table_l4:
resb 4096
page_table_l3:
resb 4096
page_table_l2:
resb 4096
stack_bottom:
resb 4096 * 4
stack_top:
section .rodata
gdt64:
dq 0 ; zero entry
.code_segment: equ $ - gdt64
dq (1 << 43) | (1 << 44) | (1 << 47) | (1 << 53) ; code segment
.pointer:
dw $ - gdt64 - 1 ; length
dq gdt64 ; address
main64.asm
global long_mode_start
global load_gdt
global load_idt
global keyboard_handler
global ioport_in
global ioport_out
global enable_interrupts
extern main
extern handle_keyboard_interrupt
section .text
bits 64
long_mode_start:
; load null into all data segment registers
mov ax, 0
mov ss, ax
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
call main
hlt
bits 32
load_idt:
mov edx, [esp + 4]
lidt [edx]
ret
keyboard_handler:
pushad
cld
call handle_keyboard_interrupt
popad
iretd
ioport_in:
mov edx, [esp + 4]
in al, dx
ret
ioport_out:
mov edx, [esp + 4]
mov eax, [esp + 8]
out dx, al
ret
bits 16
enable_interrupts:
sti
ret
And here is my kernel:
main.c
#include "io/print.h"
#include "io/input.h"
void print_prompt(){
print_str("> ");
}
void kernel_main() {
print_clear();
print_set_color(PRINT_COLOR_YELLOW, PRINT_COLOR_BLACK);
print_str("Welcome to vgOS v0.1!!");
print_newline();
print_newline();
print_prompt();
}
int main(){
kernel_main();
init_idt();
enable_interrupts();
init_kb();
print_str("here");
print_newline();
while(1);
return 0;
}
input.h
#pragma once
#include <stdint.h>
#define IDT_SIZE 256
#define KERNEL_CODE_SEGMENT_OFFSET 0x8
#define IDT_INTERRUPT_GATE_64BIT 0x0e
#define PIC1_COMMAND_PORT 0x20
#define PIC1_DATA_PORT 0x21
#define PIC2_COMMAND_PORT 0xA0
#define PIC2_DATA_PORT 0xA1
#define KEYBOARD_DATA_PORT 0x60
#define KEYBOARD_STATUS_PORT 0x64
extern void load_gdt();
extern void load_idt(unsigned int idt_address);
extern void keyboard_handler();
extern char ioport_in(unsigned short port);
extern void ioport_out(unsigned short port, unsigned char data);
extern void enable_interrupts();
struct IDTPointer{
uint16_t limit;
unsigned long long base;
} __attribute__((packed));
struct IDTEntry{
uint16_t offset_1; // Offset bits 0-15
uint16_t selector; // Code segment selector
uint8_t ist; // Interrupt Stack Table offset
uint32_t zero;
uint8_t type_attr; // Gate, type, dpl and p fields
uint16_t offset_2; // Offset bits 16-31
uint32_t offset_3; // Offset bits 32-63
} __attribute__((packed));
void init_idt();
void init_kb();
input.c
#include "input.h"
#include "print.h"
// Declare IDT
struct IDTEntry IDT[IDT_SIZE];
void init_idt(){
// Set IDT keyboard entry
uint64_t offset = (uint64_t)keyboard_handler;
IDT[0x21].offset_1 = offset & 0x000000000000FFFF;
IDT[0x21].selector = KERNEL_CODE_SEGMENT_OFFSET;
IDT[0x21].ist = 0xE; // Set gate type to 'Interrupt'
IDT[0x21].zero = 0; // 0 for testing purposes
IDT[0x21].type_attr = IDT_INTERRUPT_GATE_64BIT;
IDT[0x21].offset_2 = (offset & 0x00000000FFFF0000) >> 16;
IDT[0x21].offset_3 = (offset & 0xFFFFFFFF00000000) >> 32;
// Setup ICWs
// ICW1
ioport_out(PIC1_COMMAND_PORT, 0x11);
ioport_out(PIC2_COMMAND_PORT, 0x11);
// ICW2
ioport_out(PIC1_DATA_PORT, 0x20);
ioport_out(PIC2_DATA_PORT, 0x28);
// ICW3
ioport_out(PIC1_DATA_PORT, 0x4);
ioport_out(PIC2_DATA_PORT, 0x2);
// ICW4
ioport_out(PIC1_DATA_PORT, 0x01);
ioport_out(PIC2_DATA_PORT, 0x01);
// Mask all interrupts
ioport_out(PIC1_DATA_PORT, 0xff);
ioport_out(PIC2_DATA_PORT, 0xff);
// Load IDT data structure
struct IDTPointer idt_ptr;
idt_ptr.limit = (sizeof(struct IDTEntry) * IDT_SIZE) - 1;
idt_ptr.base = (unsigned long long)(&IDT);
load_idt(&idt_ptr);
}
void init_kb(){
// 0xFD = 1111 1101 - Unmask IRQ1
ioport_out(PIC1_DATA_PORT, 0xFD);
}
void handle_keyboard_interrupt(){
ioport_out(PIC1_COMMAND_PORT, 0x20);
unsigned char status = ioport_in(KEYBOARD_STATUS_PORT);
if(status & 0x1){
char keycode = ioport_in(KEYBOARD_DATA_PORT);
if(keycode < 0) return;
print_char(keycode);
}
}
I need to do some operations with 48 bit variables, so I had two options:
Create my own structure with 48 bit variables, or
Use unsigned long long (64 bits).
As the operations will not overflow 48 bits, I considered that using 64 bit variables was an overkill, so I created a base structure
#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif
#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif
PACK(struct uint48 {
unsigned long long v : 48;
});
and created some code to check for speed in the operations
#include <stdio.h>
#include <time.h>
#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif
#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif
PACK(struct uint48 {
unsigned long long v : 48;
});
void TestProductLong();
void TestProductLong02();
void TestProductPackedStruct();
void TestProductPackedStruct02();
clock_t start, end;
double cpu_time_used;
int cycleNumber = 100000;
int main(void)
{
TestProductLong();
TestProductLong02();
TestProductPackedStruct();
TestProductPackedStruct02();
return 0;
}
void TestProductLong() {
start = clock();
for (int i = 0; i < cycleNumber;i++) {
unsigned long long varlong01 = 155782;
unsigned long long varlong02 = 15519994;
unsigned long long product01 = varlong01 * varlong02;
unsigned long long varlong03 = 155782;
unsigned long long varlong04 = 15519994;
unsigned long long product02 = varlong03 * varlong04;
unsigned long long addition = product01 + product02;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductLong() took %f seconds to execute \n", cpu_time_used);
}
void TestProductLong02() {
start = clock();
unsigned long long varlong01;
unsigned long long varlong02;
unsigned long long product01;
unsigned long long varlong03;
unsigned long long varlong04;
unsigned long long product02;
unsigned long long addition;
for (int i = 0; i < cycleNumber;i++) {
varlong01 = 155782;
varlong02 = 15519994;
product01 = varlong01 * varlong02;
varlong03 = 155782;
varlong04 = 15519994;
product02 = varlong03 * varlong04;
addition = product01 + product02;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductLong02() took %f seconds to execute \n", cpu_time_used);
}
void TestProductPackedStruct() {
start = clock();
for (int i = 0; i < cycleNumber; i++) {
struct uint48 x01;
struct uint48 x02;
struct uint48 x03;
x01.v = 155782;
x02.v = 15519994;
x03.v = x01.v * x02.v;
struct uint48 x04;
struct uint48 x05;
struct uint48 x06;
x04.v = 155782;
x05.v = 15519994;
x06.v = x04.v * x05.v;
struct uint48 x07;
x07.v = x03.v + x06.v;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductPackedStruct() took %f seconds to execute \n", cpu_time_used);
}
void TestProductPackedStruct02() {
start = clock();
struct uint48 x01;
struct uint48 x02;
struct uint48 x03;
struct uint48 x04;
struct uint48 x05;
struct uint48 x06;
struct uint48 x07;
for (int i = 0; i < cycleNumber; i++) {
x01.v = 155782;
x02.v = 15519994;
x03.v = x01.v * x02.v;
x04.v = 155782;
x05.v = 15519994;
x06.v = x04.v * x05.v;
x07.v = x03.v + x06.v;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductPackedStruct02() took %f seconds to execute \n", cpu_time_used);
}
But I got the following results
TestProductLong() took 0.000188 seconds to execute
TestProductLong02() took 0.000198 seconds to execute
TestProductPackedStruct() took 0.001231 seconds to execute
TestProductPackedStruct02() took 0.001231 seconds to execute
So the operations using unsigned long long took less time than the ones using the packed structure.
Why is that?
Would be better then to use the unsigned long long instead?
Is there a better way to pack structures?
As I'm right now unrolling loops, using the correct datastructure could impact the performance of my application significantly.
Thank you.
Although you know that the operations on the 48-bit values will not overflow, a compiler cannot know this! Further, with the vast majority of compilers and platforms, your uint48 structure will actually be implemented as a 64-bit data type, for which only the low 48-bits will ever be used.
So, after any arithmetic (or other) operations on the .v data, the 'unused' 16-bits of the (internal) 64-bit representation will need to be cleared, to ensure that any future accesses to that data will give the true, 48-bit-only value.
Thus, using the clang-cl compiler in Visual Studio 2019, the following (rather trivial) function using the native uint64_t type:
extern uint64_t add64(uint64_t a, uint64_t b) {
return a + b;
}
generates the expected, highly efficient assembly code:
lea rax, [rcx + rdx]
ret
However, using (an equivalent of) your 48-bit packed structure:
#pragma pack(push, 1)
typedef struct uint48 {
unsigned long long v : 48;
} uint48_t;
#pragma pack(pop)
extern uint48_t add48(uint48_t a, uint48_t b) {
uint48_t c;
c.v = a.v + b.v;
return c;
}
requires additional assembly code to ensure that any overflow into the 'unused' bits is discarded:
add rcx, rdx
movabs rax, 281474976710655 # This is 0x0000FFFFFFFFFFFF - clearing top 16 bits!
and rax, rcx
ret
Note that the MSVC compiler generates very similar code.
Thus, you should expect that using native, uint64_t variables will generate more efficient code than your 'space-saving' structure.
Your test procedure is wrong. Why?
Packing 1 member struct does actually nothing.
You execute it using -O0 and with no optimizations testing the execution speed does not make any sense. It you compile it with optimizations - your code will be wiped out :) https://godbolt.org/z/9ibP_8
When you sort this code to be optimizable (As you do not use the value they have to be global or at least static and adding compiler memory barrier (clobber)).
https://godbolt.org/z/BL9uJE
The difference comes with trimming the results to 48 bits.
If you pack the struct (which is not necesary here) you force compiler to byte access the variables - because only bytes are always aligned: https://godbolt.org/z/2iV7vq
You can also use the mixed approach - not portable as it relies on endianess and bitfields implementation https://godbolt.org/z/J3-it_
so the code will compile to:
unsigned long long:
mov QWORD PTR varlong01[rip], 155782
mov QWORD PTR varlong02[rip], 15519994
mov QWORD PTR product01[rip], rdx
mov QWORD PTR varlong03[rip], 155782
mov QWORD PTR varlong04[rip], 15519994
mov QWORD PTR product02[rip], rdx
mov QWORD PTR addition[rip], rcx
not packed struct
mov rdx, QWORD PTR x01[rip]
and rdx, rax
or rdx, 155782
mov QWORD PTR x01[rip], rdx
mov rdx, QWORD PTR x02[rip]
and rdx, rax
or rdx, 15519994
mov QWORD PTR x02[rip], rdx
mov rdx, QWORD PTR x03[rip]
and rdx, rax
or rdx, rsi
mov QWORD PTR x03[rip], rdx
mov rdx, QWORD PTR x04[rip]
and rdx, rax
or rdx, 155782
mov QWORD PTR x04[rip], rdx
mov rdx, QWORD PTR x05[rip]
and rdx, rax
or rdx, 15519994
mov QWORD PTR x05[rip], rdx
mov rdx, QWORD PTR x06[rip]
and rdx, rax
or rdx, rsi
mov QWORD PTR x06[rip], rdx
mov rdx, QWORD PTR x07[rip]
and rdx, rax
or rdx, rdi
mov QWORD PTR x07[rip], rdx
packed struct
mov BYTE PTR x01[rip], -122
mov BYTE PTR x01[rip+1], 96
mov BYTE PTR x01[rip+2], 2
mov BYTE PTR x01[rip+3], 0
mov BYTE PTR x01[rip+4], 0
mov BYTE PTR x01[rip+5], 0
mov BYTE PTR x02[rip], -6
mov BYTE PTR x02[rip+1], -48
mov BYTE PTR x02[rip+2], -20
mov BYTE PTR x02[rip+3], 0
mov BYTE PTR x02[rip+4], 0
mov BYTE PTR x02[rip+5], 0
mov BYTE PTR x03[rip], -36
mov BYTE PTR x03[rip+1], 34
mov BYTE PTR x03[rip+2], 71
mov BYTE PTR x03[rip+3], -20
mov BYTE PTR x03[rip+4], 50
mov BYTE PTR x03[rip+5], 2
mov BYTE PTR x04[rip], -122
mov BYTE PTR x04[rip+1], 96
mov BYTE PTR x04[rip+2], 2
mov BYTE PTR x04[rip+3], 0
mov BYTE PTR x04[rip+4], 0
mov BYTE PTR x04[rip+5], 0
mov BYTE PTR x05[rip], -6
mov BYTE PTR x05[rip+1], -48
mov BYTE PTR x05[rip+2], -20
mov BYTE PTR x05[rip+3], 0
mov BYTE PTR x05[rip+4], 0
mov BYTE PTR x05[rip+5], 0
mov BYTE PTR x06[rip], -36
mov BYTE PTR x06[rip+1], 34
mov BYTE PTR x06[rip+2], 71
mov BYTE PTR x06[rip+3], -20
mov BYTE PTR x06[rip+4], 50
mov BYTE PTR x06[rip+5], 2
mov BYTE PTR x07[rip], -72
mov BYTE PTR x07[rip+1], 69
mov BYTE PTR x07[rip+2], -114
mov BYTE PTR x07[rip+3], -40
mov BYTE PTR x07[rip+4], 101
mov BYTE PTR x07[rip+5], 4
I am trying to develop a simple interrupt-driven OS in the C Language for x86. I have just installed an IDT, set up some ISRs (which seem to be working fine, I tested by printing 6/0) and I am now on the stage where I am setting up some IRQs. The problem that seems to be occurring here is.. well.. nothing happens. Let me go step by step on everything I have done so far so maybe we can identify the problem.
Step 1 - Set up and load the IDT
I know that each entry in the IDT is composed of some offset bits, a code segment selector, a zero entry, a type attribute, and some more offset bits. Because of this, my typedef for idtdesc_t looks like this
typedef struct {
uint16_t low;
uint16_t sel;
uint8_t null;
uint8_t tattr;
uint16_t high;
} __attribute__((packed)) idtdesc_t;
I know that the IDT is loaded using the lidt instruction, so I need to make a pointer type that is loadable by that instruction. With that in mind, here is the typedef I use for the IDT pointer
typedef struct {
uint16_t lim;
uint32_t base;
} __attribute__((packed)) idtptr_t;
This is all still in my idt.h header file. Finally, to be manipulated by the code itself, I made these two variables
idtptr_t idtr;
idtdesc_t idt[256];
And the functions to set the IDT gates and to clear the IDT look like this
uint8_t idt_install()
{
idtr.base = &idt;
idtr.lim = (sizeof (idtdesc_t) * 256) - 1;
for(size_t i = 0; i<256; i++) {
idt[i].low = 0;
idt[i].high = 0;
idt[i].null = 0;
idt[i].sel = 0;
idt[i].tattr = 0;
}
idt_load();
return 0;
}
void idt_setentry(uint8_t num, uint32_t base, uint16_t sel, uint8_t tattr)
{
idt[num].low = (base & 0xFFFF);
idt[num].high = (base >> 16) & 0xFFFF;
idt[num].null = 0;
idt[num].sel = sel;
idt[num].tattr = tattr;
}
Just for reference, my idt_load function is defined in a file called idt.asm like this
extern idtr
global idt_load
idt_load:
lidt[idtr]
ret
Now that this is done, we are ready to move on to step 2.
Step 2 - Write some ISR stubs and an ISR handler
Now, I need to write an ISR handler and some ISR stubs. I do it like this in a file called isr.asm
global isr0
...
global isr30
global isr31
extern handler
isr0:
cli
push byte 0
push byte 0
jmp isr_stub
...
isr30:
cli
push byte 0
push byte 30
jmp isr_stub
isr31:
cli
push byte 0
push byte 31
jmp isr_stub
isr_stub:
pusha
push ds
push es
push fs
push gs
mov ax, 0x10
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov eax, esp
push eax
mov eax, handler
call eax
pop eax
pop gs
pop fs
pop es
pop ds
popa
add esp, 8
iret
Some of these ISRs push an error code, so I have pushed an error code for some of the ones that are not shown. I have a handler function in my C file, as one is used in the Assembly function. It takes a pointer to a regs_t typed variable called r, which represents the stack frame. The definition for regs_t looks like this
typedef struct {
uint32_t gs, fs, es, ds;
uint32_t edi, esi, ebp, esp, ebx, edx, ecx, eax;
uint32_t int_no, err_code;
uint32_t eip, cs, eflags, useresp, ss;
} regs_t;
So with all that said, here is my handler function
void handler(regs_t *r)
{
if(r->int_no < 32) {
terminal_puterr(emsgs[r->int_no], r->int_no);
for(;;);
}
}
And of course, I need to remap the ISRs (most of it taken out for compactness) as well.
uint8_t isr_install()
{
idt_setentry(0, (unsigned) isr0, 0x08, 0x8E);
idt_setentry(1, (unsigned) isr1, 0x08, 0x8E);
...
idt_setentry(30, (unsigned) isr30, 0x08, 0x8E);
idt_setentry(31, (unsigned) isr31, 0x08, 0x8E);
return 0;
}
Once I switched to GNU make, the handler catches(d) the right exceptions when they are called in kmain(). Now, the problematic parts begin... once I load my IRQs in a very similar way, the CPU just triple faults. The only handler I have is the timer handler. Here is my code, starting with irq.asm
global irq0
...
global irq15
extern handler_irq
irq0:
cli
push byte 0
push byte 32
jmp irq_stub
...
irq15:
cli
push byte 0
push byte 47
jmp irq_stub
irq_stub:
pusha
push ds
push es
push fs
push gs
mov ax, 0x10
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov eax, esp
mov eax, handler_irq
call eax
pop eax
pop gs
pop fs
pop es
pop ds
popa
add esp, 8
iret
And my irq.c, which contains my handlers and the remapping code
#include <irq.h>
void *irq_routines[16] = {
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
};
void irq_install_handler(int irq, void (*handler)(regs_t *r))
{
irq_routines[irq] = handler;
}
void irq_remove_handler(int irq)
{
irq_routines[irq] = 0;
}
void irq_remap(void)
{
outb(0x20, 0x11);
outb(0xA0, 0x11);
outb(0x21, 0x20);
outb(0xA1, 0x28);
outb(0x21, 0x04);
outb(0xA1, 0x02);
outb(0x21, 0x01);
outb(0xA1, 0x01);
outb(0x21, 0x0);
outb(0xA1, 0x0);
}
void handler_irq(regs_t *r)
{
void (*handler) (regs_t *r);
handler = irq_routines[r->int_no - 32];
if(handler) {
handler(r);
}
if(r->int_no >= 40) {
outb(0xA0, 0x20);
}
outb(0x20, 0x20);
}
void irq_install()
{
irq_remap();
idt_setentry(32, (unsigned)irq0, 0x08, 0x8E);
...
idt_setentry(47, (unsigned)irq15, 0x08, 0x8E);
}
And finally, I mapped a timer handler to IRQ 0 (for now, the code is from James Molloy's Kernel Development Tutorials)
#include <timer.h>
uint32_t tick = 0;
static void timer_callback(regs_t *r)
{
tick++;
terminal_puts("Tick: ");
terminal_putint(tick);
terminal_puts("\n");
}
void timer_install(uint32_t frequency)
{
// Firstly, register our timer callback.
irq_install_handler(0, &timer_callback);
// The value we send to the PIT is the value to divide it's input clock
// (1193180 Hz) by, to get our required frequency. Important to note is
// that the divisor must be small enough to fit into 16-bits.
uint32_t divisor = 1193180 / frequency;
// Send the command byte.
outb(0x43, 0x36);
// Divisor has to be sent byte-wise, so split here into upper/lower bytes.
uint8_t l = (uint8_t)(divisor & 0xFF);
uint8_t h = (uint8_t)( (divisor>>8) & 0xFF );
// Send the frequency divisor.
outb(0x40, l);
outb(0x40, h);
}
But, when I try to install this handler in kmain.c, it results in a Triple Fault and not even my ISR handlers working anymore. Here is the code in kmain.c
#include <kernel.h>
void kmain(void)
{
interrupt_disable();
terminal_init(LGREEN, BLACK);
idt_install();
irq_install();
interrupt_enable();
terminal_puts("Hello, refined world!\n");
terminal_puts("This is a kernel\n");
terminal_puts("Working or not?\n");
timer_install(50);
while(1) {asm volatile ("cli"); asm volatile ("hlt");};
}
I have been struggling with this for days. What is the problem, and how do I fix it?
P.S: some people suggesting getting rid of the "cli" at the end, but it just did the same and also resulted in a Triple Fault.
I am developing a kernel in C and created something to print on screen on video memory. I expected that the first byte in video memory would be the character to print and the second byte tells the color. But my program has something different but it works!! It is very unexpected and unusual.
My kernel code -
#define VIDEO_MEM 0xb8000
void write_string( int colour, const unsigned char *string );
void main()
{
unsigned char *vid = (unsigned char*) VIDEO_MEM;
int i=0;
for (i = 0; i < 2000; i++)
{
*vid = ' ';
*(vid+2) = 0x1f;
vid += 2;
}
write_string(0x1f,"The Kernel has been loaded successfully!!");
}
void write_string( int colour, const unsigned char *string ) {
unsigned char *vid = (unsigned char*) VIDEO_MEM;
while(*string != 0)
{
*(vid) = *string;
*(vid+2) = colour;
++string;
vid+=2;
}
}
It prints the character on *vid and the color on *(vid+2) and then increments the vid by 2. It should then replace and print the next char on *(vid+2). So, the color should go but it still works.
Also, the color should be on *(vid+1)
When I use *(vid+1) instead of *(vid+2) to print the string, the screen shows down arrow characters (with ACII code 0x1f which I wanted to be the color) replacing the entire string.
Why does the code behave so unusual??
Can anyone help?
EDIT
I have edited my code and now it prints string. But another problem arose. I added a support for printing on particular line number. But now this shifts the string backwards by one character.
void write_string( int colour, const unsigned char *string, int pos ) {
unsigned char *vid = (unsigned char*) VIDEO_MEM;
vid+=pos*160;
while(*string != 0)
{
*vid = colour;
*(vid+1) = *string;
++string;
vid+=2;
}
}
So, If I tell it to print on line 10, it prints the first character on the last character of the 9th line and then continues.
I also have a character printing function that justs prints curly braces (}) instead of the given character and that too one character backwards of the given position (like the error in the write_string function). Also it doen't change the character background color given as argument.
void putChar(char character, short col, short row, char attr) {
unsigned char* vid_mem = (unsigned char *) VIDEO_MEM;
int offset = (row*80 + col)*2;
vid_mem += offset;
if(!attr) {
attr = 0x0f;
}
*vid_mem = (attr<<8)+character;
}
EDIT 2
My Boot Loader:
[org 0x7c00]
KERNEL equ 0x1000
mov [BOOT_DRIVE],dl
mov bp,0x9000
mov sp,bp
mov bx, msgReal
call print_string
call load_kernel
call switch_to_pm
jmp $
%include 'boot/bios.ASM'
%include 'boot/gdt.ASM'
%include 'boot/protected_mode.ASM'
%include 'boot/print32.ASM'
[bits 16]
load_kernel:
mov bx,msgKernel
call print_string
mov bx, KERNEL
mov dh, 15
mov dl, [BOOT_DRIVE]
call disk_load
ret
[bits 32]
BEGIN_PM:
mov ebx, msgProt
call print_string32
call KERNEL
jmp $
BOOT_DRIVE db 0
msgReal db "Booted in 16-bit mode",0
msgProt db "Successfully switched to 32-bit mode",0
msgKernel db "Loading the kernel onto memory",0
times 510-($-$$) db 0
dw 0xaa55
bios.ASM -
;BIOS Functions
[bits 16]
print_string:
pusha
mov cx,bx
mov ah,0x0e
printStringStart:
mov al,[bx]
cmp al,0
je done
int 0x10
inc bx
jmp printStringStart
done:
popa
ret
print_word:
pusha
mov ax,0x0000
mov cl,0x10
mov al,bh
div cl
call printDig
mov al,bh
and al,0x0f
call printDig
mov ax,0x0000
mov al,bl
div cl
call printDig
mov al,bl
and al,0x0f
call printDig
popa
ret
printDig:
cmp al,0x9
jg alpha
add al,'0'
mov ah,0x0e
int 0x10
jmp pDigDone
alpha:
sub al,0xa
add al,'A'
mov ah,0x0e
int 0x10
pDigDone:
ret
hex_prefix: db '0x',0
disk_load:
push dx
mov ah,0x02
mov al,dh
mov ch,0x00
mov dh,0x00
mov cl,0x02
int 0x13
jc disk_error
pop dx
cmp dh,al
jne disk_error
ret
disk_error:
mov ah,0x0e
mov al,'X'
int 0x10
mov bx,errMsg
call print_string
jmp $
errMsg:
db "Disk Read Error....."
times 80-20 db " "
db 0
gdt.ASM -
gdt_start:
gdt_null:
dd 0x0
dd 0x0
gdt_code:
dw 0xffff
dw 0x0
db 0x0
db 10011010b
db 11001111b
db 0x0
gdt_data:
dw 0xffff
dw 0x0
db 0x0
db 10010010b
db 11001111b
db 0x0
gdt_end:
gdt_descriptor:
dw gdt_end - gdt_start - 1
dd gdt_start
CODE_SEG equ gdt_code - gdt_start
DATA_SEG equ gdt_data - gdt_start
protected_mode.ASM -
[bits 16]
switch_to_pm:
cli
lgdt [gdt_descriptor]
mov eax, cr0
or eax, 0x1
mov cr0, eax
jmp CODE_SEG:init_pm
[bits 32]
init_pm:
mov ax, DATA_SEG
mov ds, ax
mov ss, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ebp,0x90000
mov esp,0x90000
call BEGIN_PM
print32.ASM -
[bits 32]
VIDEO_MEM equ 0xb8000
DEF_COLOR equ 0x0f
print_string32:
pusha
mov edx,VIDEO_MEM
print_string32_loop:
mov al, [ebx]
mov ah, DEF_COLOR
cmp al,0
je print_string32_end
mov [edx],ax
inc ebx
add edx,2
jmp print_string32_loop
print_string32_end:
popa
ret
I also add a kernel_start.asm file just before the kernel while linking to call the main function -
[bits 32]
[extern main]
call main
jmp $
And here's my make file -
C_SOURCES = $(wildcard drivers/*.c kernel/*.c)
HEADERS = $(wildcard kernel/*.h drivers/*.h)
OBJ = ${C_SOURCES:.c=.o}
all: os-image
os-image: boot/boot_sector.bin kernel.bin
cat $^ > $#
kernel.bin: kernel/kernel_start.o ${OBJ}
ld -o $# -Ttext 0x1000 $^ --oformat binary
%.o : %.c
gcc -std=c99 -Wall -pedantic -ffreestanding -c $< -o $#
%.o : %.asm
nasm $< -f elf64 -o $#
%.bin : %.asm
nasm $< -f bin -o $#
clean:
rm -fr kernel/*.o
rm -fr drivers/*.o
rm -fr boot/*.bin
rm -fr os-image *.bin *.o
With the changes suggested in other answer and comments, your problem doesn't seem to be reproducible for me. The following code works for me. I've tried to maintain how you coded it just so it makes sense to you:
#define VIDEO_MEM 0xb8000
void write_string( unsigned char colour, const char *string );
void write_string_line( unsigned char colour, const char *string, int pos );
void putChar(char character, short col, short row, unsigned char attr);
/* Place this at top of file as first code in kernel.o */
__asm__ ("call main\r\n" \
"cli\r\n" \
"hlt\r\n"
);
void main()
{
volatile unsigned char *vid = (unsigned char*) VIDEO_MEM;
int i=0;
for (i = 0; i < 2000; i++)
{
*vid = ' ';
*(vid+1) = 0x1f;
vid += 2;
}
write_string(0x1f,"The Kernel has been loaded successfully!!");
write_string_line(0x1f,"Testing Here!!",1);
putChar('Z',3,3,0xf3);
}
void write_string( unsigned char colour, const char *string ) {
volatile unsigned char *vid = (unsigned char*) VIDEO_MEM;
while(*string != 0)
{
*(vid) = *string;
*(vid+1) = colour;
++string;
vid+=2;
}
}
void write_string_line( unsigned char colour, const char *string, int pos ) {
volatile unsigned char *vid = (unsigned char*) VIDEO_MEM;
vid+=pos*160;
while(*string != 0)
{
*vid = *string;
*(vid+1) = colour;
++string;
vid+=2;
}
}
void putChar(char character, short col, short row, unsigned char attr) {
volatile unsigned char* vid_mem = (unsigned char *) VIDEO_MEM;
int offset = (row*80 + col)*2;
vid_mem += offset;
if(!attr) {
attr = 0x0f;
}
*(unsigned short int *)vid_mem = (attr<<8)+character;
/* This would do the same as line above
*vid_mem = character;
*(vid_mem+1) = attr;
*/
}
I've added the __asm__ at the beginning to make sure that code is the first to appear in the generated object file. It likely works without it. I've modified all your *vid pointers to be volatile . Since video is memory mapped IO you don't want to have the compiler potentially remove screen writes when it optimizes. Likely your code will work without volatile, but it is proper to add it here to avoid potential problems.
When run BOCHS this code produces this screen output:
If you use the code provided here and it doesn't work that would suggest the issue you are having is likely related to the a code you write in your bootloader that read the disk, enabled A20, set the GDT, entered protected mode, and then called into your C code. It is also possible problems could occur depending on how you compile and link your kernel.
Likely Cause of Undefined Behavior
After all the code and the make file were made available in EDIT 2 it became clear that one significant problem was that most of the code was compiled and linked to 64-bit objects and executables. That code won't work in 32-bit protected mode.
In the make file make these adjustments:
When compiling with GCC you need to add -m32 option
When assembling with GNU Assembler (as) targeting 32-bit objects you need to use --32
When linking with LD you need to add the -melf_i386 option
When assembling with NASM targeting 32-bit objects you need to change -f elf64 to -f elf32
A preferable option to using a 64-bit compiler and tool chain from the host environment is to create a cross compiler toolchain for i686 or i386.
This should work.
Each VGA cell is of 2 bytes long, First byte stores Character while the second byte stores the color.
Also make sure you make marked the pointer volatile. To avoid any type of unexpected changes(or optimizations) made by the compiler on that local field.
void write_string( int colour, const unsigned char *string )
{
volatile unsigned char *vid = (unsigned char*) VIDEO_MEM;
while( *string != 0 )
{
*vid++ = *string++;
*vid++ = colour;
}
}
You use *(vid) for first video character for color
I'm developing a software on 8051 processor. A frequent job is to divide the high and low byte of a 16bit address. I want to see there are how many ways to achieve it. The ways I come up so far are: (say ptr is a 16bit pointer, and int is 16bit int) [note the rn and arn is registers]
bitwise operation
ADDH = (unsigned int) ptr >> 8;
ADDL = (unsigned int) ptr & 0x00FF;
SDCC gives the following assembly code
; t.c:32: ADDH = (unsigned int) ptr >> 8;
mov ar6,r3
mov ar7,r4
mov _main_ADDH_1_1,r7
; t.c:33: ADDL = (unsigned int) ptr & 0x00FF;
mov _main_ADDL_1_1,r6
Keil C51 gives me:
; SOURCE LINE # 32
0045 AA00 R MOV R2,ptr+01H
0047 A900 R MOV R1,ptr+02H
0049 AE02 MOV R6,AR2
004B EE MOV A,R6
004C F500 R MOV ADDH,A
; SOURCE LINE # 33
004E AF01 MOV R7,AR1
0050 EF MOV A,R7
0051 F500 R MOV ADDL,A
which has many useless code IMHO.
pointer trick
ADDH = ((unsigned char *)&ptr)[0];
ADDL = ((unsigned char *)&ptr)[1];
SDCC gives me:
; t.c:37: ADDH = ((unsigned char *)&ptr)[0];
mov _main_ADDH_1_1,_main_ptr_1_1
; t.c:38: ADDL = ((unsigned char *)&ptr)[1];
mov _main_ADDL_1_1,(_main_ptr_1_1 + 0x0001)
Keil C51 gives me:
; SOURCE LINE # 37
006A 850000 R MOV ADDH,ptr
; SOURCE LINE # 38
006D 850000 R MOV ADDL,ptr+01H
which is the same with SDCC version.
Andrey's mathematic approach
ADDH = ptr / 256;
ADDL = ptr % 256;
SDCC gives:
; t.c:42: ADDH = (unsigned int)ptr / 256;
mov ar5,r3
mov ar6,r4
mov ar7,r6
mov _main_ADDH_1_1,r7
; t.c:43: ADDL = (unsigned int)ptr % 256;
mov _main_ADDL_1_1,r5
I've no idea why sdcc use the r7 register...
Keil C51 gives me:
; SOURCE LINE # 42
0079 AE00 R MOV R6,ptr
007B AF00 R MOV R7,ptr+01H
007D AA06 MOV R2,AR6
007F EA MOV A,R2
0080 F500 R MOV ADDH,A
; SOURCE LINE # 43
0082 8F00 R MOV ADDL,R7
I've no idea why Keil use R2 register neither...
semaj's union approach
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
U16_U8 ptr;
// Do something to set the variable ptr
ptr.u16 = ?;
ADDH = ptr.u8[0];
ADDL = ptr.u8[1];
SDCC gives me
; t.c:26: ADDH = uptr.u8[0];
mov _main_ADDH_1_1,_main_uptr_1_1
; t.c:27: ADDL = uptr.u8[1];
mov _main_ADDL_1_1,(_main_uptr_1_1 + 0x0001)
Keil C51 gives me:
; SOURCE LINE # 26
0028 850000 R MOV ADDH,uptr
; SOURCE LINE # 27
002B 850000 R MOV ADDL,uptr+01H
which is very smiler to the pointers trick. However, this approach require two more bytes memory the store the union.
Does anyone have any other bright ideas? ;)
And anyone can tell me which way is more efficient?
In case anyone interested, here is the test case:
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
// call a function on the ADDs to avoid optimizition
void swap(unsigned char *a, unsigned char *b)
{
unsigned char tm;
tm = *a;
*a = *b;
*b = tm;
}
main (void)
{
char c[] = "hello world.";
unsigned char xdata *ptr = (unsigned char xdata *)c;
unsigned char ADDH, ADDL;
unsigned char i = 0;
U16_U8 uptr;
uptr.u16 = (unsigned short)ptr;
for ( ; i < 4 ; i++, uptr.u16++){
ADDH = uptr.u8[0];
ADDL = uptr.u8[1];
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = (unsigned int) ptr >> 8;
ADDL = (unsigned int) ptr & 0x00FF;
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = ((unsigned char *)&ptr)[0];
ADDL = ((unsigned char *)&ptr)[1];
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = (unsigned int)ptr / 256;
ADDL = (unsigned int)ptr % 256;
swap(&ADDH, &ADDL);
}
}
The most efficient way is completely dependent on the compiler. You definitely have to figure out how to get an assembly listing from your compiler for an 8051 project.
One method you might try that is similar to those already mentioned is a union:
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
U16_U8 ptr;
// Do something to set the variable ptr
ptr.u16 = ?;
ADDH = ptr.u8[0];
ADDL = ptr.u8[1];
Another not so bright way to split the address:
ADDH = ptr / 256;
ADDL = ptr % 256;
most efficient is first one, since it is done in single instruction.
NO! I lied to you sorry. I forgot that 8051 instruction set has only 1-bit shift instructions. Second should be faster, but compiler may generate stupid code, so beware and check assembly code.
I just create two defines(as follows).
It seems more straight forward, and less error prone.
#define HI(x) ((x) >> 8)
#define LO(x) ((x) & 0xFF)