i have a problem. I making simple OS kernel with this tutorial: http://wiki.osdev.org/Bare_Bones#Linking_the_Kernel
but,if i want to link files boot.o and kernel.o, gcc compiler returns this error:
boot.o: In function `start':
boot.asm:(.text+0x6): undefined reference to `kernel_main'
collect2.exe: error: ld returned 1 exit status.
sources of files:
boot.asm
; Declare constants used for creating a multiboot header.
MBALIGN equ 1<<0 ; align loaded modules on page boundaries
MEMINFO equ 1<<1 ; provide memory map
FLAGS equ MBALIGN | MEMINFO ; this is the Multiboot 'flag' field
MAGIC equ 0x1BADB002 ; 'magic number' lets bootloader find the header
CHECKSUM equ -(MAGIC + FLAGS) ; checksum of above, to prove we are multiboot
; Declare a header as in the Multiboot Standard. We put this into a special
; section so we can force the header to be in the start of the final program.
; You don't need to understand all these details as it is just magic values that
; is documented in the multiboot standard. The bootloader will search for this
; magic sequence and recognize us as a multiboot kernel.
section .multiboot
align 4
dd MAGIC
dd FLAGS
dd CHECKSUM
; Currently the stack pointer register (esp) points at anything and using it may
; cause massive harm. Instead, we'll provide our own stack. We will allocate
; room for a small temporary stack by creating a symbol at the bottom of it,
; then allocating 16384 bytes for it, and finally creating a symbol at the top.
section .bootstrap_stack
align 4
stack_bottom:
times 16384 db 0
stack_top:
; The linker script specifies _start as the entry point to the kernel and the
; bootloader will jump to this position once the kernel has been loaded. It
; doesn't make sense to return from this function as the bootloader is gone.
section .text
global _start
_start:
; Welcome to kernel mode! We now have sufficient code for the bootloader to
; load and run our operating system. It doesn't do anything interesting yet.
; Perhaps we would like to call printf("Hello, World\n"). You should now
; realize one of the profound truths about kernel mode: There is nothing
; there unless you provide it yourself. There is no printf function. There
; is no <stdio.h> header. If you want a function, you will have to code it
; yourself. And that is one of the best things about kernel development:
; you get to make the entire system yourself. You have absolute and complete
; power over the machine, there are no security restrictions, no safe
; guards, no debugging mechanisms, there is nothing but what you build.
; By now, you are perhaps tired of assembly language. You realize some
; things simply cannot be done in C, such as making the multiboot header in
; the right section and setting up the stack. However, you would like to
; write the operating system in a higher level language, such as C or C++.
; To that end, the next task is preparing the processor for execution of
; such code. C doesn't expect much at this point and we only need to set up
; a stack. Note that the processor is not fully initialized yet and stuff
; such as floating point instructions are not available yet.
; To set up a stack, we simply set the esp register to point to the top of
; our stack (as it grows downwards).
mov esp, stack_top
; We are now ready to actually execute C code. We cannot embed that in an
; assembly file, so we'll create a kernel.c file in a moment. In that file,
; we'll create a C entry point called kernel_main and call it here.
extern kernel_main
call kernel_main
; In case the function returns, we'll want to put the computer into an
; infinite loop. To do that, we use the clear interrupt ('cli') instruction
; to disable interrupts, the halt instruction ('hlt') to stop the CPU until
; the next interrupt arrives, and jumping to the halt instruction if it ever
; continues execution, just to be safe.
cli
.hang:
hlt
jmp .hang
kernel.c
#if !defined(__cplusplus)
#include <stdbool.h> /* C doesn't have booleans by default. */
#endif
#include <stddef.h>
#include <stdint.h>
/* Check if the compiler thinks if we are targeting the wrong operating system. */
#if defined(__linux__)
#error "You are not using a cross-compiler, you will most certainly run into trouble"
#endif
/* This tutorial will only work for the 32-bit ix86 targets. */
#if !defined(__i386__)
#error "This tutorial needs to be compiled with a ix86-elf compiler"
#endif
/* Hardware text mode color constants. */
enum vga_color
{
COLOR_BLACK = 0,
COLOR_BLUE = 1,
COLOR_GREEN = 2,
COLOR_CYAN = 3,
COLOR_RED = 4,
COLOR_MAGENTA = 5,
COLOR_BROWN = 6,
COLOR_LIGHT_GREY = 7,
COLOR_DARK_GREY = 8,
COLOR_LIGHT_BLUE = 9,
COLOR_LIGHT_GREEN = 10,
COLOR_LIGHT_CYAN = 11,
COLOR_LIGHT_RED = 12,
COLOR_LIGHT_MAGENTA = 13,
COLOR_LIGHT_BROWN = 14,
COLOR_WHITE = 15,
};
uint8_t make_color(enum vga_color fg, enum vga_color bg)
{
return fg | bg << 4;
}
uint16_t make_vgaentry(char c, uint8_t color)
{
uint16_t c16 = c;
uint16_t color16 = color;
return c16 | color16 << 8;
}
size_t strlen(const char* str)
{
size_t ret = 0;
while ( str[ret] != 0 )
ret++;
return ret;
}
static const size_t VGA_WIDTH = 80;
static const size_t VGA_HEIGHT = 25;
size_t terminal_row;
size_t terminal_column;
uint8_t terminal_color;
uint16_t* terminal_buffer;
void terminal_initialize()
{
terminal_row = 0;
terminal_column = 0;
terminal_color = make_color(COLOR_LIGHT_GREY, COLOR_BLACK);
terminal_buffer = (uint16_t*) 0xB8000;
for ( size_t y = 0; y < VGA_HEIGHT; y++ )
{
for ( size_t x = 0; x < VGA_WIDTH; x++ )
{
const size_t index = y * VGA_WIDTH + x;
terminal_buffer[index] = make_vgaentry(' ', terminal_color);
}
}
}
void terminal_setcolor(uint8_t color)
{
terminal_color = color;
}
void terminal_putentryat(char c, uint8_t color, size_t x, size_t y)
{
const size_t index = y * VGA_WIDTH + x;
terminal_buffer[index] = make_vgaentry(c, color);
}
void terminal_putchar(char c)
{
terminal_putentryat(c, terminal_color, terminal_column, terminal_row);
if ( ++terminal_column == VGA_WIDTH )
{
terminal_column = 0;
if ( ++terminal_row == VGA_HEIGHT )
{
terminal_row = 0;
}
}
}
void terminal_writestring(const char* data)
{
size_t datalen = strlen(data);
for ( size_t i = 0; i < datalen; i++ )
terminal_putchar(data[i]);
}
void kernel_main()
{
terminal_initialize();
/* Since there is no support for newlines in terminal_putchar yet, \n will
produce some VGA specific character instead. This is normal. */
terminal_writestring("Hello\n");
}
It looks like you’re using GCC on Microsoft® Windows® (for example, with Cygwin), judging from the collect2.exe reference. This means your native executable format, which you appear to be using, prepends an underscore to C identifiers to keep them separate from assembly identifiers, which is something most object formats, but not the ELF format wide-spread under modern Unix, does.
If you change your call to _kernel_main, the link error will likely go away.
But please note this line, quoted from your question:
#error "This tutorial needs to be compiled with a ix86-elf compiler"
You’re violating a basic tenet of the tutorial you’re using. I suggest you get a GNU/Linux or BSD VM for i386 (32-bit), and run the tutorial within that.
Related
This is a follow-up from the following question Custom Instruction crashing with SIGNAL 4 (Illegal Instruction): RISC-V (32) GNU-Toolchain with QEMU (apologies if I have missed any etiquette points in advance or formatted this in an unsavoury way, as I am still new to posting here). I am using the latest version of the riscv-gnu-toolchain, so I am just wondering what caveats and differences this process will be for this platform.
I was following namely two guides to add custom RISC-V instructions to QEMU - namely https://www.ashling.com/wp-content/uploads/QEMU_CUSTOM_INST_WP.pdf and https://chowdera.com/2021/04/20210430120004272q.html.
After editing qemu/target/riscv/insn32.decode with a free opcode (following Ashling tutorial), and implementing translator function in insn_trans/trans_rvi.c.inc - QEMU hangs when calling the function as an .insn directive (I know this to work - ). Changing opcodes of existing instructions to test if disassembly changes indicated to me that QEMU was not registering the changes I made and that I didn't rebuild/recompile QEMU correctly. I simply ran make clean, reconfigured and make again in the QEMU directory to rebuild it - Is this the correct way to rebuild QEMU in this toolchain or is there something I missed.
The code to call
#include <stdio.h>
static int test_ins(int a, int b) {
int result;
asm volatile(".insn r 0x33, 7, 0x20, %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
return result;
}
int main() {
int a, b, result;
a = 2;
b = 4;
result = test_ins(a,b);
printf("%d\n", result);
}
Instruction in insn32.decode is as follows:
OPCODE = "0110011", FUNCT3 = "111" and FUNCT7 = "0100000".
Implementation of aforementioned instruction is as follows:
static bool trans_bitcnt(DisasContext *ctx, arg_bitcnt *a)
{
TCGLabel *loop_source1 = gen_new_label();
TCGLabel *loop_source2 = gen_new_label();
TCGv source1, source2, dstval, cntval;
source1 = tcg_temp_local_new();
source2 = tcg_temp_local_new();
dstval = tcg_temp_local_new();
cntval = tcg_temp_local_new();
// Count all the bits set in rs1 and rs2 and put that number in rd
gen_get_gpr(source1, a->rs1);
gen_get_gpr(source2, a->rs2);
tcg_gen_movi_tl(cntval, 0x0);
/* Count the bits that are set in the first register */
gen_set_label(loop_source1);
tcg_gen_andi_tl(dstval, source1, 0x1);
tcg_gen_shri_tl(source1, source1, 0x1);
tcg_gen_add_tl(cntval, cntval, dstval);
tcg_gen_brcondi_tl(TCG_COND_NE, source1, 0x0, loop_source1);
/* Count the bits that are set in the second register */
gen_set_label(loop_source2);
tcg_gen_andi_tl(dstval, source2, 0x1);
tcg_gen_shri_tl(source2, source2, 0x1);
tcg_gen_add_tl(cntval, cntval, dstval);
tcg_gen_brcondi_tl(TCG_COND_NE, source2, 0x0, loop_source2);
/* Update the destination register with the bits total */
gen_set_gpr(a->rd, cntval);
tcg_temp_free(source1);
tcg_temp_free(source2);
tcg_temp_free(dstval);
tcg_temp_free(cntval);
return true;
}
I am Developing an OS, I wants to get EDID from monitor, I am found some asm code (https://wiki.osdev.org/EDID) to get edid in ES:DI registers,
mov ax, 0x4f15
mov bl, 0x01
xor cx, cx
xor dx, dx
int 0x10
;AL = 0x4F if function supported
;AH = status (0 is success, 1 is fail)
;ES:DI contains the EDID
How can I get AL, AH, and ES:DI values in C File?
Actually I am developing an 64 bit UEFI OS
LoadGDT:
lgdt [rdi]
mov ax, 0x10
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ss, ax
pop rdi
mov rax, 0x08
push rax
push rdi
retfq
GLOBAL LoadGDT
I am able to run these above asm code and get it in c using Global Functions in C,
That page on osdev.org contains code intended to be run when the CPU is in 16-bit real mode.
You can tell not only from the registers involved but also from the fact that int 10h is used.
This is a well-known BIOS interrupt service that is written in 16-bit real-mode code.
If you target UEFI, then your bootloader is actually an UEFI application, which is a PE32(+) image.
If the CPU is 64-bit capable, the firmware will switch into long mode (64-bit mode) and load your bootloader.
Otherwise, it will switch into protected mode (32-bit mode).
In any case, real mode is never used in UEFI.
You can call 16-bit code from protected/long mode with the use of a 16-bit code segment in the GDT/LDT but you cannot call real-mode code (i.e. code written to work with the real-mode segmentation) because segmentation works completely different between the modes.
Plus, in real mode the interrupts are dispatched through the IVT and not the IDT, you would need to get the original entry-point for interrupt 10h.
UEFI protocol EFI_EDID_DISCOVERED_PROTOCOL
Luckily, UEFI has a replacement for most basic services offered by the legacy BIOS interface.
In this case, you can use the EFI_EDID_DISCOVERED_PROTOCOL and eventually apply any override from the platform firmware with the use of EFI_EDID_OVERRIDE_PROTOCOL.
The EFI_EDID_DISCOVERED_PROTOCOL is straightforward to use, it's just a (Size, Data) pair.
typedef struct _EFI_EDID_DISCOVERED_PROTOCOL {
UINT32 SizeOfEdid;
UINT8 *Edid;
} EFI_EDID_DISCOVERED_PROTOCOL;
(from gnu-efi)
The format of the buffer Edid can be found in the VESA specification or even on Wikipedia.
As an example, I wrote a simple UEFI application with gnu-efi and x64_64-w64-mingw32 (a version of GCC and tools that target PEs).
I avoided using uefilib.h in order to use gnu-efi just for the definition of the structures related to EUFI.
The code sucks, it assumes at most 10 handles support the EDID protocol and I wrote only a partial structure for the EDID data (because I got bored).
But this should be enough the get the idea.
NOTE That my VM didn't return any EDID information, so the code is not completely tested!
#include <efi.h>
//You are better off using this lib
//#include <efilib.h>
EFI_GUID gEfiEdidDiscoveredProtocolGuid = EFI_EDID_DISCOVERED_PROTOCOL_GUID;
EFI_SYSTEM_TABLE* gST = NULL;
typedef struct _EDID14 {
UINT8 Signature[8];
UINT16 ManufacturerID;
UINT16 ManufacturerCode;
UINT32 Serial;
UINT8 Week;
UINT8 Year;
UINT8 Major;
UINT8 Minor;
UINT32 InputParams;
UINT8 HSize;
UINT8 VSize;
UINT8 Gamma;
//...Omitted...
} EDID14_RAW;
VOID Print(CHAR16* string)
{
gST->ConOut->OutputString(gST->ConOut, string);
}
VOID PrintHex(UINT64 number)
{
CHAR16* digits = L"0123456789abcdef";
CHAR16 buffer[2] = {0, 0};
for (INTN i = 64-4; i >= 0; i-=4)
{
buffer[0] = digits[(number >> i) & 0xf];
Print(buffer);
}
}
VOID PrintDec(UINT64 number)
{
CHAR16 buffer[21] = {0};
UINTN i = 19;
do
{
buffer[i--] = L'0' + (number % 10);
number = number / 10;
}
while (number && i >= 0);
Print(buffer + i + 1);
}
#define MANUFACTURER_DECODE_LETTER(x) ( L'A' + ( (x) & 0x1f ) - 1 )
EFI_STATUS efi_main(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE* SystemTable)
{
EFI_STATUS Status = EFI_SUCCESS;
EFI_HANDLE EDIDHandles[10];
UINTN Size = sizeof(EFI_HANDLE) * 10;
EFI_EDID_DISCOVERED_PROTOCOL* EDID;
gST = SystemTable;
if ( EFI_ERROR( (Status = SystemTable->BootServices->LocateHandle(ByProtocol, &gEfiEdidDiscoveredProtocolGuid, NULL, &Size, EDIDHandles)) ) )
{
Print(L"Failed to get EDID handles: "); PrintHex(Status); Print(L"\r\n");
return Status;
}
for (INTN i = 0; i < Size/sizeof(EFI_HANDLE); i++)
{
if (EFI_ERROR( (SystemTable->BootServices->OpenProtocol(
EDIDHandles[i], &gEfiEdidDiscoveredProtocolGuid, (VOID**)&EDID, ImageHandle, NULL, EFI_OPEN_PROTOCOL_GET_PROTOCOL)) ) )
{
Print(L"Failed to get EDID info for handle "); PrintDec(i); Print(L": "); PrintHex(Status); Print(L"\r\n");
return Status;
}
if (EDID->SizeOfEdid == 0 || EDID->Edid == NULL)
{
Print(L"No EDID data for handle "); PrintDec(i); Print(L"\r\n");
continue;
}
/*
THIS CODE IS NOT TESTED!
! ! ! D O N O T U S E ! ! !
*/
EDID14_RAW* EdidData = (EDID14_RAW*)EDID->Edid;
CHAR16 Manufacturer[4] = {0};
Manufacturer[0] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID >> 10);
Manufacturer[1] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID >> 5);
Manufacturer[2] = MANUFACTURER_DECODE_LETTER(EdidData->ManufacturerID);
Print(L"Manufacturer ID: "); Print(Manufacturer); Print(L"\r\n");
Print(L"Resolution: "); PrintDec(EdidData->HSize); Print(L"X"); PrintDec(EdidData->VSize); Print(L"\r\n");
}
return Status;
}
ACPI
If you don't want to use these UEFI protocols you can use ACPI. Each display output device has a _DDC method that is documented in the ACPI specification and can be used to return the EDID data (either as a buffer of 128 or 256 bytes).
This method is conceptually simple but in practice it requires writing a full-blown ACPI parser (including the AML VM) which is a lot of work.
However, ACPI is necessary for modern OSes and so you can use it, later on, to get the EDID data without having to worry about UEFI protocols.
My problem explained:
On my microcontroller (Atmel AT90CAN128) i have about 2500 bytes of RAM left.
In those 2500 bytes i need to store 5 times 100 data sets (size could change in the future). The data sets have a predefined but varying length between 1 and 9 bytes. The total bytes that the pure data sets occupy is about 2000 bytes. I now need to be able to access the data sets in an array like fashion by passing a uint8 to a function and get a pointer to the data set in return.
But i only have about 500 bytes left, so an array with pointers to each data set (calculated at start of run time) is simply not possible.
My attempt:
i use one big uint8 array[2000] (in RAM) and the length of the data sets is stored in flash as const uint8[] = {1, 5, 9, ...};.
The position of the data set in the big array is the accumulated length of the sets before it. So i would have to iterate through the length array and add the values up and then use it as an offset to the pointer of the big data array.
At runtime this gives me bad performance. The position of the data sets within the big array IS KNOWN at compile time, I just dont know how to put this information into an array that the compiler can store into flash.
As the amount of data sets could change, i need a solution that automatically calculates the positions.
Goal:
something like that
uint8 index = 57;
uint8 *pointer_to_data = pointer_array[57];
Is this even possible, as the compiler is a 1 pass comiler ?
(I am using Codevision, not avr gcc)
My solution
The pure C solution/answer is technically the right answer for my question but it just seems overly complicated (from my perspective). The idea with the build script seemed better but codevision is not very practical in that way.
So i ended up with a bit of a mix.
I wrote a javascript that writes the C code/definition of the variables for me. The raw-definitions are easy to edit and i just copy paste the whole thing into a html text file and open it in a browser and copy paste the content back into my C file.
In the beginning i was missing a crucial element and that is the position of the 'flash' keyword in the definition. The following is a simplified output of my javascript that compiles just the way i like it.
flash uint8 len[150] = {4, 4, 0, 2, ...};
uint8 data1[241] = {0}; //accumulated from above
uint8 * flash pointers_1[150] = {data1 +0, data1 +4, data1 +0, data1 +8, ...};
The ugly part (lots of manual labor without script) is adding up the length for each pointer as the compiler will only compile if the pointer is increased by a constant and not a value stored in a constant array.
The raw definitions that are fed to the javascript then look like this
var strings = [
"len[0] = 4;",
"len[1] = 4;",
"len[3] = 2;",
...
Within the javascript it is an array of strings, this way i could copy my old definitions into it and just add some quotes. I only need to define the ones that i want to use, index 2 is not defined and the script uses length 0 for it but does include it. The macro would have needed an entry with 0 i guess, which is bad for overview in my case.
It is not a one click solution but it is very readable and tidy which makes up for the copy-paste.
One common method of packing variable-length data sets to a single continuous array is using one element to describe the length of the next data sequence, followed by that many data items, with a zero length terminating the array.
In other words, if you have data "strings" 1, 2 3, 4 5 6, and 7 8 9 10, you can pack them into an array of 1+1+1+2+1+3+1+4+1 = 15 bytes as 1 1 2 2 3 3 4 5 6 4 7 8 9 10 0.
The functions to access said sequences are quite simple, too. In OP's case, each data item is an uint8:
uint8 dataset[] = { ..., 0 };
To loop over each set, you use two variables: one for the offset of current set, and another for the length:
uint16 offset = 0;
while (1) {
const uint8 length = dataset[offset];
if (!length) {
offset = 0;
break;
} else
++offset;
/* You have 'length' uint8's at dataset+offset. */
/* Skip to next set. */
offset += length;
}
To find a specific dataset, you do need to find it using a loop. For example:
uint8 *find_dataset(const uint16 index)
{
uint16 offset = 0;
uint16 count = 0;
while (1) {
const uint8 length = dataset[offset];
if (length == 0)
return NULL;
else
if (count == index)
return dataset + offset;
offset += 1 + length;
count++;
}
}
The above function will return a pointer to the length item of the index'th set (0 referring to the first set, 1 to the second set, and so on), or NULL if there is no such set.
It is not difficult to write functions to remove, append, prepend, and insert new sets. (When prepending and inserting, you do need to copy the rest of the elements in the dataset array forward (to higher indexes), by 1+length elements, first; this means that you cannot access the array in an interrupt context or from a second core, while the array is being modified.)
If the data is immutable (for example, generated whenever a new firmware is uploaded to the microcontroller), and you have sufficient flash/rom available, you can use a separate array for each set, an array of pointers to each set, and an array of sizes of each set:
static const uint8 dataset_0[] PROGMEM = { 1 };
static const uint8 dataset_1[] PROGMEM = { 2, 3 };
static const uint8 dataset_2[] PROGMEM = { 4, 5, 6 };
static const uint8 dataset_3[] PROGMEM = { 7, 8, 9, 10 };
#define DATASETS 4
static const uint8 *dataset_ptr[DATASETS] PROGMEM = {
dataset_0,
dataset_1,
dataset_2,
dataset_3,
};
static const uint8 dataset_len[DATASETS] PROGMEM = {
sizeof dataset_0,
sizeof dataset_1,
sizeof dataset_2,
sizeof dataset_3,
};
When this data is generated at firmware compile time, it is common to put this into a separate header file, and simply include it from the main firmware .c source file (or, if the firmware is very complicated, from the specific .c source file that accesses the data sets). If the above is dataset.h, then the source file typically contains say
#include "dataset.h"
const uint8 dataset_length(const uint16 index)
{
return (index < DATASETS) ? dataset_len[index] : 0;
}
const uint8 *dataset_pointer_P(const uint16 index)
{
return (index < DATASETS) ? dataset_ptr[index] : NULL;
}
i.e., it includes the dataset, and then defines the functions that access the data. (Note that I deliberately made the data itself static, so they are only visible in the current compilation unit; but the dataset_length() and dataset_pointer(), the safe accessor functions, are accessible from other compilation units (C source files), too.)
When the build is controlled via a Makefile, this is trivial. Let's say the generated header file is dataset.h, and you have a shell script, say generate-dataset.sh, that generates the contents for that header. Then, the Makefile recipe is simply
dataset.h: generate-dataset.sh
#$(RM) $#
$(SHELL) -c "$^ > $#"
with the recipes for the compilation of the C source files that need it, containing it as a prerequisite:
main.o: main.c dataset.h
$(CC) $(CFLAGS) -c main.c
Do note that the indentation in Makefiles always uses Tabs, but this forum does not reproduce them in code snippets. (You can always run sed -e 's|^ *|\t|g' -i Makefile to fix copy-pasted Makefiles, though.)
OP mentioned that they are using Codevision, that does not use Makefiles (but a menu-driven configuration system). If Codevision does not provide a pre-build hook (to run an executable or script before compiling the source files), then OP can write a script or program run on the host machine, perhaps named pre-build, that regenerates all generated header files, and run it by hand before every build.
In the hybrid case, where you know the length of each data set at compile time, and it is immutable (constant), but the sets themselves vary at run time, you need to use a helper script to generate a rather large C header (or source) file. (It will have 1500 lines or more, and nobody should have to maintain that by hand.)
The idea is that you first declare each data set, but do not initialize them. This makes the C compiler reserve RAM for each:
static uint8 dataset_0_0[3];
static uint8 dataset_0_1[2];
static uint8 dataset_0_2[9];
static uint8 dataset_0_3[4];
/* : : */
static uint8 dataset_0_97[1];
static uint8 dataset_0_98[5];
static uint8 dataset_0_99[7];
static uint8 dataset_1_0[6];
static uint8 dataset_1_1[8];
/* : : */
static uint8 dataset_1_98[2];
static uint8 dataset_1_99[3];
static uint8 dataset_2_0[5];
/* : : : */
static uint8 dataset_4_99[9];
Next, declare an array that specifies the length of each set. Make this constant and PROGMEM, since it is immutable and goes into flash/rom:
static const uint8 dataset_len[5][100] PROGMEM = {
sizeof dataset_0_0, sizeof dataset_0_1, sizeof dataset_0_2,
/* ... */
sizeof dataset_4_97, sizeof dataset_4_98, sizeof dataset_4_99
};
Instead of the sizeof statements, you can also have your script output the lengths of each set as a decimal value.
Finally, create an array of pointers to the datasets. This array itself will be immutable (const and PROGMEM), but the targets, the datasets defined first above, are mutable:
static uint8 *const dataset_ptr[5][100] PROGMEM = {
dataset_0_0, dataset_0_1, dataset_0_2, dataset_0_3,
/* ... */
dataset_4_96, dataset_4_97, dataset_4_98, dataset_4_99
};
On AT90CAN128, the flash memory is at addresses 0x0 .. 0x1FFFF (131072 bytes total). Internal SRAM is at addresses 0x0100 .. 0x10FF (4096 bytes total). Like other AVRs, it uses Harvard architecture, where code resides in a separate address space -- in Flash. It has separate instructions for reading bytes from flash (LPM, ELPM).
Because a 16-bit pointer can only reach half the flash, it is rather important that the dataset_len and dataset_ptr arrays are "near", in the lower 64k. Your compiler should take care of this, though.
To generate correct code for accessing the arrays from flash (progmem), at least AVR-GCC needs some helper code:
#include <avr/pgmspace.h>
uint8 subset_len(const uint8 group, const uint8 set)
{
return pgm_read_byte_near(&(dataset_len[group][set]));
}
uint8 *subset_ptr(const uint8 group, const uint8 set)
{
return (uint8 *)pgm_read_word_near(&(dataset_ptr[group][set]));
}
The assembly code, annotated with the cycle counts, avr-gcc-4.9.2 generates for at90can128 from above, is
subset_len:
ldi r25, 0 ; 1 cycle
movw r30, r24 ; 1 cycle
lsl r30 ; 1 cycle
rol r31 ; 1 cycle
add r30, r24 ; 1 cycle
adc r31, r25 ; 1 cycle
add r30, r22 ; 1 cycle
adc r31, __zero_reg__ ; 1 cycle
subi r30, lo8(-(dataset_len)) ; 1 cycle
sbci r31, hi8(-(dataset_len)) ; 1 cycle
lpm r24, Z ; 3 cycles
ret
subset_ptr:
ldi r25, 0 ; 1 cycle
movw r30, r24 ; 1 cycle
lsl r30 ; 1 cycle
rol r31 ; 1 cycle
add r30, r24 ; 1 cycle
adc r31, r25 ; 1 cycle
add r30, r22 ; 1 cycle
adc r31, __zero_reg__ ; 1 cycle
lsl r30 ; 1 cycle
rol r31 ; 1 cycle
subi r30, lo8(-(dataset_ptr)) ; 1 cycle
sbci r31, hi8(-(dataset_ptr)) ; 1 cycle
lpm r24, Z+ ; 3 cycles
lpm r25, Z ; 3 cycles
ret
Of course, declaring subset_len and subset_ptr as static inline would indicate to the compiler you want them inlined, which increases the code size a bit, but might shave off a couple of cycles per invocation.
Note that I have verified the above (except using unsigned char instead of uint8) for at90can128 using avr-gcc 4.9.2.
First, you should put the predefined length array in flash using PROGMEM, if you haven't already.
You could write a script, using the predefined length array as input, to generate a .c (or cpp) file that contains the PROGMEM array definition. Here is an example in python:
# Assume the array that defines the data length is in a file named DataLengthArray.c
# and the array is of the format
# const uint16 dataLengthArray[] PROGMEM = {
# 2, 4, 5, 1, 2,
# 4 ... };
START_OF_ARRAY = "const uint16 dataLengthArray[] PROGMEM = {"
outFile = open('PointerArray.c', 'w')
with open("DataLengthArray.c") as f:
fc = f.read().replace('\n', '')
dataLengthArray=fc[fc.find(START_OF_ARRAY)+len(START_OF_ARRAY):]
dataLengthArray=dataLengthArray[:dataLengthArray.find("}")]
offsets = [int(s) for s in dataLengthArray.split(",")]
outFile.write("extern uint8 array[2000];\n")
outFile.write("uint8* pointer_array[] PROGMEM = {\n")
sum = 0
for offset in offsets:
outFile.write("array + {}, ".format(sum))
sum=sum+offset
outFile.write("};")
Which would output PointerArray.c:
extern uint8 array[2000];
uint8* pointer_array[] = {
array + 0, array + 2, array + 6, array + 11, array + 12, array + 14, };
You could run the script as a Pre-build event, if your IDE supports it. Otherwise you will have to remember to run the script every time you update the offsets.
You mention that the data set lengths are pre-defined, but not how they are defined - so I'm going to make the assumption of how the lengths are written into code is up for grabs..
If you define your flash array in terms of offsets instead of lengths, you should immediately get a run-time benefit.
With lengths in flash, I expect you have something like this:
const uint8_t lengths[] = {1, 5, 9, ...};
uint8_t get_data_set_length(uint16_t index)
{
return lengths[index];
}
uint8_t * get_data_set_pointer(uint16_t index)
{
uint16_t offset = 0;
uint16_t i = 0;
for ( i = 0; i < index; ++i )
{
offset += lengths[index];
}
return &(array[offset]);
}
With offsets in flash, the const array has gone from uint8_t to uint16_t, which doubles the flash usage, plus an additional element to be speed up calculating the length of the last element.
const uint16_t offsets[] = {0, 1, 6, 15, ..., /* last offset + last length */ };
uint8_t get_data_set_length(uint16_t index)
{
return offsets[index+1] - offsets[index];
}
uint8_t * get_data_set_pointer(uint16_t index)
{
uint16_t offset = offsets[index];
return &(array[offset]);
}
If you can't afford that extra flash memory, ou could also combine the two by having the lengths for all elements and offsets for a fraction of the indices, e.g every 16 element in the example below, trading off run-time cost vs flash memory cost.
uint8_t get_data_set_length(uint16_t index)
{
return lengths[index];
}
uint8_t * get_data_set_pointer(uint16_t index)
{
uint16_t i;
uint16_t offset = offsets[index / 16];
for ( i = index & 0xFFF0u; i < index; ++i )
{
offset += lengths[index];
}
return &(array[offset]);
}
To simplify the encoding, you can consider using x-macros, e.g.
#define DATA_SET_X_MACRO(data_set_expansion) \
data_set_expansion( A, 1 ) \
data_set_expansion( B, 5 ) \
data_set_expansion( C, 9 )
uint8_t array[2000];
#define count_struct(tag,len) uint8_t tag;
#define offset_struct(tag,len) uint8_t tag[len];
#define offset_array(tag,len) (uint16_t)(offsetof(data_set_offset_struct,tag)),
#define length_array(tag,len) len,
#define pointer_array(tag,len) (&(array[offsetof(data_set_offset_struct,tag)])),
typedef struct
{
DATA_SET_X_MACRO(count_struct)
} data_set_count_struct;
typedef struct
{
DATA_SET_X_MACRO(offset_struct)
} data_set_offset_struct;
const uint16_t offsets[] =
{
DATA_SET_X_MACRO(offset_array)
};
const uint16_t lengths[] =
{
DATA_SET_X_MACRO(length_array)
};
uint8_t * const pointers[] =
{
DATA_SET_X_MACRO(pointer_array)
};
The preprocessor turns that into:
typedef struct
{
uint8_t A;
uint8_t B;
uint8_t C;
} data_set_count_struct;
typedef struct
{
uint8_t A[1];
uint8_t B[5];
uint8_t C[9];
} data_set_offset_struct;
typedef struct
{
uint8_t A[1];
uint8_t B[5];
uint8_t C[9];
} data_set_offset_struct;
const uint16_t offsets[] = { 0,1,6, };
const uint16_t lengths[] = { 1,5,9, };
uint8_t * const pointers[] =
{
array+0,
array+1,
array+6,
};
This just shows an example of what the x-macro can expand to. A short main() can show these in action:
int main()
{
printf("There are %d individual data sets\n", (int)sizeof(data_set_count_struct) );
printf("The total size of the data sets is %d\n", (int)sizeof(data_set_offset_struct) );
printf("The data array base address is %x\n", array );
int i;
for ( i = 0; i < sizeof(data_set_count_struct); ++i )
{
printf( "elem %d: %d bytes at offset %d, or address %x\n", i, lengths[i], offsets[i], pointers[i]);
}
return 0;
}
With sample output
There are 3 individual data sets
The total size of the data sets is 15
The data array base address is 601060
elem 0: 1 bytes at offset 0, or address 601060
elem 1: 5 bytes at offset 1, or address 601061
elem 2: 9 bytes at offset 6, or address 601066
The above require you to give a 'tag' - a valid C identifier for each data set, but if you have 500 of these, pairing each length with a descriptor is probably not a bad thing. With that amount of data, I would also recommend using an include file for the x-macro, rather than a #define, in particular if the data set definitions can be exported somewhere else.
The benefit of this approach is that you have the data sets defined in one place, and everything is generated from this one definition. If you re-order the definition, or add to it, the arrays will be generated at compile-time. It is also purely using the compiler toolchain, in particular the pre-processor, but there's no need for writing external scripts or hooking in pre-build scripts.
You said that you want to store the address of each data set but it seems like it would be much simpler if you store the offset of each data set. Storing the offsets instead of the addresses means that you don't need to know the address of big array at compile time.
Right now you have an array of constants containing the length of each data set.
const uint8_t data_set_lengths[] = { 1, 5, 9...};
Just change that to be an array of constants containing the offset of each data set in the big array.
const uint8_t data_set_offsets[] = { 0, 1, 6, 15, ...};
You should be able to calculate these offsets at design time given that you already know the lengths. You said yourself, just accumulate the lengths to get the offsets.
With the offsets precalculated the code won't have the bad performance of accumulating at run time. And you can find the address of any data set at run time simply by adding the data set's offset to the address of the big array. And the address of big array doesn't need to be settled until link time.
I wrote some code for initializing the IDT, which stores 32-bit addresses in two non-adjacent 16-bit halves. The IDT can be stored anywhere, and you tell the CPU where by running the LIDT instruction.
This is the code for initializing the table:
void idt_init(void) {
/* Unfortunately, we can't write this as loops. The first option,
* initializing the IDT with the addresses, here looping over it, and
* reinitializing the descriptors didn't work because assigning a
* a uintptr_t (from (uintptr_t) handler_func) to a descr (a.k.a.
* uint64_t), according to the compiler, "isn't computable at load
* time."
* The second option, storing the addresses as a local array, simply is
* inefficient (took 0.020ms more when profiling with the "time" command
* line program!).
* The third option, storing the addresses as a static local array,
* consumes too much space (the array will probably never be used again
* during the whole kernel runtime).
* But IF my argument against the third option will be invalidated in
* the future, THEN it's the best option I think. */
/* Initialize descriptors of exception handlers. */
idt[EX_DE_VEC] = idt_trap(ex_de);
idt[EX_DB_VEC] = idt_trap(ex_db);
idt[EX_NMI_VEC] = idt_trap(ex_nmi);
idt[EX_BP_VEC] = idt_trap(ex_bp);
idt[EX_OF_VEC] = idt_trap(ex_of);
idt[EX_BR_VEC] = idt_trap(ex_br);
idt[EX_UD_VEC] = idt_trap(ex_ud);
idt[EX_NM_VEC] = idt_trap(ex_nm);
idt[EX_DF_VEC] = idt_trap(ex_df);
idt[9] = idt_trap(ex_res); /* unused Coprocessor Segment Overrun */
idt[EX_TS_VEC] = idt_trap(ex_ts);
idt[EX_NP_VEC] = idt_trap(ex_np);
idt[EX_SS_VEC] = idt_trap(ex_ss);
idt[EX_GP_VEC] = idt_trap(ex_gp);
idt[EX_PF_VEC] = idt_trap(ex_pf);
idt[15] = idt_trap(ex_res);
idt[EX_MF_VEC] = idt_trap(ex_mf);
idt[EX_AC_VEC] = idt_trap(ex_ac);
idt[EX_MC_VEC] = idt_trap(ex_mc);
idt[EX_XM_VEC] = idt_trap(ex_xm);
idt[EX_VE_VEC] = idt_trap(ex_ve);
/* Initialize descriptors of reserved exceptions.
* Thankfully we compile with -std=c11, so declarations within
* for-loops are possible! */
for (size_t i = 21; i < 32; ++i)
idt[i] = idt_trap(ex_res);
/* Initialize descriptors of hardware interrupt handlers (ISRs). */
idt[INT_8253_VEC] = idt_int(int_8253);
idt[INT_8042_VEC] = idt_int(int_8042);
idt[INT_CASC_VEC] = idt_int(int_casc);
idt[INT_SERIAL2_VEC] = idt_int(int_serial2);
idt[INT_SERIAL1_VEC] = idt_int(int_serial1);
idt[INT_PARALL2_VEC] = idt_int(int_parall2);
idt[INT_FLOPPY_VEC] = idt_int(int_floppy);
idt[INT_PARALL1_VEC] = idt_int(int_parall1);
idt[INT_RTC_VEC] = idt_int(int_rtc);
idt[INT_ACPI_VEC] = idt_int(int_acpi);
idt[INT_OPEN2_VEC] = idt_int(int_open2);
idt[INT_OPEN1_VEC] = idt_int(int_open1);
idt[INT_MOUSE_VEC] = idt_int(int_mouse);
idt[INT_FPU_VEC] = idt_int(int_fpu);
idt[INT_PRIM_ATA_VEC] = idt_int(int_prim_ata);
idt[INT_SEC_ATA_VEC] = idt_int(int_sec_ata);
for (size_t i = 0x30; i < IDT_SIZE; ++i)
idt[i] = idt_trap(ex_res);
}
The macros idt_trap and idt_int, and are defined as follows:
#define idt_entry(off, type, priv) \
((descr) (uintptr_t) (off) & 0xffff) | ((descr) (KERN_CODE & 0xff) << \
0x10) | ((descr) ((type) & 0x0f) << 0x28) | ((descr) ((priv) & \
0x03) << 0x2d) | (descr) 0x800000000000 | \
((descr) ((uintptr_t) (off) & 0xffff0000) << 0x30)
#define idt_int(off) idt_entry(off, 0x0e, 0x00)
#define idt_trap(off) idt_entry(off, 0x0f, 0x00)
idt is an array of uint64_t, so these macros are implicitly cast to that type. uintptr_t is the type guaranteed to be capable of holding pointer values as integers and on 32-bit systems usually 32 bits wide. (A 64-bit IDT has 16-byte entries; this code is for 32-bit).
I get the warning that the initializer element is not constant due to the address modification in play.
It is absolutely sure that the address is known at linking time.
Is there anything I can do to make this work? Making the idt array automatic would work but this would require the whole kernel to run in the context of one function and this would be some bad hassle, I think.
I could make this work by some additional work at runtime (as Linux 0.01 also does) but it just annoys me that something technically feasible at linking time is actually infeasible.
Related: Solution needed for building a static IDT and GDT at assemble/compile/link time - a linker script for ld can shift and mask to break up link-time-constant addresses. No earlier step has the final addresses, and relocation entries are limited in what they can represent in a .o.
The main problem is that function addresses are link-time constants, not strictly compile time constants. The compiler can't just get 32b binary integers and stick that into the data segment in two separate pieces. Instead, it has to use the object file format to indicate to the linker where it should fill in the final value (+ offset) of which symbol when linking is done. The common cases are as an immediate operand to an instruction, a displacement in an effective address, or a value in the data section. (But in all those cases it's still just filling in 32-bit absolute address so all 3 use the same ELF relocation type. There's a different relocation for relative displacements for jump / call offsets.)
It would have been possible for ELF to have been designed to store a symbol reference to be substituted at link time with a complex function of an address (or at least high / low halves like on MIPS for lui $t0, %hi(symbol) / ori $t0, $t0, %lo(symbol) to build address constants from two 16-bit immediates). But in fact the only function allowed is addition/subtraction, for use in things like mov eax, [ext_symbol + 16].
It is of course possible for your OS kernel binary to have a static IDT with fully resolved addresses at build time, so all you need to do at runtime is execute a single lidt instruction. However, the standard
build toolchain is an obstacle. You probably can't achieve this without post-processing your executable.
e.g. you could write it this way, to produce a table with the full padding in the final binary, so the data can be shuffled in-place:
#include <stdint.h>
#define PACKED __attribute__((packed))
// Note, this is the 32-bit format. 64-bit is larger
typedef union idt_entry {
// we will postprocess the linker output to have this format
// (or convert at runtime)
struct PACKED runtime { // from OSdev wiki
uint16_t offset_1; // offset bits 0..15
uint16_t selector; // a code segment selector in GDT or LDT
uint8_t zero; // unused, set to 0
uint8_t type_attr; // type and attributes, see below
uint16_t offset_2; // offset bits 16..31
} rt;
// linker output will be in this format
struct PACKED compiletime {
void *ptr; // offset bits 0..31
uint8_t zero;
uint8_t type_attr;
uint16_t selector; // to be swapped with the high16 of ptr
} ct;
} idt_entry;
// #define idt_ct_entry(off, type, priv) { .ptr = off, .type_attr = type, .selector = priv }
#define idt_ct_trap(off) { .ct = { .ptr = off, .type_attr = 0x0f, .selector = 0x00 } }
// generate an entry in compile-time format
extern void ex_de(); // these are the raw interrupt handlers, written in ASM
extern void ex_db(); // they have to save/restore *all* registers, and end with iret, rather than the usual C ABI.
// it might be easier to use asm macros to create this static data,
// just so it can be in the same file and you don't need cross-file prototypes / declarations
// (but all the same limitations about link-time constants apply)
static idt_entry idt[] = {
idt_ct_trap(ex_de),
idt_ct_trap(ex_db),
// ...
};
// having this static probably takes less space than instructions to write it on the fly
// but not much more. It would be easy to make a lidt function that took a struct pointer.
static const struct PACKED idt_ptr {
uint16_t len; // encoded as bytes - 1, so 0xffff means 65536
void *ptr;
} idt_ptr = { sizeof(idt) - 1, idt };
/****** functions *********/
// inline
void load_static_idt(void) {
asm volatile ("lidt %0"
: // no outputs
: "m" (idt_ptr));
// memory operand, instead of writing the addressing mode ourself, allows a RIP-relative addressing mode in 64bit mode
// also allows it to work with -masm=intel or not.
}
// Do this once at at run-time
// **OR** run this to pre-process the binary, after link time, as part of your build
void idt_convert_to_runtime(void) {
#ifdef DEBUG
static char already_done = 0; // make sure this only runs once
if (already_done)
error;
already_done = 1;
#endif
const int count = sizeof idt / sizeof idt[0];
for (int i=0 ; i<count ; i++) {
uint16_t tmp1 = idt[i].rt.selector;
uint16_t tmp2 = idt[i].rt.offset_2;
idt[i].rt.offset_2 = tmp1;
idt[i].rt.selector = tmp2;
// or do this swap in fewer insns with SSE or MMX pshufw, but using vector instructions before setting up the IDT may be insane.
}
}
This does compile. See a diff of the -m32 and -m64 asm output on the Godbolt compiler explorer. Look at the layout in the data section (note that .value is a synonym for .short, and is 16 bits.) (But note that the IDT table format is different for 64-bit mode.)
I think I have the size calculation correct (bytes - 1), as documented in http://wiki.osdev.org/Interrupt_Descriptor_Table. Minimum value 100h bytes long (encoded as 0x99). See also https://en.wikibooks.org/wiki/X86_Assembly/Global_Descriptor_Table. (lgdt size/pointer works the same way, although the table itself has a different format.)
The other option, instead of having the IDT static in the data section, is to have it in the bss section, with the data stored as immediate constants in a function that will initialize it (or in an array read by that function).
Either way, that function (and its data) can be in a .init section whose memory you re-use after it's done. (Linux does this to reclaim memory from code and data that's only needed once, at startup.) This would give you the optimal tradeoff of small binary size (since 32b addresses are smaller than 64b IDT entries), and no runtime memory wasted on code to set up the IDT. A small loop that runs once at startup is negligible CPU time. (The version on Godbolt fully unrolls because I only have 2 entries, and it embeds the address into each instruction as a 32-bit immediate, even with -Os. With a large enough table (just copy/paste to duplicate a line) you get a compact loop even at -O3. The threshold is lower for -Os.)
Without memory-reuse haxx, probably a tight loop to rewrite 64b entries in place is the way to go. Doing it at build time would be even better, but then you'd need a custom tool to run the tranformation on the kernel binary.
Having the data stored in immediates sounds good in theory, but the code for each entry would probably total more than 64b, because it couldn't loop. The code to split an address into two would have to be fully unrolled (or placed in a function and called). Even if you had a loop to store all the same-for-multiple-entries stuff, each pointer would need a mov r32, imm32 to get the address in a register, then mov word [idt+i + 0], ax / shr eax, 16 / mov word [idt+i + 6], ax. That's a lot of machine-code bytes.
One way would be to use an intermediate jump table that is located at a fixed address. You could initialize the idt with addresses of the locations in this table (which will be compile time constant). The locations in the jump table will contain jump instructions to the actual isr routines.
The dispatch to an isr will be indirect as follows:
trap -> jump to intermediate address in the idt -> jump to isr
One way to create a jump table at a fixed address is as follows.
Step 1: Put jump table in a section
// this is a jump table at a fixed address
void jump(void) __attribute__((section(".si.idt")));
void jump(void) {
asm("jmp isr0"); // can also be asm("call ...") depending on need
asm("jmp isr1");
asm("jmp isr2");
}
Step 2: Instruct the linker to locate the section at a fixed address
SECTIONS
{
.so.idt 0x600000 :
{
*(.si.idt)
}
}
Put this in the linker script right after the .text section. This will make sure that the executable code in the table will go into a executable memory region.
You can instruct the linker to use your script as follows using the --script option in the Makefile.
LDFLAGS += -Wl,--script=my_script.lds
The following macro gives you the address of the location which contains the jump (or call) instruction to the corresponding isr.
// initialize the idt at compile time with const values
// you can find a cleaner way to generate offsets
#define JUMP_ADDR(off) ((char*)0x600000 + 4 + (off * 5))
You will then initialize the idt as follows using modified macros.
// your real idt will be initialized as follows
#define idt_entry(addr, type, priv) \
( \
((descr) (uintptr_t) (addr) & 0xffff) | \
((descr) (KERN_CODE & 0xff) << 0x10) | \
((descr) ((type) & 0x0f) << 0x28) | \
((descr) ((priv) & 0x03) << 0x2d) | \
((descr) 0x1 << 0x2F) | \
((descr) ((uintptr_t) (addr) & 0xffff0000) << 0x30) \
)
#define idt_int(off) idt_entry(JUMP_ADDR(off), 0x0e, 0x00)
#define idt_trap(off) idt_entry(JUMP_ADDR(off), 0x0f, 0x00)
descr idt[] =
{
...
idt_trap(ex_de),
...
idt_int(int_casc),
...
};
A demo working example is below, which shows dispatch to a isr with a non-fixed address from a instruction at a fixed address.
#include <stdio.h>
// dummy isrs for demo
void isr0(void) {
printf("==== isr0\n");
}
void isr1(void) {
printf("==== isr1\n");
}
void isr2(void) {
printf("==== isr2\n");
}
// this is a jump table at a fixed address
void jump(void) __attribute__((section(".si.idt")));
void jump(void) {
asm("jmp isr0"); // can be asm("call ...")
asm("jmp isr1");
asm("jmp isr2");
}
// initialize the idt at compile time with const values
// you can find a cleaner way to generate offsets
#define JUMP_ADDR(off) ((char*)0x600000 + 4 + (off * 5))
// dummy idt for demo
// see below for the real idt
char* idt[] =
{
JUMP_ADDR(0),
JUMP_ADDR(1),
JUMP_ADDR(2),
};
int main(int argc, char* argv[]) {
int trap;
char* addr = idt[trap = argc - 1];
printf("==== idt[%d]=%p\n", trap, addr);
asm("jmp *%0\n" : :"m"(addr));
}
I've made a driver for Windows, compiled it and tried to start it via SC manager, but I get the system error from the SC manager API:
ERROR_PROC_NOT_FOUND The specified procedure could not be found.
Is there a way to get more information about why exactly the driver fails to start?
WinDbg or something? If I comment out all code in my DriverEntry routine, the driver starts.
The only thing I'm calling is a procedure in another source module (in my own project, though). I can comment out all external dependencies and I still get the same error.
Edit:
I've also tried different DDKs, i.e. 2003 DDK und Vista WDK (but not Win7 WDK)
Edit2:
Here is my driver sour code file driver.cpp:
#ifdef __cplusplus
extern "C" {
#endif
#include <ntddk.h>
#include <ntstrsafe.h>
#ifdef __cplusplus
}; // extern "C"
#endif
#include "../distorm/src/distorm.h"
void DriverUnload(IN PDRIVER_OBJECT DriverObject)
{
}
#define MAX_INSTRUCTIONS 20
#ifdef __cplusplus
extern "C" {
#endif
NTSTATUS DriverEntry(IN PDRIVER_OBJECT DriverObject, IN PUNICODE_STRING RegistryPath)
{
UNICODE_STRING pFcnName;
// Holds the result of the decoding.
_DecodeResult res;
// Decoded instruction information.
_DecodedInst decodedInstructions[MAX_INSTRUCTIONS];
// next is used for instruction's offset synchronization.
// decodedInstructionsCount holds the count of filled instructions' array by the decoder.
unsigned int decodedInstructionsCount = 0, i, next;
// Default decoding mode is 32 bits, could be set by command line.
_DecodeType dt = Decode32Bits;
// Default offset for buffer is 0, could be set in command line.
_OffsetType offset = 0;
char* errch = NULL;
// Buffer to disassemble.
char *buf;
int len = 100;
// Register unload routine
DriverObject->DriverUnload = DriverUnload;
DbgPrint("diStorm Loaded!\n");
// Get address of KeBugCheck
RtlInitUnicodeString(&pFcnName, L"KeBugCheck");
buf = (char *)MmGetSystemRoutineAddress(&pFcnName);
offset = (unsigned) (_OffsetType)buf;
DbgPrint("Resolving KeBugCheck # 0x%08x\n", buf);
// Decode the buffer at given offset (virtual address).
while (1) {
res = distorm_decode(offset, (const unsigned char*)buf, len, dt, decodedInstructions, MAX_INSTRUCTIONS, &decodedInstructionsCount);
if (res == DECRES_INPUTERR) {
DbgPrint(("NULL Buffer?!\n"));
break;
}
for (i = 0; i < decodedInstructionsCount; i++) {
// Note that we print the offset as a 64 bits variable!!!
// It might be that you'll have to change it to %08X...
DbgPrint("%08I64x (%02d) %s %s %s\n", decodedInstructions[i].offset, decodedInstructions[i].size,
(char*)decodedInstructions[i].instructionHex.p,
(char*)decodedInstructions[i].mnemonic.p,
(char*)decodedInstructions[i].operands.p);
}
if (res == DECRES_SUCCESS || decodedInstructionsCount == 0) {
break; // All instructions were decoded.
}
// Synchronize:
next = (unsigned int)(decodedInstructions[decodedInstructionsCount-1].offset - offset);
next += decodedInstructions[decodedInstructionsCount-1].size;
// Advance ptr and recalc offset.
buf += next;
len -= next;
offset += next;
}
DbgPrint(("Done!\n"));
return STATUS_SUCCESS;
}
#ifdef __cplusplus
}; // extern "C"
#endif
My directory structure is like this:
base_dir\driver\driver.cpp
\distorm\src\all_the_c_files
\distorm\distorm.h
\distorm\config.h
My SOURCES file:
# $Id$
TARGETNAME=driver
TARGETPATH=obj
TARGETTYPE=DRIVER
# Additional defines for the C/C++ preprocessor
C_DEFINES=$(C_DEFINES) -DSUPPORT_64BIT_OFFSET
SOURCES=driver.cpp \
distorm_dummy.c \
drvversion.rc
INCLUDES=..\distorm\src;
TARGETLIBS=$(DDK_LIB_PATH)\ntdll.lib \
$(DDK_LIB_PATH)\ntstrsafe.lib
You can download diStorm from here: http://ragestorm.net/distorm/dl.php?id=8
distorm_dummy is the same as the dummy.c from the diStorm lib.
Enable "Show loader snaps" using gflags -- in the debug output, you should find information about which import the loader is not able to resolve.
Not surprisingly, you have all the information you need to solve this on your own.
ERROR_PROC_NOT_FOUND The specified procedure could not be found.
This, combined with your dependency Walker output, pretty much points to a broken Import Table
Why is your IT broken? I'm not sure, could be a problem with your build/linker settings, since rather obviously, HAL.DLL is right there in %windir%\system32.
Reasons for a broken load order are many and you'll have to track them down yourself.
Have you tried running Dependency Walker on the compiled .sys and see if there is actually some missing function imports?
Build it with the 6000 WDK/DDK (because with the "actual" Build 7600... it links against wdfldr.sys, but under Windows Vista and XP Systems this sys file is not available).
I don't know where you can download it officially but i did use a torrent...
You can add deferred breakpoints in WinDbg.
If you specify a breakpoint, while the driver is not loaded (or with bu), it will be triggered, when the driver does get loaded and enters the function.
The command for specifiying breakpoints is :
bp <module_name>!<function_name>
e.g. :
bp my_driver!DriverEntry