I have a USB device that outputs data of size of one byte, and I want to pass these bytes to FPGA component that exists on AXI bridge, FPGA and CPU are on the same chip... it's SoC FPGA Altera Cyclone V. CPU is ARM Cortex-A9. Kernel version 3.7.0.
There is a software that reads from the USB device and writes to a dump file... it works just fine. I tried to use mmap() to map the FPGA address to the virtual space and write to it from the userspace. When doing so... after say a minute, the kernel seem to crash.
I wrote a driver for my FPGA component and I passed the driver path to that software as a file, so that it writes to it, and eventually to my FPGA component, but the same result... kernel crashes again after a random time.
I also wrote a simple program that reads bytes from a local file and pass it to FPGA... this works fine either ways (using mmap() or driver module), the file passes through to the FPGA with no problems at all no matter how big is the file.
So the problem is when passing from USB device to FPGA, either using mmap() or a driver module.
Here is a sample crash message:
Internal error: Oops - undefined instruction: 0 [#1] SMP ARM
Modules linked in: ipv6
CPU: 1 Not tainted (3.7.0 #106)
PC is at scheduler_ipi+0x8/0x4c
LR is at handle_IPI+0x10c/0x19c
pc : [<800521a0>] lr : [<800140d4>] psr: 80000193
sp : bf87ff58 ip : 8056acc8 fp : 00000000
r10: 00000000 r9 : 413fc090 r8 : 00000001
r7 : 00000000 r6 : bf87e000 r5 : 80535018 r4 : 8053eec0
r3 : 8056ac80 r2 : bf87ff58 r1 : 00000482 r0 : 00000481
Flags: Nzcv IRQs off FIQs on Mode SVC_32 ISA ARM Segment kernel
Control: 10c5387d Table: 3f0c404a DAC: 00000015
Process swapper/1 (pid: 0, stack limit = 0xbf87e240)
Stack: (0xbf87ff58 to 0xbf880000)
ff40: 00000000 800140d4
ff60: fffec10c 8053e418 bf87ff90 fffec100 8000f6e0 8000851c 8000f708 8000f70c
ff80: 60000013 ffffffff bf87ffc4 8000e180 00000000 00000000 00000001 00000000
ffa0: bf87e000 80565688 803ddfb0 80541fc8 8000f6e0 413fc090 00000000 00000000
ffc0: 8053e9b8 bf87ffd8 8000f708 8000f70c 60000013 ffffffff 00000020 8000f894
ffe0: 3f86c06a 00000015 10c0387d 805658d8 0000406a 003d1ee8 31ca2085 5c1021c3
Code: eaffffad 80564700 e92d4800 e1a0200d (4c4c9b50)
---[ end trace 9e492cde975c41f9 ]---
Other crash messages start like:
Unable to handle kernel paging request at virtual address 2a7a4390
Internal error: Oops - bad syscall: ebcffb [#1] SMP ARM
pgd = bf318000
[2a7a4390] *pgd=00000000
And:
Internal error: Oops - undefined instruction: 0 [#2] SMP ARM
Modules linked in: ipv6
CPU: 1 Tainted: G D (3.7.0 #106)
Here is the full crash messages.
I noticed that all the crash messages I get intersect with the PC and LR locations, but actually I don't have previous experience with Linux kernel. I found similar error messages online but none of the proposed solutions worked for me.
Source Code:
This is function is called whenever a new buffer of bytes arrives from USB:
static void rtlsdr_callback(unsigned char *buf, uint32_t len, void *ctx)
{
if (ctx) {
if (do_exit)
return;
if ((bytes_to_read > 0) && (bytes_to_read < len)) {
len = bytes_to_read;
do_exit = 1;
rtlsdr_cancel_async(dev);
}
/* if (fwrite(buf, 1, len, (FILE*)ctx) != len) {
fprintf(stderr, "Short write, samples lost, exiting!\n");
rtlsdr_cancel_async(dev);
}
*/
if (fm_receiver_addr == NULL)
{
virtual_base = mmap(NULL, HPS2FPGA_SPAN, PROT_WRITE, MAP_PRIVATE, fd, HPS2FPGA_BASE);
if (virtual_base == MAP_FAILED)
{
perror("mmap");
close(fd);
exit(1);
}
fm_receiver_addr = (unsigned char*)(virtual_base + FM_DEMOD_OFFSET);
}
int i, j;
for (i = 0; i < len; i++)
{
*fm_receiver_addr = buf[i];
for (j = 0; j < 150; j++);
}
if (bytes_to_read > 0)
bytes_to_read -= len;
}
}
You see I commented fwrite() function (it's used by the original code to write to files) and replaced it with my code that writes to my FPGA component: *fm_receiver_addr = buf[i];. Before that I check the address to see if it's valid and obtain another address if it's not.
For the other way, the driver module, I wrote this code:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/platform_device.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/io.h>
#define HPS2FPGA_BASE 0xC0000000
#define HPS2FPGA_SPAN PAGE_SIZE
void* fm_demod_addr;
int i;
// Get a driver entry in Sysfs
static struct device_driver fm_demod_driver =
{
.name = "fm-demodulator", // Name of the driver
.bus = &platform_bus_type, // Which bus does the device exist
};
// Function that is used when we read from the file in /sys, but we won't use it
ssize_t fm_demod_read(struct device_driver* drv, char* buf)
{ return 0; }
// Function that is called when we write to the file in /sys
ssize_t fm_demod_write_sample(struct device_driver* drv, const char* buf, size_t count)
{
if (buf == NULL)
{
pr_err("Error! String must not be NULL!\n");
return -EINVAL;
}
for (i = 0; i < count; i++)
{
iowrite8(buf[i], fm_demod_addr);
}
return count;
}
// Set our module's pointers and set permissions mode
static DRIVER_ATTR(fm_demod, S_IWUSR, fm_demod_read, fm_demod_write_sample);
// Set module information
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Siraj Muhammad <sirajmuhammad#outlook.com>");
MODULE_DESCRIPTION("Driver for FPGA component 'FM Demodulator'");
static int __init fm_demod_init(void)
{
int ret;
struct resource* res;
// Register driver in kernel
ret = driver_register(&fm_demod_driver);
if (ret < 0)
return ret;
// Create file system in /sys
ret = driver_create_file(&fm_demod_driver, &driver_attr_fm_demod);
if (ret < 0)
{
driver_unregister(&fm_demod_driver);
return ret;
}
// Request exclusive access to the memory region we want to write to
res = request_mem_region(HPS2FPGA_BASE, HPS2FPGA_SPAN, "fm-demodulator");
if (res == NULL)
{
driver_remove_file(&fm_demod_driver, &driver_attr_fm_demod);
driver_unregister(&fm_demod_driver);
return -EBUSY;
}
// Map the address into virtual memory
fm_demod_addr = ioremap(HPS2FPGA_BASE, HPS2FPGA_SPAN);
if (fm_demod_addr == NULL)
{
driver_remove_file(&fm_demod_driver, &driver_attr_fm_demod);
driver_unregister(&fm_demod_driver);
release_mem_region(HPS2FPGA_BASE, HPS2FPGA_SPAN);
return -EFAULT;
}
return 0;
}
static void __exit fm_demod_exit(void)
{
// Remove file system from /sys
driver_remove_file(&fm_demod_driver, &driver_attr_fm_demod);
// Unregister the driver
driver_unregister(&fm_demod_driver);
// Release requested memory
release_mem_region(HPS2FPGA_BASE, HPS2FPGA_SPAN);
// Un-map address
iounmap(fm_demod_addr);
}
module_init(fm_demod_init);
module_exit(fm_demod_exit);
And I revert the userspace code to its original state, and pass the driver path: /sys/bus/platform/drivers/fm-demodulator/fm_demod to the userspace app to write to it.
Any thought about it?
Internal error: Oops - undefined instruction: 0 [#1] SMP ARM
PC is at scheduler_ipi+0x8/0x4c
LR is at handle_IPI+0x10c/0x19c
pc : [<800521a0>] lr : [<800140d4>] psr: 80000193
[snip]
Code: eaffffad 80564700 e92d4800 e1a0200d (4c4c9b50)
---[ end trace 9e492cde975c41f9 ]---
No one can probably absolutely know the answer. Note: undefined instruction!
The PC is at scheduler_ipi+0x8/0x4c, this is hardcore ARM-Linux scheduling; an inter-processor interrupt. You can disassemble the 'Code:' part to help,
0: eaffffad b 0xfffffebc
4: 80564700 subshi r4, r6, r0, lsl #14
8: e92d4800 push {fp, lr}
c: e1a0200d mov r2, sp
10: 4c4c9b50 mcrrmi 11, 5, r9, ip, cr0
The crash is at the instruction mcrrmi and this appears to be non-sense. If you disassemble sched/core.o you will see the instruction sequence, but I bet that the '4c4c9b50' value is corrupt. Ie, this is not the code the compiler generated.
So the problem is when passing from USB device to FPGA, either using mmap() or a driver module.
I will use a zen move and think a little. The USB device use DMA? Your FPGA is probably also some how in control of the ARM/AXI bus. I would at least consider the possibility that the FPGA is corrupting a bus cycle and perhaps flipping address bits and causing a phycial write to kernel code space. This can happen when you use an innocent by-stander like a DMA peripheral. The ARM CPU will use cache and burst everything.
Things to check,
The code address in (brackets) is reported as the compiler produced. If not, hardware has probably corrupted things. It is hard for Linux code to do this as the kernel code pages are typically R/O.
You should also produce disassembler for any code and see what register is in effect. For instance, the (4c4c9b50) code can be found with,
printf '\x50\x9b\x4c\x4c' > arm.bin
objdump -marm -b binary -D arm.bin
You can just objdump vmlinux to find the assembler for the scheduler_ipi routine and then determine what a pointer might be. For instance, if this_rq() was in R9 and R9 is bogus, then you have a clue.
If the code is corrupt, you need a bus analyzer and/or some routine to monitor the location and report whenever it changes to try and locate the source of corruption.
Related
hi i am using an APM32F003 with Keil uVision compiler.
is a little known microcontroller but compatible with STM32.
I would like to write functions in RAM for different purposes.
I don't want to use the linker attribute to assign the function in ram,
but I want to copy a written one in flash and transfer it in RAM in run-time.
below the code I am trying to write but for now it is not working.
I think it's not possible in this way right?
static volatile uint8_t m_buffer_ram[100];
void flash_function()
{
/* Example */
LED2_ON();
}
void flash_function_end()
{
}
void call_function_in_ram()
{
uint32_t size = (uint32_t) flash_function_end - (uint32_t) flash_function;
/* clone function in RAM */
for (uint32_t i = 0; i < size; i++)
m_buffer_ram[i] = (((uint8_t*)&flash_function)[i]);
__disable_irq();
/* cast buffer to function pointer */
void(*func_ptr)(void) = (void(*)(void)) (&m_buffer_ram);
/* call function in ram */
func_ptr();
__enable_irq();
}
Eugene asked if your function is relocatable. This is very important. I have had issues in the past wherein I copied a function from flash to RAM, and the compiler used an absolute address in the "flash" based function. Therefore the code which was running in RAM jumped back into the flash. This is just one example of what might go wrong with moving code which is not relocatable.
If you have a debugger that can disassemble and also step through the compiled code for you, that would be ideal.
Note also "the busybee" pointed out that code which is adjacent in source code does is not guaranteed to be adjacent in the compiled binary, so your method of finding the size of the code is not reliable.
You can look in the map file to determine the size of the function.
I agree with the comment that you would be better off learning to have the linker do the work for you.
None of what I am saying here is new; I am just reinforcing the comments made above.
CODE
static volatile uint8_t m_buffer_ram[200];
static uint32_t m_function_size;
void flash_function(void)
{
LED2_ON();
}
void flash_function_end(void)
{
}
void test(void)
{
m_function_size = (uint32_t) flash_function_end - (uint32_t) flash_function;
/* clone function in RAM */
for (uint16_t i = 0; i < m_function_size; i++)
m_buffer_ram[i] = (((uint8_t*)&flash_function)[i]);
__disable_irq();
/* cast buffer to function pointer, +1 Thumb Code */
void(*func_ptr)(void) = (void(*)(void)) (&m_buffer_ram[1]);
/* call function in ram */
func_ptr();
__enable_irq();
}
MAP
Image Symbol Table
Symbol Name Value Ov Type Size Object(Section)
Local Symbols
.....
m_function_size 0x20000024 Data 4 test.o(.data)
m_buffer_ram 0x200001f0 Data 200 test.o(.bss)
Global Symbols
.....
flash_function 0x00000399 Thumb Code 12 test.o(i.flash_function)
flash_function_end 0x000003a9 Thumb Code 2 test.o(i.flash_function_end)
Memory Map of the image
Exec Addr Load Addr Size Type Attr Idx E Section Name Object
.....
0x00000398 0x00000398 0x00000010 Code RO 355 i.flash_function test.o
0x000003a8 0x000003a8 0x00000002 Code RO 356 i.flash_function_end test.o
DISASSEMBLE
.....
30: m_function_size = (uint32_t) flash_function_end - (uint32_t) flash_function;
31:
0x00000462 480D LDR r0,[pc,#52] ; #0x00000498
0x00000464 4A0D LDR r2,[pc,#52] ; #0x0000049C
0x00000466 4C0E LDR r4,[pc,#56] ; #0x000004A0
0x00000468 1A81 SUBS r1,r0,r2
0x0000046A 6021 STR r1,[r4,#0x00]
32: for (uint16_t i = 0; i < m_function_size; i++)
0x0000046C 2000 MOVS r0,#0x00
33: m_buffer_ram[i] = (((uint8_t*)&flash_function)[i]);
34:
0x0000046E 4B0D LDR r3,[pc,#52] ; #0x000004A4
0x00000470 2900 CMP r1,#0x00
0x00000472 D905 BLS 0x00000480
33: m_buffer_ram[i] = (((uint8_t*)&flash_function)[i]);
0x00000474 5C15 LDRB r5,[r2,r0]
0x00000476 541D STRB r5,[r3,r0]
32: for (uint16_t i = 0; i < m_function_size; i++)
0x00000478 1C40 ADDS r0,r0,#1
0x0000047A B280 UXTH r0,r0
32: for (uint16_t i = 0; i < m_function_size; i++)
0x0000047C 4288 CMP r0,r1
0x0000047E D3F9 BCC 0x00000474
35: __disable_irq();
36:
0x00000480 B672 CPSID I
37: void(*func_ptr)(void) = (void(*)(void)) (&m_buffer_ram[1]);
0x00000482 1C5B ADDS r3,r3,#1
38: func_ptr();
39:
0x00000484 4798 BLX r3
40: __enable_irq();
41:
0x00000486 B662 CPSIE I
I report all the information that I was able to recover.
I added a shift for the Thumb Code; the calculation of the function size coincides with the MAP file
my doubt is that in debug the pointer cannot jump to a point of the RAM .. for this reason I activate a led to see if (flashing code and run) this turns on without debugging.
as reported below, the read values coincide
(0x000003a8)flash_function_end - (0x00000398)flash_function = 0x10
(0x20000024)m_function_size = 0x10
func_ptr = 0x200001f1;
I'm trying to get a small piece of hello-world MIPS program running in Gem 5 simulator. The program was compiled with gcc 4.9.2 and glibc 2.19 (built by crosstool-ng) and runs well in qemu, but it crashed with a page fault (trying to access address 0) in gem5.
Code is rather simple:
#include <stdio.h>
int main()
{
printf("hello, world\n");
return 0;
}
file ./test result:
./test: ELF 32-bit LSB executable, MIPS, MIPS-I version 1, statically
linked, for GNU/Linux 3.15.4, not stripped
After some debugging with gdb, I figured out that the page fault is triggered by _dl_setup_stack_chk_guard function in glibc. It accepts a void pointer called _dl_random passed by __libc_start_main function, which happens to be NULL. However, as far as I know, these functions never dereference the pointer, but instructions were generated to load values from the memory _dl_random pointer points to. Some code pieces might help understanding:
in function __libc_start_main (macro THREAD_SET_STACK_GUARD is not set):
/* Initialize the thread library at least a bit since the libgcc
functions are using thread functions if these are available and
we need to setup errno. */
__pthread_initialize_minimal ();
/* Set up the stack checker's canary. */
uintptr_t stack_chk_guard = _dl_setup_stack_chk_guard (_dl_random);
# ifdef THREAD_SET_STACK_GUARD
THREAD_SET_STACK_GUARD (stack_chk_guard);
# else
__stack_chk_guard = stack_chk_guard;
# endif
in function _dl_setup_stack_chk_guard (always inlined):
static inline uintptr_t __attribute__ ((always_inline))
_dl_setup_stack_chk_guard (void *dl_random)
{
union
{
uintptr_t num;
unsigned char bytes[sizeof (uintptr_t)];
} ret = { 0 };
if (dl_random == NULL)
{
ret.bytes[sizeof (ret) - 1] = 255;
ret.bytes[sizeof (ret) - 2] = '\n';
}
else
{
memcpy (ret.bytes, dl_random, sizeof (ret));
#if BYTE_ORDER == LITTLE_ENDIAN
ret.num &= ~(uintptr_t) 0xff;
#elif BYTE_ORDER == BIG_ENDIAN
ret.num &= ~((uintptr_t) 0xff << (8 * (sizeof (ret) - 1)));
#else
# error "BYTE_ORDER unknown"
#endif
}
return ret.num;
}
disassembly code:
0x00400ea4 <+228>: jal 0x4014b4 <__pthread_initialize_minimal>
0x00400ea8 <+232>: nop
0x00400eac <+236>: lui v0,0x4a
0x00400eb0 <+240>: lw v0,6232(v0)
0x00400eb4 <+244>: li a0,-256
0x00400eb8 <+248>: lwl v1,3(v0)
0x00400ebc <+252>: lwr v1,0(v0)
0x00400ec0 <+256>: addiu v0,v0,4
0x00400ec4 <+260>: and v1,v1,a0
0x00400ec8 <+264>: lui a0,0x4a
0x00400ecc <+268>: sw v1,6228(a0)
0x4a1858 (0x4a0000 + 6232) is the address of _dl_random
0x4a1854 (0x4a0000 + 6228) is the address of __stack_chk_guard
Page fault occurs at 0x00400eb8. I don't quite get it how instruction 0x00400eb8 and 0x00400ebc are generated. Could someone shed some light on it please? Thanks.
Here is how I find the root of this problem and my suggestion for solution.
I think it helpful to dive into the Glibc source code to see what really happens. Starting from _dl_random or __libc_start_main are both OK.
As the value of _dl_random is unexpectedly NULL, we need to find how this variable initialize and where it is assigned. With the help of code analysing tools, we can find _dl_random in Glibc is only assigned with meaningful value in function _dl_aux_init, and this function is called by __libc_start_min.
_dl_aux_init iterates on its parameter -- auxvec -- and acts corresponding to auxvec[i].at_type. AT_RANDOM is the case for the assignment of _dl_random. So the problem is that there isn't an AT_RANDOM element to make _dl_random assigned.
As the program runs well in user mode qemu, the root of this problem resides in system environment provider, say, gem5, which has the responsibility to construct auxvec. Having that keyword, we can find that the auxv is constructed in gem5/src/arch/<arch-name>/process.cc.
The current auxv for MIPS is constructed as below:
// Set the system page size
auxv.push_back(auxv_t(M5_AT_PAGESZ, MipsISA::PageBytes));
// Set the frequency at which time() increments
auxv.push_back(auxv_t(M5_AT_CLKTCK, 100));
// For statically linked executables, this is the virtual
// address of the program header tables if they appear in the
// executable image.
auxv.push_back(auxv_t(M5_AT_PHDR, elfObject->programHeaderTable()));
DPRINTF(Loader, "auxv at PHDR %08p\n", elfObject->programHeaderTable());
// This is the size of a program header entry from the elf file.
auxv.push_back(auxv_t(M5_AT_PHENT, elfObject->programHeaderSize()));
// This is the number of program headers from the original elf file.
auxv.push_back(auxv_t(M5_AT_PHNUM, elfObject->programHeaderCount()));
//The entry point to the program
auxv.push_back(auxv_t(M5_AT_ENTRY, objFile->entryPoint()));
//Different user and group IDs
auxv.push_back(auxv_t(M5_AT_UID, uid()));
auxv.push_back(auxv_t(M5_AT_EUID, euid()));
auxv.push_back(auxv_t(M5_AT_GID, gid()));
auxv.push_back(auxv_t(M5_AT_EGID, egid()));
Now we know what to do. We just need to provide an accessible address value to _dl_random tagged by MT_AT_RANDOM. Gem5's ARM arch implements this already (code). Maybe we can take it as an example.
A lot of related questions <How is x86 instruction cache synchronized? > mention x86 should properly handle i-cache synchronization in self modifying code. I wrote the following piece of code which toggles a function call on and off from different threads interleaved with its execution. I am using compare and swap operation as an additional guard so that the modification is atomic. But I am getting intermittent crashes (SIGSEGV, SIGILL) and analyzing the core dump makes me suspicious if the processor is trying to execute partially updated instructions. The code and the analysis given below. May be I am missing something here. Let me know if that's the case.
toggle.c
#include <stdio.h>
#include <inttypes.h>
#include <time.h>
#include <pthread.h>
#include <sys/mman.h>
#include <errno.h>
#include <unistd.h>
int active = 1; // Whether the function is toggled on or off
uint8_t* funcAddr = 0; // Address where function call happens which we need to toggle on/off
uint64_t activeSequence = 0; // Byte sequence for toggling on the function CALL
uint64_t deactiveSequence = 0; // NOP byte sequence for toggling off the function CALL
inline int modify_page_permissions(uint8_t* addr) {
long page_size = sysconf(_SC_PAGESIZE);
int code = mprotect((void*)(addr - (((uint64_t)addr)%page_size)), page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;
}
// If the 8 bytes we need to modify straddles a page boundary make the next page writable too
if (page_size - ((uint64_t)addr)%page_size < 8) {
code = mprotect((void*)(addr-((uint64_t)addr)%page_size+ page_size) , page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;;
}
}
return 1;
}
void* add_call(void* param) {
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 50000;
while (1) {
if (!active) {
if (activeSequence != 0) {
int status = modify_page_permissions(funcAddr);
if (!status) {
return 0;
}
uint8_t* start_addr = funcAddr - 8;
fprintf(stderr, "Activating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*) start_addr,
*((uint64_t*)start_addr), activeSequence);
active = 1;
} else {
fprintf(stderr, "Active sequence not initialized..\n");
}
}
nanosleep(&ts, NULL);
}
}
int remove_call(uint8_t* addr) {
if (active) {
// Remove gets called first before add so we initialize active and deactive state byte sequences during the first call the remove
if (deactiveSequence == 0) {
uint64_t sequence = *((uint64_t*)(addr-8));
uint64_t mask = 0x0000000000FFFFFF;
uint64_t deactive = (uint64_t) (sequence & mask);
mask = 0x9090909090000000; // We NOP 5 bytes of CALL instruction and leave rest of the 3 bytes as it is
activeSequence = sequence;
deactiveSequence = deactive | mask;
funcAddr = addr;
}
int status = modify_page_permissions(addr);
if (!status) {
return -1;
}
uint8_t* start_addr = addr - 8;
fprintf(stderr, "Deactivating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*)start_addr,
*((uint64_t*)start_addr), deactiveSequence);
active = 0;
// fprintf(stderr, "Result : %p\n", res);
}
}
int counter = 0;
void foo(int i) {
// Use the return address to determine where we need to patch foo CALL instruction (5 bytes)
uint64_t* addr = (uint64_t*)__builtin_extract_return_addr(__builtin_return_address(0));
fprintf(stderr, "Foo counter : %d\n", counter++);
remove_call((uint8_t*)addr);
}
// This thread periodically checks if the method is inactive and if so reactivates it
void spawn_add_call_thread() {
pthread_t tid;
pthread_create(&tid, NULL, add_call, (void*)NULL);
}
int main() {
spawn_add_call_thread();
int i=0;
for (i=0; i<1000000; i++) {
// fprintf(stderr, "i : %d..\n", i);
foo(i);
}
fprintf(stderr, "Final count : %d..\n\n\n", counter);
}
Core dump analysis
Program terminated with signal 4, Illegal instruction.
#0 0x0000000000400a28 in main () at toggle.c:123
(gdb) info frame
Stack level 0, frame at 0x7fff7c8ee360:
rip = 0x400a28 in main (toggle.c:123); saved rip 0x310521ed5d
source language c.
Arglist at 0x7fff7c8ee350, args:
Locals at 0x7fff7c8ee350, Previous frame's sp is 0x7fff7c8ee360
Saved registers:
rbp at 0x7fff7c8ee350, rip at 0x7fff7c8ee358
(gdb) disas /r 0x400a28,+30
Dump of assembler code from 0x400a28 to 0x400a46:
=> 0x0000000000400a28 <main+64>: ff (bad)
0x0000000000400a29 <main+65>: ff (bad)
0x0000000000400a2a <main+66>: ff eb ljmpq *<internal disassembler error>
0x0000000000400a2c <main+68>: e7 48 out %eax,$0x48
(gdb) disas /r main
Dump of assembler code for function main:
0x00000000004009e8 <+0>: 55 push %rbp
...
0x0000000000400a24 <+60>: 89 c7 mov %eax,%edi
0x0000000000400a26 <+62>: e8 11 ff ff ff callq 0x40093c <foo>
0x0000000000400a2b <+67>: eb e7 jmp 0x400a14 <main+44>
So as can be seen the instruction pointer seems to positioned within an address inside the CALL instruction and processor is apparently trying to execute that misaligned instruction causing an illegal instruction fault.
I think your problem is that you replaced a 5-byte CALL instruction with 5 1-byte NOPs. Consider what happens when your thread has executed 3 of the NOPs, and then your master thread decides to swap the CALL instruction back in. Your thread's PC will be three bytes in the middle of the CALL instruction and will therefore execute an unexpected and likely illegal instruction.
What you need to do is swap the 5-byte CALL instruction with a 5-byte NOP. You just need to find a multibyte instruction that does nothing (such as or'ing a register against itself) and if you need some extra bytes, prepend some prefix bytes such as a gs override prefix and an address-size override prefix (both of which will do nothing). By using a 5-byte NOP, your thread will be guaranteed to either be at the CALL instruction or past the CALL instruction, but never inside of it.
On 80x86 most calls use a relative displacement, not an absolute address. Essentially its "call the code at here + < displacement >" and not "call the code at < address >".
For 64-bit code, the displacement may be 8 bits or 32-bits. It's never 64-bits.
For example, for a 2-byte "call with 8-bit displacement" instruction, you'd be trashing 6 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
For another example, for a 5-byte "call with 32-bit displacement" instruction, you'd be trashing 3 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
However...
These aren't the only way to call. For example, you can call using a function pointer, where the address of the code being called is not in the instruction at all (but may be in a register or be a variable in memory). There's also an optimisation called "tail call optimisation" where a call followed by a ret is replaced with a jmp (likely with some additional stack diddling for passing parameters, cleaning up the caller's local variables, etc).
Essentially; your code is severely broken, you can't cover all the possible corner cases, you shouldn't be doing this to begin with, and you probably should be using a function pointer instead of self modifying code (which would be faster and easier and portable too).
I'm trying to erase a NOR Flash memory with Linux MTD driver in C...
I'm confused about the return status from the ioctl(MEMUNLOCK) call which returns an error even if ioctl(MEMERASE) is successful after it.
The following code displays the warning message but works (i.e. the Flash block has been erased):
int erase_MTD_Pages(int fd, size_t size, off_t offset)
{
mtd_info_t mtd_info;
erase_info_t ei;
ioctl(fd, MEMGETINFO, &mtd_info);
ei.length = mtd_info.erasesize;
for(ei.start = offset; ei.start < (offset+size); ei.start += mtd_info.erasesize) {
if(ioctl(fd, MEMUNLOCK, &ei) < 0)
{
// logPrintf(FAILURE, "[Flash] Can not unlock MTD (MEMUNLOCK, errno=%d)!\n", errno);
// return RETURN_FILE_ERROR;
logPrintf(WARNING, "[Flash] Can not unlock MTD (MEMUNLOCK, errno=%d)!\n", errno);
}
if(ioctl(fd, MEMERASE, &ei) < 0)
{
logPrintf(FAILURE, "[Flash] Can not erase MTD (MEMERASE, errno=%d)!\n", errno);
return RETURN_FILE_ERROR;
}
}
return RETURN_SUCCESS;
}
When I look some C codes on the net, the return status from MEMUNLOCK is not always checked (e.g. from mtc.c):
ioctl(fd, MEMUNLOCK, &mtdEraseInfo);
if(ioctl(fd, MEMERASE, &mtdEraseInfo)) {
fprintf(stderr, "Could not erase MTD device: %s\n", mtd);
close(fd);
exit(1);
}
flash_unlock also returns an error:
root $ cat /proc/mtd
dev: size erasesize name
mtd0: 00020000 00020000 "X-Loader-NOR"
mtd1: 000a0000 00020000 "U-Boot-NOR"
mtd2: 00040000 00020000 "Boot Env-NOR"
mtd3: 00400000 00020000 "Kernel-NOR"
mtd4: 03b00000 00020000 "File System-NOR"
root $ mtd_debug info /dev/mtd3
mtd.type = MTD_NORFLASH
mtd.flags = MTD_CAP_NORFLASH
mtd.size = 4194304 (4M)
mtd.erasesize = 131072 (128K)
mtd.writesize = 1
mtd.oobsize = 0
regions = 0
root $ flash_unlock /dev/mtd3
Could not unlock MTD device: /dev/mtd3
Am I missing something? Is it normal to get an error from MEMUNLOCK with some configurations?
Notes / Environment:
The read-only flag (MTD_WRITEABLE) in not set on the mtd3 partition (only on mtd0 and mtd1).
flash_lock also returns the same error.
TI AM3505 (ARM Cortex A8, OMAP34).
Linux 2.6.37.
Flash NOR Spansion S29GL512S12DHIV1.
Kernel log:
mtdoops: mtd device (mtddev=name/number) must be supplied
physmap platform flash device: 08000000 at 08000000
physmap-flash.0: Found 1 x16 devices at 0x0 in 16-bit bank. Manufacturer ID 0x000001 Chip ID 0x002301
Amd/Fujitsu Extended Query Table at 0x0040
Amd/Fujitsu Extended Query version 1.5.
Silicon revision: 14
Address sensitive unlock: Required
Erase Suspend: Read/write
Block protection: 1 sectors per group
Temporary block unprotect: Not supported
Block protect/unprotect scheme: 8
Number of simultaneous operations: 0
Burst mode: Not supported
Page mode: 12 word page
Vpp Supply Minimum Program/Erase Voltage: 0.0 V
Vpp Supply Maximum Program/Erase Voltage: 0.0 V
Top/Bottom Boot Block: Uniform, Top WP
number of CFI chips: 1
RedBoot partition parsing not available
Using physmap partition information
Creating 5 MTD partitions on "physmap-flash.0":
0x000000000000-0x000000020000 : "X-Loader-NOR"
0x000000020000-0x0000000c0000 : "U-Boot-NOR"
0x0000000c0000-0x000000100000 : "Boot Env-NOR"
0x000000100000-0x000000500000 : "Kernel-NOR"
0x000000500000-0x000004000000 : "File System-NOR"
For a flash chip that I worked on (drivers/mtd/devices/m25p80.c), I found that UNLOCK was not implemented. The driver's ioctl(UNLOCK) returned -EOPNOTSUPP=95. And code inspection showed mtd_unlock return status being dropped on the floor, as you have found.
These imply assumptions in the m25p80 driver that flash will just never be locked, and in the mtd drivers that it's OK for the device driver to omit UNLOCK. On the board I worked on, flash was being locked by u-boot after every write, so erase and reprogram from linux didn't work at all. I looked at u-boot driver and device datasheet, got some code to implement m25p80_lock and m25p80_unlock, it was not too difficult after I knew what was up. I did not upstream it.
It does seem like a defect for chip drivers to not implement these.
By the way Mousstix, very nice job providing full information in this question.
On newer Kernels (tested on 4.1.18) there is an device-tree option named "use-advanced-sector-protection;" When this is set, I was able to erase/write to protected flash-regions.
It is also documented in the Kernel: Documentation/devicetree/bindings/mtd/mtd-physmap.txt
I'm trying to write some self modifying code in C and ARM. I previously asked a similar question about MIPS and am now trying to port over the project to ARM.
My system := Raspbian on raspberry pi, ARMv6, GCC
There are a few things I am unsure of:
Does ARM require a D-cache write-back/I-cache invalidate (cache flush)? If so, how can we do this?
Also I tried an example
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
int inc(int x){ //increments x
uint16_t *ret = malloc(2 * sizeof(uint16_t));
*(ret + 0) = 0x3001; //add r0 1 := r0 += 1
*(ret + 1) = 0x4770; //bx lr := jump back to inc()
int(*f)(int) = (int (*)(int)) ret;
return (*f)(x);
}
int main(){
printf("%d",inc(6)); //expect '7' to be printed
exit(0);}
but I keep getting a segmentation fault. I'm using the aapcs calling convention, which I've been given to understand is the default for all ARM
I'd be much obliged if someone pointed me in the right direction
Bonus question (meaning, it doesn't really have to be answered, but would be cool to know) - I "come from a MIPS background", how the heck do ARM programmers do without a 0 register? (as in, a register hardcoded to the value 0)
Read Caches and Self-Modifying Code on blogs.arm.com. Article includes an example as well which does what you are describing.
To answer your question from article
... the ARM architecture is often considered to be a Modified Harvard Architecture. ...
The typical drawback of a pure Harvard architecture is that instruction memory is not directly accessible from the same address space as data memory, though this restriction does not apply to ARM. On ARM, you can write instructions into memory, but because the D-cache and I-cache are not coherent, the newly-written instructions might be masked by the existing contents of the I-cache, causing the processor to execute old (or possibly invalid) instructions.
See __clear_cache for how to invalidate cache(s).
I hope you are also aware of ARM/Thumb instruction sets, if you are planning to push your instructions into memory.
Ok, so this works on my raspberry Pi.
#include <stdio.h>
#include <sys/mman.h>
#include <stdint.h>
#include <stdlib.h>
int inc(int x){ //increments x
uint32_t *ret = mmap(NULL,
2 * sizeof(uint32_t), // Space for 16 instructions. (More than enough.)
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,0);
if (ret == MAP_FAILED) {
printf("Could not mmap a memory buffer with the proper permissions.\n");
return -1;
}
*(ret + 0) = 0xE2800001; //add r0 r0 #1 := r0 += 1
*(ret + 1) = 0xE12FFF1E; //bx lr := jump back to inc()
__clear_cache((char*) ret, (char*) (ret+2));
int(*f)(int) = (int (*)(int)) ret;
return (*f)(x);
}
int main(){
printf("%d\n",inc(6)); //expect '7' to be printed
exit(0);}
There are a couple of problems.
You don't flush your D-Cache and I-Cache, so most times the I-Cache will fetch stale data from L2. Under linux there is a libc/sys-call which does that for you. Either use __clear_cache(begin, end) or _builtin_clear_cache(begin, end).
You output Thumb-Code, but you don't take care of how your code gets called. The easiest way to fix that would be to use some asm-code to do the actual blx call and OR the address with 1, as this bit sets the mode the processor runs in. As you're malloc address will always be aligned to a word boundary, making you call thumb-code in arm-mode.