STM32 - Pointers and sum

STM32 - Pointers and sum - c

I'm learning how to program STM32 Nucleo F446RE board using registers.
To know the position of a register, I take from datasheets the boundary address and the offset.
However, I cannot calculate the sum of them. I show an exmaple:
volatile uint32_t *GPIOA = 0x0; // Initialization of the boundary adress
GPIOA = (uint32_t*)0x40020000; // Boundary adress from datasheet
volatile uint32_t *GPIOA_ODR = 0x0; // Initialization of GPIOA_ODR register
GPIOA_ODR = GPIOA + (uint32_t*)0x14; // Calculation of GPIOA_ODR as the sum of the boundary adress and the offset (i.e. 0x14.
Line 5 gives me an error, do you know how to calculate it correctly?
Thank you very much in advance.

It is wrong. If you want to use this extremely inconvenient way:
#define GPIOA 0x4002000
#define ODR_OFFSET 0x14
#define GPIO_ODR (*(volatile uint32_t *)(GPIOA + ODR_OFFSET))
why #define not the pointer? It is just more compiler friendly and saves one memory read.
https://godbolt.org/z/LdLLVN
#define GPIOA 0x4002000
#define ODR_OFFSET 0x14
#define GPIO_ODR (*(volatile uint32_t *)(GPIOA + ODR_OFFSET))
volatile uint32_t *pGPIO_ODR = (volatile uint32_t *)(GPIOA + ODR_OFFSET);
void foo(uint32_t x)
{
GPIO_ODR = x;
}
void bar(uint32_t x)
{
*pGPIO_ODR = x;
}
and resulting code
foo:
ldr r3, .L3
str r0, [r3, #20]
bx lr
.L3:
.word 67117056
bar:
ldr r3, .L6
ldr r3, [r3]
str r0, [r3]
bx lr
.L6:
.word .LANCHOR0
pGPIO_ODR:
.word 67117076

The cast should be outside the constant value, in other words, you are adding GPIOA address + 14 to generate a new address. So the cast must be outside them:
GPIOA_ODR = (uint32_t*)(GPIOA + 0x14);

I tried but nothing. If I insert the GPIOA_ODR = (uint32_t*)(0x40020000 + 0x14); it works, instead if I insert GPIOA_ODR = (uint32_t*)(GPIOA + 0x14); it doesn't work.
Some other ideas?
Thank you very much for the answer. The complete code I'm using is the following:
int main(int argc, char* argv[])
{
/** RCC **/
/* RCC */
volatile uint32_t *RCC = 0x0;
RCC = (uint32_t*)0x40023800;
/* RCC_AHB1ENR */
volatile uint32_t *RCC_AHB1ENR = 0x0;
RCC_AHB1ENR = (uint32_t*)(0x40023800 + 0x30);
*RCC_AHB1ENR |= 0x1;
/** GPIOA **/
/* GPIOA */
volatile uint32_t *GPIOA = 0x0;
GPIOA = (uint32_t*)0x40020000;
/* GPIOA_MODER */
volatile uint32_t *GPIOA_MODER = 0x0;
GPIOA_MODER = (uint32_t*)(0x40020000 + 0x00);
*GPIOA_MODER |= 1 << 16;
*GPIOA_MODER &= ~(0 << 17);
/* GPIOA_ODR */
volatile uint32_t *GPIOA_ODR = 0x0;
GPIOA_ODR = (uint32_t*)(GPIOA + 0x14);
*GPIOA_ODR |= 1 << 8;
}
This code doens't work correctly because of the line GPIOA_ODR = (uint32_t*)(GPIOA + 0x14);. If I insert GPIOA_ODR = (uint32_t*)(0x40020000 + 0x14) it works correctly.

Related

Embedded Software: Exception occurs when calling function inside a task

I'm a beginner to embedded software. I try to build my simple real time operating system kernel using C code with the ARM Cortex-M4F Based MCU Tiva C LaunchPad and run in the IAR Embedded Workbench IDE.
The system can support 3 tasks, task A blinks the red LED, task B blinks the blue LED and task C blinks the green LED. The tasks are scheduled in a round-robin way. It uses SysTick to trigger PendSV once per second for context switching.
The following code works fine to blink the LEDs as expected.
#include "include/tm4c_cmsis.h"
#include <intrinsics.h>
#define SYS_CLOCK_HZ 16000000U
#define LED_RED (1U << 1)
#define LED_BLUE (1U << 2)
#define LED_GREEN (1U << 3)
#define MAX_TASK_NUM 3
#define MAX_TASK_SIZE 0x40
int OSStack[MAX_TASK_NUM][MAX_TASK_SIZE] __attribute__ ((aligned (4)));
void task_A();
void task_B();
void task_C();
/* Task Control Block (TCB) */
typedef struct {
int *sp; /* stack pointer */
int status; // 0: does not exists, 1: created, 2: running
} OSTask;
OSTask OSTask_List[MAX_TASK_NUM];
int OS_curr; /* index of the current task */
int OS_next;
int OS_tn; // total task number
int *sp_curr;
int *sp_next;
void OSInit(){
// configure GPIOF for LED blinking
SYSCTL->RCGC2 |= (1U << 5);
GPIOF->DIR |= (1<<3)|(1<<2)|(1<<1);
GPIOF->DEN |= (1<<3)|(1<<2)|(1<<1);
SysTick->LOAD = SYS_CLOCK_HZ - 1;
SysTick->VAL = 0;
SysTick->CTRL = (1U << 2) | (1U << 1) | 1;
OS_tn = 0;
}
void OSCreateTask(void* taskH){
int n=OS_tn;
OS_tn++;
int* p = (int *) OSStack;
OSTask_List[n].sp = p + ((n+1) * MAX_TASK_SIZE);
// init the stack for each task
*(--OSTask_List[n].sp) = (1U << 24); /* xPSR */
*(--OSTask_List[n].sp) = (uint32_t)taskH; /* PC */
*(--OSTask_List[n].sp) = 0x0000000EU + n*16; /* LR */
*(--OSTask_List[n].sp) = 0x0000000CU + n*16; /* R12 */
*(--OSTask_List[n].sp) = 0x00000003U + n*16; /* R3 */
*(--OSTask_List[n].sp) = 0x00000002U + n*16; /* R2 */
*(--OSTask_List[n].sp) = 0x00000001U + n*16; /* R1 */
*(--OSTask_List[n].sp) = 0x00000000U + n*16; /* R0 */
*(--OSTask_List[n].sp) = 0x0000000BU + n*16; /* R11 */
*(--OSTask_List[n].sp) = 0x0000000AU + n*16; /* R10 */
*(--OSTask_List[n].sp) = 0x00000009U + n*16; /* R9 */
*(--OSTask_List[n].sp) = 0x00000008U + n*16; /* R8 */
*(--OSTask_List[n].sp) = 0x00000007U + n*16; /* R7 */
*(--OSTask_List[n].sp) = 0x00000006U + n*16; /* R6 */
*(--OSTask_List[n].sp) = 0x00000005U + n*16; /* R5 */
*(--OSTask_List[n].sp) = 0x00000004U + n*16; /* R4 */
}
// cannot create tasks out of order
void OSSchd(){
if (OS_curr == OS_tn-1)
OS_next = 0;
else
OS_next = OS_curr + 1;
sp_curr = OSTask_List[OS_curr].sp;
sp_next = OSTask_List[OS_next].sp;
*(volatile uint32_t *)0xE000ED04 = (1U << 28);
}
void PendSV_Handler(void) {
asm("PUSH {r4-r11}");
asm("LDR r3, =sp_curr");
asm("STR sp, [r3,#0x00]");
asm("LDR r3, =sp_next");
asm("LDR sp, [r3,#0x00]");
OS_curr = OS_next;
asm("POP {r4-r11}");
}
void SysTick_Handler(void) {
GPIOF->DATA = 0;
OSSchd();
}
void lightRed(void){
GPIOF->DATA_Bits[LED_RED] ^= LED_RED;
}
int main() {
OSInit();
OSCreateTask((void *)task_A);
OSCreateTask((void *)task_B);
OSCreateTask((void *)task_C);
__enable_interrupt();
while (1) {
}
}
void task_A() {
while (1) {
//lightRed();
GPIOF->DATA_Bits[LED_RED] ^= LED_RED;
}
}
void task_B() {
while (1) {
GPIOF->DATA_Bits[LED_BLUE] ^= LED_BLUE;
}
}
void task_C() {
while (1) {
GPIOF->DATA_Bits[LED_GREEN] ^= LED_GREEN;
}
}
However, when I try to change the code inside task_A by calling the function lightRed() as follow:
void lightRed(void){
GPIOF->DATA_Bits[LED_RED] ^= LED_RED;
}
...
void task_A() {
while (1) {
lightRed();
}
}
The three LEDs only blink for 2 cycles and no further response. I stop executing the code and the debugger shows the following problems:
: HardFault exception.
: The processor has escalated a configurable-priority exception to HardFault.
: An integrity check error has occurred on EXC_RETURN (CFSR.INVPC).
: Exception occured at PC = 0x7, LR = 0x1000000
: See the call stack for more information.
: The stack pointer for stack 'CSTACK' (currently 0x200000E0) is outside the stack range (0x20000330 to 0x20000B30)
Also, the call stack is as follow:
-> [__iar_zero_init3 + 0x39]
<Exception frame>
[__vector_table + 0x7]
How can I solve this problem?

This is, fittingly enough, definitely a case of stack overflow. The debug message you're getting is somewhat bogus; the valid stack range given (0x20000330 to 0x20000B30) is probably the configured range for the main stack, which you're not using (all of your tasks are using stacks from your OSStack array).
You've allocated 64 bytes (0x40) for each task's stack. The stack frame you've defined for a task that's suspended is already 64 bytes in size, and that's before it's done anything. If a task is running, and has used any stack space for anything whatsoever, then when it is suspended it will use an additional 64 bytes on top of whatever it is using at the time. This pretty much guarantees you a stack overflow on any nontrivial task, and in this case as soon as you introduce the function call into task_A() you'll be forcing it to use the stack.
The immediate fix is simple: just increase the value of the MAX_TASK_SIZE constant.
Incidentally, the Cortex-M4 has dual stack capability, so you can configure thread-mode code to use a different stack from handler-mode code. This can simplify analysis of stack usage and reduce the required size of task stacks, because interrupt service routines will not use the task stacks for their local storage. Your context switch can remain almost the same, but must read and write PSP to obtain and modify the task stack pointers rather than just using sp.

Using Cortex-M4 hardware support to compute CRC32

I'm working on code to compute CRC32 using the hardware CRC support that's built into the ARM Cortex-M4 processor. For reference, there's an application note that describes the hardware here:
http://www.st.com/st-web-ui/static/active/en/resource/technical/document/application_note/DM00068118.pdf
Basically, you write 32-bits of data at a time to a memory-mapped register (CRC_DR), and then you read the resulting CRC back from the same address. However, the CRC this produces is quite different than the standard result that the software CRC32 libraries produce. I finally found someone who had written code that manipulates the Cortex result to produce the "standard" result:
http://www.cnblogs.com/shangdawei/p/4603948.html
My code (shown below and adapted from the above solution) now produces the "standard" result, but I suspect there are more calls to function ReverseBits than are actually necessary. I'm hoping someone can tell me if it can be simplified.
Thanks!
Dan
#define RCC_BASE 0x40023800
#define RCC_AHB1ENR *((uint32_t *) (RCC_BASE + 0x30))
#define CRC_BASE 0x40023000
#define CRC_DR *((volatile uint32_t *) (CRC_BASE + 0x00))
#define CRC_IDR *((volatile uint32_t *) (CRC_BASE + 0x04))
#define CRC_CR *((volatile uint32_t *) (CRC_BASE + 0x08))
uint32_t ARMcrc32(void *data, uint32_t bytes)
{
uint32_t *p32 = data ;
uint32_t crc, crc_reg ;
RCC_AHB1ENR |= 1 << 12 ; // Enable CRC clock
CRC_CR |= 0x00000001 ; // Reset the CRC calculator
while (bytes >= 4)
{
CRC_DR = ReverseBits(*p32++) ;
bytes -= 4 ;
}
crc_reg = CRC_DR ;
crc = ReverseBits(crc_reg) ;
if (bytes > 0)
{
uint32_t bits = 8 * bytes ;
uint32_t xtra = 32 - bits ;
uint32_t mask = (1 << bits) - 1 ;
CRC_DR = crc_reg ;
CRC_DR = ReverseBits((*p32 & mask) ^ crc) >> xtra ;
crc = (crc >> bits) ^ ReverseBits(CRC_DR);
}
return ~crc ;
}

How jump to the ADDRESS in Keil on lpc4078?

I have bootloader, when good jobed in lpc2xxx. But, when I copy this to lpc4078, bootloader not jump to main programm.
I tried:
1) Use #define USER_FLASH_START 0x8000
__asm void boot_jump(uint32_t address)
{
LDR SP, [R0]
LDR PC, [R0, #4]
}
main()
{
bool t = true;
while(t)
{
...WORKING CODE...
uart << "Hello, World!!!";
}
uart << "END";
__disable_irq();
__set_CONTROL(0);
__set_MSP(stack_adr);
SCB->VTOR = (USER_FLASH_START & 0x1FFFFF80);
boot_jump(USER_FLASH_START);
}
2) Use #define USER_FLASH_START 0x8000
void JumpToAppAt(unsigned int * vtbp)
{
__disable_irq();
__set_MSP(vtbp[0]); // load SP
((void (*)(void)) vtbp[1])(); // go...
}
main()
{
bool t = true;
while(t)
{
...WORKING CODE...
uart << "Hello, World!!!";
}
uart << "END";
__disable_irq();
__set_CONTROL(0);
__set_MSP(stack_adr);
SCB->VTOR = (USER_FLASH_START & 0x1FFFFF80);
JumpToAppAt((unsigned int *) USER_FLASH_START);
}
Main programm not start(.
I thought what not work MAIN_PROGRAMM and tried:
3) Use #define USER_FLASH_START 0x0, but nothing has changed.

LPC2xxx are based on ARM7 while LPC4078 is based on Cortex-M4. Regarding the reset vector, I see that you have adjusted to using the 2nd entry of the vector table. Please make sure that the addresses stored in the vector table have their LSB set to 1, i.e. bit[0] = 1, because this bit indicates Thumb instructions, and the Cortex-M4 processor only supports Thumb instructions. (reference)

C Kernel - Runs fine in Qemu but not in VM

I am developing a kernel from scratch in C. I am having a problem with the keyboard. In Qemu, when I press a key on my keyboard, the keypress is handled normally. When I run it in VirtualBox or on an actual computer, it works fine until I press a key. In virtualbox, when it crashes, it gives me the error code VINF_EM_TRIPLE_FAULT. Here is my (important parts) of the code:
Assembly:
bits 32
section .text
;multiboot spec
align 4
dd 0x1BADB002 ;magic
dd 0x00 ;flags
dd - (0x1BADB002 + 0x00)
global start
global keyboard_handler
global read_port
global write_port
global load_idt
extern main ;this is defined in the c file
extern keyboard_handler_main
read_port:
mov edx, [esp + 4]
;al is the lower 8 bits of eax
in al, dx ;dx is the lower 16 bits of edx
ret
write_port:
mov edx, [esp + 4]
mov al, [esp + 4 + 4]
out dx, al
ret
load_idt:
mov edx, [esp + 4]
lidt [edx]
sti ;turn on interrupts
ret
keyboard_handler:
call keyboard_handler_main
iretd
start:
cli ;block interrupts
mov esp, stack_space
call main
hlt ;halt the CPU
section .bss
resb 8192; 8KB for stack
stack_space:
C (The important parts, I omitted parts that don't involve the keyboard):
struct IDT_entry{
unsigned short int offset_lowerbits;
unsigned short int selector;
unsigned char zero;
unsigned char type_attr;
unsigned short int offset_higherbits;
};
struct IDT_entry IDT[IDT_SIZE];
void idt_init(void)
{
unsigned long keyboard_address;
unsigned long idt_address;
unsigned long idt_ptr[2];
/* populate IDT entry of keyboard's interrupt */
keyboard_address = (unsigned long)keyboard_handler;
IDT[0x21].offset_lowerbits = keyboard_address & 0xffff;
IDT[0x21].selector = KERNEL_CODE_SEGMENT_OFFSET;
IDT[0x21].zero = 0;
IDT[0x21].type_attr = INTERRUPT_GATE;
IDT[0x21].offset_higherbits = (keyboard_address & 0xffff0000) >> 16;
/* Ports
* PIC1 PIC2
*Command 0x20 0xA0
*Data 0x21 0xA1
*/
/* ICW1 - begin initialization */
write_port(0x20 , 0x11);
write_port(0xA0 , 0x11);
/* ICW2 - remap offset address of IDT */
/*
* In x86 protected mode, we have to remap the PICs beyond 0x20 because
* Intel have designated the first 32 interrupts as "reserved" for cpu exceptions
*/
write_port(0x21 , 0x20);
write_port(0xA1 , 0x28);
/* ICW3 - setup cascading */
write_port(0x21 , 0x00);
write_port(0xA1 , 0x00);
/* ICW4 - environment info */
write_port(0x21 , 0x01);
write_port(0xA1 , 0x01);
/* Initialization finished */
/* mask interrupts */
write_port(0x21 , 0xff);
write_port(0xA1 , 0xff);
/* fill the IDT descriptor */
idt_address = (unsigned long)IDT ;
idt_ptr[0] = (sizeof (struct IDT_entry) * IDT_SIZE) + ((idt_address & 0xffff) << 16);
idt_ptr[1] = idt_address >> 16 ;
load_idt(idt_ptr);
}
void kb_init(void)
{
/* 0xFD is 11111101 - enables only IRQ1 (keyboard)*/
write_port(0x21 , 0xFD);
}
void keyboard_handler_main(void) {
unsigned char status;
char keycode;
/* write EOI */
write_port(0x20, 0x20);
status = read_port(KEYBOARD_STATUS_PORT);
/* Lowest bit of status will be set if buffer is not empty */
if (status & 0x01) {
keycode = read_port(KEYBOARD_DATA_PORT);
if(keycode < 0){
return;
}
char c = keyboard_map[(unsigned char) keycode]; //keyboard_map converts key codes to ASCII
(prints the char)
}
}
}
void main(void)
{
idt_init();
kb_init();
while(1);
print("Welcome to Code OS");
}
And then of course they are compiled with a linker.
So why is this working in qemu but not an actual computer? (On an actual computer it works fine until I press any key.)

How can I multiply 64 bit operands and get 128 bit result portably?

For x64 I can use this:
{
uint64_t hi, lo;
// hi,lo = 64bit x 64bit multiply of c[0] and b[0]
__asm__("mulq %3\n\t"
: "=d" (hi),
"=a" (lo)
: "%a" (c[0]),
"rm" (b[0])
: "cc" );
a[0] += hi;
a[1] += lo;
}
But I'd like to perform the same calculation portably. For instance to work on x86.

As I understand the question, you want a portable pure C implementation of 64 bit multiplication, with output to a 128 bit value, stored in two 64 bit values. In which case this article purports to have what you need. That code is written for C++. It doesn't take much to turn it into C code:
void mult64to128(uint64_t op1, uint64_t op2, uint64_t *hi, uint64_t *lo)
{
uint64_t u1 = (op1 & 0xffffffff);
uint64_t v1 = (op2 & 0xffffffff);
uint64_t t = (u1 * v1);
uint64_t w3 = (t & 0xffffffff);
uint64_t k = (t >> 32);
op1 >>= 32;
t = (op1 * v1) + k;
k = (t & 0xffffffff);
uint64_t w1 = (t >> 32);
op2 >>= 32;
t = (u1 * op2) + k;
k = (t >> 32);
*hi = (op1 * op2) + w1 + k;
*lo = (t << 32) + w3;
}

Since you have gcc as a tag, note that you can just use gcc's 128-bit integer type:
typedef unsigned __int128 uint128_t;
// ...
uint64_t x, y;
// ...
uint128_t result = (uint128_t)x * y;
uint64_t lo = result;
uint64_t hi = result >> 64;

The accepted solution isn't really the best solution, in my opinion.
It is confusing to read.
It has some funky carry handling.
It doesn't take advantage of the fact that 64-bit arithmetic may be available.
It displeases ARMv6, the God of Absolutely Ridiculous Multiplies. Whoever uses UMAAL shall not lag but have eternal 64-bit to 128-bit multiplies in 4 instructions.
Joking aside, it is much better to optimize for ARMv6 than any other platform because it will have the most benefit. x86 needs a complicated routine and it would be a dead end optimization.
The best way I have found (and used in xxHash3) is this, which takes advantage of multiple implementations using macros:
It is a tiny bit slower than mult64to128 on x86 (by 1-2 instructions), but a lot faster on ARMv6.
#include <stdint.h>
#ifdef _MSC_VER
# include <intrin.h>
#endif
/* Prevents a partial vectorization from GCC. */
#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
__attribute__((__target__("no-sse")))
#endif
static uint64_t multiply64to128(uint64_t lhs, uint64_t rhs, uint64_t *high)
{
/*
* GCC and Clang usually provide __uint128_t on 64-bit targets,
* although Clang also defines it on WASM despite having to use
* builtins for most purposes - including multiplication.
*/
#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
__uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
*high = (uint64_t)(product >> 64);
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFF);
/* Use the _umul128 intrinsic on MSVC x64 to hint for mulq. */
#elif defined(_MSC_VER) && defined(_M_IX64)
# pragma intrinsic(_umul128)
/* This intentionally has the same signature. */
return _umul128(lhs, rhs, high);
#else
/*
* Fast yet simple grade school multiply that avoids
* 64-bit carries with the properties of multiplying by 11
* and takes advantage of UMAAL on ARMv6 to only need 4
* calculations.
*/
/* First calculate all of the cross products. */
uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF);
uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
uint64_t hi_hi = (lhs >> 32) * (rhs >> 32);
/* Now add the products together. These will never overflow. */
uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
*high = upper;
return (cross << 32) | (lo_lo & 0xFFFFFFFF);
#endif /* portable */
}
On ARMv6, you can't get much better than this, at least on Clang:
multiply64to128:
push {r4, r5, r11, lr}
umull r12, r5, r2, r0
umull r2, r4, r2, r1
umaal r2, r5, r3, r0
umaal r4, r5, r3, r1
ldr r0, [sp, #16]
mov r1, r2
strd r4, r5, [r0]
mov r0, r12
pop {r4, r5, r11, pc}
The accepted solution generates a bunch of adds and adc, as well as an extra umull in Clang due to an instcombine bug.
I further explain the portable method in the link I posted.