Mov value on floating point register to general register - c

So I am using the instruction "fmov x0,d3" to move the value from d3 to x0, but for some reason the value on x0 remains unchanged.
Can someone, please, tell me how to properly move a value from a floating point register to a general register?
This is my code:
mov x0,#6 //Moves the decimal 0 value to the x0 register, which is the return register
Fadd_test_64:
fmov d1, #7.0 //Moves floating point value to the d1 register
fmov d2, #2.0
fmov d4, #14.0 //Moves the expected result to d4
fadd d3,d1,d2 //d3=d1+d2
fcmp d3, d4 //Compares w3 to w4
b.ne Fadd_error_64 //if they are different, go to Fadd_error_64
b Fadd_end_64 //if not, go to Fadd_end_64
Fadd_error_64:
fmov x0,d3 //Moves the value on the floating point register to the return register
Fadd_end_64:
ret
//Functions declaration
void mathTest_64bits(void);
UINT16 FAddition_64();
//Main function
int main(void)
{
mathTest_64bits();
return 0;
}
//Functino in C to run the assembly code
void mathTest_64bits(void)
{
int ret;
ret = FAddition_64();
if(ret!=0)
{
Print(L"64-bits Floating Point Addition test: Failed: %d",ret);
}
else
{
Print(L"64-bits Floating Point Addition test: Success ");
}
Print(L"\n");
}

Related

ARMCC 5 optimization of strtol and strtod

I have a board based on STM32L4 MCU (Ultra Low Power Cortex-M4) for GNSS tracking purposes. I don't use RTOS, so I use a custom scheduler. Compiler and environment is KEIL uVision 5 (compiler 5.05 and 5.06, behavior doesn't change)
The MCU speaks with GNSS module via plain UART and the protocol is a mix of NMEA and AT. GNSS position is given as plain text that must be converted to a pair of float/double coordinates.
To get the double/float value from text, I use strtod (or strtof).
Note that string operations are made in a separate buffer, different from the UART RX one.
The typical string for a latitude on the UART is
4256.45783
which means 42° 56.45783'
to get absolute position in degrees, I use the following formula
42 + 56.45783 / 60
When there is no optimization the code works fine and the position is converted right. When I turn on level 1 optimization (or higher), if I use standard C library I can convert the integer part (42 in the example) and when it comes to convert 56.45783, I get only 56 (so the integer part of minutes until the dot).
If I get rid of standard library and I use a custom strtod function downloaded from ANSI C source library I simply get 0 with ERANGE error.
In other parts of the code I use strtol, which has a strange behavior when L1 optimization is turned ON: when the first digit is 9 and conversion base is 10 it simply skips that 9 going on with the other digits.
So if in the buffer I have 92, I will get just 2 parsed. To get rid of this I simply prepended a sign + to the number and the result is always OK (as far as I can tell). This WA doesn't work with strtod.
Note that I tried to use static, volatile and on-stack variables, behavior doesn't change.
EDIT: I simplified the code in order to get where it goes wrong, as per comments hereafter
C code is like this:
void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
{
double dbl = 0.0;
dbl = strtod("56.45783",NULL);
if(struc != NULL)
{
struc->Axis = (float)((dbl / 60.0) + 42.0);
}
}
Level 0 optimization:
559: void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
0x08011FEE BDF8 POP {r3-r7,pc}
560: {
0x08011FF0 B570 PUSH {r4-r6,lr}
0x08011FF2 4605 MOV r5,r0
0x08011FF4 ED2D8B06 VPUSH.64 {d8-d10}
0x08011FF8 460C MOV r4,r1
561: double dbl = 0.0;
0x08011FFA ED9F0BF8 VLDR d0,[pc,#0x3E0]
0x08011FFE EEB08A40 VMOV.F32 s16,s0
0x08012002 EEF08A60 VMOV.F32 s17,s1
562: dbl = strtod("56.45783",NULL);
0x08012006 2100 MOVS r1,#0x00
0x08012008 A0F6 ADR r0,{pc}+4 ; #0x080123E4
0x0801200A F7FDFED1 BL.W __hardfp_strtod (0x0800FDB0)
0x0801200E EEB08A40 VMOV.F32 s16,s0
0x08012012 EEF08A60 VMOV.F32 s17,s1
563: if(struc != NULL)
564: {
0x08012016 B1A4 CBZ r4,0x08012042
565: struc->Axis = (float)((dbl / 60.0) + 42.0);
566: }
0x08012018 ED9F0BF5 VLDR d0,[pc,#0x3D4]
0x0801201C EC510B18 VMOV r0,r1,d8
0x08012020 EC532B10 VMOV r2,r3,d0
0x08012024 F7FEF880 BL.W __aeabi_ddiv (0x08010128)
0x08012028 EC410B1A VMOV d10,r0,r1
0x0801202C ED9F0BF2 VLDR d0,[pc,#0x3C8]
0x08012030 EC532B10 VMOV r2,r3,d0
0x08012034 F7FDFFBC BL.W __aeabi_dadd (0x0800FFB0)
0x08012038 EC410B19 VMOV d9,r0,r1
0x0801203C F7FDFF86 BL.W __aeabi_d2f (0x0800FF4C)
0x08012040 6020 STR r0,[r4,#0x00]
567: }
LEVEL 1 optimization
557: void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
0x08011FEE BDF8 POP {r3-r7,pc}
558: {
559: double dbl = 0.0;
0x08011FF0 B510 PUSH {r4,lr}
0x08011FF2 460C MOV r4,r1
560: dbl = strtod("56.45783",NULL);
0x08011FF4 2100 MOVS r1,#0x00
0x08011FF6 A0F7 ADR r0,{pc}+2 ; #0x080123D4
0x08011FF8 F7FDFEDA BL.W __hardfp_strtod (0x0800FDB0)
561: if(struc != NULL)
562: {
0x08011FFC 2C00 CMP r4,#0x00
0x08011FFE D010 BEQ 0x08012022
563: struc->Axis = (float)((dbl / 60.0) + 42.0);
564: }
0x08012000 ED9F1BF7 VLDR d1,[pc,#0x3DC]
0x08012004 EC510B10 VMOV r0,r1,d0
0x08012008 EC532B11 VMOV r2,r3,d1
0x0801200C F7FEF88C BL.W __aeabi_ddiv (0x08010128)
0x08012010 ED9F1BF5 VLDR d1,[pc,#0x3D4]
0x08012014 EC532B11 VMOV r2,r3,d1
0x08012018 F7FDFFCA BL.W __aeabi_dadd (0x0800FFB0)
0x0801201C F7FDFF96 BL.W __aeabi_d2f (0x0800FF4C)
0x08012020 6020 STR r0,[r4,#0x00]
565: }
I looked at the disassembly of __hardfp_strtod and __strtod_int called by these functions and, as they are incorporated as binaries, they don't change with respect of optimization level.
Due to optimization, strtod didn't work.
Thanks to #old_timer, I had to make my own strtod function, which works even with optimization level set at level 2.
double simple_strtod(const char* str)
{
int8 inc;
double result = 0.0;
char * c_tmp;
c_tmp = strchr(str, '.');
if(c_tmp != NULL)
{
c_tmp++;
inc = -1;
while(*c_tmp != 0 && inc > -9)
{
result += (*c_tmp - '0') * pow(10.0, inc);
c_tmp++; inc--;
}
inc = 0;
c_tmp = strchr(str, '.');
c_tmp--;
do
{
result += (*c_tmp - '0') * pow(10.0,inc);
c_tmp--; inc++;
}while(c_tmp >= str);
}
return result;
}
It can be further optimized by not calling 'pow' and use something more clever, but just like this it works perfectly.

divide and store quotient and reminder in different arrays

The standard div() function returns a div_t struct as parameter, for example:
/* div example */
#include <stdio.h> /* printf */
#include <stdlib.h> /* div, div_t */
int main ()
{
div_t divresult;
divresult = div (38,5);
printf ("38 div 5 => %d, remainder %d.\n", divresult.quot, divresult.rem);
return 0;
}
My case is a bit different; I have this
#define NUM_ELTS 21433
int main ()
{
unsigned int quotients[NUM_ELTS];
unsigned int remainders[NUM_ELTS];
int i;
for(i=0;i<NUM_ELTS;i++) {
divide_single_instruction(&quotient[i],&reminder[i]);
}
}
I know that the assembly language for division does everything in single instruction, so I need to do the same here to save on cpu cycles, which is bassicaly move the quotient from EAX and reminder from EDX into a memory locations where my arrays are stored. How can this be done without including the asm {} or SSE intrinsics in my C code ? It has to be portable.
Since you're writing to the arrays in-place (replacing numerator and denominator with quotient and remainder) you should store the results to temporary variables before writing to the arrays.
void foo (unsigned *num, unsigned *den, int n) {
int i;
for(i=0;i<n;i++) {
unsigned q = num[i]/den[i], r = num[i]%den[i];
num[i] = q, den[i] = r;
}
}
produces this main loop assembly
.L5:
movl (%rdi,%rcx,4), %eax
xorl %edx, %edx
divl (%rsi,%rcx,4)
movl %eax, (%rdi,%rcx,4)
movl %edx, (%rsi,%rcx,4)
addq $1, %rcx
cmpl %ecx, %r8d
jg .L5
There are some more complicated cases where it helps to save the quotient and remainder when they are first used. For example in testing for primes by trial division you often see a loop like this
for (p = 3; p <= n/p; p += 2)
if (!(n % p)) return 0;
It turns out that GCC does not use the remainder from the first division and therefore it does the division instruction twice which is unnecessary. To fix this you can save the remainder when the first division is done like this:
for (p = 3, q=n/p, r=n%p; p <= q; p += 2, q = n/p, r=n%p)
if (!r) return 0;
This speeds up the result by a factor of two.
So in general GCC does a good job particularly if you save the quotient and remainder when they are first calculated.
The general rule here is to trust your compiler to do something fast. You can always disassemble the code and check that the compiler is doing something sane. It's important to realise that a good compiler knows a lot about the machine, often more than you or me.
Also let's assume you have a good reason for needing to "count cycles".
For your example code I agree that the x86 "idiv" instruction is the obvious choice. Let's see what my compiler (MS visual C 2013) will do if I just write out the most naive code I can
struct divresult {
int quot;
int rem;
};
struct divresult divrem(int num, int den)
{
return (struct divresult) { num / den, num % den };
}
int main()
{
struct divresult res = divrem(5, 2);
printf("%d, %d", res.quot, res.rem);
}
And the compiler gives us:
struct divresult res = divrem(5, 2);
printf("%d, %d", res.quot, res.rem);
01121000 push 1
01121002 push 2
01121004 push 1123018h
01121009 call dword ptr ds:[1122090h] ;;; this is printf()
Wow, I was outsmarted by the compiler. Visual C knows how division works so it just precalculated the result and inserted constants. It didn't even bother to include my function in the final code. We have to read in the integers from console to force it to actually do the calculation:
int main()
{
int num, den;
scanf("%d, %d", &num, &den);
struct divresult res = divrem(num, den);
printf("%d, %d", res.quot, res.rem);
}
Now we get:
struct divresult res = divrem(num, den);
01071023 mov eax,dword ptr [num]
01071026 cdq
01071027 idiv eax,dword ptr [den]
printf("%d, %d", res.quot, res.rem);
0107102A push edx
0107102B push eax
0107102C push 1073020h
01071031 call dword ptr ds:[1072090h] ;;; printf()
So you see, the compiler (or this compiler at least) already does what you want, or something even more clever.
From this we learn to trust the compiler and only second-guess it when we know it isn't doing a good enough job already.

Strange behavior of asm block in c code

I'm trying to create a little example of 'How to use asm block in C code'.
In my example, i'm trying to increment a value of variable which I created in my C code.
This is my code:
int main()
{
unsigned int i = 0;
unsigned int *ptr1;
// Get the address of the variable i.
ptr1 = &i;
// Show ECHO message.
printf_s("Value before '_asm' block:");
printf_s("\ni = %d (Address = ptr1: %d)\n\n", i, ptr1);
_asm {
// Copy the value of i from the memory.
mov bx, word ptr [ptr1]
// Increment the value of i.
inc bx
// Update the new value of i in memory.
mov word ptr [ptr1], bx
}
// Show ECHO message.
printf_s("Value after '_asm' block:");
printf_s("\ni = %d (Address = ptr1: %d)\n\n", i, ptr1);
// Force the console to stay open.
getchar();
return 0;
}
This is the result of the code in the console:
Values before '_asm' block:
i = 0 (Address = ptr1: 1441144)
Values after '_asm' block:
i = 0 (Address = ptr1: 1441145)
This is very wierd. I only want to update the value of the 'i' variable, but it doesn't work.
In addition, the pointer 'ptr1' now points to the next memory block...
Why is this happening ? And how should I solve this problem?
EDIT:
Thanks to the comments below, I solved the problem.
The main change is in this line:
// Increment the value of i.
inc bx
Due to the fact that we want to increment the VALUE of the variable 'i', we should use brackets.
In addition, the bx register should be changed now to 'ebx', that is a 32-bit register.
Because of using the 'ebx' register, the expression 'word ptr' should be replaced with 'dword ptr'.
The code of the asm block, after the editings:
_asm {
// Copy the value of i from the memory.
mov ebx, dword ptr [ptr1]
// Increment the value of i.
inc [ebx]
// Update the new value of i in memory.
mov dword ptr [ptr1], ebx
}

Last stretch of rounding function in ASM

What I essentially have to do is make what is in Main work.
I'm on my last stretch of this assignment (which will likely take just as long as it did for me to get here) I'm having trouble figuring out how to pass the roundingMode that is passed to roundD and using it in ASM.
Also, there is a block of just comments, as far as I can tell, that's all I have left to do. does that sound right?
#include <stdio.h>
#include <stdlib.h>
#define PRECISION 3
#define RND_CTL_BIT_SHIFT 10
// floating point rounding modes: IA-32 Manual, Vol. 1, p. 4-20
typedef enum {
ROUND_NEAREST_EVEN = 0 << RND_CTL_BIT_SHIFT,
ROUND_MINUS_INF = 1 << RND_CTL_BIT_SHIFT,
ROUND_PLUS_INF = 2 << RND_CTL_BIT_SHIFT,
ROUND_TOWARD_ZERO = 3 << RND_CTL_BIT_SHIFT
} RoundingMode;
double roundD(double n, RoundingMode roundingMode)
{
// do not change anything above this comment
int oldCW = 0x0000;
int newCW = 0xF3FF;
int mask = 0x0300;
int tempVar = 0x0000;
asm(" push %eax \n"
" push %ebx \n"
" fstcw %[oldCWOut] \n" //store FPU CW into OldCW
" mov %%eax, %[oldCWOut] \n" //store old FPU CW into tempVar
" mov %[tempVarIn], %%eax \n"
" add %%eax, %[maskIn] \n" //isolate rounding bits
" add %%eax, %[roundModeOut] \n" //adding rounding modifier
//shift in old bits to tempFPU
//do rounding calculation
//store result into n
" fldcw %[oldCWIn] \n" //restoring the FPU CW to normal
" pop %ebx \n"
" pop %eax \n"
: [oldCWOut] "=m" (oldCW),
[newCWOut] "=m" (newCW),
[maskOut] "=m" (mask),
[tempVarOut] "=m" (tempVar),
[roundModeOut] "=m" (roundMode)
: [oldCWIn] "m" (oldCW),
[newCWIn] "m" (newCW),
[maskIn] "m" (mask),
[tempVarIn] "m" (tempVar),
[roundModeIn] "m" (roundMode)
:"eax", "ebx"
);
return n;
// do not change anything below this comment, except for printing out your name
}
int main(int argc, char **argv)
{
double n = 0.0;
if (argc > 1)
n = atof(argv[1]);
printf("roundD even %.*f = %.*f\n",
PRECISION, n, PRECISION, roundD(n, ROUND_NEAREST_EVEN));
printf("roundD down %.*f = %.*f\n",
PRECISION, n, PRECISION, roundD(n, ROUND_MINUS_INF));
printf("roundD up %.*f = %.*f\n",
PRECISION, n, PRECISION, roundD(n, ROUND_PLUS_INF));
printf("roundD zero %.*f = %.*f\n",
PRECISION, n, PRECISION, roundD(n, ROUND_TOWARD_ZERO));
return 0;
}
While C might like to pretend that enum is not just an integer, it is just an integer. If you can't use roundingMode directly in the assembly, create an integer local variable and set it equal to the roundingMode parameter.
I'm just offering this as a suggestion to you. I've never used inline assembly before and I've never used x86 assembly before, but if all you need to do is reference the parameter, what I said above should work.

Conversion short to int and sum with NEON

I want to convert the next function to NEON:
int dot4_c(unsigned char v0[4], unsigned char v1[4]){
int r=0;
r = v0[0]*v1[0];
r += v0[1]*v1[1];
r += v0[2]*v1[2];
r += v0[3]*v1[3];
return r;
}
I think I almost do it, but there is an error because it is not working well
int dot4_neon_hfp(unsigned char v0[4], unsigned char v1[4])
{
asm volatile (
"vld1.16 {d2, d3}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
"vld1.16 {d4, d5}, [%1] \n\t" //d4={x1,y1}, d5={z1, w1}
"vcvt.32.u16 d2, d2 \n\t" //conversion
"vcvt.32.u16 d3, d3 \n\t"
"vcvt.32.u16 d4, d4 \n\t"
"vcvt.32.u16 d5, d5 \n\t"
"vmul.32 d0, d2, d4 \n\t" //d0= d2*d4
"vmla.32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5
"vpadd.32 d0, d0 \n\t" //d0 = d[0] + d[1]
:: "r"(v0), "r"(v1) :
);
}
How can I get this working?
As mentioned, you must load at least 8 bytes at a time with NEON. As long as the load doesn't go past the end of your buffer, you can ignore the extra bytes. Here is how to do it with intrinsics:
uint8x8_t v0_vec, v1_vec;
uint16x8_t vproduct;
uint32x2_t vsum32;
v0_vec = vld1_u8(v0); // extra bytes will be ignored as long as you can safely read them
v1_vec = vld1_u8(v1);
// you didn't specify if the product of your vector fits in 8-bits, so I assume it needs to be widened to 16-bits
vproduct = vmull_u8(v0_vec, v1_vec);
vsum32 = vpaddl_u16(vget_low_u16(vproduct)); // pairwise add lower half (first 4 u16's)
return vsum32.val[0] + vsum32.val[1];
If you absolutely can't load 8 bytes from your source pointers, you can manually load a 32-bit value into a NEON register (the 4 bytes) and then cast it to the proper intrinsic type.

Resources