Wrong FFT result when using rfft function on STM32F4 - arm

I'm trying to use rfft function in STM32F407G and I am getting wrong results when I compare it with an online fft calculator . I give a sample input as {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}. My results using the online calculator are as follows -
1) 528 + 0j
2) -16 + 162.450726j
3) -16 + 80.437432j
4) -16 + 52.744931j
5) -16 + 38.627417j
6) -16 + 29.933895j
7) -16 + 23.945692j
8) -16 + 19.496056j
9) -16 + 16j
10) -16 + 13.130861j
11) -16 + 10.690858j
12) -16 + 8.552178j
13) -16 + 6.627417j
14) -16 + 4.853547j
15) -16 + 3.182598j
16) -16 + 1.575862j
17) -16 + 0j
18) -16 - 1.575862j
19) -16 - 3.182598j
20) -16 - 4.853547j
21) -16 - 6.627417j
22) -16 - 8.552178j
23) -16 - 10.690858j
24) -16 - 13.130861j
25) -16 - 16j
26) -16 - 19.496056j
27) -16 - 23.945692j
28) -16 - 29.933895j
29) -16 - 38.627417j
30) -16 - 52.744931j
31) -16 - 80.437432j
32) -16 - 162.450726j
But the results using my code is completely different. I get the following result -
ffttestop[0] float32_t 528
ffttestop[1] float32_t -16
ffttestop[2] float32_t -19.6766739
ffttestop[3] float32_t 125.12075
ffttestop[4] float32_t -22.83638
ffttestop[5] float32_t 46.0686264
ffttestop[6] float32_t -24.1503487
ffttestop[7] float32_t 25.8768311
ffttestop[8] float32_t -25.7224121
ffttestop[9] float32_t 15.1554413
ffttestop[10] float32_t -28.2019958
ffttestop[11] float32_t 7.10556698
ffttestop[12] float32_t -32.8137932
ffttestop[13] float32_t -1.21792603
ffttestop[14] float32_t -43.8981056
ffttestop[15] float32_t -14.4978828
ffttestop[16] float32_t -96.4374313
ffttestop[17] float32_t -64.4374313
ffttestop[18] float32_t 26.5798321
ffttestop[19] float32_t 48.0752258
ffttestop[20] float32_t 6.12293482
ffttestop[21] float32_t 25.4729309
ffttestop[22] float32_t -0.353181839
ffttestop[23] float32_t 16.9155769
ffttestop[24] float32_t -3.09499454
ffttestop[25] float32_t 11.9728451
ffttestop[26] float32_t -3.78621292
ffttestop[27] float32_t 8.55855846
ffttestop[28] float32_t -1.21792793
ffttestop[29] float32_t 6.12293386
ffttestop[30] float32_t 29.4866867
ffttestop[31] float32_t 6.05591106
Here is my code -
#include "stm32f4xx.h"
#include "arm_math.h"
#include "arm_const_structs.h"
#include "core_cm4.h"
#define TEST_LENGTH_SAMPLES 32
/* -------------------------------------------------------------------
* Input and Output buffer Declarations for FFT
* ------------------------------------------------------------------- */
float32_t ffttestip[TEST_LENGTH_SAMPLES]=
{
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
};
static float32_t ffttestop[TEST_LENGTH_SAMPLES];
/* ------------------------------------------------------------------
* Global variables
* ------------------------------------------------------------------- */
uint16_t fftSize = 32;
uint8_t ifftFlag = 0;
uint32_t doBitReverse = 0;
int main(void)
{
//arm_cfft_instance_f32 * S;
//arm_cfft_f32(&arm_cfft_sR_f32_len16, ffttestip, ifftFlag, doBitReverse);
arm_rfft_fast_instance_f32 S;
arm_rfft_fast_init_f32 (&S,fftSize);
arm_rfft_fast_f32(&S,ffttestip,ffttestop,ifftFlag);
//arm_cmplx_mag_f32(ffttestop,ffttestip,fftSize);
while(1);
return 0;
}
/** \endlink */
What is the problem? I am unable to figure it out so it would be great if someone could help.

I think your bitReverse configuration is doing the opposite you would like.
try uint32_t doBitReverse = 1;
so the Bitreverse are activated (which means is not reversing the bits)
It seems confuse I know.

Related

Extraction of directory and page bits off a linear address

As many sources say that using ((1 << x) -1) << y isolates x bits
starting from y to y + x, and a linear address is built like this: 0:11 is the offset, 12:21 is the page and 22:31is the directory.
However, when I use the bit mask above and try to extract the directory for example, it doesn't seem to work at all.
/* unsigned address */
#define MASK_DIR ((1u << 9) - 1) << 22
int main(void)
{
unsigned int dir;
uint32_t addr = 0x12345678;
dir = MASK_DIR & addr;
printf("%#x", dir); // 0x12000000
}
Extracting the offset works well when using (1u << 11) - 1 with the same process as above.
As #Gerhardh corrected my false calculation in which I thought 0-11 is 11 bits even though it's 12. With that said I was able to isolate all the required bits from the 32-bit address, with the following macros
#define ADDR_PAGE(addr) ((((1u << 10) - 1) << 12) & addr)
#define ADDR_DIR(addr) ((((1u << 10) - 1) << 22) & addr)
#define ADDR_OFF(addr) (((1u << 12) - 1) & addr)
Output
0x678 - 0x345000 - 0x12000000 (offset, page, dir)

Convert Q18.2 to float

I'm trying to read the pressure value from MPL3115.
At chapter 14.3 the ds says:
The pressure data is stored as a 20-bit unsigned integer with a fractional part. The
OUT_P_MSB (01h), OUT_P_CSB (02h) and bits 7 to 6 of the OUT_P_LSB (03h)
registers contain the integer part in Pascals. Bits 5 to 4 of OUT_P_LSB contain the
fractional component. This value is representative as a Q18.2 fixed point format where
there are 18 integer bits and two fractional bits.
Here my (testing) code for Microchip XMEGA (GCC):
#define MPL3115_ADDRESS 0x60
#define MPL3115_REG_PRESSURE_DATA 0x01
bool _ReadRegister(uint8_t address, uint8_t *readBuf, uint8_t readCount)
{
TWI_MasterWriteRead(&twiMaster, MPL3115_ADDRESS, &address, 1, readCount);
while (twiMaster.status != TWIM_STATUS_READY);
memcpy(readBuf, (void *) twiMaster.readData, readCount);
return true;
}
bool MPL3115_ReadPressure(float *value) {
uint8_t raw[3];
_ReadRegister(MPL3115_REG_PRESSURE_DATA, raw, 3);
uint32_t data = (((uint32_t) raw[0]) << 16) | (((uint32_t) raw[1]) << 8) | raw[2];
uint32_t q18n2 = data >> 4;
*value = (double) q18n2 / 4;
return true;
}
I'm pretty sure the i2c line is working because I can successfully read the temperature from the same chip.
I configured it in barometer mode and 128 oversampling (CTRL_REG1 = 0x38).
In debug mode I read the following values:
raw0 = 0x18
raw1 = 0x25
raw2 = 0x70
data = 1582448
to obtain the pressure in Pascal I have to right shift data of 4 bits:
q18n2 = 98908
now to convert the Q18.2 to float I should multiply for 2^-n or divide for 4:
value = 24727 Pa
This should be in Pascal, so divide for 100 and get mBar = 247.27 mBar... it's unlikely I have such a pressure here! By the way now should be around 1008 mBar.
Are there any mistakes in my thoughts?
you must to right shift data of 6 bits and then add fractional part (2 bits * 0.25).
*value = (raw0 << 10) | (raw1 << 2) | (raw2 >> 6);
*value += 0.25 * ((raw2 >> 4) & 0x03);

uint64_t setting a range of bits to 1 [duplicate]

This question already has answers here:
bit shifting with unsigned long type produces wrong results
(4 answers)
Closed 5 years ago.
I am trying to create a method to change a range of bits to all 1 using a high and a low and a source. The code works from 0 to 30, then it outputs incorrect numbers. The correct result for setBits(0, 31, 0) should be ffffffff instead of 0.
What is causing my code to reset to zero?
setBits(0,0,0): 1
setBits(0,1,0): 3
setBits(0,2,0): 7
setBits(0,3,0): f
setBits(0,4,0): 1f
setBits(0,5,0): 3f
setBits(0,6,0): 7f
setBits(0,7,0): ff
setBits(0,8,0): 1ff
setBits(0,9,0): 3ff
setBits(0,10,0): 7ff
setBits(0,11,0): fff
setBits(0,12,0): 1fff
setBits(0,13,0): 3fff
setBits(0,14,0): 7fff
setBits(0,15,0): ffff
setBits(0,16,0): 1ffff
setBits(0,17,0): 3ffff
setBits(0,18,0): 7ffff
setBits(0,19,0): fffff
setBits(0,20,0): 1fffff
setBits(0,21,0): 3fffff
setBits(0,22,0): 7fffff
setBits(0,23,0): ffffff
setBits(0,24,0): 1ffffff
setBits(0,25,0): 3ffffff
setBits(0,26,0): 7ffffff
setBits(0,27,0): fffffff
setBits(0,28,0): 1fffffff
setBits(0,29,0): 3fffffff
setBits(0,30,0): 7fffffff
setBits(0,31,0): 0
uint64_t setBits(unsigned low, unsigned high, uint64_t source)
{
assert(high < 64 && (low <= high));
uint64_t mask;
mask = ((1 << (high-low + 1))-1) << low;
uint64_t extracted = mask | source;
return extracted;
}
You need to make the initial bit into the type unsigned long long (or uint64_t) so that it doesn't overflow when bitshifted.
mask = ((1ULL << (high - low + 1)) - 1) << low;
^^^
For number 1 of int type, it'll overflow when leftshifted for 32 bits:
((1 << (high-low + 1))-1) // Where (high-low + 1) == 31 - 0 + 1 == 32
^
00000000 00000000 00000000 00000001 = 1
v <-- Left shift for 32 bits --------<
(1) 00000000 00000000 00000000 00000000 = 0
But that would work for a 64-bit integer type. So change it to 1ULL and the problem is gone.
unsigned is unsigned int, so a 32 bits value, as well as constant 1 which is a signed int, so when you're shifting 1 << (high-low + 1) you're doing it on 32 bits integers.
Use ull to transform all your constants to unsigned 64 bits int during the shifts.
mask = ((1ull << (high-low + 1ull))-1ull) << low

Using Cortex-M4 hardware support to compute CRC32

I'm working on code to compute CRC32 using the hardware CRC support that's built into the ARM Cortex-M4 processor. For reference, there's an application note that describes the hardware here:
http://www.st.com/st-web-ui/static/active/en/resource/technical/document/application_note/DM00068118.pdf
Basically, you write 32-bits of data at a time to a memory-mapped register (CRC_DR), and then you read the resulting CRC back from the same address. However, the CRC this produces is quite different than the standard result that the software CRC32 libraries produce. I finally found someone who had written code that manipulates the Cortex result to produce the "standard" result:
http://www.cnblogs.com/shangdawei/p/4603948.html
My code (shown below and adapted from the above solution) now produces the "standard" result, but I suspect there are more calls to function ReverseBits than are actually necessary. I'm hoping someone can tell me if it can be simplified.
Thanks!
Dan
#define RCC_BASE 0x40023800
#define RCC_AHB1ENR *((uint32_t *) (RCC_BASE + 0x30))
#define CRC_BASE 0x40023000
#define CRC_DR *((volatile uint32_t *) (CRC_BASE + 0x00))
#define CRC_IDR *((volatile uint32_t *) (CRC_BASE + 0x04))
#define CRC_CR *((volatile uint32_t *) (CRC_BASE + 0x08))
uint32_t ARMcrc32(void *data, uint32_t bytes)
{
uint32_t *p32 = data ;
uint32_t crc, crc_reg ;
RCC_AHB1ENR |= 1 << 12 ; // Enable CRC clock
CRC_CR |= 0x00000001 ; // Reset the CRC calculator
while (bytes >= 4)
{
CRC_DR = ReverseBits(*p32++) ;
bytes -= 4 ;
}
crc_reg = CRC_DR ;
crc = ReverseBits(crc_reg) ;
if (bytes > 0)
{
uint32_t bits = 8 * bytes ;
uint32_t xtra = 32 - bits ;
uint32_t mask = (1 << bits) - 1 ;
CRC_DR = crc_reg ;
CRC_DR = ReverseBits((*p32 & mask) ^ crc) >> xtra ;
crc = (crc >> bits) ^ ReverseBits(CRC_DR);
}
return ~crc ;
}

How can I multiply 64 bit operands and get 128 bit result portably?

For x64 I can use this:
{
uint64_t hi, lo;
// hi,lo = 64bit x 64bit multiply of c[0] and b[0]
__asm__("mulq %3\n\t"
: "=d" (hi),
"=a" (lo)
: "%a" (c[0]),
"rm" (b[0])
: "cc" );
a[0] += hi;
a[1] += lo;
}
But I'd like to perform the same calculation portably. For instance to work on x86.
As I understand the question, you want a portable pure C implementation of 64 bit multiplication, with output to a 128 bit value, stored in two 64 bit values. In which case this article purports to have what you need. That code is written for C++. It doesn't take much to turn it into C code:
void mult64to128(uint64_t op1, uint64_t op2, uint64_t *hi, uint64_t *lo)
{
uint64_t u1 = (op1 & 0xffffffff);
uint64_t v1 = (op2 & 0xffffffff);
uint64_t t = (u1 * v1);
uint64_t w3 = (t & 0xffffffff);
uint64_t k = (t >> 32);
op1 >>= 32;
t = (op1 * v1) + k;
k = (t & 0xffffffff);
uint64_t w1 = (t >> 32);
op2 >>= 32;
t = (u1 * op2) + k;
k = (t >> 32);
*hi = (op1 * op2) + w1 + k;
*lo = (t << 32) + w3;
}
Since you have gcc as a tag, note that you can just use gcc's 128-bit integer type:
typedef unsigned __int128 uint128_t;
// ...
uint64_t x, y;
// ...
uint128_t result = (uint128_t)x * y;
uint64_t lo = result;
uint64_t hi = result >> 64;
The accepted solution isn't really the best solution, in my opinion.
It is confusing to read.
It has some funky carry handling.
It doesn't take advantage of the fact that 64-bit arithmetic may be available.
It displeases ARMv6, the God of Absolutely Ridiculous Multiplies. Whoever uses UMAAL shall not lag but have eternal 64-bit to 128-bit multiplies in 4 instructions.
Joking aside, it is much better to optimize for ARMv6 than any other platform because it will have the most benefit. x86 needs a complicated routine and it would be a dead end optimization.
The best way I have found (and used in xxHash3) is this, which takes advantage of multiple implementations using macros:
It is a tiny bit slower than mult64to128 on x86 (by 1-2 instructions), but a lot faster on ARMv6.
#include <stdint.h>
#ifdef _MSC_VER
# include <intrin.h>
#endif
/* Prevents a partial vectorization from GCC. */
#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
__attribute__((__target__("no-sse")))
#endif
static uint64_t multiply64to128(uint64_t lhs, uint64_t rhs, uint64_t *high)
{
/*
* GCC and Clang usually provide __uint128_t on 64-bit targets,
* although Clang also defines it on WASM despite having to use
* builtins for most purposes - including multiplication.
*/
#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
__uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
*high = (uint64_t)(product >> 64);
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFF);
/* Use the _umul128 intrinsic on MSVC x64 to hint for mulq. */
#elif defined(_MSC_VER) && defined(_M_IX64)
# pragma intrinsic(_umul128)
/* This intentionally has the same signature. */
return _umul128(lhs, rhs, high);
#else
/*
* Fast yet simple grade school multiply that avoids
* 64-bit carries with the properties of multiplying by 11
* and takes advantage of UMAAL on ARMv6 to only need 4
* calculations.
*/
/* First calculate all of the cross products. */
uint64_t lo_lo = (lhs & 0xFFFFFFFF) * (rhs & 0xFFFFFFFF);
uint64_t hi_lo = (lhs >> 32) * (rhs & 0xFFFFFFFF);
uint64_t lo_hi = (lhs & 0xFFFFFFFF) * (rhs >> 32);
uint64_t hi_hi = (lhs >> 32) * (rhs >> 32);
/* Now add the products together. These will never overflow. */
uint64_t cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
uint64_t upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
*high = upper;
return (cross << 32) | (lo_lo & 0xFFFFFFFF);
#endif /* portable */
}
On ARMv6, you can't get much better than this, at least on Clang:
multiply64to128:
push {r4, r5, r11, lr}
umull r12, r5, r2, r0
umull r2, r4, r2, r1
umaal r2, r5, r3, r0
umaal r4, r5, r3, r1
ldr r0, [sp, #16]
mov r1, r2
strd r4, r5, [r0]
mov r0, r12
pop {r4, r5, r11, pc}
The accepted solution generates a bunch of adds and adc, as well as an extra umull in Clang due to an instcombine bug.
I further explain the portable method in the link I posted.

Resources