Related
I have a system where I have to implement to second part. The issue is that I received a float where the bytes are mixed. In the example below, the input is 1E9 (f_orig) and what is receive is this 2.034699e+26 (f_recv). I did the function ABCD_to_CDAB_float but I find it ugly. Is there a better way to to write ABCD_to_CDAB_float(without two temporary variables will already be nice) ?
#include <stdio.h>
#include <float.h>
#include <string.h>
#include <stdint.h>
/* Change float byte order */
float ABCD_to_CDAB_float(float infloat) {
float outfloat;
uint8_t tmp[4];
memcpy(&tmp, &infloat, 4);
uint8_t tmp2[] = {tmp[2], tmp[3], tmp[0], tmp[1]};
memcpy(&outfloat, &tmp2, 4);
return outfloat;
}
int main() {
float f_orig = 1E9; // This is the value sent
printf("f_orig\t%e\n", f_orig);
char str_orig[4];
memcpy(&str_orig, &f_orig, 4);
printf("str_orig\t%x %x %x %x\n", str_orig[0], str_orig[1], str_orig[2], str_orig[3]);
float f_built; // same as f_orig
char str_built[] = {0x28, 0x6B, 0x6E, 0x4E};
memcpy(&f_built, &str_built, 4);
printf("f_built\t%e\n", f_built); // it prints "1E9"
float f_recv; // received float
char str_recv[] = {0x6E, 0x4E, 0x28, 0x6B};
memcpy(&f_recv, &str_recv, 4);
printf("f_recv\t%e\n", f_recv); // converesion was wrong somewhere
char str6[] = {str_recv[2], str_recv[3], str_recv[0], str_recv[1]};
float f_res;
memcpy(&f_res, &str6, 4);
printf("f_res\t%e\n", f_res); // result is fine
printf("%e\n",ABCD_to_CDAB_float(f_recv)); // result is right
return 0;
}
Firstly, please note that removing variables does not automatically imply that it will use less memory. It's quite likely that the compiler will remove unnecessary variables, but it might also add variables if it wants to.
If you absolutely want to get rid of them, you can do this:
float ABCD_to_CDAB_float(float infloat) {
float outfloat;
char *tmp1 = (char*) &infloat;
char *tmp2 = (char*) &outfloat;
tmp2[0] = tmp1[2];
tmp2[1] = tmp1[3];
tmp2[2] = tmp1[0];
tmp2[3] = tmp1[1];
return outfloat;
}
But to be honest, I don't see the point. Also, I'd advice against things like this, because it's very easy to get it wrong and cause very hard traced bugs. I guess you also could do something like this: (No, you cannot. See edit below.)
float ABCD_to_CDAB_float(float infloat) {
char *tmp1 = (char*) &infloat;
char tmp2[] = {tmp[2], tmp[3], tmp[0], tmp[1]};
return *(float*) &tmp2; // This is not ok! It's violating the
// strict aliasing rule
}
But again. Avoid these magic tricks if you don't really need them. In some cases, it might impact performance a little bit, but it does not make the code easier to read.
And to be perfectly honest, I'm not 100% sure that this does not violate some of those really strange rules in C. It is a possibility that this leads to undefined behavior. So don't trust me completely on this one. When dealing with pointer casting like this, it's easy to violate the strict aliasing rule
EDIT:
The second example DOES violate the strict aliasing rule, which proves my point that his might be tricky. Thanks to Andrew Henle for pointing it out.
If you want to have absolutely no extra variables, not even pointers, take a look at Eric Postpischil's answer but don't use stuff like that if anyone else is supposed to read it.
To do it with no extra variables:
float ABCD_to_CDAB_float(float x)
{
return (union { uint8_t u[4]; float f; }) {{
((uint8_t *) &x)[2], ((uint8_t *) &x)[3],
((uint8_t *) &x)[0], ((uint8_t *) &x)[1] }} .f;
}
To remove some distracting parentheses:
return (union { uint8_t u[4]; float f; }) {{
2[(uint8_t *) &x], 3[(uint8_t *) &x],
0[(uint8_t *) &x], 1[(uint8_t *) &x] }} .f;
If you use gcc family compiler I would:
#define SWAP(a,b,type) do{type c = a; (a) = b; (b) = c;} while(0)
float ABCD_to_CDAB_float(float x)
{
union
{
float f;
uint16_t u16[2];
}z = {.f = x};
z.u16[0] = __builtin_bswap16(z.u16[0]);
z.u16[1] = __builtin_bswap16(z.u16[1]);
SWAP(z.u16[0], z.u16[1], uint16_t);
return z.f;
}
float ABCD_to_DCBA_float(float x)
{
uint32_t u32;
memcpy(&u32,&x, 4);
u32 = __builtin_bswap32(u32);
memcpy(&x, &u32, 4);
return x;
}
and it will generate the most efficient code:
ABCD_to_CDAB_float:
movd eax, xmm0
mov ecx, eax
shr eax, 16
mov edx, eax
rol cx, 8
rol dx, 8
sal ecx, 16
movzx eax, dx
or eax, ecx
movd xmm0, eax
ret
ABCD_to_DCBA_float:
movd eax, xmm0
bswap eax
movd xmm0, eax
ret
You can do this with unions:
union f2l {
float f;
uint32_t l;
};
float ABCD_to_CDAB(float input) {
f2l t1;
t1.f = input;
uint16_t cd = t1.l >> 16;
uint16_t ab = t1.l & 0x0000FFFF;
t1.l = (ab << 16) | cd;
return t1.f;
}
the 2 variables cd and ab are for sake of clarity and can be removed.
GCC is giving me a hard time generating optimal assembly for following source code:
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
X being an uint64_t[2] array, and
Y being an unsigned char[16] array, and
table being a double dimensional array of union qword_t:
union qword_t {
uint8_t asBytes[8];
uint64_t asQWord;
};
const union qword_t table[16][256] = /* ... */;
With options -m64 -Ofast -mno-sse it does unroll the loop, and each xor with assignment results in 3 instructions (thus overall number of instructions issued is 3 * 16 = 48):
movzx r9d, byte ptr [Y + i] ; extracting byte
xor rax, qword ptr [table + r9*8 + SHIFT] ; xoring, SHIFT = i * 0x800
mov qword ptr [X], rax ; storing result
Now, my understanding is that resulting X value could be accumulated in rax register throughout all 16 xors, and then it could be stored at [X] address, which could be achieved with these two instructions for each xor with assignment:
movzx r9d, byte ptr [Y + i] ; extracting byte
xor rax, qword ptr [table + r9*8 + SHIFT] ; xoring, SHIFT = i * 0x800
and single storing:
mov qword ptr [X], rax ; storing result
(In this case overall number of instructions is 2 * 16 + 1 = 33)
Why does GCC generate these redundant mov instructions? What can I do to avoid this?
P.S. C99, GCC 5.3.0, Intel Core i5 Sandy Bridge
Redundant stores are usually down to aliasing; in this case gcc would be unable to prove to its satisfaction that the store to X[0] does not affect table. It makes a big difference how the variables are passed to the routine; if they are globals or members of the same larger struct then proving non-aliasing is easier.
Example:
void f1(uint64_t X[2]) {
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
}
uint64_t X[2];
void f2() {
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
}
Here the store to X[0] is sunk out of the loop in f2 but not in f1, because only in f2 can gcc prove that X does not alias members of table.
Your workaround/fix could be to adjust how the parameters are passed, to use the restrict specifier, or to manually sink the store yourself.
To avoid this, you could use this instead:
uint64_t v = 0;
for (int i= 0; i < 16; ++i) {
v ^= table[i][Y[i]].asQWord;
}
X[0] = v;
X[1] = 0;
You can easily notice the generated instructions are sub-optimal in your case, however for different reasons gcc may not be able to determine that. (And in this case, gcc cannot determine that table will never access the same memory-region as X, as ecatmur explains more elaborately.)
A lot of related questions <How is x86 instruction cache synchronized? > mention x86 should properly handle i-cache synchronization in self modifying code. I wrote the following piece of code which toggles a function call on and off from different threads interleaved with its execution. I am using compare and swap operation as an additional guard so that the modification is atomic. But I am getting intermittent crashes (SIGSEGV, SIGILL) and analyzing the core dump makes me suspicious if the processor is trying to execute partially updated instructions. The code and the analysis given below. May be I am missing something here. Let me know if that's the case.
toggle.c
#include <stdio.h>
#include <inttypes.h>
#include <time.h>
#include <pthread.h>
#include <sys/mman.h>
#include <errno.h>
#include <unistd.h>
int active = 1; // Whether the function is toggled on or off
uint8_t* funcAddr = 0; // Address where function call happens which we need to toggle on/off
uint64_t activeSequence = 0; // Byte sequence for toggling on the function CALL
uint64_t deactiveSequence = 0; // NOP byte sequence for toggling off the function CALL
inline int modify_page_permissions(uint8_t* addr) {
long page_size = sysconf(_SC_PAGESIZE);
int code = mprotect((void*)(addr - (((uint64_t)addr)%page_size)), page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;
}
// If the 8 bytes we need to modify straddles a page boundary make the next page writable too
if (page_size - ((uint64_t)addr)%page_size < 8) {
code = mprotect((void*)(addr-((uint64_t)addr)%page_size+ page_size) , page_size,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (code) {
fprintf(stderr, "mprotect was not successfull! code %d\n", code);
fprintf(stderr, "errno value is : %d\n", errno);
return 0;;
}
}
return 1;
}
void* add_call(void* param) {
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = 50000;
while (1) {
if (!active) {
if (activeSequence != 0) {
int status = modify_page_permissions(funcAddr);
if (!status) {
return 0;
}
uint8_t* start_addr = funcAddr - 8;
fprintf(stderr, "Activating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*) start_addr,
*((uint64_t*)start_addr), activeSequence);
active = 1;
} else {
fprintf(stderr, "Active sequence not initialized..\n");
}
}
nanosleep(&ts, NULL);
}
}
int remove_call(uint8_t* addr) {
if (active) {
// Remove gets called first before add so we initialize active and deactive state byte sequences during the first call the remove
if (deactiveSequence == 0) {
uint64_t sequence = *((uint64_t*)(addr-8));
uint64_t mask = 0x0000000000FFFFFF;
uint64_t deactive = (uint64_t) (sequence & mask);
mask = 0x9090909090000000; // We NOP 5 bytes of CALL instruction and leave rest of the 3 bytes as it is
activeSequence = sequence;
deactiveSequence = deactive | mask;
funcAddr = addr;
}
int status = modify_page_permissions(addr);
if (!status) {
return -1;
}
uint8_t* start_addr = addr - 8;
fprintf(stderr, "Deactivating foo..\n");
uint64_t res = __sync_val_compare_and_swap((uint64_t*)start_addr,
*((uint64_t*)start_addr), deactiveSequence);
active = 0;
// fprintf(stderr, "Result : %p\n", res);
}
}
int counter = 0;
void foo(int i) {
// Use the return address to determine where we need to patch foo CALL instruction (5 bytes)
uint64_t* addr = (uint64_t*)__builtin_extract_return_addr(__builtin_return_address(0));
fprintf(stderr, "Foo counter : %d\n", counter++);
remove_call((uint8_t*)addr);
}
// This thread periodically checks if the method is inactive and if so reactivates it
void spawn_add_call_thread() {
pthread_t tid;
pthread_create(&tid, NULL, add_call, (void*)NULL);
}
int main() {
spawn_add_call_thread();
int i=0;
for (i=0; i<1000000; i++) {
// fprintf(stderr, "i : %d..\n", i);
foo(i);
}
fprintf(stderr, "Final count : %d..\n\n\n", counter);
}
Core dump analysis
Program terminated with signal 4, Illegal instruction.
#0 0x0000000000400a28 in main () at toggle.c:123
(gdb) info frame
Stack level 0, frame at 0x7fff7c8ee360:
rip = 0x400a28 in main (toggle.c:123); saved rip 0x310521ed5d
source language c.
Arglist at 0x7fff7c8ee350, args:
Locals at 0x7fff7c8ee350, Previous frame's sp is 0x7fff7c8ee360
Saved registers:
rbp at 0x7fff7c8ee350, rip at 0x7fff7c8ee358
(gdb) disas /r 0x400a28,+30
Dump of assembler code from 0x400a28 to 0x400a46:
=> 0x0000000000400a28 <main+64>: ff (bad)
0x0000000000400a29 <main+65>: ff (bad)
0x0000000000400a2a <main+66>: ff eb ljmpq *<internal disassembler error>
0x0000000000400a2c <main+68>: e7 48 out %eax,$0x48
(gdb) disas /r main
Dump of assembler code for function main:
0x00000000004009e8 <+0>: 55 push %rbp
...
0x0000000000400a24 <+60>: 89 c7 mov %eax,%edi
0x0000000000400a26 <+62>: e8 11 ff ff ff callq 0x40093c <foo>
0x0000000000400a2b <+67>: eb e7 jmp 0x400a14 <main+44>
So as can be seen the instruction pointer seems to positioned within an address inside the CALL instruction and processor is apparently trying to execute that misaligned instruction causing an illegal instruction fault.
I think your problem is that you replaced a 5-byte CALL instruction with 5 1-byte NOPs. Consider what happens when your thread has executed 3 of the NOPs, and then your master thread decides to swap the CALL instruction back in. Your thread's PC will be three bytes in the middle of the CALL instruction and will therefore execute an unexpected and likely illegal instruction.
What you need to do is swap the 5-byte CALL instruction with a 5-byte NOP. You just need to find a multibyte instruction that does nothing (such as or'ing a register against itself) and if you need some extra bytes, prepend some prefix bytes such as a gs override prefix and an address-size override prefix (both of which will do nothing). By using a 5-byte NOP, your thread will be guaranteed to either be at the CALL instruction or past the CALL instruction, but never inside of it.
On 80x86 most calls use a relative displacement, not an absolute address. Essentially its "call the code at here + < displacement >" and not "call the code at < address >".
For 64-bit code, the displacement may be 8 bits or 32-bits. It's never 64-bits.
For example, for a 2-byte "call with 8-bit displacement" instruction, you'd be trashing 6 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
For another example, for a 5-byte "call with 32-bit displacement" instruction, you'd be trashing 3 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).
However...
These aren't the only way to call. For example, you can call using a function pointer, where the address of the code being called is not in the instruction at all (but may be in a register or be a variable in memory). There's also an optimisation called "tail call optimisation" where a call followed by a ret is replaced with a jmp (likely with some additional stack diddling for passing parameters, cleaning up the caller's local variables, etc).
Essentially; your code is severely broken, you can't cover all the possible corner cases, you shouldn't be doing this to begin with, and you probably should be using a function pointer instead of self modifying code (which would be faster and easier and portable too).
I want to pretend that an array in C is an area of memory in a microprocessor, so I can compile some code on a PC. I've written a small program to try to get the syntax correct, but the program won't run, it either crashes or won't compile when I change the way I access the variable - it's late and I can't see why. What is wrong with this please?
// original code in microprocessor header that I need to change if I compile on the host
// BASE is simply a hex value that is later used as an address or a hex value
#define BASE (0x0000)
// used later in header like this (cannot change the way this is done)
#define OFFSET 0x0001
#define PERIPHERAL (BASE + OFFSET)
// also used like (also cannot change):
uint32_t var = PERIPHERAL | HEXMASK;
// here is how I intend to replace the uC specific code
// replace the BASE DEFINE with the next 2 lines of code:
// instead of writing to memory location, write to array of bytes instead, so declare it:
uint8_t BASE_memory[4] = {0, 0, 0, 0};
// define BASE as hex value that can be used as drop-in replacement in either of the 2 uses shown above
#define BASE ((uint32_t)(BASE_memory))
// now test usage
// access contents of BASE_memory[0]
printf("contents of BASE_memory[0] == %02x\n", *((uint32_t *)(BASE)));
// now I want to access PERIPHERAL, the second element of the array, i.e. BASE_memory[1]
printf("contents of BASE_memory[1] == %02x\n", *((uint32_t *)(PERIPHERAL)));
I think you are on a 64-bit system.
#include <stdint.h>
uint8_t BASE_memory[4] = {1, 2, 3, 4};
int func1()
{
return *(uint32_t *) (uint32_t) BASE_memory;
}
int func2()
{
return *(uint32_t *) (uintptr_t) BASE_memory;
}
Here's the assembly output for func1:
leaq _BASE_memory(%rip), %rax
movl %eax, %eax
movl (%rax), %eax
Here's the assembly for func2:
movl _BASE_memory(%rip), %eax
You can see that if you cast the address to uint32_t, then there's an extra step where the high bits are set to zero. The address is then wrong, and you get a segmentation fault. That's why you use uintptr_t or intptr_t instead of uint32_t.
I have to split a number into its digits in order to display it on an LCD. Right now I use the following method:
pos = 7;
do
{
LCD_Display(pos, val % 10);
val /= 10;
pos--;
} while (pos >= 0 && val);
The problem with this method is that division and modulo operations are extremely slow on an MSP430 microcontroller. Is there any alternative to this method, something that either does not involve division or that reduces the number of operations?
A note: I can't use any library functions, such as itoa. The libraries are big and the functions themselves are rather resource hungry (both in terms of number of cycles, and RAM usage).
You could do subtractions in a loop with predefined base 10 values.
My C is a bit rusty, but something like this:
int num[] = { 10000000,1000000,100000,10000,1000,100,10,1 };
for (pos = 0; pos < 8; pos++) {
int cnt = 0;
while (val >= num[pos]) {
cnt++;
val -= num[pos];
}
LCD_Display(pos, cnt);
}
Yes, there's another way, originally invented (at least AFAIK) by Terje Mathiesen. Instead of dividing by 10, you (sort of) multiply by the reciprocal. The trick, of course, is that in integers you can't represent the reciprocal directly. To make up for that, you work with scaled integers. If we had floating point, we could extract digits with something like:
input = 123
first digit = integer(10 * (fraction(input * .1))
second digit = integer(100 * (fraction(input * .01))
...and so on for as many digits as needed. To do this with integers, we basically just scale those by 232 (and round each up, since we'll use truncating math). In C, the algorithm looks like this:
#include <stdio.h>
// here are our scaled factors
static const unsigned long long factors[] = {
3435973837, // ceil((0.1 * 2**32)<<3)
2748779070, // ceil((0.01 * 2**32)<<6)
2199023256, // etc.
3518437209,
2814749768,
2251799814,
3602879702,
2882303762,
2305843010
};
static const char shifts[] = {
3, // the shift value used for each factor above
6,
9,
13,
16,
19,
23,
26,
29
};
int main() {
unsigned input = 13754;
for (int i=8; i!=-1; i--) {
unsigned long long inter = input * factors[i];
inter >>= shifts[i];
inter &= (unsigned)-1;
inter *= 10;
inter >>= 32;
printf("%u", inter);
}
return 0;
}
The operations in the loop will map directly to instructions on most 32-bit processors. Your typical multiply instruction will take 2 32-bit inputs, and produce a 64-bit result, which is exactly what we need here. It'll typically be quite a bit faster than a division instruction as well. In a typical case, some of the operations will (or at least with some care, can) disappear in assembly language. For example, where I've done the inter &= (unsigned)-1;, in assembly language you'll normally be able to just use the lower 32-bit register where the result was stored, and just ignore whatever holds the upper 32 bits. Likewise, the inter >>= 32; just means we use the value in the upper 32-bit register, and ignore the lower 32-bit register.
For example, in x86 assembly language, this comes out something like:
mov ebx, 9 ; maximum digits we can deal with.
mov esi, offset output_buffer
next_digit:
mov eax, input
mul factors[ebx*4]
mov cl, shifts[ebx]
shrd eax, edx, cl
mov edx, 10 ; overwrite edx => inter &= (unsigned)-1
mul edx
add dl, '0'
mov [esi], dl ; effectively shift right 32 bits by ignoring 32 LSBs in eax
inc esi
dec ebx
jnz next_digit
mov [esi], bl ; zero terminate the string
For the moment, I've cheated a tiny bit, and written the code assuming an extra item at the beginning of each table (factors and shifts). This isn't strictly necessary, but simplifies the code at the cost of wasting 8 bytes of data. It's pretty easy to do away with that too, but I haven't bothered for the moment.
In any case, doing away with the division makes this a fair amount faster on quite a few low- to mid-range processors that lack dedicated division hardware.
Another way is using double dabble. This is a way to convert binary to BCD with only additions and bit shifts so it's very appropriate for microcontrollers. After splitting to BCDs you can easily print out each number
I would use a temporary string, like:
char buffer[8];
itoa(yourValue, buffer, 10);
int pos;
for(pos=0; pos<8; ++pos)
LCD_Display(pos, buffer[pos]); /* maybe you'll need a cast here */
edit: since you can't use library's itoa, then I think your solution is already the best, providing you compile with max optimization turned on.
You may take a look at this: Most optimized way to calculate modulus in C
This is my attempt at a complete solution. Credit should go to Guffa for providing the general idea. This should work for 32bit integers, signed or otherwise and 0.
#include <stdlib.h>
#include <stdio.h>
#define MAX_WIDTH (10)
static unsigned int uiPosition[] = {
1u,
10u,
100u,
1000u,
10000u,
100000u,
1000000u,
10000000u,
100000000u,
1000000000u,
};
void uitostr(unsigned int uiSource, char* cTarget)
{
int i, c=0;
for( i=0; i!=MAX_WIDTH; ++i )
{
cTarget[i] = 0;
}
if( uiSource == 0 )
{
cTarget[0] = '0';
cTarget[1] = '\0';
return;
}
for( i=MAX_WIDTH -1; i>=0; --i )
{
while( uiSource >= uiPosition[i] )
{
cTarget[c] += 1;
uiSource -= uiPosition[i];
}
if( c != 0 || cTarget[c] != 0 )
{
cTarget[c] += 0x30;
c++;
}
}
cTarget[c] = '\0';
}
void itostr(int iSource, char* cTarget)
{
if( iSource < 0 )
{
cTarget[0] = '-';
uitostr((unsigned int)(iSource * -1), cTarget + 1);
}
else
{
uitostr((unsigned int)iSource, cTarget);
}
}
int main()
{
char szStr[MAX_WIDTH +1] = { 0 };
// signed integer
printf("Signed integer\n");
printf("int: %d\n", 100);
itostr(100, szStr);
printf("str: %s\n", szStr);
printf("int: %d\n", -1);
itostr(-1, szStr);
printf("str: %s\n", szStr);
printf("int: %d\n", 1000000000);
itostr(1000000000, szStr);
printf("str: %s\n", szStr);
printf("int: %d\n", 0);
itostr(0, szStr);
printf("str: %s\n", szStr);
return 0;
}