I have a system where I have to implement to second part. The issue is that I received a float where the bytes are mixed. In the example below, the input is 1E9 (f_orig) and what is receive is this 2.034699e+26 (f_recv). I did the function ABCD_to_CDAB_float but I find it ugly. Is there a better way to to write ABCD_to_CDAB_float(without two temporary variables will already be nice) ?
#include <stdio.h>
#include <float.h>
#include <string.h>
#include <stdint.h>
/* Change float byte order */
float ABCD_to_CDAB_float(float infloat) {
float outfloat;
uint8_t tmp[4];
memcpy(&tmp, &infloat, 4);
uint8_t tmp2[] = {tmp[2], tmp[3], tmp[0], tmp[1]};
memcpy(&outfloat, &tmp2, 4);
return outfloat;
}
int main() {
float f_orig = 1E9; // This is the value sent
printf("f_orig\t%e\n", f_orig);
char str_orig[4];
memcpy(&str_orig, &f_orig, 4);
printf("str_orig\t%x %x %x %x\n", str_orig[0], str_orig[1], str_orig[2], str_orig[3]);
float f_built; // same as f_orig
char str_built[] = {0x28, 0x6B, 0x6E, 0x4E};
memcpy(&f_built, &str_built, 4);
printf("f_built\t%e\n", f_built); // it prints "1E9"
float f_recv; // received float
char str_recv[] = {0x6E, 0x4E, 0x28, 0x6B};
memcpy(&f_recv, &str_recv, 4);
printf("f_recv\t%e\n", f_recv); // converesion was wrong somewhere
char str6[] = {str_recv[2], str_recv[3], str_recv[0], str_recv[1]};
float f_res;
memcpy(&f_res, &str6, 4);
printf("f_res\t%e\n", f_res); // result is fine
printf("%e\n",ABCD_to_CDAB_float(f_recv)); // result is right
return 0;
}
Firstly, please note that removing variables does not automatically imply that it will use less memory. It's quite likely that the compiler will remove unnecessary variables, but it might also add variables if it wants to.
If you absolutely want to get rid of them, you can do this:
float ABCD_to_CDAB_float(float infloat) {
float outfloat;
char *tmp1 = (char*) &infloat;
char *tmp2 = (char*) &outfloat;
tmp2[0] = tmp1[2];
tmp2[1] = tmp1[3];
tmp2[2] = tmp1[0];
tmp2[3] = tmp1[1];
return outfloat;
}
But to be honest, I don't see the point. Also, I'd advice against things like this, because it's very easy to get it wrong and cause very hard traced bugs. I guess you also could do something like this: (No, you cannot. See edit below.)
float ABCD_to_CDAB_float(float infloat) {
char *tmp1 = (char*) &infloat;
char tmp2[] = {tmp[2], tmp[3], tmp[0], tmp[1]};
return *(float*) &tmp2; // This is not ok! It's violating the
// strict aliasing rule
}
But again. Avoid these magic tricks if you don't really need them. In some cases, it might impact performance a little bit, but it does not make the code easier to read.
And to be perfectly honest, I'm not 100% sure that this does not violate some of those really strange rules in C. It is a possibility that this leads to undefined behavior. So don't trust me completely on this one. When dealing with pointer casting like this, it's easy to violate the strict aliasing rule
EDIT:
The second example DOES violate the strict aliasing rule, which proves my point that his might be tricky. Thanks to Andrew Henle for pointing it out.
If you want to have absolutely no extra variables, not even pointers, take a look at Eric Postpischil's answer but don't use stuff like that if anyone else is supposed to read it.
To do it with no extra variables:
float ABCD_to_CDAB_float(float x)
{
return (union { uint8_t u[4]; float f; }) {{
((uint8_t *) &x)[2], ((uint8_t *) &x)[3],
((uint8_t *) &x)[0], ((uint8_t *) &x)[1] }} .f;
}
To remove some distracting parentheses:
return (union { uint8_t u[4]; float f; }) {{
2[(uint8_t *) &x], 3[(uint8_t *) &x],
0[(uint8_t *) &x], 1[(uint8_t *) &x] }} .f;
If you use gcc family compiler I would:
#define SWAP(a,b,type) do{type c = a; (a) = b; (b) = c;} while(0)
float ABCD_to_CDAB_float(float x)
{
union
{
float f;
uint16_t u16[2];
}z = {.f = x};
z.u16[0] = __builtin_bswap16(z.u16[0]);
z.u16[1] = __builtin_bswap16(z.u16[1]);
SWAP(z.u16[0], z.u16[1], uint16_t);
return z.f;
}
float ABCD_to_DCBA_float(float x)
{
uint32_t u32;
memcpy(&u32,&x, 4);
u32 = __builtin_bswap32(u32);
memcpy(&x, &u32, 4);
return x;
}
and it will generate the most efficient code:
ABCD_to_CDAB_float:
movd eax, xmm0
mov ecx, eax
shr eax, 16
mov edx, eax
rol cx, 8
rol dx, 8
sal ecx, 16
movzx eax, dx
or eax, ecx
movd xmm0, eax
ret
ABCD_to_DCBA_float:
movd eax, xmm0
bswap eax
movd xmm0, eax
ret
You can do this with unions:
union f2l {
float f;
uint32_t l;
};
float ABCD_to_CDAB(float input) {
f2l t1;
t1.f = input;
uint16_t cd = t1.l >> 16;
uint16_t ab = t1.l & 0x0000FFFF;
t1.l = (ab << 16) | cd;
return t1.f;
}
the 2 variables cd and ab are for sake of clarity and can be removed.
Related
I'm attempting to write a small x86-64 JIT, and I'm a little over my head in a few places.
I'm trying to JIT a simple function that assigns the value of a float into the xmm0 register and then returns it, but I am unsure of how I should go about encoding the arguments to the movsd call.
Any help would be greatly appreciated.
/* main.c */
#include <stdio.h>
#include <sys/mman.h>
#define xmm(n) (n)
typedef double(*fn)();
fn jit(){
char* memory = mmap(NULL,
4096,
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS,
-1, 0);
int i=0;
float myfloat = 3.1f;
memory[i++] = 0x48; /* REX.W */
memory[i++] = 0xf2; /*******************/
memory[i++] = 0x0f; /* MOVSD xmm0, m64 */
memory[i++] = 0x10; /*******************/
memory[i++] = 0x47 | xmm(0) << 3; /* Not 100% sure this is correct */
memory[i++] = 0; /* what goes here to load myfloat into xmm0? */
memory[i++] = 0xc3; /* RET */
return (fn) memory;
}
int main(){
fn f = jit();
printf("result: %f\n", (*f)());
return 0;
}
SSE instructions generally don't support immediates except for some rare instructions with a one-byte immediate to control their operation. Thus you need to:
store myfloat to some nearby memory area
generate a memory operand the references this area
Both steps are easy. For the first step, I'd simply use the beginning of memory and let the code start right afterwards. Note that in this case, you need to make sure to return a pointer to the beginning of the function, not the beginning of memory. Other solutions are possible. Just make sure that myfloat is stored within ±2 GiB from the code.
To generate the operand, revisit the Intel manuals. The addressing mode you want is a 32 bit RIP-relative operand. This is generated with mod = 0, r/m = 5. The displacement is a signed 32 bit number that is added to the value of RIP right at the end of the instruction (this is where the +4 comes from as have to factor in the lenth of the displacement).
Thus we have something like:
memory[i++] = 0xf2; /*******************/
memory[i++] = 0x0f; /* MOVSD xmm0, m64 */
memory[i++] = 0x10; /*******************/
memory[i++] = 0005 | xmm(0) << 3; /* mod = 0, r/m = 5: [rip + disp32] */
*(int *)(memory + i) = memory + i + 4 - addr_of_myfloat;
i += 4;
memory[i++] = 0xc3; /* RET */
Note that the REX prefix is not needed here.
How to get an integer's sign and store it in a char? One way is:
int n = -5
char c;
if(n<0)
c = '-';
else
c = '+';
Or:
char c = n < 0 ? '-' : '+';
But is there a way to do it without conditionals?
There's the most efficient and portable way, but it doesn't win any beauty awards.
We can assume that the MSB of a signed integer is always set if it is negative. This is a 100% portable assumption even when taking exotic signedness formats in account (one's complement, signed magnitude). Therefore the fastest way is to simply mask out the MSB from the integer.
The MSB of any integer is found at location CHAR_BIT * sizeof(n) - 1;. On a typical 32 bit mainstream system, this would for example be 8 * 4 - 1 = 31.
So we can write a function like this:
_Bool is_signed (int n)
{
const unsigned int sign_bit_n = CHAR_BIT * sizeof(n) - 1;
return (_Bool) ((unsigned int)n >> sign_bit_n);
}
On x86-64 gcc 9.1 (-O3), this results in very efficient code:
is_signed:
mov eax, edi
shr eax, 31
ret
The advantage of this method is also that, unlike code such as x < 0, it won't risk getting translated into "branch if negative" instructions when ported.
Complete example:
#include <limits.h>
#include <stdio.h>
_Bool is_signed (int n)
{
const unsigned int sign_bit_n = CHAR_BIT * sizeof(n) - 1;
return (_Bool) ((unsigned int)n >> sign_bit_n);
}
int main (void)
{
int n = -1;
const char SIGNS[] = {' ', '-'};
char sign = SIGNS[is_signed(n)];
putchar(sign);
}
Disassembly (x86-64 gcc 9.1 (-O3)):
is_signed:
mov eax, edi
shr eax, 31
ret
main:
sub rsp, 8
mov rsi, QWORD PTR stdout[rip]
mov edi, 45
call _IO_putc
xor eax, eax
add rsp, 8
ret
This creates branchless code with gcc/clang on x86-64:
void storeneg(int X, char *C)
{
*C='+';
*C += (X<0)*('-'-'+');
}
https://gcc.godbolt.org/z/yua1go
char c = 43 + signbit(n) * 2 ;
char 43 is '+'
char 45 is '-'
signbit(NEGATIVE INTEGER) is true, converted to 1
int signbit(int) is included in cmath in C++ and math.h in C
GCC is giving me a hard time generating optimal assembly for following source code:
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
X being an uint64_t[2] array, and
Y being an unsigned char[16] array, and
table being a double dimensional array of union qword_t:
union qword_t {
uint8_t asBytes[8];
uint64_t asQWord;
};
const union qword_t table[16][256] = /* ... */;
With options -m64 -Ofast -mno-sse it does unroll the loop, and each xor with assignment results in 3 instructions (thus overall number of instructions issued is 3 * 16 = 48):
movzx r9d, byte ptr [Y + i] ; extracting byte
xor rax, qword ptr [table + r9*8 + SHIFT] ; xoring, SHIFT = i * 0x800
mov qword ptr [X], rax ; storing result
Now, my understanding is that resulting X value could be accumulated in rax register throughout all 16 xors, and then it could be stored at [X] address, which could be achieved with these two instructions for each xor with assignment:
movzx r9d, byte ptr [Y + i] ; extracting byte
xor rax, qword ptr [table + r9*8 + SHIFT] ; xoring, SHIFT = i * 0x800
and single storing:
mov qword ptr [X], rax ; storing result
(In this case overall number of instructions is 2 * 16 + 1 = 33)
Why does GCC generate these redundant mov instructions? What can I do to avoid this?
P.S. C99, GCC 5.3.0, Intel Core i5 Sandy Bridge
Redundant stores are usually down to aliasing; in this case gcc would be unable to prove to its satisfaction that the store to X[0] does not affect table. It makes a big difference how the variables are passed to the routine; if they are globals or members of the same larger struct then proving non-aliasing is easier.
Example:
void f1(uint64_t X[2]) {
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
}
uint64_t X[2];
void f2() {
memset(X, 0, 16);
for (int i= 0; i < 16; ++i) {
X[0] ^= table[i][Y[i]].asQWord;
}
}
Here the store to X[0] is sunk out of the loop in f2 but not in f1, because only in f2 can gcc prove that X does not alias members of table.
Your workaround/fix could be to adjust how the parameters are passed, to use the restrict specifier, or to manually sink the store yourself.
To avoid this, you could use this instead:
uint64_t v = 0;
for (int i= 0; i < 16; ++i) {
v ^= table[i][Y[i]].asQWord;
}
X[0] = v;
X[1] = 0;
You can easily notice the generated instructions are sub-optimal in your case, however for different reasons gcc may not be able to determine that. (And in this case, gcc cannot determine that table will never access the same memory-region as X, as ecatmur explains more elaborately.)
The standard div() function returns a div_t struct as parameter, for example:
/* div example */
#include <stdio.h> /* printf */
#include <stdlib.h> /* div, div_t */
int main ()
{
div_t divresult;
divresult = div (38,5);
printf ("38 div 5 => %d, remainder %d.\n", divresult.quot, divresult.rem);
return 0;
}
My case is a bit different; I have this
#define NUM_ELTS 21433
int main ()
{
unsigned int quotients[NUM_ELTS];
unsigned int remainders[NUM_ELTS];
int i;
for(i=0;i<NUM_ELTS;i++) {
divide_single_instruction("ient[i],&reminder[i]);
}
}
I know that the assembly language for division does everything in single instruction, so I need to do the same here to save on cpu cycles, which is bassicaly move the quotient from EAX and reminder from EDX into a memory locations where my arrays are stored. How can this be done without including the asm {} or SSE intrinsics in my C code ? It has to be portable.
Since you're writing to the arrays in-place (replacing numerator and denominator with quotient and remainder) you should store the results to temporary variables before writing to the arrays.
void foo (unsigned *num, unsigned *den, int n) {
int i;
for(i=0;i<n;i++) {
unsigned q = num[i]/den[i], r = num[i]%den[i];
num[i] = q, den[i] = r;
}
}
produces this main loop assembly
.L5:
movl (%rdi,%rcx,4), %eax
xorl %edx, %edx
divl (%rsi,%rcx,4)
movl %eax, (%rdi,%rcx,4)
movl %edx, (%rsi,%rcx,4)
addq $1, %rcx
cmpl %ecx, %r8d
jg .L5
There are some more complicated cases where it helps to save the quotient and remainder when they are first used. For example in testing for primes by trial division you often see a loop like this
for (p = 3; p <= n/p; p += 2)
if (!(n % p)) return 0;
It turns out that GCC does not use the remainder from the first division and therefore it does the division instruction twice which is unnecessary. To fix this you can save the remainder when the first division is done like this:
for (p = 3, q=n/p, r=n%p; p <= q; p += 2, q = n/p, r=n%p)
if (!r) return 0;
This speeds up the result by a factor of two.
So in general GCC does a good job particularly if you save the quotient and remainder when they are first calculated.
The general rule here is to trust your compiler to do something fast. You can always disassemble the code and check that the compiler is doing something sane. It's important to realise that a good compiler knows a lot about the machine, often more than you or me.
Also let's assume you have a good reason for needing to "count cycles".
For your example code I agree that the x86 "idiv" instruction is the obvious choice. Let's see what my compiler (MS visual C 2013) will do if I just write out the most naive code I can
struct divresult {
int quot;
int rem;
};
struct divresult divrem(int num, int den)
{
return (struct divresult) { num / den, num % den };
}
int main()
{
struct divresult res = divrem(5, 2);
printf("%d, %d", res.quot, res.rem);
}
And the compiler gives us:
struct divresult res = divrem(5, 2);
printf("%d, %d", res.quot, res.rem);
01121000 push 1
01121002 push 2
01121004 push 1123018h
01121009 call dword ptr ds:[1122090h] ;;; this is printf()
Wow, I was outsmarted by the compiler. Visual C knows how division works so it just precalculated the result and inserted constants. It didn't even bother to include my function in the final code. We have to read in the integers from console to force it to actually do the calculation:
int main()
{
int num, den;
scanf("%d, %d", &num, &den);
struct divresult res = divrem(num, den);
printf("%d, %d", res.quot, res.rem);
}
Now we get:
struct divresult res = divrem(num, den);
01071023 mov eax,dword ptr [num]
01071026 cdq
01071027 idiv eax,dword ptr [den]
printf("%d, %d", res.quot, res.rem);
0107102A push edx
0107102B push eax
0107102C push 1073020h
01071031 call dword ptr ds:[1072090h] ;;; printf()
So you see, the compiler (or this compiler at least) already does what you want, or something even more clever.
From this we learn to trust the compiler and only second-guess it when we know it isn't doing a good enough job already.
I would like my C function to efficiently compute the high 64 bits of the product of two 64 bit signed ints. I know how to do this in x86-64 assembly, with imulq and pulling the result out of %rdx. But I'm at a loss for how to write this in C at all, let alone coax the compiler to do it efficiently.
Does anyone have any suggestions for writing this in C? This is performance sensitive, so "manual methods" (like Russian Peasant, or bignum libraries) are out.
This dorky inline assembly function I wrote works and is roughly the codegen I'm after:
static long mull_hi(long inp1, long inp2) {
long output = -1;
__asm__("movq %[inp1], %%rax;"
"imulq %[inp2];"
"movq %%rdx, %[output];"
: [output] "=r" (output)
: [inp1] "r" (inp1), [inp2] "r" (inp2)
:"%rax", "%rdx");
return output;
}
If you're using a relatively recent GCC on x86_64:
int64_t mulHi(int64_t x, int64_t y) {
return (int64_t)((__int128_t)x*y >> 64);
}
At -O1 and higher, this compiles to what you want:
_mulHi:
0000000000000000 movq %rsi,%rax
0000000000000003 imulq %rdi
0000000000000006 movq %rdx,%rax
0000000000000009 ret
I believe that clang and VC++ also have support for the __int128_t type, so this should also work on those platforms, with the usual caveats about trying it yourself.
The general answer is that x * y can be broken down into (a + b) * (c + d), where a and c are the high order parts.
First, expand to ac + ad + bc + bd
Now, you multiply the terms as 32 bit numbers stored as long long (or better yet, uint64_t), and you just remember that when you multiplied a higher order number, you need to scale by 32 bits. Then you do the adds, remembering to detect carry. Keep track of the sign. Naturally, you need to do the adds in pieces.
For code implementing the above, see my other answer.
With regard to your assembly solution, don't hard-code the mov instructions! Let the compiler do it for you. Here's a modified version of your code:
static long mull_hi(long inp1, long inp2) {
long output;
__asm__("imulq %2"
: "=d" (output)
: "a" (inp1), "r" (inp2));
return output;
}
Helpful reference: Machine Constraints
Since you did a pretty good job solving your own problem with the machine code, I figured you deserved some help with the portable version. I would leave an ifdef in where you do just use the assembly if in gnu on x86.
Anyway, here is an implementation based on my general answer. I'm pretty sure this is correct, but no guarantees, I just banged this out last night. You probably should get rid of the statics positive_result[] and result_negative - those are just artefacts of my unit test.
#include <stdlib.h>
#include <stdio.h>
// stdarg.h doesn't help much here because we need to call llabs()
typedef unsigned long long uint64_t;
typedef signed long long int64_t;
#define B32 0xffffffffUL
static uint64_t positive_result[2]; // used for testing
static int result_negative; // used for testing
static void mixed(uint64_t *result, uint64_t innerTerm)
{
// the high part of innerTerm is actually the easy part
result[1] += innerTerm >> 32;
// the low order a*d might carry out of the low order result
uint64_t was = result[0];
result[0] += (innerTerm & B32) << 32;
if (result[0] < was) // carry!
++result[1];
}
static uint64_t negate(uint64_t *result)
{
uint64_t t = result[0] = ~result[0];
result[1] = ~result[1];
if (++result[0] < t)
++result[1];
return result[1];
}
uint64_t higherMul(int64_t sx, int64_t sy)
{
uint64_t x, y, result[2] = { 0 }, a, b, c, d;
x = (uint64_t)llabs(sx);
y = (uint64_t)llabs(sy);
a = x >> 32;
b = x & B32;
c = y >> 32;
d = y & B32;
// the highest and lowest order terms are easy
result[1] = a * c;
result[0] = b * d;
// now have the mixed terms ad + bc to worry about
mixed(result, a * d);
mixed(result, b * c);
// now deal with the sign
positive_result[0] = result[0];
positive_result[1] = result[1];
result_negative = sx < 0 ^ sy < 0;
return result_negative ? negate(result) : result[1];
}
Wait, you have a perfectly good, optimized assembly solution already
working for this, and you want to back it out and try to write it in
an environment that doesn't support 128 bit math? I'm not following.
As you're obviously aware, this operation is a single instruction on
x86-64. Obviously nothing you do is going to make it work any better.
If you really want portable C, you'll need to do something like
DigitalRoss's code above and hope that your optimizer figures out what
you're doing.
If you need architecture portability but are willing to limit yourself
to gcc platforms, there are __int128_t (and __uint128_t) types in the
compiler intrinsics that will do what you want.