Related
Currently, from research and various attempts, I'm pretty sure that the only solution to this problem is to use assembly. I'm posting this question to show an existing problem, and maybe get attention from compiler developers, or get some hits from searches about similar problems.
If anything changes in the future, I will accept it as an answer.
This is a very related question for MSVC.
In x86_64 machines, it is faster to use div/idiv with a 32-bit operand than a 64-bit operand. When the dividend is 64-bit and the divisor is 32-bit, and when you know that the quotient will fit in 32 bits, you don't have to use the 64-bit div/idiv. You can split the 64-bit dividend into two 32-bit registers, and even with this overhead, performing a 32-bit div on two 32-bit registers will be faster than doing a 64-bit div with a full 64-bit register.
The compiler will produce a 64-bit div with this function, and that is correct because for a 32-bit div, if the quotient of the division does not fit in 32 bits, an hardware exception occurs.
uint32_t div_c(uint64_t a, uint32_t b) {
return a / b;
}
However, if the quotient is known to be fit in 32 bits, doing a full 64-bit division is unnecessary. I used __builtin_unreachable to tell the compiler about this information, but it doesn't make a difference.
uint32_t div_c_ur(uint64_t a, uint32_t b) {
uint64_t q = a / b;
if (q >= 1ull << 32) __builtin_unreachable();
return q;
}
For both div_c and div_c_ur, the output from gcc is,
mov rax, rdi
mov esi, esi
xor edx, edx
div rsi
ret
clang does an interesting optimization of checking the dividend size, but it still uses a 64-bit div when the dividend is 64-bit.
mov rax, rdi
mov ecx, esi
mov rdx, rdi
shr rdx, 32
je .LBB0_1
xor edx, edx
div rcx
ret
.LBB0_1:
xor edx, edx
div ecx
ret
I had to write straight in assembly to achieve what I want. I couldn't find any other way to do this.
__attribute__((naked, sysv_abi))
uint32_t div_asm(uint64_t, uint32_t) {__asm__(
"mov eax, edi\n\t"
"mov rdx, rdi\n\t"
"shr rdx, 32\n\t"
"div esi\n\t"
"ret\n\t"
);}
Was it worth it? At least perf reports 49.47% overhead from div_c while 24.88% overhead from div_asm, so on my computer (Tiger Lake), div r32 is about 2 times faster than div r64.
This is the benchmark code.
#include <stdint.h>
#include <stdio.h>
__attribute__((noinline))
uint32_t div_c(uint64_t a, uint32_t b) {
uint64_t q = a / b;
if (q >= 1ull << 32) __builtin_unreachable();
return q;
}
__attribute__((noinline, naked, sysv_abi))
uint32_t div_asm(uint64_t, uint32_t) {__asm__(
"mov eax, edi\n\t"
"mov rdx, rdi\n\t"
"shr rdx, 32\n\t"
"div esi\n\t"
"ret\n\t"
);}
static uint64_t rdtscp() {
uint32_t _;
return __builtin_ia32_rdtscp(&_);
}
int main() {
#define n 500000000ll
uint64_t c;
c = rdtscp();
for (int i = 1; i <= n; ++i) {
volatile uint32_t _ = div_c(i + n * n, i + n);
}
printf(" c%15ul\n", rdtscp() - c);
c = rdtscp();
for (int i = 1; i <= n; ++i) {
volatile uint32_t _ = div_asm(i + n * n, i + n);
}
printf("asm%15ul\n", rdtscp() - c);
}
Every idea in this answer is based on comments by Nate Eldredge, from which I discovered some powerfulness of gcc's extended inline assembly. Even though I still have to write assembly, it is possible to create a custom as-if intrinsic function.
static inline uint32_t divqd(uint64_t a, uint32_t b) {
if (__builtin_constant_p(b)) {
return a / b;
}
uint32_t lo = a;
uint32_t hi = a >> 32;
__asm__("div %2" : "+a" (lo), "+d" (hi) : "rm" (b));
return lo;
}
__builtin_constant_p returns 1 if b can be evaluated in compile-time. +a and +d means values are read from and written to a and d registers (eax and edx). rm specifies that the input b can either be a register or memory operand.
To see if inlining and constant propagation is done smoothly,
uint32_t divqd_r(uint64_t a, uint32_t b) {
return divqd(a, b);
}
divqd_r:
mov rdx, rdi
mov rax, rdi
shr rdx, 32
div esi
ret
uint32_t divqd_m(uint64_t a) {
extern uint32_t b;
return divqd(a, b);
}
divqd_m:
mov rdx, rdi
mov rax, rdi
shr rdx, 32
div DWORD PTR b[rip]
ret
uint32_t divqd_c(uint64_t a) {
return divqd(a, 12345);
}
divqd_c:
movabs rdx, 6120523590596543007
mov rax, rdi
mul rdx
shr rdx, 12
mov eax, edx
ret
and the results are satisfying (https://godbolt.org/z/47PE4ovMM).
I'm trying to efficiently implement SHLD and SHRD instructions of x86 without using inline assembly.
uint32_t shld_UB_on_0(uint32_t a, uint32_t b, uint32_t c) {
return a << c | b >> 32 - c;
}
seems to work, but invokes undefined behaviour when c == 0 because the second shift's operand becomes 32. The actual SHLD instruction with third operand being 0 is well defined to do nothing. (https://www.felixcloutier.com/x86/shld)
uint32_t shld_broken_on_0(uint32_t a, uint32_t b, uint32_t c) {
return a << c | b >> (-c & 31);
}
doesn't invoke undefined behaviour, but when c == 0 the result is a | b instead of a.
uint32_t shld_safe(uint32_t a, uint32_t b, uint32_t c) {
if (c == 0) return a;
return a << c | b >> 32 - c;
}
does what's intended, but gcc now puts a je. clang on the other hand is smart enough to translate it to a single shld instruction.
Is there any way to implement it correctly and efficiently without inline assembly?
And why is gcc trying so much not to put shld? The shld_safe attempt is translated by gcc 11.2 -O3 as (Godbolt):
shld_safe:
mov eax, edi
test edx, edx
je .L1
mov ecx, 32
sub ecx, edx
shr esi, cl
mov ecx, edx
sal eax, cl
or eax, esi
.L1:
ret
while clang does,
shld_safe:
mov ecx, edx
mov eax, edi
shld eax, esi, cl
ret
As far as I have tested with gcc 9.3 (x86-64), it translates the following code to shldq and shrdq.
uint64_t shldq_x64(uint64_t low, uint64_t high, uint64_t count) {
return (uint64_t)(((((unsigned __int128)high << 64) | (unsigned __int128)low) << (count & 63)) >> 64);
}
uint64_t shrdq_x64(uint64_t low, uint64_t high, uint64_t count) {
return (uint64_t)((((unsigned __int128)high << 64) | (unsigned __int128)low) >> (count & 63));
}
Also, gcc -m32 -O3 translates the following code to shld and shrd. (I have not tested with gcc (i386), though.)
uint32_t shld_x86(uint32_t low, uint32_t high, uint32_t count) {
return (uint32_t)(((((uint64_t)high << 32) | (uint64_t)low) << (count & 31)) >> 32);
}
uint32_t shrd_x86(uint32_t low, uint32_t high, uint32_t count) {
return (uint32_t)((((uint64_t)high << 32) | (uint64_t)low) >> (count & 31));
}
(I have just read the gcc code and written the above functions, i.e. I'm not sure they are your expected ones.)
A developer can use the __builtin_expect builtin to help the compiler understand in which direction a branch is likely to go.
In the future, we may get a standard attribute for this purpose, but as of today at least all of clang, icc and gcc support the non-standard __builtin_expect instead.
However, icc seems to generate oddly terrible code when you use it1. That is, code that is uses the builtin is strictly worse than the code without it, regardless of which direction the prediction is made.
Take for example the following toy function:
int foo(int a, int b)
{
do {
a *= 77;
} while (b-- > 0);
return a * 77;
}
Out of the three compilers, icc is the only one that compiles this to the optimal scalar loop of 3 instructions:
foo(int, int):
..B1.2: # Preds ..B1.2 ..B1.1
imul edi, edi, 77 #4.6
dec esi #5.12
jns ..B1.2 # Prob 82% #5.18
imul eax, edi, 77 #6.14
ret
Both gcc and Clang manage the miss the easy solution and use 5 instructions.
On the other hand, when you use likely or unlikely macros on the loop condition, icc goes totally braindead:
#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)
int foo(int a, int b)
{
do {
a *= 77;
} while (likely(b-- > 0));
return a * 77;
}
This loop is functionally equivalent to the previous loop (since __builtin_expect just returns its first argument), yet icc produces some awful code:
foo(int, int):
mov eax, 1 #9.12
..B1.2: # Preds ..B1.2 ..B1.1
xor edx, edx #9.12
test esi, esi #9.12
cmovg edx, eax #9.12
dec esi #9.12
imul edi, edi, 77 #8.6
test edx, edx #9.12
jne ..B1.2 # Prob 95% #9.12
imul eax, edi, 77 #11.15
ret #11.15
The function has doubled in size to 10 instructions, and (worse yet!) the critical loop has more than doubled to 7 instructions with a long critical dependency chain involving a cmov and other weird stuff.
The same is true if you use the unlikely hint and also across all icc versions (13, 14, 17) that godbolt supports. So the code generation is strictly worse, regardless of the hint, and regardless of the actual runtime behavior.
Neither gcc nor clang suffer any degradation when hints are used.
What's up with that?
1 At least in the first and subsequent examples I tried.
To me it seems an ICC bug. This code (available on godbolt)
int c;
do
{
a *= 77;
c = b--;
}
while (likely(c > 0));
that simply use an auxiliary local var c, produces an output without the edx = !!(esi > 0) pattern
foo(int, int):
..B1.2:
mov eax, esi
dec esi
imul edi, edi, 77
test eax, eax
jg ..B1.2
still not optimal (it could do without eax), though.
I don't know if the official ICC policy about __builtin_expect is full support or just compatibility support.
This question seems better suited for the Official ICC forum.
I've tried posting this topic there but I'm not sure I've made a good job (I've been spoiled by SO).
If they answer me I'll update this answer.
EDIT
I've got and an answer at the Intel Forum, they recorded this issue in their tracking system.
As today, it seems a bug.
Don't let the instructions deceive you. What matters is performance.
Consider this rather crude test :
#include "stdafx.h"
#include <windows.h>
#include <iostream>
int foo(int a, int b) {
do { a *= 7; } while (b-- > 0);
return a * 7;
}
int fooA(int a, int b) {
__asm {
mov esi, b
mov edi, a
mov eax, a
B1:
imul edi, edi, 7
dec esi
jns B1
imul eax, edi, 7
}
}
int fooB(int a, int b) {
__asm {
mov esi, b
mov edi, a
mov eax, 1
B1:
xor edx, edx
test esi, esi
cmovg edx, eax
dec esi
imul edi, edi, 7
test edx, edx
jne B1
imul eax, edi, 7
}
}
int main() {
DWORD start = GetTickCount();
int j = 0;
for (int aa = -10; aa < 10; aa++) {
for (int bb = -500; bb < 15000; bb++) {
j += foo(aa, bb);
}
}
std::cout << "foo compiled (/Od)\n" << "j = " << j << "\n"
<< GetTickCount() - start << "ms\n\n";
start = GetTickCount();
j = 0;
for (int aa = -10; aa < 10; aa++) {
for (int bb = -500; bb < 15000; bb++) {
j += fooA(aa, bb);
}
}
std::cout << "optimal scalar\n" << "j = " << j << "\n"
<< GetTickCount() - start << "ms\n\n";
start = GetTickCount();
j = 0;
for (int aa = -10; aa < 10; aa++) {
for (int bb = -500; bb < 15000; bb++) {
j += fooB(aa, bb);
}
}
std::cout << "use likely \n" << "j = " << j << "\n"
<< GetTickCount() - start << "ms\n\n";
std::cin.get();
return 0;
}
produces output:
foo compiled (/Od)
j = -961623752
4422ms
optimal scalar
j = -961623752
1656ms
use likely
j = -961623752
1641ms
This is naturally entirely CPU dependent (tested here on Haswell i7), but both asm loops generally are very nearly identical in performance when tested over a range of inputs. A lot of this has to do with the selection and ordering of instructions being conducive to leveraging instruction pipelining (latency), branch prediction, and other hardware optimizations in the CPU.
The real lesson when you're optimizing is that you need to profile - it's extremely difficult to do this by inspection of the raw assembly.
Even giving a challenging test where likely(b-- >0) isn't true over a third of the time :
for (int aa = -10000000; aa < 10000000; aa++) {
for (int bb = -3; bb < 9; bb++) {
j += fooX(aa, bb);
}
}
results in :
foo compiled (/Od) : 1844ms
optimal scalar : 906ms
use likely : 1187ms
Which isn't bad. What you have to keep in mind is that the compiler will generally do its best without your interference. Using __builtin_expect and the like should really be restricted to cases where you have existing code that you have profiled and that you have specifically identified as being both hotspots and as having pipeline or prediction issues. This trivial example is an ideal case where the compiler will almost certainly do the right thing without help from you.
By including __builtin_expect you're asking the compiler to necessarily compile in a different way - a more complex way, in terms of pure number of instructions, but a more intelligent way in that it structures the assembly in a way that helps the CPU make better branch predictions. In this case of pure register play (as in this example) there's not much at stake, but if it improves prediction in a more complex loop, maybe saving you a bad misprediction, cache misses, and related collateral damage, then it's probably worth using.
I think it's pretty clear here, at least, that when the branch actually is likely then we very nearly recover the full performance of the optimal loop (which I think is impressive). In cases where the "optimal loop" is rather more complex and less trivial we can expect that the codegen would indeed improve branch prediction rates (which is what this is really all about). I think this is really a case of if you don't need it, don't use it.
On the topic of likely vs unlikely generating the same assembly, this doesn't imply that the compiler is broken - it just means that the same codegen is effective regardless of whether the branch is mostly taken or mostly not taken - as long as it is mostly something, it's good (in this case). The codegen is designed to optimise use of the instruction pipeline and to assist branch prediction, which it does. While we saw some reduction in performance with the mixed case above, pushing the loop to mostly unlikely recovers performance.
for (int aa = -10000000; aa < 10000000; aa++) {
for (int bb = -30; bb < 1; bb++) {
j += fooX(aa, bb);
}
}
foo compiled (/Od) : 2453ms
optimal scalar : 1968ms
use likely : 2094ms
Borland C has pseudo-Registers _AX,_BX, _FLAGS etc that could be used in 'C' code to save the registers to temp variables.
Is there any MSVC equivalent? I tried #AX, #BX, etc, but the compiler (MSVC1.5) gave error ('40' unrecognized symbol).
I'm developing a 16-bit pre-boot app and can't use .
Thanks.
you don't need to have pseudo registers if you only move values between registers and variables. example:
int a = 4;
int b = 999;
__asm
{
mov eax, a; // eax equals to 4
mov b, eax; // b equals to eax
}
// b equals to 4 now
edit: to copy the flags into a variable and back to flags again, you can use LAHF and SAHF instructions. example:
int flags = 0;
__asm
{
lahf;
mov flags, eax;
}
flags |= (1 << 3);
__asm
{
mov eax, flags;
sahf;
// 4th bit of the flag is set
}
I'm developing a software on 8051 processor. A frequent job is to divide the high and low byte of a 16bit address. I want to see there are how many ways to achieve it. The ways I come up so far are: (say ptr is a 16bit pointer, and int is 16bit int) [note the rn and arn is registers]
bitwise operation
ADDH = (unsigned int) ptr >> 8;
ADDL = (unsigned int) ptr & 0x00FF;
SDCC gives the following assembly code
; t.c:32: ADDH = (unsigned int) ptr >> 8;
mov ar6,r3
mov ar7,r4
mov _main_ADDH_1_1,r7
; t.c:33: ADDL = (unsigned int) ptr & 0x00FF;
mov _main_ADDL_1_1,r6
Keil C51 gives me:
; SOURCE LINE # 32
0045 AA00 R MOV R2,ptr+01H
0047 A900 R MOV R1,ptr+02H
0049 AE02 MOV R6,AR2
004B EE MOV A,R6
004C F500 R MOV ADDH,A
; SOURCE LINE # 33
004E AF01 MOV R7,AR1
0050 EF MOV A,R7
0051 F500 R MOV ADDL,A
which has many useless code IMHO.
pointer trick
ADDH = ((unsigned char *)&ptr)[0];
ADDL = ((unsigned char *)&ptr)[1];
SDCC gives me:
; t.c:37: ADDH = ((unsigned char *)&ptr)[0];
mov _main_ADDH_1_1,_main_ptr_1_1
; t.c:38: ADDL = ((unsigned char *)&ptr)[1];
mov _main_ADDL_1_1,(_main_ptr_1_1 + 0x0001)
Keil C51 gives me:
; SOURCE LINE # 37
006A 850000 R MOV ADDH,ptr
; SOURCE LINE # 38
006D 850000 R MOV ADDL,ptr+01H
which is the same with SDCC version.
Andrey's mathematic approach
ADDH = ptr / 256;
ADDL = ptr % 256;
SDCC gives:
; t.c:42: ADDH = (unsigned int)ptr / 256;
mov ar5,r3
mov ar6,r4
mov ar7,r6
mov _main_ADDH_1_1,r7
; t.c:43: ADDL = (unsigned int)ptr % 256;
mov _main_ADDL_1_1,r5
I've no idea why sdcc use the r7 register...
Keil C51 gives me:
; SOURCE LINE # 42
0079 AE00 R MOV R6,ptr
007B AF00 R MOV R7,ptr+01H
007D AA06 MOV R2,AR6
007F EA MOV A,R2
0080 F500 R MOV ADDH,A
; SOURCE LINE # 43
0082 8F00 R MOV ADDL,R7
I've no idea why Keil use R2 register neither...
semaj's union approach
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
U16_U8 ptr;
// Do something to set the variable ptr
ptr.u16 = ?;
ADDH = ptr.u8[0];
ADDL = ptr.u8[1];
SDCC gives me
; t.c:26: ADDH = uptr.u8[0];
mov _main_ADDH_1_1,_main_uptr_1_1
; t.c:27: ADDL = uptr.u8[1];
mov _main_ADDL_1_1,(_main_uptr_1_1 + 0x0001)
Keil C51 gives me:
; SOURCE LINE # 26
0028 850000 R MOV ADDH,uptr
; SOURCE LINE # 27
002B 850000 R MOV ADDL,uptr+01H
which is very smiler to the pointers trick. However, this approach require two more bytes memory the store the union.
Does anyone have any other bright ideas? ;)
And anyone can tell me which way is more efficient?
In case anyone interested, here is the test case:
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
// call a function on the ADDs to avoid optimizition
void swap(unsigned char *a, unsigned char *b)
{
unsigned char tm;
tm = *a;
*a = *b;
*b = tm;
}
main (void)
{
char c[] = "hello world.";
unsigned char xdata *ptr = (unsigned char xdata *)c;
unsigned char ADDH, ADDL;
unsigned char i = 0;
U16_U8 uptr;
uptr.u16 = (unsigned short)ptr;
for ( ; i < 4 ; i++, uptr.u16++){
ADDH = uptr.u8[0];
ADDL = uptr.u8[1];
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = (unsigned int) ptr >> 8;
ADDL = (unsigned int) ptr & 0x00FF;
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = ((unsigned char *)&ptr)[0];
ADDL = ((unsigned char *)&ptr)[1];
swap(&ADDH, &ADDL);
}
for ( ; i < 4 ; i++, ptr++){
ADDH = (unsigned int)ptr / 256;
ADDL = (unsigned int)ptr % 256;
swap(&ADDH, &ADDL);
}
}
The most efficient way is completely dependent on the compiler. You definitely have to figure out how to get an assembly listing from your compiler for an 8051 project.
One method you might try that is similar to those already mentioned is a union:
typedef union
{
unsigned short u16;
unsigned char u8[2];
} U16_U8;
U16_U8 ptr;
// Do something to set the variable ptr
ptr.u16 = ?;
ADDH = ptr.u8[0];
ADDL = ptr.u8[1];
Another not so bright way to split the address:
ADDH = ptr / 256;
ADDL = ptr % 256;
most efficient is first one, since it is done in single instruction.
NO! I lied to you sorry. I forgot that 8051 instruction set has only 1-bit shift instructions. Second should be faster, but compiler may generate stupid code, so beware and check assembly code.
I just create two defines(as follows).
It seems more straight forward, and less error prone.
#define HI(x) ((x) >> 8)
#define LO(x) ((x) & 0xFF)