divdi3 division used for long long by gcc on x86 - c

When gcc sees multiplication or division of integer types that isn't supported in hardware, it generates call to special library function.
http://gcc.gnu.org/onlinedocs/gccint/Integer-library-routines.html#Integer-library-routines
According link above, long __divdi3 (long a, long b) used for division of long. However, here http://gcc.gnu.org/onlinedocs/gcc-3.3/gccint/Library-Calls.html divdi explained as "call for division of one signed double-word". When first source has cleary mapping of di suffix -> long arguments, second states divdi for double-word and udivdi for full-word (single, right?)
When I compile simple example
int main(int argc, char *argv[]) {
long long t1, t2, tr;
t1 = 1;
t2 = 1;
tr = t1 / t2;
return tr;
}
with gcc -Wall -O0 -m32 -march=i386 (gcc ver. 4.7.2)
dissamble shows me
080483cc <main>:
80483cc: 55 push %ebp
80483cd: 89 e5 mov %esp,%ebp
80483cf: 83 e4 f0 and $0xfffffff0,%esp
80483d2: 83 ec 30 sub $0x30,%esp
80483d5: c7 44 24 28 01 00 00 movl $0x1,0x28(%esp)
80483dc: 00
80483dd: c7 44 24 2c 00 00 00 movl $0x0,0x2c(%esp)
80483e4: 00
80483e5: c7 44 24 20 01 00 00 movl $0x1,0x20(%esp)
80483ec: 00
80483ed: c7 44 24 24 00 00 00 movl $0x0,0x24(%esp)
80483f4: 00
80483f5: 8b 44 24 20 mov 0x20(%esp),%eax
80483f9: 8b 54 24 24 mov 0x24(%esp),%edx
80483fd: 89 44 24 08 mov %eax,0x8(%esp)
8048401: 89 54 24 0c mov %edx,0xc(%esp)
8048405: 8b 44 24 28 mov 0x28(%esp),%eax
8048409: 8b 54 24 2c mov 0x2c(%esp),%edx
804840d: 89 04 24 mov %eax,(%esp)
8048410: 89 54 24 04 mov %edx,0x4(%esp)
8048414: e8 17 00 00 00 call 8048430 <__divdi3>
8048419: 89 44 24 18 mov %eax,0x18(%esp)
804841d: 89 54 24 1c mov %edx,0x1c(%esp)
8048421: 8b 44 24 18 mov 0x18(%esp),%eax
8048425: c9 leave
8048426: c3 ret
Note 8048414: call 8048430 <__divdi3>.
I can't use gcc lib for my project and it's multiplatform. I hoped not to write all __* functions for all platforms (speed is not matter), but now I'm a bit confused.
Can somebody explain, why is there __divdi3 (not __divti3) call generated for long long int (64-bit) division?

On x86 machines, the term "word" usually implies presence of a 16-bit value. More generally in the computer-science world, word can denote values of virtually arbitrary lengths, with words of 10 or 12 bits not being uncommon in the embedded systems.
I believe that the terminology you have hit upon is used for the Linux/Unix systems just for the sake of unification on the level of the operating system and has nothing to do with the target platform of your build. An example of use of the same notation can be found in gdb, which uses w for the 32-bit word and hw for the 16-bit "half-word" (in the x86 sense).
Furthermore, this convention also extends to the standard IEEE-754 floating point numbers with ease, and is summarised in the few bullet points below
s - single (precision, word) is used for four byte integers (int) / floats (float)
d - double (precision) for eight byte integers (long or long long) / floats (double)
t - ten bytes for integers (long long) / floats (long double)
This naming convention is used for all arithmetic built-ins, like __divsi3, __divdi3, __divti3 or __mulsi3, __muldi3, __multi3... (and all u - unsigned - variants). A complete list can be found here.
Division of 64-bit numbers on 32-bit machines uses advanced (and bit difficult) algorithm. However, you can still use algorithm principle you've learned in school. Here's simple pseudo-code for it (have a look on this answer about big-integers):
result = 0;
count = 0;
remainder = numerator;
while(highest_bit_of_divisor_not_set) {
divisor = divisor << 1;
count++;
}
while(remainder != 0) {
if(remainder >= divisor) {
remainder = remainder - divisor;
result = result | (1 << count);
}
if(count == 0) {
break;
}
divisor = divisor >> 1;
count--;
}

Related

Why bitops in linux kernel performance in slower than mine?

I'm looking for a best bitops lib or function that wrote with c language, thus I think linux kernel was the best in this case.
So I copy Linux kernel set_bit function from arch/x86/include/asm/bitops.h and compare with mine and saw a strange results!!!
kernel_bitops.c
#define ADDR BITOP_ADDR(addr)
#define __ASM_FORM(x) #x
#define BITOP_ADDR(x) "m" (*(volatile long *) (x))
#define __ASM_SEL(a,b) __ASM_FORM(b)
#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, inst##q##__VA_ARGS__)
__always_inline void linux_set_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
my_bitops.c
#define SETBIT(_value, _bitIndex) _value |= (1ul<<(_bitIndex))
__always_inline void mine_set_bit(long nr, volatile unsigned long *addr)
{
SETBIT(*addr,nr)
}
main.c
#define ARRAY_SIZE 10000000
static unsigned long num_array[ARRAY_SIZE];
unsigned long int num = 0x0F00000F00000000;
for (int i = 0; i < ARRAY_SIZE; i++)
num_array[i] = num;
clock_t start = clock();
for (unsigned long int i = 0 ; i < ARRAY_SIZE; i++)
for (unsigned long int j = 0; j < sizeof(unsigned long int) * 8; j++)
// linux_set_bit(j, &num_array[i]);
// mine_set_bit(j, &num_array[i]);
clock_t end = clock();
Time took for Linux: 1375991 us
Time took for mine: 912256 us
CPU: Intel(R) Core(TM) i7-7700K CPU # 4.20GHz
Assembly code that generated with -O2 are:
26 [1] linux_set_bit(j, &num_array[i]);
0x4005c0 <+ 90> 48 8b 45 d0 mov -0x30(%rbp),%rax
0x4005c4 <+ 94> 48 c1 e0 03 shl $0x3,%rax
0x4005c8 <+ 98> 48 8d 90 60 10 60 00 lea 0x601060(%rax),%rdx
0x4005cf <+ 105> 48 8b 45 d8 mov -0x28(%rbp),%rax
0x4005d3 <+ 109> 48 89 d6 mov %rdx,%rsi
0x4005d6 <+ 112> 48 89 c7 mov %rax,%rdi
0x4005d9 <+ 115> e8 69 00 00 00 callq 0x400647 <linux_set_bit>
71 [1] asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
0x400653 <+ 12> 48 8b 45 f0 mov -0x10(%rbp),%rax
0x400657 <+ 16> 48 8b 55 f8 mov -0x8(%rbp),%rdx
0x40065b <+ 20> 48 0f ab 10 bts %rdx,(%rax)
19 [1] SETBIT(*addr,nr);
0x400653 <+ 12> 48 8b 45 f0 mov -0x10(%rbp),%rax
0x400657 <+ 16> 48 8b 00 mov (%rax),%rax
0x40065a <+ 19> 48 8b 55 f8 mov -0x8(%rbp),%rdx
0x40065e <+ 23> be 01 00 00 00 mov $0x1,%esi
0x400663 <+ 28> 89 d1 mov %edx,%ecx
0x400665 <+ 30> d3 e6 shl %cl,%esi
0x400667 <+ 32> 89 f2 mov %esi,%edx
0x400669 <+ 34> 89 d2 mov %edx,%edx
0x40066b <+ 36> 48 09 c2 or %rax,%rdx
0x40066e <+ 39> 48 8b 45 f0 mov -0x10(%rbp),%rax
0x400672 <+ 43> 48 89 10 mov %rdx,(%rax)
Where i am wrong? Or Linux has a slow operation?
The main difference is that your code can't handle "bit number" being larger than the number of bits in an unsigned long, and Linux's version can. Because of this difference you've written a loop that works with your version's limitations, that isn't ideal when those limitations aren't there, and isn't ideal for Linux's version.
Specifically; for Linux's version, you could/should do this:
for (unsigned long int i = 0 ; i < ARRAY_SIZE * sizeof(unsigned long int) * 8; i++) {
linux_set_bit(i, num_array);
}
By removing the entire inner loop overhead, plus the calculation needed to find a pointer to an element of the array (the &num_array[i] part), it'll be significantly faster (and probably be faster than yours).
Yes, bts %reg, (mem) is slow (https://uops.info); IDK why Linux forces that form without using a lock prefix. Possibly the operation needs to be atomic wrt. interrupts on the same core, which doing it with a single instruction accomplishes.
If not, it's faster to emulate it with multiple instructions to calculate the address of the byte or dword containing the bit you want: How can memory destination BTS be significantly slower than load / BTS reg,reg / store?
(bts imm, (mem) is not bad, though, so you could use __builtin_constant_p(bitpos) and use memory-destination bts.)
As #Brendan points out, your version only works for bitpos < sizeof(unsigned long) * CHAR_BIT, i.e. within the first qword.
I don't know why exactly Linux forces a memory destination bts with a volatile pointer. Presumably there's some kind of reason other than performance. Otherwise, yes it's a missed optimization.

Why 10/3 it's exact in C? [duplicate]

This question already has answers here:
Why is printf round floating point numbers up?
(3 answers)
Closed 5 years ago.
Take a look on this code.
10/3 return 3.3333332538604736328125000 and when I multiply by 3 in a calcutor i get 9.99, but if do the same by the code i get exactly 10.00.
How it's posible ?
#include <stdlib.h>
#include <stdio.h>
int main() {
float v = 10.f/3.f;
float test = v*3.f;
printf("10/3 => %25.25f \n (10/3)*3 => %25.25f\n",v,test);
return 0;
}
This is the assembly code without printf, compiled using default gcc 7.2.1 parameters:
0000000000400497 <main>:
400497: 55 push rbp
400498: 48 89 e5 mov rbp,rsp
40049b: f3 0f 10 05 b1 00 00 movss xmm0,DWORD PTR [rip+0xb1] # 400554 <_IO_stdin_used+0x4>
4004a2: 00
4004a3: f3 0f 11 45 fc movss DWORD PTR [rbp-0x4],xmm0
4004a8: f3 0f 10 4d fc movss xmm1,DWORD PTR [rbp-0x4]
4004ad: f3 0f 10 05 a3 00 00 movss xmm0,DWORD PTR [rip+0xa3] # 400558 <_IO_stdin_used+0x8>
4004b4: 00
4004b5: f3 0f 59 c1 mulss xmm0,xmm1
4004b9: f3 0f 11 45 f8 movss DWORD PTR [rbp-0x8],xmm0
4004be: b8 00 00 00 00 mov eax,0x0
4004c3: 5d pop rbp
4004c4: c3 ret
4004c5: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
4004cc: 00 00 00
4004cf: 90 nop
I think mulss is rounded by a CPU feature.
For note, 10/3 in the GNU BC program returns 3.3333333333333333333333 ( *3 => 9.9999) and in SciLab returns 3.3333333333333334813631 ( *3 => 10).
You end up getting exactly 10 as a result because the representation happens to work out that way. I get the same on my implementation for both float and double.
Let's look at an example using double:
If we print out 10./3. in hexadecimal floating point notation using %a, we get this:
0x1.aaaaaaaaaaaabp+1
This matches up with the IEEE754 double representation 0x401aaaaaaaaaaaab.
The above number normalized is:
0x3.5555555555558
In binary:
11.0101010101010101010101010101010101010101010101011
To keep things simple, let's add three times instead of multiplying by 3:
11.0101010101010101010101010101010101010101010101011
+ 11.0101010101010101010101010101010101010101010101011
---------------------------------------------------------
110.1010101010101010101010101010101010101010101010111
+ 11.0101010101010101010101010101010101010101010101011
---------------------------------------------------------
1010.0000000000000000000000000000000000000000000000000
Which is exactly 10.
EDIT:
Looks like I managed to botch the math on the last few digits. The actual sum:
11.0101010101010101010101010101010101010101010101011
+ 11.0101010101010101010101010101010101010101010101011
---------------------------------------------------------
110.1010101010101010101010101010101010101010101010110
+ 11.0101010101010101010101010101010101010101010101011
---------------------------------------------------------
1010.0000000000000000000000000000000000000000000000001
So it's not exactly 10, but off by the least significant bit.
I noticed a similar difference when using float.
10.f/3.f printed with %a:
0x1.aaaaaap+1
Normalized:
0x3.555554
In binary:
11.0101010101010101010101
Then we add:
11.0101010101010101010101
+ 11.0101010101010101010101
------------------------------
110.1010101010101010101010
+ 11.0101010101010101010101
------------------------------
1001.1111111111111111111111
Again, off by the least significant bit.
As for how the actual result is rounded, that I can't say.
The reason for the discrepancy between what you see in C and what you see in SciLab is that you are using single-precision floating-point values (float) in C, whereas SciLab seems to use double-precision values (double) by default.
You can see the difference here (just remove the f suffix from your numbers and put double instead of float).

Type casting of Macro to optimize the code

Working on to optimize the code. Is it good idea to type cast the macro to char to reduce the memory consumption? What could be the side effect of doing this?
Example:
#define TRUE 1 //non-optimized code
sizeof(TRUE) --> 4
#define TRUE 1 ((char) 0x01) //To optimize
sizeof(TRUE) --> 1
#define MAX 10 //non-optimized code
sizeof(MAX) --> 4
#define MAX ((char) 10) //To optimize
sizeof(MAX) --> 1
They will make virtually no difference in memory consumption.
These macros provide values to be used in expressions, while the actual memory usage is (roughly) dictated by the type and number of variables and dynamically allocated memory. So, you may have TRUE as an int or as a char, but what actually matters is the type of variable it (or, the expression in which it appears) gets assigned to, which is not influenced by the type of the constant.
The only influence the type of these constants may have is in how the expressions they are used into are carried out - but even that effect should be almost non existant, given that the C standard (simplifying) implicitly promotes to int or unsigned all the smaller types before carrying out almost any operation.1
So: if you want to reduce your memory consumption, don't look at your constants, but at your data structures, possibly global and dynamically-allocates ones2! Maybe you have a huge array of double values where the precision of float would be enough, maybe you are keeping around big data longer than you need it, or you have memory leaks, or a big array of a badly-laid-out struct, or of booleans that are 4-byte wide when they could be a bitfield - this is the kind of thing you should look after, definitely not these #defines.
Notes
The idea being that integral operations are carried out at the native register size, which traditionally corresponds to int. Besides, even if this rule wasn't true, the only memory effect of changing the size of integral temporary values in expressions may be at most to increase a bit the stack usage (which is generally mostly preallocated anyway) in case of heavy register spilling.
What is allocated on the stack generally isn't problematic - as said above, it's generally preallocated, and if you were exhausting it your program would be already crashing.
There is no such thing as a char constant in C, which is why there are no suffixes for "short" and "char", as there are for "long" and "long long". The casted value of (char)0x10 will immediately be promoted back to an int in almost any integer context, because of the integer promotions (§6.3.1.1p2).
So if c is a char and you write if (c == (char)0x10) ...,
both x and (char)0x10 are promoted to int before being compared.
Of course, a given compiler might elide the conversion if it knows that it makes no difference, but that compiler would certainly also use a byte constant if possible even without the explicit cast.
The optimization level depends on (1) where those defines are used and (2) what is the processor's arquitecture (or microcontroller) you're running the code.
The (1) has already been addressed in other answers.
The (2) is importante because there are processors/microcontrollers that perform better with 8 bits instead of 32 bits. There are processors that are, for example, 16 bits and if you use 8 bits variables it could decrease the memory needed but increase the run time of the program.
Below are an example and its disassemble:
#include <stdint.h>
#define _VAR_UINT8 ((uint8_t) -1)
#define _VAR_UINT16 ((uint16_t) -1)
#define _VAR_UINT32 ((uint32_t) -1)
#define _VAR_UINT64 ((uint64_t) -1)
volatile uint8_t v1b;
volatile uint16_t v2b;
volatile uint32_t v4b;
volatile uint64_t v8b;
int main(void) {
v1b = _VAR_UINT8;
v2b = _VAR_UINT8;
v2b = _VAR_UINT16;
v4b = _VAR_UINT8;
v4b = _VAR_UINT16;
v4b = _VAR_UINT32;
v8b = _VAR_UINT8;
v8b = _VAR_UINT16;
v8b = _VAR_UINT32;
v8b = _VAR_UINT64;
return 0;
}
Below are the disassemble for a x86 32 bit specific platform (it could be differente if you compile the above code and generate the disassemble in our processor)
00000000004004ec <main>:
4004ec: 55 push %rbp
4004ed: 48 89 e5 mov %rsp,%rbp
4004f0: c6 05 49 0b 20 00 ff movb $0xff,0x200b49(%rip) # 601040 <v1b>
4004f7: 66 c7 05 48 0b 20 00 movw $0xff,0x200b48(%rip) # 601048 <v2b>
4004fe: ff 00
400500: 66 c7 05 3f 0b 20 00 movw $0xffff,0x200b3f(%rip) # 601048 <v2b>
400507: ff ff
400509: c7 05 31 0b 20 00 ff movl $0xff,0x200b31(%rip) # 601044 <v4b>
400510: 00 00 00
400513: c7 05 27 0b 20 00 ff movl $0xffff,0x200b27(%rip) # 601044 <v4b>
40051a: ff 00 00
40051d: c7 05 1d 0b 20 00 ff movl $0xffffffff,0x200b1d(%rip) # 601044 <v4b>
400524: ff ff ff
400527: 48 c7 05 06 0b 20 00 movq $0xff,0x200b06(%rip) # 601038 <v8b>
40052e: ff 00 00 00
400532: 48 c7 05 fb 0a 20 00 movq $0xffff,0x200afb(%rip) # 601038 <v8b>
400539: ff ff 00 00
40053d: c7 05 f1 0a 20 00 ff movl $0xffffffff,0x200af1(%rip) # 601038 <v8b>
400544: ff ff ff
400547: c7 05 eb 0a 20 00 00 movl $0x0,0x200aeb(%rip) # 60103c <v8b+0x4>
40054e: 00 00 00
400551: 48 c7 05 dc 0a 20 00 movq $0xffffffffffffffff,0x200adc(%rip) # 601038 <v8b>
400558: ff ff ff ff
40055c: b8 00 00 00 00 mov $0x0,%eax
400561: 5d pop %rbp
400562: c3 retq
400563: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40056a: 00 00 00
40056d: 0f 1f 00 nopl (%rax)
In my specific platform it is using 4 types of mov instruction, movb (7 bytes), movw (9 bytes), movl (10 bytes) and movq (12 bytes) depending upon the variable type and also the data type to be assigned.

What is the difference between directly assigning the result of left shift operation to a variable and the left shift assignment operation in C?

In the following expression, the result of the left shift operation is assigned to the variable i.
int i;
i = 7 << 32;
printf("i = %d\n",i);
In the following expression, the left shift assignment operation is carried.
int x = 7;
x <<= 32;
printf("x = %d\n",x);
Both the above expressions gave different results. But it's not the same with the following two expressions. Both of them gave the same result. So what could be the reason for the above expressions to return different values?
int a;
a = 1 + 1;
printf("a = %d\n",a);
int b = 1;
b += 1;
printf("b = %d\n",b);
The C standard says:
The result is undefined if the right operand is negative, or greater
than or equal to the number of bits in the left expression’s type.
So, it is undefined behavior because int is normally 32 bits in size, which means that only 0 through 31 steps are well-defined.
I agree with Cody Gray's comments. Just for people in future who end up here, the way to resolve this ambiguity is using unsigned long long.
unsigned long long int b = 7ULL<<32; // ULL here is important, as it tells the compiler that the number being shifted is more than 32bit.
unsigned long long int a = 7;
a <<=32;
The abstract operational semantics from ISO/IEC 9899 says:
6.5.7 Bitwise shift operators --- Semantics
3 .... ... . If the value
of the right operand is negative or is greater than or equal to the
width of the promoted left operand, the behavior is undefined.
In your case, disassembling and seeing what happens, we see so:
[root#arch stub]# objdump -d a.out | sed '/ <main>/,/^$/ !d'
00000000004004f6 <main>:
4004f6: 55 push %rbp
4004f7: 48 89 e5 mov %rsp,%rbp
4004fa: 48 83 ec 10 sub $0x10,%rsp
4004fe: c7 45 fc 07 00 00 00 movl $0x7,-0x4(%rbp)
400505: b8 20 00 00 00 mov $0x20,%eax
40050a: 89 c1 mov %eax,%ecx
40050c: d3 65 fc shll %cl,-0x4(%rbp) <<== HERE IS THE PROBLEM
40050f: 8b 45 fc mov -0x4(%rbp),%eax
400512: 89 c6 mov %eax,%esi
400514: bf b4 05 40 00 mov $0x4005b4,%edi
400519: b8 00 00 00 00 mov $0x0,%eax
40051e: e8 cd fe ff ff callq 4003f0 <printf#plt>
400523: b8 00 00 00 00 mov $0x0,%eax
400528: c9 leaveq
400529: c3 retq
40052a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
The generated code tries indeed to shift, but the shll %cl,-0x4(%rbp) (shift left of a long) has no effect.
The undefined behaviour in this case lies in assembly, namely in SHL operation.

What numeric values defines in dissembled of C code?

I'm understanding the assembly and C code.
I have following C program , compiled to generate Object file only.
#include <stdio.h>
int main()
{
int i = 10;
int j = 22 + i;
return 0;
}
I executed following command
objdump -S myprogram.o
Output of above command is:
objdump -S testelf.o
testelf.o: file format elf32-i386
Disassembly of section .text:
00000000 <main>:
#include <stdio.h>
int main()
{
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 10 sub $0x10,%esp
int i = 10;
6: c7 45 f8 0a 00 00 00 movl $0xa,-0x8(%ebp)
int j = 22 + i;
d: 8b 45 f8 mov -0x8(%ebp),%eax
10: 83 c0 16 add $0x16,%eax
13: 89 45 fc mov %eax,-0x4(%ebp)
return 0;
16: b8 00 00 00 00 mov $0x0,%eax
}
1b: c9 leave
1c: c3 ret
What is meant by number numeric before the mnemonic commands
i.e. "83 ec 10 " before "sub" command or
"c7 45 f8 0a 00 00 00" before "movl" command
I'm using following platform to compile this code:
$ lscpu
Architecture: i686
CPU op-mode(s): 32-bit
Byte Order: Little Endian
CPU(s): 1
On-line CPU(s) list: 0
Thread(s) per core: 1
Core(s) per socket: 1
Socket(s): 1
Vendor ID: GenuineIntel
Those are x86 opcodes. A detailed reference, other than the ones listed in the comments above is available here.
For example the c7 45 f8 0a 00 00 00 before the movl $0xa,-0x8(%ebp) are hexadecimal values for the opcode bytes. They tell the CPU to move the immediate value of 10 decimal (as a 4-byte value) into the address located on the current stack 8-bytes above the stack frame base pointer. That is where the variable i from your C source code is located when your code is running. The top of the stack is at a lower memory address than the bottom of the stack, so moving a negative direction from the base is moving up the stack.
The c7 45 f8 opcodes mean to mov data and clear the arithmetic carry flag in the EFLAGS register. See the reference for more detail.
The remainder of the codes are an immediate value. Since you are using a little endian system, the least significant byte of a number is listed first, such that 10 decimal which is 0x0a in hexadecimal and has a 4-byte value of 0x0000000a is stored as 0a 00 00 00.

Resources