Related
I have found that manually calculating the % operator on __int128 is significantly faster than the built-in compiler operator. I will show you how to calculate modulo 9, but the method can be used to calculate modulo any other number.
First, consider the built-in compiler operator:
uint64_t mod9_v1(unsigned __int128 n)
{
return n % 9;
}
Now consider my manual implementation:
uint64_t mod9_v2(unsigned __int128 n)
{
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)4;
r += (uint32_t)(n >> 64) * (uint64_t)7;
r += (uint32_t)(n >> 96);
return r % 9;
}
Measuring over 100,000,000 random numbers gives the following results:
mod9_v1 | 3.986052 secs
mod9_v2 | 1.814339 secs
GCC 9.3.0 with -march=native -O3 was used on AMD Ryzen Threadripper 2990WX.
Here is a link to godbolt.
I would like to ask if it behaves the same way on your side?
(Before reporting a bug to GCC Bugzilla).
UPDATE:
On request, I supply a generated assembly:
mod9_v1:
sub rsp, 8
mov edx, 9
xor ecx, ecx
call __umodti3
add rsp, 8
ret
mod9_v2:
mov rax, rdi
shrd rax, rsi, 32
mov rdx, rsi
mov r8d, eax
shr rdx, 32
mov eax, edi
add rax, rdx
lea rax, [rax+r8*4]
mov esi, esi
lea rcx, [rax+rsi*8]
sub rcx, rsi
mov rax, rcx
movabs rdx, -2049638230412172401
mul rdx
mov rax, rdx
shr rax, 3
and rdx, -8
add rdx, rax
mov rax, rcx
sub rax, rdx
ret
The reason for this difference is clear from the assembly listings: the % operator applied to 128-bit integers is implemented via a library call to a generic function that cannot take advantage of compile time knowledge of the divisor value, which makes it possible to turn division and modulo operations into much faster multiplications.
The timing difference is even more significant on my old Macbook-pro using clang, where I mod_v2() is x15 times faster than mod_v1().
Note however these remarks:
you should measure the cpu time just after the end of the for loop, not after the first printf as currently coded.
rand_u128() only produces 124 bits assuming RAND_MAX is 0x7fffffff.
most of the time is spent computing the random numbers.
Using your slicing approach, I extended you code to reduce the number of steps using slices of 42, 42 and 44 bits, which further improves the timings (because 242 % 9 == 1):
#pragma GCC diagnostic ignored "-Wpedantic"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
static uint64_t mod9_v1(unsigned __int128 n) {
return n % 9;
}
static uint64_t mod9_v2(unsigned __int128 n) {
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)(((uint64_t)1ULL << 32) % 9);
r += (uint32_t)(n >> 64) * (uint64_t)(((unsigned __int128)1 << 64) % 9);
r += (uint32_t)(n >> 96);
return r % 9;
}
static uint64_t mod9_v3(unsigned __int128 n) {
return (((uint64_t)(n >> 0) & 0x3ffffffffff) +
((uint64_t)(n >> 42) & 0x3ffffffffff) +
((uint64_t)(n >> 84))) % 9;
}
unsigned __int128 rand_u128() {
return ((unsigned __int128)rand() << 97 ^
(unsigned __int128)rand() << 66 ^
(unsigned __int128)rand() << 35 ^
(unsigned __int128)rand() << 4 ^
(unsigned __int128)rand());
}
#define N 100000000
int main() {
srand(42);
unsigned __int128 *arr = malloc(sizeof(unsigned __int128) * N);
if (arr == NULL) {
return 1;
}
for (size_t n = 0; n < N; ++n) {
arr[n] = rand_u128();
}
#if 1
/* check that modulo 9 is calculated correctly */
for (size_t n = 0; n < N; ++n) {
uint64_t m = mod9_v1(arr[n]);
assert(m == mod9_v2(arr[n]));
assert(m == mod9_v3(arr[n]));
}
#endif
clock_t clk1 = -clock();
uint64_t sum1 = 0;
for (size_t n = 0; n < N; ++n) {
sum1 += mod9_v1(arr[n]);
}
clk1 += clock();
clock_t clk2 = -clock();
uint64_t sum2 = 0;
for (size_t n = 0; n < N; ++n) {
sum2 += mod9_v2(arr[n]);
}
clk2 += clock();
clock_t clk3 = -clock();
uint64_t sum3 = 0;
for (size_t n = 0; n < N; ++n) {
sum3 += mod9_v3(arr[n]);
}
clk3 += clock();
printf("mod9_v1: sum=%"PRIu64", elapsed time: %.3f secs\n", sum1, clk1 / (double)CLOCKS_PER_SEC);
printf("mod9_v2: sum=%"PRIu64", elapsed time: %.3f secs\n", sum2, clk2 / (double)CLOCKS_PER_SEC);
printf("mod9_v3: sum=%"PRIu64", elapsed time: %.3f secs\n", sum3, clk3 / (double)CLOCKS_PER_SEC);
free(arr);
return 0;
}
Here are the timings on my linux server (gcc):
mod9_v1: sum=400041273, elapsed time: 7.992 secs
mod9_v2: sum=400041273, elapsed time: 1.295 secs
mod9_v3: sum=400041273, elapsed time: 1.131 secs
The same code on my Macbook (clang):
mod9_v1: sum=399978071, elapsed time: 32.900 secs
mod9_v2: sum=399978071, elapsed time: 0.204 secs
mod9_v3: sum=399978071, elapsed time: 0.185 secs
In the mean time (while waiting for Bugzilla), you could let the preprocessor do the optimization for you. E.g. define a macro called MOD_INT128(n,d) :
#define MODCALC0(n,d) ((65536*n)%d)
#define MODCALC1(n,d) MODCALC0(MODCALC0(n,d),d)
#define MODCALC2(n,d) MODCALC1(MODCALC1(n,d),d)
#define MODCALC3(n,d) MODCALC2(MODCALC1(n,d),d)
#define MODPARAM(n,d,a,b,c) \
((uint64_t)((uint32_t)(n) ) + \
(uint64_t)((uint32_t)(n >> 32) * (uint64_t)a) + \
(uint64_t)((uint32_t)(n >> 64) * (uint64_t)b) + \
(uint64_t)((uint32_t)(n >> 96) * (uint64_t)c) ) % d
#define MOD_INT128(n,d) MODPARAM(n,d,MODCALC1(1,d),MODCALC2(1,d),MODCALC3(1,d))
Now,
uint64_t mod9_v3(unsigned __int128 n)
{
return MOD_INT128( n, 9 );
}
will generate similar assembly language as the mod9_v2() function, and
uint64_t mod8_v3(unsigned __int128 n)
{
return MOD_INT128( n, 8 );
}
works fine with already existing optimization (GCC 10.2.0)
I'm using an MCU that doesn't support long long type. I can't cast to long long.
I need a function to get the high 32 bits of a 32-bit/32-bit multiplication.
x86 ASM is like this:
__asm {
mov eax, a
mov ecx, b
mul ecx
mov eax, edx
}
Is there a C code that would do the same function? I tried
UINT32 umulm(UINT32 a, UINT32 b)
{
UINT32 i, r;
r = 0;
for (i = 1; i < 32; i++)
if (a & (1 << i))
r += b >> (32 - i);
return r;
}
, but something is wrong with function; how could I fix it?
You can do this if you use the standard sized types:
#include <stdint.h>
...
uint32_t a = 0x1234; b = 0x5678;
uint64_t result = (uint64_t)a * b;
uint32_t high = (result >> 32) && 0xffffffff;
uint32_t low = result && 0xffffffff;
<stdint.h> was introduced with C99, so if your compiler isn't too ancient, you will most likely have this feature available.
So I've written this function to count the number of bits in a long, which for my purposes includes zeros to the right of the MSB and excludes zeros to its left:
int bitCount(unsigned long bits)
{
int len = 64;
unsigned long mask = 0x8000000000000000;
while ((bits & mask) == 0 && len > 0){
mask >>= 1;
--len;
}
return len;
}
This function works fine for me as far as returning a correct answer, but is there a better (faster or otherwise) way to go about doing this?
If you want to count the number of bits in a long type, I suggest you use ULONG_MAX from the <limits.h> header file, and use the right shift operator >> to count the number of one-bits. This way you don't have to actually know the number of bits beforehand.
Something like
unsigned long value = ULONG_MAX;
unsigned count = 1;
while (value >>= 1)
++count;
This works because the right shift fills up with zeroes.
The general answer for the number of bits in any type is CHAR_BIT*sizeof(type). CHAR_BIT, defined in <limits.h> is the (implementation-defined) number of bits in a char. sizeof(type) is specified in a way that yields the number of chars used to represent the type (i.e. sizeof(char) is 1).
The solutions the other guys proposed are very nice and probably the shortest to write and remain understandable. Another straight forward approach would be something like this
int bitCountLinear(long int n) {
int len = sizeof(long int)*8;
for (int i = 0; i < len; ++i)
if ((1UL<<i) > (unsigned long int)n)
return i;
return len;
}
The rest might get a bit extreme but I gave it a try so I'll share it.
I suspected that there might be arguably faster methods of doing this. eg Using binary search (even though a length of 64bits is extremely small). So I gave it a quick try for you and for the fun of it.
union long_ing_family {
unsigned long int uli;
long int li;
};
int bitCountLogN(long int num) {
union long_ing_family lif;
lif.li = num;
unsigned long int n = lif.uli;
int res;
int len = sizeof(long int)*8-1;
int max = len;
int min = 0;
if (n == 0) return 0;
do {
res = (min + max) / 2;
if (n < 1UL<<res)
max = res - 1;
else if (n >= (1UL<<(res+1)))
min = res + 1;
else
return res+1;
} while (min < max);
return min+1; // or max+1
}
I then timed both to see if they have any interesting differences...
#include <stdio.h>
#define REPS 10000000
int bitCountLinear(long int n);
int bitCountLogN(long int num);
unsigned long int timestamp_start(void);
unsigned long int timestamp_stop(void);
union long_ing_family;
int main(void) {
long int n;
long int begin, end;
long int begin_Lin, end_Lin;
long int begin_Log, end_Log;
begin_Lin = 0;
end_Lin = 0;
begin_Log = 0;
end_Log = 0;
for (int i = 0; i < REPS; ++i) {
begin_Lin += timestamp_start();
bitCountLinear(i);
end_Lin += timestamp_stop();
}
printf("Linear: %lu\n", (end_Lin-begin_Lin)/REPS);
for (int i = 0; i < REPS; ++i) {
begin_Log += timestamp_start();
bitCountLogN(i);
end_Log += timestamp_stop();
}
printf("Log(n): %lu\n", (end_Log-begin_Log)/REPS);
}
unsigned long int timestamp_start(void) {
unsigned int cycles_low;
unsigned int cycles_high;
asm volatile ("CPUID\n\t"
"RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::"%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long int)cycles_high << 32) | cycles_low;
}
unsigned long int timestamp_stop(void) {
unsigned int cycles_low;
unsigned int cycles_high;
asm volatile ("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low)::"%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long int)cycles_high << 32) | cycles_low;
}
...and not surprisingly they didn't.
On my machine I'll get numbers like
Linear: 228
Log(n): 224
Which are not considered to be different assuming a lot of background noise.
Edit:
I realized that I only tested the fastest solutions for the Linear approach so changing the function inputs to
bitCountLinear(0xFFFFFFFFFFFFFFFF-i);
and
bitCountLogN(0xFFFFFFFFFFFFFFFF-i);
On my machine I'll get numbers like
Linear: 415
Log(n): 269
Which is clearly a win for the Log(n) method. I didn't expect to see a difference here.
You can count the number of bit 1 first:
int bitCount(unsigned long n)
{
unsigned long tmp;
tmp = n - ((n >> 1) & 0x7777777777777777)
- ((n >> 2) & 0x3333333333333333)
- ((n >> 3) & 0x1111111111111111);
tmp = (tmp + (tmp >> 4)) & 0x0F0F0F0F0F0F0F0F;
return 64 - tmp % 255; // temp % 255 is number of bit 1
}
Take a look at the MIT HAKMEM Count.
In C, why is signed int faster than unsigned int? True, I know that this has been asked and answered multiple times on this website (links below). However, most people said that there is no difference. I have written code and accidentally found a significant performance difference.
Why would the "unsigned" version of my code be slower than the "signed" version (even when testing the same number)? (I have a x86-64 Intel processor).
Similar links
Faster comparing signed than unsigned ints
performance of unsigned vs signed integers
Compile Command: gcc -Wall -Wextra -pedantic -O3 -Wl,-O3 -g0 -ggdb0 -s -fwhole-program -funroll-loops -pthread -pipe -ffunction-sections -fdata-sections -std=c11 -o ./test ./test.c && strip --strip-all --strip-unneeded --remove-section=.note --remove-section=.comment ./test
signed int version
NOTE: There is no difference if I explicitly declare signed int on all numbers.
int isprime(int num) {
// Test if a signed int is prime
int i;
if (num % 2 == 0 || num % 3 == 0)
return 0;
else if (num % 5 == 0 || num % 7 == 0)
return 0;
else {
for (i = 11; i < num; i += 2) {
if (num % i == 0) {
if (i != num)
return 0;
else
return 1;
}
}
}
return 1;
}
unsigned int version
int isunsignedprime(unsigned int num) {
// Test if an unsigned int is prime
unsigned int i;
if (num % (unsigned int)2 == (unsigned int)0 || num % (unsigned int)3 == (unsigned int)0)
return 0;
else if (num % (unsigned int)5 == (unsigned int)0 || num % (unsigned int)7 == (unsigned int)0)
return 0;
else {
for (i = (unsigned int)11; i < num; i += (unsigned int)2) {
if (num % i == (unsigned int)0) {
if (i != num)
return 0;
else
return 1;
}
}
}
return 1;
}
Test this in a file with the below code:
int main(void) {
printf("%d\n", isprime(294967291));
printf("%d\n", isprime(294367293));
printf("%d\n", isprime(294967293));
printf("%d\n", isprime(294967241)); // slow
printf("%d\n", isprime(294967251));
printf("%d\n", isprime(294965291));
printf("%d\n", isprime(294966291));
printf("%d\n", isprime(294963293));
printf("%d\n", isprime(294927293));
printf("%d\n", isprime(294961293));
printf("%d\n", isprime(294917293));
printf("%d\n", isprime(294167293));
printf("%d\n", isprime(294267293));
printf("%d\n", isprime(294367293)); // slow
printf("%d\n", isprime(294467293));
return 0;
}
Results (time ./test):
Signed - real 0m0.949s
Unsigned - real 0m1.174s
Your question is genuinely intriguing as the unsigned version consistently produces code that is 10 to 20% slower. Yet there are multiple problems in the code:
Both functions return 0 for 2, 3, 5 and 7, which is incorrect.
The test if (i != num) return 0; else return 1; is completely useless as the loop body is only run for i < num. Such a test would be useful for the small prime tests but special casing them is not really useful.
the casts in the unsigned version are redundant.
benchmarking code that produces textual output to the terminal is unreliable, you should use the clock() function to time CPU intensive functions without any intervening I/O.
the algorithm for prime testing is utterly inefficient as the loop runs num / 2 times instead of sqrt(num).
Let's simplify the code and run some precise benchmarks:
#include <stdio.h>
#include <time.h>
int isprime_slow(int num) {
if (num % 2 == 0)
return num == 2;
for (int i = 3; i < num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int unsigned_isprime_slow(unsigned int num) {
if (num % 2 == 0)
return num == 2;
for (unsigned int i = 3; i < num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int isprime_fast(int num) {
if (num % 2 == 0)
return num == 2;
for (int i = 3; i * i <= num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int unsigned_isprime_fast(unsigned int num) {
if (num % 2 == 0)
return num == 2;
for (unsigned int i = 3; i * i <= num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int main(void) {
int a[] = {
294967291, 0, 294367293, 0, 294967293, 0, 294967241, 1, 294967251, 0,
294965291, 0, 294966291, 0, 294963293, 0, 294927293, 1, 294961293, 0,
294917293, 0, 294167293, 0, 294267293, 0, 294367293, 0, 294467293, 0,
};
struct testcase { int (*fun)(); const char *name; int t; } test[] = {
{ isprime_slow, "isprime_slow", 0 },
{ unsigned_isprime_slow, "unsigned_isprime_slow", 0 },
{ isprime_fast, "isprime_fast", 0 },
{ unsigned_isprime_fast, "unsigned_isprime_fast", 0 },
};
for (int n = 0; n < 4; n++) {
clock_t t = clock();
for (int i = 0; i < 30; i += 2) {
if (test[n].fun(a[i]) != a[i + 1]) {
printf("%s(%d) != %d\n", test[n].name, a[i], a[i + 1]);
}
}
test[n].t = clock() - t;
}
for (int n = 0; n < 4; n++) {
printf("%21s: %4d.%03dms\n", test[n].name, test[n].t / 1000), test[n].t % 1000);
}
return 0;
}
The code compiled with clang -O2 on OS/X produces this output:
isprime_slow: 788.004ms
unsigned_isprime_slow: 965.381ms
isprime_fast: 0.065ms
unsigned_isprime_fast: 0.089ms
These timings are consistent with the OP's observed behavior on a different system, but show the dramatic improvement caused by the more efficient iteration test: 10000 times faster!
Regarding the question Why is the function slower with unsigned?, let's look at the generated code (gcc 7.2 -O2):
isprime_slow(int):
...
.L5:
movl %edi, %eax
cltd
idivl %ecx
testl %edx, %edx
je .L1
.L4:
addl $2, %ecx
cmpl %esi, %ecx
jne .L5
.L6:
movl $1, %edx
.L1:
movl %edx, %eax
ret
unsigned_isprime_slow(unsigned int):
...
.L19:
xorl %edx, %edx
movl %edi, %eax
divl %ecx
testl %edx, %edx
je .L22
.L18:
addl $2, %ecx
cmpl %esi, %ecx
jne .L19
.L20:
movl $1, %eax
ret
...
.L22:
xorl %eax, %eax
ret
The inner loops are very similar, same number of instructions, similar instructions. Here are however some potential explanations:
cltd extends the sign of the eax register into the edx register, which may be causing an instruction delay because eax is modified by the immediately preceeding instruction movl %edi, %eax. Yet this would make the signed version slower than the unsigned one, not faster.
the loops' initial instructions might be misaligned for the unsigned version, but it is unlikely as changing the order in the source code has no effect on the timings.
Although the register contents are identical for the signed and unsigned division opcodes, it is possible that the idivl instruction take fewer cycles than the divl instruction. Indeed the signed division operates on one less bit of precision than the unsigned division, but the difference seems quite large for this small change.
I suspect more effort was put into the silicon implementation of idivl because signed divisions are more common that unsigned divisions (as measured by years of coding statistics at Intel).
as commented by rcgldr, looking at instruction tables for Intel process, for Ivy Bridge, DIV 32 bit takes 10 micro ops, 19 to 27 cycles, IDIV 9 micro ops, 19 to 26 cycles. The benchmark times are consistent with these timings. The extra micro-op may be due to the longer operands in DIV (64/32 bits) as opposed to IDIV (63/31 bits).
This surprising result should teach us a few lessons:
optimizing is a difficult art, be humble and procrastinate.
correctness is often broken by optimizations.
choosing a better algorithm beats optimization by a long shot.
always benchmark code, do not trust your instincts.
Because signed integer overflow is undefined, the compiler can make a lot of assumptions and optimizations on code involving signed integers. Unsigned integer overflow is defined to wrap around, so the compiler won't be able to optimize as much. See also http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html#signed_overflow and http://www.airs.com/blog/archives/120.
From Instruction specification on AMD/Intel we have (for K7):
Instruction Ops Latency Throughput
DIV r32/m32 32 24 23
IDIV r32 81 41 41
IDIV m32 89 41 41
For i7, latency and throughput are the same for IDIVL and DIVL, a slight difference exists for the µops.
This may explain the difference as -O3 assembly codes only differ by signedness (DIVL vs IDIVL) on my machine.
Alternative wiki candidate test that may/may not show a significant time difference.
#include <stdio.h>
#include <time.h>
#define J 10
#define I 5
int main(void) {
clock_t c1,c2,c3;
for (int j=0; j<J; j++) {
c1 = clock();
for (int i=0; i<I; i++) {
isprime(294967241);
isprime(294367293);
}
c2 = clock();
for (int i=0; i<I; i++) {
isunsignedprime(294967241);
isunsignedprime(294367293);
}
c3 = clock();
printf("%d %d %d\n", (int)(c2-c1), (int)(c3-c2), (int)((c3-c2) - (c2-c1)));
fflush(stdout);
}
return 0;
}
Sample output
2761 2746 -15
2777 2777 0
2761 2745 -16
2793 2808 15
2792 2730 -62
2746 2730 -16
2746 2730 -16
2776 2793 17
2823 2808 -15
2793 2823 30
In fact in many cases unsigned is faster than signed for eample
In dividing by a power of 2
unsigned int x=37;
cout<<x/4;
In checking if a number if even
unsigned int x=37;
cout<<(x%2==0)?"even":"odd";
How can I multiply and divide using only bit shifting and adding?
To multiply in terms of adding and shifting you want to decompose one of the numbers by powers of two, like so:
21 * 5 = 10101_2 * 101_2 (Initial step)
= 10101_2 * (1 * 2^2 + 0 * 2^1 + 1 * 2^0)
= 10101_2 * 2^2 + 10101_2 * 2^0
= 10101_2 << 2 + 10101_2 << 0 (Decomposed)
= 10101_2 * 4 + 10101_2 * 1
= 10101_2 * 5
= 21 * 5 (Same as initial expression)
(_2 means base 2)
As you can see, multiplication can be decomposed into adding and shifting and back again. This is also why multiplication takes longer than bit shifts or adding - it's O(n^2) rather than O(n) in the number of bits. Real computer systems (as opposed to theoretical computer systems) have a finite number of bits, so multiplication takes a constant multiple of time compared to addition and shifting. If I recall correctly, modern processors, if pipelined properly, can do multiplication just about as fast as addition, by messing with the utilization of the ALUs (arithmetic units) in the processor.
The answer by Andrew Toulouse can be extended to division.
The division by integer constants is considered in details in the book "Hacker's Delight" by Henry S. Warren (ISBN 9780201914658).
The first idea for implementing division is to write the inverse value of the denominator in base two.
E.g.,
1/3 = (base-2) 0.0101 0101 0101 0101 0101 0101 0101 0101 .....
So,
a/3 = (a >> 2) + (a >> 4) + (a >> 6) + ... + (a >> 30)
for 32-bit arithmetics.
By combining the terms in an obvious manner we can reduce the number of operations:
b = (a >> 2) + (a >> 4)
b += (b >> 4)
b += (b >> 8)
b += (b >> 16)
There are more exciting ways to calculate division and remainders.
EDIT1:
If the OP means multiplication and division of arbitrary numbers, not the division by a constant number, then this thread might be of use: https://stackoverflow.com/a/12699549/1182653
EDIT2:
One of the fastest ways to divide by integer constants is to exploit the modular arithmetics and Montgomery reduction: What's the fastest way to divide an integer by 3?
X * 2 = 1 bit shift left
X / 2 = 1 bit shift right
X * 3 = shift left 1 bit and then add X
x << k == x multiplied by 2 to the power of k
x >> k == x divided by 2 to the power of k
You can use these shifts to do any multiplication operation. For example:
x * 14 == x * 16 - x * 2 == (x << 4) - (x << 1)
x * 12 == x * 8 + x * 4 == (x << 3) + (x << 2)
To divide a number by a non-power of two, I'm not aware of any easy way, unless you want to implement some low-level logic, use other binary operations and use some form of iteration.
A left shift by 1 position is analogous to multiplying by 2. A right shift is analogous to dividing by 2.
You can add in a loop to multiply. By picking the loop variable and the addition variable correctly, you can bound performance. Once you've explored that, you should use Peasant Multiplication
A procedure for dividing integers that uses shifts and adds can be derived in straightforward fashion from decimal longhand division as taught in elementary school. The selection of each quotient digit is simplified, as the digit is either 0 and 1: if the current remainder is greater than or equal to the divisor, the least significant bit of the partial quotient is 1.
Just as with decimal longhand division, the digits of the dividend are considered from most significant to least significant, one digit at a time. This is easily accomplished by a left shift in binary division. Also, quotient bits are gathered by left shifting the current quotient bits by one position, then appending the new quotient bit.
In a classical arrangement, these two left shifts are combined into left shifting of one register pair. The upper half holds the current remainder, the lower half initial holds the dividend. As the dividend bits are transferred to the remainder register by left shift, the unused least significant bits of the lower half are used to accumulate the quotient bits.
Below is x86 assembly language and C implementations of this algorithm. This particular variant of a shift & add division is sometimes referred to as the "non-performing" variant, as the subtraction of the divisor from the current remainder is not performed unless the remainder is greater than or equal to the divisor (Otto Spaniol, "Computer Arithmetic: Logic and Design." Chichester: Wiley 1981, p. 144). In C, there is no notion of the carry flag used by the assembly version in the register pair left shift. Instead, it is emulated, based on the observation that the result of an addition modulo 2n can be smaller that either addend only if there was a carry out.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define USE_ASM 0
#if USE_ASM
uint32_t bitwise_division (uint32_t dividend, uint32_t divisor)
{
uint32_t quot;
__asm {
mov eax, [dividend];// quot = dividend
mov ecx, [divisor]; // divisor
mov edx, 32; // bits_left
mov ebx, 0; // rem
$div_loop:
add eax, eax; // (rem:quot) << 1
adc ebx, ebx; // ...
cmp ebx, ecx; // rem >= divisor ?
jb $quot_bit_is_0; // if (rem < divisor)
$quot_bit_is_1: //
sub ebx, ecx; // rem = rem - divisor
add eax, 1; // quot++
$quot_bit_is_0:
dec edx; // bits_left--
jnz $div_loop; // while (bits_left)
mov [quot], eax; // quot
}
return quot;
}
#else
uint32_t bitwise_division (uint32_t dividend, uint32_t divisor)
{
uint32_t quot, rem, t;
int bits_left = CHAR_BIT * sizeof (uint32_t);
quot = dividend;
rem = 0;
do {
// (rem:quot) << 1
t = quot;
quot = quot + quot;
rem = rem + rem + (quot < t);
if (rem >= divisor) {
rem = rem - divisor;
quot = quot + 1;
}
bits_left--;
} while (bits_left);
return quot;
}
#endif
I translated the Python code to C. The example given had a minor flaw. If the dividend value that took up all the 32 bits, the shift would fail. I just used 64-bit variables internally to work around the problem:
int No_divide(int nDivisor, int nDividend, int *nRemainder)
{
int nQuotient = 0;
int nPos = -1;
unsigned long long ullDivisor = nDivisor;
unsigned long long ullDividend = nDividend;
while (ullDivisor < ullDividend)
{
ullDivisor <<= 1;
nPos ++;
}
ullDivisor >>= 1;
while (nPos > -1)
{
if (ullDividend >= ullDivisor)
{
nQuotient += (1 << nPos);
ullDividend -= ullDivisor;
}
ullDivisor >>= 1;
nPos -= 1;
}
*nRemainder = (int) ullDividend;
return nQuotient;
}
Take two numbers, lets say 9 and 10, write them as binary - 1001 and 1010.
Start with a result, R, of 0.
Take one of the numbers, 1010 in this case, we'll call it A, and shift it right by one bit, if you shift out a one, add the first number, we'll call it B, to R.
Now shift B left by one bit and repeat until all bits have been shifted out of A.
It's easier to see what's going on if you see it written out, this is the example:
0
0000 0
10010 1
000000 0
1001000 1
------
1011010
Taken from here.
This is only for division:
int add(int a, int b) {
int partialSum, carry;
do {
partialSum = a ^ b;
carry = (a & b) << 1;
a = partialSum;
b = carry;
} while (carry != 0);
return partialSum;
}
int subtract(int a, int b) {
return add(a, add(~b, 1));
}
int division(int dividend, int divisor) {
boolean negative = false;
if ((dividend & (1 << 31)) == (1 << 31)) { // Check for signed bit
negative = !negative;
dividend = add(~dividend, 1); // Negation
}
if ((divisor & (1 << 31)) == (1 << 31)) {
negative = !negative;
divisor = add(~divisor, 1); // Negation
}
int quotient = 0;
long r;
for (int i = 30; i >= 0; i = subtract(i, 1)) {
r = (divisor << i);
// Left shift divisor until it's smaller than dividend
if (r < Integer.MAX_VALUE && r >= 0) { // Avoid cases where comparison between long and int doesn't make sense
if (r <= dividend) {
quotient |= (1 << i);
dividend = subtract(dividend, (int) r);
}
}
}
if (negative) {
quotient = add(~quotient, 1);
}
return quotient;
}
This should work for multiplication:
.data
.text
.globl main
main:
# $4 * $5 = $2
addi $4, $0, 0x9
addi $5, $0, 0x6
add $2, $0, $0 # initialize product to zero
Loop:
beq $5, $0, Exit # if multiplier is 0,terminate loop
andi $3, $5, 1 # mask out the 0th bit in multiplier
beq $3, $0, Shift # if the bit is 0, skip add
addu $2, $2, $4 # add (shifted) multiplicand to product
Shift:
sll $4, $4, 1 # shift up the multiplicand 1 bit
srl $5, $5, 1 # shift down the multiplier 1 bit
j Loop # go for next
Exit: #
EXIT:
li $v0,10
syscall
The below method is the implementation of binary divide considering both numbers are positive. If subtraction is a concern we can implement that as well using binary operators.
Code
-(int)binaryDivide:(int)numerator with:(int)denominator
{
if (numerator == 0 || denominator == 1) {
return numerator;
}
if (denominator == 0) {
#ifdef DEBUG
NSAssert(denominator==0, #"denominator should be greater then 0");
#endif
return INFINITY;
}
// if (numerator <0) {
// numerator = abs(numerator);
// }
int maxBitDenom = [self getMaxBit:denominator];
int maxBitNumerator = [self getMaxBit:numerator];
int msbNumber = [self getMSB:maxBitDenom ofNumber:numerator];
int qoutient = 0;
int subResult = 0;
int remainingBits = maxBitNumerator-maxBitDenom;
if (msbNumber >= denominator) {
qoutient |=1;
subResult = msbNumber - denominator;
}
else {
subResult = msbNumber;
}
while (remainingBits > 0) {
int msbBit = (numerator & (1 << (remainingBits-1)))>0?1:0;
subResult = (subResult << 1) | msbBit;
if(subResult >= denominator) {
subResult = subResult - denominator;
qoutient= (qoutient << 1) | 1;
}
else{
qoutient = qoutient << 1;
}
remainingBits--;
}
return qoutient;
}
-(int)getMaxBit:(int)inputNumber
{
int maxBit = 0;
BOOL isMaxBitSet = NO;
for (int i=0; i<sizeof(inputNumber)*8; i++) {
if (inputNumber & (1<<i)) {
maxBit = i;
isMaxBitSet=YES;
}
}
if (isMaxBitSet) {
maxBit+=1;
}
return maxBit;
}
-(int)getMSB:(int)bits ofNumber:(int)number
{
int numbeMaxBit = [self getMaxBit:number];
return number >> (numbeMaxBit - bits);
}
For multiplication:
-(int)multiplyNumber:(int)num1 withNumber:(int)num2
{
int mulResult = 0;
int ithBit;
BOOL isNegativeSign = (num1<0 && num2>0) || (num1>0 && num2<0);
num1 = abs(num1);
num2 = abs(num2);
for (int i=0; i<sizeof(num2)*8; i++)
{
ithBit = num2 & (1<<i);
if (ithBit>0) {
mulResult += (num1 << i);
}
}
if (isNegativeSign) {
mulResult = ((~mulResult)+1);
}
return mulResult;
}
it is basically multiplying and dividing with the base power 2
shift left = x * 2 ^ y
shift right = x / 2 ^ y
shl eax,2 = 2 * 2 ^ 2 = 8
shr eax,3 = 2 / 2 ^ 3 = 1/4
For anyone interested in a 16-bit x86 solution, there is a piece of code by JasonKnight here1 (he also includes a signed multiply piece, which I haven't tested). However, that code has issues with large inputs, where the "add bx,bx" part would overflow.
The fixed version:
softwareMultiply:
; INPUT CX,BX
; OUTPUT DX:AX - 32 bits
; CLOBBERS BX,CX,DI
xor ax,ax ; cheap way to zero a reg
mov dx,ax ; 1 clock faster than xor
mov di,cx
or di,bx ; cheap way to test for zero on both regs
jz #done
mov di,ax ; DI used for reg,reg adc
#loop:
shr cx,1 ; divide by two, bottom bit moved to carry flag
jnc #skipAddToResult
add ax,bx
adc dx,di ; reg,reg is faster than reg,imm16
#skipAddToResult:
add bx,bx ; faster than shift or mul
adc di,di
or cx,cx ; fast zero check
jnz #loop
#done:
ret
Or the same in GCC inline assembly:
asm("mov $0,%%ax\n\t"
"mov $0,%%dx\n\t"
"mov %%cx,%%di\n\t"
"or %%bx,%%di\n\t"
"jz done\n\t"
"mov %%ax,%%di\n\t"
"loop:\n\t"
"shr $1,%%cx\n\t"
"jnc skipAddToResult\n\t"
"add %%bx,%%ax\n\t"
"adc %%di,%%dx\n\t"
"skipAddToResult:\n\t"
"add %%bx,%%bx\n\t"
"adc %%di,%%di\n\t"
"or %%cx,%%cx\n\t"
"jnz loop\n\t"
"done:\n\t"
: "=d" (dx), "=a" (ax)
: "b" (bx), "c" (cx)
: "ecx", "edi"
);
Try this. https://gist.github.com/swguru/5219592
import sys
# implement divide operation without using built-in divide operator
def divAndMod_slow(y,x, debug=0):
r = 0
while y >= x:
r += 1
y -= x
return r,y
# implement divide operation without using built-in divide operator
def divAndMod(y,x, debug=0):
## find the highest position of positive bit of the ratio
pos = -1
while y >= x:
pos += 1
x <<= 1
x >>= 1
if debug: print "y=%d, x=%d, pos=%d" % (y,x,pos)
if pos == -1:
return 0, y
r = 0
while pos >= 0:
if y >= x:
r += (1 << pos)
y -= x
if debug: print "y=%d, x=%d, r=%d, pos=%d" % (y,x,r,pos)
x >>= 1
pos -= 1
return r, y
if __name__ =="__main__":
if len(sys.argv) == 3:
y = int(sys.argv[1])
x = int(sys.argv[2])
else:
y = 313271356
x = 7
print "=== Slow Version ...."
res = divAndMod_slow( y, x)
print "%d = %d * %d + %d" % (y, x, res[0], res[1])
print "=== Fast Version ...."
res = divAndMod( y, x, debug=1)
print "%d = %d * %d + %d" % (y, x, res[0], res[1])