So I've written this function to count the number of bits in a long, which for my purposes includes zeros to the right of the MSB and excludes zeros to its left:
int bitCount(unsigned long bits)
{
int len = 64;
unsigned long mask = 0x8000000000000000;
while ((bits & mask) == 0 && len > 0){
mask >>= 1;
--len;
}
return len;
}
This function works fine for me as far as returning a correct answer, but is there a better (faster or otherwise) way to go about doing this?
If you want to count the number of bits in a long type, I suggest you use ULONG_MAX from the <limits.h> header file, and use the right shift operator >> to count the number of one-bits. This way you don't have to actually know the number of bits beforehand.
Something like
unsigned long value = ULONG_MAX;
unsigned count = 1;
while (value >>= 1)
++count;
This works because the right shift fills up with zeroes.
The general answer for the number of bits in any type is CHAR_BIT*sizeof(type). CHAR_BIT, defined in <limits.h> is the (implementation-defined) number of bits in a char. sizeof(type) is specified in a way that yields the number of chars used to represent the type (i.e. sizeof(char) is 1).
The solutions the other guys proposed are very nice and probably the shortest to write and remain understandable. Another straight forward approach would be something like this
int bitCountLinear(long int n) {
int len = sizeof(long int)*8;
for (int i = 0; i < len; ++i)
if ((1UL<<i) > (unsigned long int)n)
return i;
return len;
}
The rest might get a bit extreme but I gave it a try so I'll share it.
I suspected that there might be arguably faster methods of doing this. eg Using binary search (even though a length of 64bits is extremely small). So I gave it a quick try for you and for the fun of it.
union long_ing_family {
unsigned long int uli;
long int li;
};
int bitCountLogN(long int num) {
union long_ing_family lif;
lif.li = num;
unsigned long int n = lif.uli;
int res;
int len = sizeof(long int)*8-1;
int max = len;
int min = 0;
if (n == 0) return 0;
do {
res = (min + max) / 2;
if (n < 1UL<<res)
max = res - 1;
else if (n >= (1UL<<(res+1)))
min = res + 1;
else
return res+1;
} while (min < max);
return min+1; // or max+1
}
I then timed both to see if they have any interesting differences...
#include <stdio.h>
#define REPS 10000000
int bitCountLinear(long int n);
int bitCountLogN(long int num);
unsigned long int timestamp_start(void);
unsigned long int timestamp_stop(void);
union long_ing_family;
int main(void) {
long int n;
long int begin, end;
long int begin_Lin, end_Lin;
long int begin_Log, end_Log;
begin_Lin = 0;
end_Lin = 0;
begin_Log = 0;
end_Log = 0;
for (int i = 0; i < REPS; ++i) {
begin_Lin += timestamp_start();
bitCountLinear(i);
end_Lin += timestamp_stop();
}
printf("Linear: %lu\n", (end_Lin-begin_Lin)/REPS);
for (int i = 0; i < REPS; ++i) {
begin_Log += timestamp_start();
bitCountLogN(i);
end_Log += timestamp_stop();
}
printf("Log(n): %lu\n", (end_Log-begin_Log)/REPS);
}
unsigned long int timestamp_start(void) {
unsigned int cycles_low;
unsigned int cycles_high;
asm volatile ("CPUID\n\t"
"RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::"%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long int)cycles_high << 32) | cycles_low;
}
unsigned long int timestamp_stop(void) {
unsigned int cycles_low;
unsigned int cycles_high;
asm volatile ("RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low)::"%rax", "%rbx", "%rcx", "%rdx");
return ((unsigned long int)cycles_high << 32) | cycles_low;
}
...and not surprisingly they didn't.
On my machine I'll get numbers like
Linear: 228
Log(n): 224
Which are not considered to be different assuming a lot of background noise.
Edit:
I realized that I only tested the fastest solutions for the Linear approach so changing the function inputs to
bitCountLinear(0xFFFFFFFFFFFFFFFF-i);
and
bitCountLogN(0xFFFFFFFFFFFFFFFF-i);
On my machine I'll get numbers like
Linear: 415
Log(n): 269
Which is clearly a win for the Log(n) method. I didn't expect to see a difference here.
You can count the number of bit 1 first:
int bitCount(unsigned long n)
{
unsigned long tmp;
tmp = n - ((n >> 1) & 0x7777777777777777)
- ((n >> 2) & 0x3333333333333333)
- ((n >> 3) & 0x1111111111111111);
tmp = (tmp + (tmp >> 4)) & 0x0F0F0F0F0F0F0F0F;
return 64 - tmp % 255; // temp % 255 is number of bit 1
}
Take a look at the MIT HAKMEM Count.
Related
#include <stdio.h>
unsigned int reverseBits(unsigned int num)
{
unsigned int reverse_num = 0;
for(int i = 0; i < sizeof(unsigned int) * 8; ++i)
{
reverse_num = (reverse_num | (num & 1));
num = num >> 1;
if(i != (sizeof(unsigned int) * 8) - 1)
reverse_num = reverse_num << 1;
}
return reverse_num;
}
int main()
{
unsigned int num = 0;
scanf("%u", &num);
printf("bit reverse of %u is %u\n", num, reverseBits(num));
return 0;
}
What is the time complexity of this bit reversing function, if we change the input size to uint_8/uint_16/uint64_t, the for loop runs for the size of the input * 8 times. This functions runs in a constant time for n inputs. so what is the time complexity of this function in big "O" notation?
O(n), for n bits.
For uint_8, the algorithm runs in 8 steps.
For uint_16, the alrogithm runs in 16 steps.
etc.
I'm no expert, but some instructions sets might have a one-cycle bit reverse (use __asm__), so you can run in O(n) for n bytes; eight times faster. Some compilers might do this automgically if you use -O3.
I have a program that requires me to find primes up till 10**10-1 (10,000,000,000). I wrote a Sieve of Eratosthenes to do this, and it worked very well (and accurately) as high as 10**9 (1,000,000,000). I confirmed its accuracy by having it count the number of primes it found, and it matched the value of 50,847,534 on the chart found here. I used unsigned int as the storage type and it successfully found all the primes in approximately 30 seconds.
However, 10**10 requires that I use a larger storage type: long long int. Once I switched to this, the program is running signifigantly slower (its been 3 hours plus and its still working). Here is the relevant code:
typedef unsigned long long ul_long;
typedef unsigned int u_int;
ul_long max = 10000000000;
u_int blocks = 1250000000;
char memField[1250000000];
char mapBit(char place) { //convert 0->0x80, 1->0x40, 2->0x20, and so on
return 0x80 >> (place);
}
for (u_int i = 2; i*i < max; i++) {
if (memField[i / 8] & activeBit) { //Use correct memory block
for (ul_long n = 2 * i; n < max; n += i) {
char secondaryBit = mapBit(n % 8); //Determine bit position of n
u_int activeByte = n / 8; //Determine correct memory block
if (n < 8) { //Manual override memory block and bit for first block
secondaryBit = mapBit(n);
activeByte = 0;
}
memField[activeByte] &= ~(secondaryBit); //Set the flag to false
}
}
activeBit = activeBit >> 1; //Check the next
if (activeBit == 0x00) activeBit = 0x80;
}
I figure that since 10**10 is 10x larger then 10**9 it should take 10 times the amount of time. Where is the flaw in this? Why did changing to long long cause such significant performance issues and how can I fix this? I recognize that the numbers get larger, so it should be somewhat slower, but only towards the end. Is there something I'm missing.
Note: I realize long int should technically be large enough but my limits.h says it isn't even though I'm compiling 64 bit. Thats why I use long long int in case anyone was wondering. Also, keep in mind, I have no computer science training, just a hobbyist.
edit: just ran it in "Release" as x86-64 with some of the debug statements suggested. I got the following output:
looks like I hit the u_int bound. I don't know why i is getting that large.
Your program has an infinite loop in for (u_int i = 2; i*i < max; i++). i is an unsigned int so i*i wraps at 32-bit and is always less than max. Make i an ul_long.
Note that you should use simpler bit pattern from 1 to 0x80 for bit 0 to 7.
Here is a complete version:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned long long ul_long;
typedef unsigned int u_int;
#define TESTBIT(a, bit) (a[(bit) / 8] & (1 << ((bit) & 7)))
#define CLEARBIT(a, bit) (a[(bit) / 8] &= ~(1 << ((bit) & 7)))
ul_long count_primes(ul_long max) {
size_t blocks = (max + 7) / 8;
unsigned char *memField = malloc(blocks);
if (memField == NULL) {
printf("cannot allocate memory for %llu bytes\n",
(unsigned long long)blocks);
return 0;
}
memset(memField, 255, blocks);
CLEARBIT(memField, 0); // 0 is not prime
CLEARBIT(memField, 1); // 1 is not prime
// clear bits after max
for (ul_long i = max + 1; i < blocks * 8ULL; i++) {
CLEARBIT(memField, i);
}
for (ul_long i = 2; i * i < max; i++) {
if (TESTBIT(memField, i)) { //Check if i is prime
for (ul_long n = 2 * i; n < max; n += i) {
CLEARBIT(memField, n); //Reset all multiples of i
}
}
}
unsigned int bitCount[256];
for (int i = 0; i < 256; i++) {
bitCount[i] = (((i >> 0) & 1) + ((i >> 1) & 1) +
((i >> 2) & 1) + ((i >> 3) & 1) +
((i >> 4) & 1) + ((i >> 5) & 1) +
((i >> 6) & 1) + ((i >> 7) & 1));
}
ul_long count = 0;
for (size_t i = 0; i < blocks; i++) {
count += bitCount[memField[i]];
}
printf("count of primes up to %llu: %llu\n", max, count);
free(memField);
return count;
}
int main(int argc, char *argv[]) {
if (argc > 1) {
for (int i = 1; i < argc; i++) {
count_primes(strtoull(argv[i], NULL, 0));
}
} else {
count_primes(10000000000);
}
return 0;
}
It completes in 10 seconds for 10^9 and 131 seconds for 10^10:
count of primes up to 1000000000: 50847534
count of primes up to 10000000000: 455052511
This is about ANSI-C (C90). This is what I know:
I can directly tell the compiler how many bits I want for a specific variable.
If I want 1 bit which can have the values zero or one.
or 2 bits for the values 0,1,2,3, and so on...;
I'm familiar with the syntax.
I have problem concerning bitfields:
I want to define a SET structure.
It can have maximum 1024 elements (it can have less, but the maximum is 1024 elements).
The domain of the set is from 1 to 1024. So an element could have any value 1-1024.
I'm trying to create a structure for a SET, and it must be efficient as possible for the memory part.
I tried:
typedef struct set
{
unsigned int var: 1;
} SET;
//now define an array of SETS
SET array_of_sets[MAX_SIZE] //didn't define MAX_SIZE, but no more than 1024 elements in each set.
I know this isn't efficient; maybe it's even not good for what I want. That's why I'm looking for help.
As noted in extensive comments, using a bit field is not the way to go. You can use just 128 bytes of storage for your set containing values 1..1024. You will need to map the value N to bit N-1 (so you have bits 0..1023 to work with). You also need to decide on the operations you need for your set. This code supports 'create', 'destroy', 'insert', 'delete' and 'in_set'. It does not support iteration over the elements in the set; that can be added if you want it.
sets.h
#ifndef SETS_H_INCLUDED
#define SETS_H_INCLUDED
typedef struct Set Set;
enum { MAX_ELEMENTS = 1024 };
extern Set *create(void);
extern void destroy(Set *set);
extern void insert(Set *set, int value);
extern void delete(Set *set, int value);
extern int in_set(Set *set, int value);
#endif /* SETS_H_INCLUDED */
sets.c
#include "sets.h"
#include <assert.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned long Bits;
#define BITS_C(n) ((Bits)(n))
enum { ARRAY_SIZE = MAX_ELEMENTS / (sizeof(Bits) * CHAR_BIT) };
struct Set
{
Bits set[ARRAY_SIZE];
};
Set *create(void)
{
Set *set = malloc(sizeof(*set));
if (set != 0)
memset(set, 0, sizeof(*set));
return set;
}
void destroy(Set *set)
{
free(set);
}
void insert(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("I: %d (%d:%d:0x%.2lX)\n", value+1, index, bitnum, mask); */
set->set[index] |= mask;
}
void delete(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("D: %d (%d:%d:0x%.2lX)\n", value+1, index, bitnum, mask); */
set->set[index] &= ~mask;
}
/* C90 does not support <stdbool.h> */
int in_set(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("T: %d (%d:%d:0x%.2lX) = %d\n", value+1, index, bitnum, mask,
(set->set[index] & mask) != 0); */
return (set->set[index] & mask) != 0;
}
#include <stdio.h>
enum { NUMBERS_PER_LINE = 15 };
int main(void)
{
Set *set = create();
if (set != 0)
{
int i;
int n = 0;
for (i = 1; i <= MAX_ELEMENTS; i += 4)
insert(set, i);
for (i = 3; i <= MAX_ELEMENTS; i += 6)
delete(set, i);
for (i = 1; i <= MAX_ELEMENTS; i++)
{
if (in_set(set, i))
{
printf(" %4d", i);
if (++n % NUMBERS_PER_LINE == 0)
{
putchar('\n');
n = 0;
}
}
}
if (n % NUMBERS_PER_LINE != 0)
putchar('\n');
destroy(set);
}
return 0;
}
The functions should really be given a systematic prefix, such as set_. The BITS_C macro is based on the INT64_C macro (and the other related macros) defined in <stdint.h> in C99 and later, which is also not a part of C90.
As per my previous comments, here is an example of how you can pack eight 1-bit elements into one char physical element.
I have only implemented the function to get the value of a 1-bit element, I leave the function to set it to you (it's easy to do).
Note: you can easily change the type of the array element (unsigned char) and experiment with types which can hold more bits (e.g unsigned int) and test if they perform better in terms of speed.
You can also modify the code to make it handle elements bigger than one bit.
#include <stdio.h>
#include <limits.h>
unsigned int get_el(unsigned char* array, unsigned int index)
{
unsigned int bits_per_arr_el = sizeof(unsigned char)*CHAR_BIT;
unsigned int arr_index = index / bits_per_arr_el;
unsigned int bit_offset = index % bits_per_arr_el;
unsigned int bitmask = 1 << bit_offset;
unsigned int retval;
// printf("index=%u\n", index);
// printf("bits_per_arr_el=%u\n", bits_per_arr_el);
// printf("arr_index=%u\n", arr_index);
// printf("bit_offset=%u\n", bit_offset);
retval = array[arr_index] & bitmask ? 1 : 0; // can be simpler if only True/False is needed
return(retval);
}
#define MAX_SIZE 10
unsigned char bitarray[MAX_SIZE];
int main()
{
bitarray[1] = 3; // 00000011
printf("array[7]=%u, array[8]=%u, array[9]=%u, array[10]=%u\n",
get_el(bitarray, 7),
get_el(bitarray, 8),
get_el(bitarray, 9),
get_el(bitarray,10));
return 0;
}
outputs
array[7]=0, array[8]=1, array[9]=1, array[10]=0
typedef struct set
{
unsigned short var:10; // uint var:1 will be padded to 32 bits
} SET; // ushort var:10 (which is max<=1024) padded to 16 bits
As was commented by #Jonathan Leffler use array(unsigned short[])
and define bitmasks
#define bitZer 0x00 //(unsigned)(0 == 0)? true:true;
#define bitOne 0x10 // so from (both inclusive)0-1023 = 1024
... // added for clarification
#define bitTen 0x0A
to look into the bits of each element.
http://www.catb.org/esr/structure-packing/ detailed
To store a value from 0 to 1023 (or from 1 to 1024, which is essentially the same and only involves adding/subtracting 1) you need a minimum of 10 bits.
This means that for 32-bit (unsigned) integers, you can pack 3 values into 30 bits, which gives 2 bits of useless padding.
Example:
%define ELEMENTS 100
uint32_t myArray[ (ELEMENTS + 2) / 3 ];
void setValue(int n, int value) {
uint32_t temp;
uint32_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return;
value--; // Convert "1 to 1024" into "0 to 1023"
temp = myArray[n / 3];
mask = mask << (n % 3)*10;
temp = (temp & ~mask) | (value << (n % 3)*10);
myArray[n / 3] = temp;
}
int getValue(int n) {
uint32_t temp;
uint32_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return 0;
temp = myArray[n / 3];
temp >>= (n % 3)*10;
return (temp & ~mask) + 1;
}
You can do this with bitfields instead, but the code to get/set individual values will end up using branches (e.g. switch( n%3 )) which will be slower in practice.
Removing those 2 bits of padding will cost a little more complexity and a little more overhead. For example:
%define ELEMENTS 100
uint32_t myArray[ (ELEMENTS*10 + 31) / 32 ];
int getValue(int n) {
uint64_t temp;
uint64_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return 0;
temp = myArray[n*10/32 + 1];
temp = (temp << 32) | myArray[n*10/32];
temp >>= (n*10 % 32);
return (temp & ~mask) + 1;
}
This can't be done with bitfields. This is the most space efficient way to store an array of values that range from 1 to 1024.
If you are storing an "array of booleans" or setting flags, it can be useful. For instance, you can initialize or compare up to 64 values at a time.
These macros will work for unsigned char, short, int, long long ... but simplifies significantly if you just pick a type (so you can use a safer static inline function)
#define getbit(x,n) x[n/(sizeof(*x)*8)] & (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define setbit(x,n) x[n/(sizeof(*x)*8)] |= (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define flpbit(x,n) x[n/(sizeof(*x)*8)] ^= (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define clrbit(x,n) x[n/(sizeof(*x)*8)] &= ~( (typeof(*x))1 << (n&((sizeof(*x)*8)-1)) )
to initialize a large array of booleans all you need to do is: char cbits[]={0,0xF,0,0xFF};
or for all zeroes char cbits[4]={0};
or an int example: int ibits[]={0xF0F0F0F0,~0};
//1111000011110000111100001111000011111111111111111111111111111111
If you will only be accessing 1 type of array, it may be better to make the macros into proper functions like:
static inline unsigned char getbit(unsigned char *x, unsigned n){
return x[n>>3] & 1 << (n&7);
}
//etc... similar for other types and functions from macros above
You can also compare multiple flags at a time by '|'ing the flags together and using '&'ed masks; however, it does get a bit more complex when you exceed the native types
For your particular instance you can initialize to all zeroes by:
unsigned char flags[128]={0};
or all 1's by:
uint64_t flags[128] = {~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0};
You can even use enums to name your flags
enum{
WHITE, //0
RED, //1
BLUE, //2
GREEN, //3
...
BLACK //1023
}
if (getbit(flags,WHITE) && getbit(flags,RED) && getbit(flags,BLUE))
printf("red, white and blue\n");
1) The proper solution for this question is to use Bit Array
The question provided the solution with Bit Fields with Struct. There are two typical ways to save memory space for bits related problem, another is to use Bit Array. For this specific case in the question, the better way is to use Bit Array (demoed as follows).
If it is the case like purely independent bit flags here, go
for the Bit Array
If there is a group of relevant bits , such as the IP address or Control Word definition, then it's better to combine them with a struct, that is to use Bit Fields with Sturct
2) Sample code just for demo Bit Array
#include<limits.h>
#define BITS_OF_INT (sizeof(int)*CHAR_BIT)
void SetBit(int A[], int k)
{
//Set the bit at the k-th position
A[k/BITS_OF_INT] |= 1 <<(k%BITS_OF_INT);
}
void ClearBit(int A[], int k)
{
//RESET the bit at the k-th position
A[k/BITS_OF_INT] &= ~(1 <<(k%BITS_OF_INT)) ;
}
int TestBit(int A[], int k)
{
// Return TRUE if bit set
return ((A[k/BITS_OF_INT] & (1 <<(k%BITS_OF_INT)))!= 0) ;
}
#define MAX_SIZE 1024
int main()
{
int A[MAX_SIZE/BITS_OF_INT];
int i;
int pos = 100; // position
for (i = 0; i < MAX_SIZE/BITS_OF_INT; i++)
A[i] = 0;
SetBit(A, pos);
if (TestBit(A, pos)){//do something}
ClearBit(A, pos);
}
3) Furthermore, a worthwhile discussing point from this question is,
How to choose a proper solution between "Bit Array" and "Bit fields with struct"?
Here are some references about this topic.
When to use bit-fields in C?
Readable and Maintainable Bitfields in C
I completed some bit manipulation exercises out of a textbook recently and have grasped onto some of the core ideas behind manipulating bits firmly. My main concern with making this post is for optimizations to my current code. I get the hunch that there are some functions that I could approach better. Do you have any recommendations for the following code?
#include <stdio.h>
#include "funcs.h"
// basically sizeof(int) using bit manipulation
unsigned int int_size(){
int size = 0;
for(unsigned int i = ~00u; i > 0; i >>= 1, size++);
return size;
}
// get a bit at a specific nth index
// index starts with 0 on the most significant bit
unsigned int bit_get(unsigned int data, unsigned int n){
return (data >> (int_size() - n - 1)) & 1;
}
// set a bit at a specific nth index
// index starts with 0 on the most significant bit
unsigned int bit_set(unsigned int data, unsigned int n){
return data | (1 << (int_size() - n - 1));
}
// gets the bit width of the data (<32)
unsigned int bit_width(unsigned int data){
int width = int_size();
for(; width > 0; width--)
if((data & (1 << width)) != 0)
break;
return width + 1;
}
// print the data contained in an unsigned int
void print_data(unsigned int data){
printf("%016X = ",data);
for(int i = 0; i < int_size(); i++)
printf("%X",bit_get(data,i));
putchar('\n');
}
// search for pattern in source (where pattern is n wide)
unsigned int bitpat_search(unsigned int source, unsigned int pattern,
unsigned int n){
int right = int_size() - n;
unsigned int mask = 0;
for(int i = 0; i < n; i++)
mask |= 1 << i;
for(int i = 0; i < right; i++)
if(((source & (mask << (right - i))) >> (right - i) ^ pattern) == 0)
return i - bit_width(source);
return -1;
}
// extract {count} bits from data starting at {start}
unsigned int bitpat_get(unsigned int data, int start, int count){
if(start < 0 || count < 0 || int_size() <= start || int_size() <= count || bit_width(data) != count)
return -1;
unsigned int mask = 1;
for(int i = 0; i < count; i++)
mask |= 1 << i;
mask <<= int_size() - start - count;
return (data & mask) >> (int_size() - start - count);
}
// set {count} bits (basically width of {replace}) in {*data} starting at {start}
void bitpat_set(unsigned int *data, unsigned int replace, int start, int count){
if(start < 0 || count < 0 || int_size() <= start || int_size() <= count || bit_width(replace) != count)
return;
unsigned int mask = 1;
for(int i = 0; i < count; i++)
mask |= 1 << i;
*data = ((*data | (mask << (int_size() - start - count))) & ~(mask << (int_size() - start - count))) | (replace << (int_size() - start - count));
}
because your int_size() function returns the same value each time you could save some time there:
unsigned int int_size(){
static unsigned int size = 0;
if (size == 0)
for(unsigned int i = ~00u; i > 0; i >>= 1, size++);
return size;
}
so it will calculate the value only once.
But replacing all calls of this function by sizeof(int)*8 would be much better.
I looked through your code and there's nothing that jumps out at me.
Overall, don't sweat the small stuff. If the code runs and works fine, no worries. If you are really concerned about performance, go ahead and run your code through a profiler.
Overall, I will say that the one thing you might be dealing with is the "paranoia" I see in your code regarding the width of an int. I generally use the fixed-length types in stdint.h and give the caller some options regarding what length of ints (i.e. uint8_t, uint16_t, uint32_t, etc.) they want to deal with.
Also, in C99, there are bitfields, which allow for each bit to be addressed into.
unsigned int int_size(){
return __builtin_popcount((unsigned int) -1) / __builtin_popcount((unsigned char) -1);
}
This should be faster than looping.
Including int_size() in all the others seems like its going to kill performance unless the compiler is really good at optimizing that loop out.
You could use a uint32_t instead of an int and then you would know up front the size.
You could also use sizeof(int) to get the size in bytes of an int and multiply by 8. I haven't seen an environment that defined a byte to be other than 8 bits, but the standard does seem to allow for it in saying it is implementation defined.
I've been developing a cryptographic algorithm on the GPU and currently stuck with an algorithm to perform large integer addition. Large integers are represented in a usual way as a bunch of 32-bit words.
For example, we can use one thread to add two 32-bit words. For simplicity, let assume
that the numbers to be added are of the same length and number of threads per block == number of words. Then:
__global__ void add_kernel(int *C, const int *A, const int *B) {
int x = A[threadIdx.x];
int y = B[threadIdx.x];
int z = x + y;
int carry = (z < x);
/** do carry propagation in parallel somehow ? */
............
z = z + newcarry; // update the resulting words after carry propagation
C[threadIdx.x] = z;
}
I am pretty sure that there is a way to do carry propagation via some tricky reduction procedure but could not figure it out..
I had a look at CUDA thrust extensions but big integer package seems not to be implemented yet.
Perhaps someone can give me a hint how to do that on CUDA ?
You are right, carry propagation can be done via prefix sum computation but it's a bit tricky to define the binary function for this operation and prove that it is associative (needed for parallel prefix sum). As a matter of fact, this algorithm is used (theoretically) in Carry-lookahead adder.
Suppose we have two large integers a[0..n-1] and b[0..n-1].
Then we compute (i = 0..n-1):
s[i] = a[i] + b[i]l;
carryin[i] = (s[i] < a[i]);
We define two functions:
generate[i] = carryin[i];
propagate[i] = (s[i] == 0xffffffff);
with quite intuitive meaning: generate[i] == 1 means that the carry is generated at
position i while propagate[i] == 1 means that the carry will be propagated from position
(i - 1) to (i + 1). Our goal is to compute the function carryout[0..n-1] used to update the resulting sum s[0..n-1]. carryout can be computed recursively as follows:
carryout[i] = generate[i] OR (propagate[i] AND carryout[i-1])
carryout[0] = 0
Here carryout[i] == 1 if carry is generated at position i OR it is generated sometimes earlier AND propagated to position i. Finally, we update the resulting sum:
s[i] = s[i] + carryout[i-1]; for i = 1..n-1
carry = carryout[n-1];
Now it is quite straightforward to prove that carryout function is indeed binary associative and hence parallel prefix sum computation applies. To implement this on CUDA, we can merge both flags 'generate' and 'propagate' in a single variable since they are mutually exclusive, i.e.:
cy[i] = (s[i] == -1u ? -1u : 0) | carryin[i];
In other words,
cy[i] = 0xffffffff if propagate[i]
cy[i] = 1 if generate[i]
cy[u] = 0 otherwise
Then, one can verify that the following formula computes prefix sum for carryout function:
cy[i] = max((int)cy[i], (int)cy[k]) & cy[i];
for all k < i. The example code below shows large addition for 2048-word integers. Here I used CUDA blocks with 512 threads:
// add & output carry flag
#define UADDO(c, a, b) \
asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
// add with carry & output carry flag
#define UADDC(c, a, b) \
asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define WS 32
__global__ void bignum_add(unsigned *g_R, const unsigned *g_A,const unsigned *g_B) {
extern __shared__ unsigned shared[];
unsigned *r = shared;
const unsigned N_THIDS = 512;
unsigned thid = threadIdx.x, thid_in_warp = thid & WS-1;
unsigned ofs, cf;
uint4 a = ((const uint4 *)g_A)[thid],
b = ((const uint4 *)g_B)[thid];
UADDO(a.x, a.x, b.x) // adding 128-bit chunks with carry flag
UADDC(a.y, a.y, b.y)
UADDC(a.z, a.z, b.z)
UADDC(a.w, a.w, b.w)
UADDC(cf, 0, 0) // save carry-out
// memory consumption: 49 * N_THIDS / 64
// use "alternating" data layout for each pair of warps
volatile short *scan = (volatile short *)(r + 16 + thid_in_warp +
49 * (thid / 64)) + ((thid / 32) & 1);
scan[-32] = -1; // put identity element
if(a.x == -1u && a.x == a.y && a.x == a.z && a.x == a.w)
// this indicates that carry will propagate through the number
cf = -1u;
// "Hillis-and-Steele-style" reduction
scan[0] = cf;
cf = max((int)cf, (int)scan[-2]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-4]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-8]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-16]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-32]) & cf;
scan[0] = cf;
int *postscan = (int *)r + 16 + 49 * (N_THIDS / 64);
if(thid_in_warp == WS - 1) // scan leading carry-outs once again
postscan[thid >> 5] = cf;
__syncthreads();
if(thid < N_THIDS / 32) {
volatile int *t = (volatile int *)postscan + thid;
t[-8] = -1; // load identity symbol
cf = t[0];
cf = max((int)cf, (int)t[-1]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-2]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-4]) & cf;
t[0] = cf;
}
__syncthreads();
cf = scan[0];
int ps = postscan[(int)((thid >> 5) - 1)]; // postscan[-1] equals to -1
scan[0] = max((int)cf, ps) & cf; // update carry flags within warps
cf = scan[-2];
if(thid_in_warp == 0)
cf = ps;
if((int)cf < 0)
cf = 0;
UADDO(a.x, a.x, cf) // propagate carry flag if needed
UADDC(a.y, a.y, 0)
UADDC(a.z, a.z, 0)
UADDC(a.w, a.w, 0)
((uint4 *)g_R)[thid] = a;
}
Note that macros UADDO / UADDC might not be necessary anymore since CUDA 4.0 has corresponding intrinsics (however I am not entirely sure).
Also remark that, though parallel reduction is quite fast, if you need to add several large integers in a row, it might be better to use some redundant representation (which was suggested in comments above), i.e., first accumulate the results of additions in 64-bit words, and then perform one carry propagation at the very end in "one sweep".
I thought I would post my answer also, in addition to #asm, so this SO question can be a sort of repository of ideas. Similar to #asm, I detect and store the carry condition as well as the "carry-through" condition, ie. when the intermediate word result is all 1's (0xF...FFF) so that if a carry were to propagate into this word, it would "carry-through" to the next word.
I didn't use any PTX or asm in my code, so I chose to use 64-bit unsigned ints instead of 32-bit, to achieve the 2048x32bit capability, using 1024 threads.
A larger difference from #asm's code is in my parallel carry propagation scheme. I construct a bit-packed array ("carry") where each bit represents the carry condition generated from the independent intermediate 64-bit adds from each of the 1024 threads. I also construct a bit-packed array ("carry_through") where each bit represents the carry_through condition of the individual 64-bit intermediate results. For 1024 threads, this amounts to 1024/64 = 16x64 bit words of shared memory for each bit-packed array, so total shared mem usage is 64+3 32bit quantites. With these bit packed arrays, I perform the following to generate a combined propagated carry indicator:
carry = carry | (carry_through ^ ((carry & carry_through) + carry_through);
(note that carry is shifted left by one: carry[i] indicates that the result of a[i-1] + b[i-1] generated a carry)
The explanation is as follows:
the bitwise and of carry and carry_through generates the candidates where a carry will
interact with a sequence of one or more carry though conditions
adding the result of step one to carry_through generates a result which
has changed bits which represent all words that will be affected by
the propagation of the carry into the carry_through sequence
taking the exclusive-or of carry_through plus the result from step 2
shows the affected results indicated with a 1 bit
taking the bitwise or of the result from step 3 and the ordinary
carry indicators gives a combined carry condition, which is then
used to update all the intermediate results.
Note that the addition in step 2 requires another multi-word add (for big ints composed of more than 64 words). I believe this algorithm works, and it has passed the test cases I have thrown at it.
Here is my example code which implements this:
// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>
#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64 // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE
// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY
#define LOOPCNT 1000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.
__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){
__shared__ unsigned long long carry_through[BSIZE];
__shared__ unsigned long long carry[BSIZE+1];
__shared__ volatile unsigned mcarry;
__shared__ volatile unsigned mcarry_through;
unsigned idx = threadIdx.x + (psize * blockIdx.x);
if ((threadIdx.x < psize) && (idx < size)){
// handle 64 bit unsigned add first
unsigned long long cr1 = a[idx];
unsigned long long lc = cr1 + b[idx];
// handle carry
if (threadIdx.x < BSIZE){
carry[threadIdx.x] = 0;
carry_through[threadIdx.x] = 0;
}
if (threadIdx.x == 0){
mcarry = 0;
mcarry_through = 0;
}
__syncthreads();
if (lc < cr1){
if ((threadIdx.x%LLBITS) != (LLBITS-1))
atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));
else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);
}
// handle carry-through
if (lc == 0xFFFFFFFFFFFFFFFFull)
atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS)));
__syncthreads();
if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){
// only 1 warp executing within this if statement
unsigned long long cr3 = carry_through[threadIdx.x];
cr1 = carry[threadIdx.x] & cr3;
// start of sub-add
unsigned long long cr2 = cr3 + cr1;
if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));
if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));
if (threadIdx.x == 0) {
unsigned cr4 = mcarry & mcarry_through;
cr4 += mcarry_through;
mcarry |= (mcarry_through ^ cr4);
}
if (mcarry & (1u<<threadIdx.x)) cr2++;
// end of sub-add
carry[threadIdx.x] |= (cr2 ^ cr3);
}
__syncthreads();
if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;
c[idx] = lc;
}
}
int main() {
unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;
unsigned at_once = 256; // valid range = 1 .. 65535
unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZE
unsigned dsize = at_once * prob_size;
cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;
float et_gpu, et_cpu, tot_gpu, tot_cpu;
tot_gpu = 0;
tot_cpu = 0;
if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}
if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long))) == 0) {printf("Malloc Fail\n"); return 1;}
cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc1 fail");
cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc2 fail");
cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc3 fail");
cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc2 fail");
cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc3 fail");
cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));
cudaEventCreate(&t_start_gpu);
cudaEventCreate(&t_end_gpu);
cudaEventCreate(&t_start_cpu);
cudaEventCreate(&t_end_cpu);
for (unsigned loops = 0; loops <LOOPCNT; loops++){
//create some test cases
if (loops == 0){
for (int j=0; j<at_once; j++)
for (int k=0; k<prob_size; k++){
int i= (j*prob_size) + k;
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_a[prob_size-1] = 0;
h_b[prob_size-1] = 1;
h_b[0] = 1;
}
else if (loops == 1){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_b[0] = 1;
}
else if (loops == 2){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFEull;
h_b[i] = 2;
}
h_b[0] = 1;
}
else {
for (int i = 0; i<dsize; i++){
h_a[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
h_b[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
}
}
#ifdef GPUCOPY
cudaEventRecord(t_start_gpu, 0);
#endif
cudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPU
cudaEventRecord(t_start_gpu, 0);
#endif
paradd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);
cudaCheckErrors("Kernel Fail");
#ifdef GPU
cudaEventRecord(t_end_gpu, 0);
#endif
cudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPY
cudaEventRecord(t_end_gpu, 0);
#endif
cudaEventSynchronize(t_end_gpu);
cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);
tot_gpu += et_gpu;
cudaEventRecord(t_start_cpu, 0);
//also compute result on CPU for comparison
for (int j=0; j<at_once; j++) {
unsigned rc=0;
for (int n=0; n<prob_size; n++){
unsigned i = (j*prob_size) + n;
c[i] = h_a[i] + h_b[i];
if (c[i] < h_a[i]) {
c[i] += rc;
rc=1;}
else {
if ((c[i] += rc) != 0) rc=0;
}
if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}
}
}
cudaEventRecord(t_end_cpu, 0);
cudaEventSynchronize(t_end_cpu);
cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);
tot_cpu += et_cpu;
if ((loops%(LOOPCNT/10)) == 0) printf("*\n");
}
printf("\nResults Match!\n");
printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));
printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));
return 0;
}