multi-precision multiplication in CUDA - c

I am trying to implement multi-precision multiplication in CUDA. For doing that, I have implemented a kernel which should compute multiplication of uint32_t type operand with 256-bit operand and put the result in 288-bit array. So far, I have came up with this code:
__device__ __constant__ UN_256fe B_const;
__global__ void multiply32x256Kernel(uint32_t A, UN_288bite* result){
uint8_t tid = blockIdx.x * blockDim.x + threadIdx.x;
//for managing warps
//uint8_t laineid = tid % 32;
//allocate partial products into array of uint64_t
__shared__ uint64_t partialMuls[8];
uint32_t carry, r;
if((tid < 8) && (tid != 0)){
//compute partial products
partialMuls[tid] = A * B_const.uint32[tid];
//add partial products and propagate carry
result->uint32[8] = (uint32_t)partialMuls[7];
r = (partialMuls[tid] >> 32) + ((uint32_t)partialMuls[tid - 1]);
carry = r < (partialMuls[tid] >> 32);
result->uint32[0] = (partialMuls[0] >> 32);
while(__any(carry)){
r = r + carry;
//new carry?
carry = r < carry;
}
result->uint32[tid] = r;
}
and my data-type is :
typedef struct UN_256fe{
uint32_t uint32[8];
}UN_256fe;
typedef struct UN_288bite{
uint32_t uint32[9];
}UN_288bite;
My kernel works, but it gives me wrong result. I cannot debug inside the kernel, so I would appreciate if someone let me know where the problem is or how I can debug my code inside the kernel on tegra-ubuntu with cuda-6.0.
Thanks

This answer has nothing to do with CUDA itself, but is a general C implementation.
I can't quite follow what you are doing (especially with carry) but you could try this snippet based on my own big num functions. I defined dtype to make it easier to test with smaller fields. Note that I don't specifically use a carry, but carry forward the partial product.
// little-endian
#include <stdio.h>
#include <stdint.h>
#include <limits.h>
#define dtype uint8_t // for testing
//#define dtype uint32_t // for proper ver
#define SHIFTS (sizeof(dtype)*CHAR_BIT)
#define NIBBLES (SHIFTS/4)
#define ARRLEN 8
typedef struct UN_256fe {
dtype uint[ARRLEN];
} UN_256fe;
typedef struct UN_288bite {
dtype uint[ARRLEN+1];
} UN_288bite;
void multiply(UN_288bite *product, UN_256fe *operand, dtype multiplier)
{
int i;
uint64_t partial = 0;
for (i=0; i<ARRLEN; i++) {
partial = partial + (uint64_t)multiplier * operand->uint[i];
product->uint[i] = (dtype)partial;
partial >>= SHIFTS; // carry
}
product->uint[i] = (dtype)partial;
}
int main(void)
{
int i;
dtype multiplier = 0xAA;
UN_256fe operand = { 1, 2, 3, 4, 5, 6, 7, 8};
UN_288bite product;
multiply(&product, &operand, multiplier);
for(i=ARRLEN-1; i>=0; i--)
printf("%0*X", NIBBLES, operand.uint[i]);
printf("\n * %0*X = \n", NIBBLES, multiplier);
for(i=ARRLEN; i>=0; i--)
printf("%0*X", NIBBLES, product.uint[i]);
printf("\n");
return 0;
}
Program output for uint8_t
0807060504030201
* AA =
0554A9FF54A9FF54AA

Related

How to do 1024-bit operations using arrays of uint64_t

I am trying to find a way to compute values that are of type uint1024_t (unsigned 1024-bit integer), by defining the 5 basic operations: plus, minus, times, divide, modulus.
The way that I can do that is by creating a structure that will have the following prototype:
typedef struct {
uint64_t chunk[16];
} uint1024_t;
Now since it is complicated to wrap my head around such operations with uint64_t as block size, I have first written some code for manipulating uint8_t. Here is what I came up with:
#define UINT8_HI(x) (x >> 4)
#define UINT8_LO(x) (((1 << 4) - 1) & x)
void uint8_add(uint8_t a, uint8_t b, uint8_t *res, int i) {
uint8_t s0, s1, s2;
uint8_t x = UINT8_LO(a) + UINT8_LO(b);
s0 = UINT8_LO(x);
x = UINT8_HI(a) + UINT8_HI(b) + UINT8_HI(x);
s1 = UINT8_LO(x);
s2 = UINT8_HI(x);
uint8_t result = s0 + (s1 << 4);
uint8_t carry = s2;
res[1 + i] = result;
res[0 + i] = carry;
}
void uint8_multiply(uint8_t a, uint8_t b, uint8_t *res, int i) {
uint8_t s0, s1, s2, s3;
uint8_t x = UINT8_LO(a) * UINT8_LO(b);
s0 = UINT8_LO(x);
x = UINT8_HI(a) * UINT8_LO(b) + UINT8_HI(x);
s1 = UINT8_LO(x);
s2 = UINT8_HI(x);
x = s1 + UINT8_LO(a) * UINT8_HI(b);
s1 = UINT8_LO(x);
x = s2 + UINT8_HI(a) * UINT8_HI(b) + UINT8_HI(x);
s2 = UINT8_LO(x);
s3 = UINT8_HI(x);
uint8_t result = s1 << 4 | s0;
uint8_t carry = s3 << 4 | s2;
res[1 + i] = result;
res[0 + i] = carry;
}
And it seems to work just fine, however I am unable to define the same operations for division, subtraction and modulus...
Furthermore I just can't seem to see how to implement the same principal to my custom uint1024_t structure even though it is pretty much identical with a few lines of code more to manage overflows.
I would really appreciate some help in implementing the 5 basic operations for my structure.
EDIT:
I have answered below with my implementation for resolving this problem.
find a way to compute ... the 5 basic operations: plus, minus, times, divide, modulus.
If uint1024_t used uint32_t, it would be easier.
I would recommend 1) half the width of the widest type uintmax_t, or 2) unsigned, whichever is smaller. E.g. 32-bit.
(Also consider something other than uintN_t to avoid collisions with future versions of C.)
typedef struct {
uint32_t chunk[1024/32];
} u1024;
Example of some untested code to give OP an idea of how using uint32_t simplifies the task.
void u1024_mult(u1024 *product, const u1024 *a, const u1024 *b) {
memset(product, 0, sizeof product[0]);
unsigned n = sizeof product->chunk / sizeof product->chunk[0];
for (unsigned ai = 0; ai < n; ai++) {
uint64_t acc = 0;
uint32_t m = a->chunk[ai];
for (unsigned bi = 0; ai + bi < n; bi++) {
acc += (uint64_t) m * b->chunk[bi] + product->chunk[ai + bi];
product->chunk[ai + bi] = (uint32_t) acc;
acc >>= 32;
}
}
}
+, - are quite similar to the above.
/, % could be combined into one routine that computes the quotient and remainder together.
It is not that hard to post those functions here as it really is the same as grade school math, but instead of base 10, base 232. I am against posting it though as it is fun exercise to do oneself.
I hope the * sample code above inspires rather than answers.
There are some problems with your implementation for uint8_t arrays:
you did not parenthesize the macro arguments in the expansion. This is very error prone as it may cause unexpected operator precedence problems if the arguments are expressions. You should write:
#define UINT8_HI(x) ((x) >> 4)
#define UINT8_LO(x) (((1 << 4) - 1) & (x))
storing the array elements with the most significant part first is counter intuitive. Multi-precision arithmetics usually represents the large values as arrays with the least significant part first.
for a small type such as uint8_t, there is no need to split it into halves as larger types are available. Furthermore, you must propagate the carry from the previous addition. Here is a much simpler implementation for the addition:
void uint8_add(uint8_t a, uint8_t b, uint8_t *res, int i) {
uint16_t result = a + b + res[i + 0]; // add previous carry
res[i + 0] = (uint8_t)result;
res[i + 1] = (uint8_t)(result >> 8); // assuming res has at least i+1 elements and is initialized to 0
}
for the multiplication, you must add the result of multiplying each part of each number to the appropriately chosen parts of the result number, propagating the carry to the higher parts.
Division is more difficult to implement efficiently. I recommend you study an open source multi-precision package such as QuickJS' libbf.c.
To transpose this to arrays of uint64_t, you can use unsigned 128-bit integer types if available on your platform (64-bit compilers gcc, clang and vsc all support such types).
Here is a simple implementation for the addition and multiplication:
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#define NB_CHUNK 16
typedef __uint128_t uint128_t;
typedef struct {
uint64_t chunk[NB_CHUNK];
} uint1024_t;
void uint0124_add(uint1024_t *dest, const uint1024_t *a, const uint1024_t *b) {
uint128_t result = 0;
for (size_t i = 0; i < NB_CHUNK; i++) {
result += (uint128_t)a->chunk[i] + b->chunk[i];
dest->chunk[i] = (uint64_t)result;
result >>= CHAR_BIT * sizeof(uint64_t);
}
}
void uint0124_multiply(uint1024_t *dest, const uint1024_t *a, const uint1024_t *b) {
for (size_t i = 0; i < NB_CHUNK; i++)
dest->chunk[i] = 0;
for (size_t i = 0; i < NB_CHUNK; i++) {
uint128_t result = 0;
for (size_t j = 0, k = i; k < NB_CHUNK; j++, k++) {
result += (uint128_t)a->chunk[i] * b->chunk[j] + dest->chunk[k];
dest->chunk[k] = (uint64_t)result;
result >>= CHAR_BIT * sizeof(uint64_t);
}
}
}
If 128-bit integers are not available, your 1024-bit type could be implemented as an array of 32-bit integers. Here is a flexible implementation with selectable types for the array elements and the intermediary result:
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#if 1 // if platform has 128 bit integers
typedef uint64_t type1;
typedef __uint128_t type2;
#else
typedef uint32_t type1;
typedef uint64_t type2;
#endif
#define TYPE1_BITS (CHAR_BIT * sizeof(type1))
#define NB_CHUNK (1024 / TYPE1_BITS)
typedef struct uint1024_t {
type1 chunk[NB_CHUNK];
} uint1024_t;
void uint0124_add(uint1024_t *dest, const uint1024_t *a, const uint1024_t *b) {
type2 result = 0;
for (size_t i = 0; i < NB_CHUNK; i++) {
result += (type2)a->chunk[i] + b->chunk[i];
dest->chunk[i] = (type1)result;
result >>= TYPE1_BITS;
}
}
void uint0124_multiply(uint1024_t *dest, const uint1024_t *a, const uint1024_t *b) {
for (size_t i = 0; i < NB_CHUNK; i++)
dest->chunk[i] = 0;
for (size_t i = 0; i < NB_CHUNK; i++) {
type2 result = 0;
for (size_t j = 0, k = i; k < NB_CHUNK; j++, k++) {
result += (type2)a->chunk[i] * b->chunk[j] + dest->chunk[k];
dest->chunk[k] = (type1)result;
result >>= TYPE1_BITS;
}
}
}

Bitshift on structures

I'm unsure if this is possible due to structure padding and alignment but, assuming you take care of that by aligning your structures to 4/8 bytes, is it possible to bit shift on a structure as if it was a single variable?
What I'd like to do is take a string (max 8 bytes) and shift it into the high order bits of a 64-bit variable.
Like if I do this:
#include <stdint.h>
#include <string.h>
void shiftstr(uint64_t* t,char* c,size_t len){
memcpy(t, c, len);
//now *t==0x000000617369616b
*t<<=(sizeof(uint64_t)-len)*8;
//now *t==0x617369616b000000
}
int main(){
uint64_t k = 0;
char n[] = "kaisa";
shiftstr(&k, n,strlen(n));
return 0;
}
This works just fine, but what if I had, instead of a uint64_t, two uint32_t, either as individual variables or a structure.
#include <stdint.h>
#include <string.h>
struct U64{
uint32_t x;
uint32_t y;
};
void shiftstrstruct(struct U64* t, char* c, size_t len){
memcpy(t, c, len);
/*
At this point I think
x == 0x7369616b
y == 0x00000061
But I could be wrong
*/
//but how can I perform the bit shift?
//Where
//x==0x0000006b
//y==0x61697361
}
int main(){
char n[] = "kaisa";
struct U64 m = {0};
shiftstrstruct(&m, n, strlen(n));
return 0;
}
Up to the memcpy part, it should be the same as if I were performing it on a single variable. I believe the values of x and y are correct in such situations. But, if that's the case that means the values need to be shifted away from x towards y.
I know I can cast but what if I wanted to deal with a 16 byte string that needed to be shifted into two 64 bit variables, or even larger?
Is shifting structures like this possible? Is there a better alternative?
Is shifting structures like this possible?
No, not really. Even if the x and y members are in adjacent memory locations, bit-shift operations on either are performed as integer operations on the individual variables. So, you can't shift bits "out of" one and "into" the other: bits that "fall off" during the shift will be lost.
Is there a better alternative?
You would have to implement such a multi-component bit-shift yourself – making copies of the bits that would otherwise be lost and somehow masking those back into the result, after shifting other bits internally to each 'component' variable. Exactly how to do this would largely depend on the use case.
Here's one possible implementation of a right-shift function for a structure comprising two uint64_t members (I have not added any error-checking for the count, and I assume that uint64_t is exactly 64 bits wide):
#include <stdio.h>
#include <stdint.h>
typedef struct {
uint64_t hi;
uint64_t lo;
} ui128;
void Rshift(ui128* data, int count)
{
uint64_t mask = (1uLL << count) - 1; // Set low "count" bits to 1
uint64_t save = data->hi & mask; // Save bits that fall off hi
data->hi >>= count; // Shift the hi component
data->lo >>= count; // Shift the lo component
data->lo |= save << (64 - count); // Mask in the bits from hi
return;
}
int main()
{
ui128 test = { 0xF001F002F003F004, 0xF005F006F007F008 };
printf("%016llx%016llx\n", test.hi, test.lo);
Rshift(&test, 16);
printf("%016llx%016llx\n", test.hi, test.lo);
return 0;
}
A similar logic could be used for a left-shift function, but you would then need to save the relevant upper (most significant) bits from the lo member and mask them into the shifted hi value:
void Lshift(ui128* data, int count)
{
uint64_t mask = ((1uLL << count) - 1) << (64 - count);
uint64_t save = data->lo & mask;
data->hi <<= count;
data->lo <<= count;
data->hi |= save >> (64 - count);
return;
}
union is your friend, this is what you want:
#include <stdint.h>
#include <stdio.h>
typedef union _shift_u64{
struct _u64{
uint32_t x;
uint32_t y;
} __attribute__((__packed__)) U64;
uint64_t x_and_y;
} SHIFT_U64;
int main(int argc, char* argv[]){
SHIFT_U64 test;
test.U64.x = 4;
test.U64.y = 8;
printf("test.U64.x=%d, test.U64.y=%d, test.x_and_y=%ld\n", test.U64.x, test.U64.y, test.x_and_y);
test.x_and_y<<=1;
printf("test.U64.x=%d, test.U64.y=%d, test.x_and_y=%ld\n", test.U64.x, test.U64.y, test.x_and_y);
test.x_and_y>>=1;
printf("test.U64.x=%d, test.U64.y=%d, test.x_and_y=%ld\n", test.U64.x, test.U64.y, test.x_and_y);
return 0;
}
EDIT: This simple program illustrates how to do it the other way, but you have to check for the carry over bit and shift overflow and shift underflow by yourself. union doesn't care about the data, you just have to make sure that the data makes sense. After compiling, redirect the output of the program to a file or hex-editor and read the errorlevel of the program.
Linux example: ./a.out > a.out.bin; echo "errorlevel=$?"; xxd a.out.bin
#include <stdio.h>
typedef union _shift_it{
struct _data{
unsigned long x : 64;
unsigned long y : 64;
} __attribute__((__packed__)) DATA;
unsigned char x_and_y[16];
} __attribute__((__packed__)) SHIFT_IT;
int main(int argc, char* argv[]){
SHIFT_IT test;
int errorlevel = 0;
//bitmask for shift operation
static const unsigned long LEFT_SHIFTMASK64 = 0x8000000000000000;
static const unsigned long RIGHT_SHIFTMASK64 = 0x0000000000000001;
//test data
test.DATA.x = 0x2468246824682468; //high bits
test.DATA.y = 0x1357135713571357; //low bits
//binary output to stdout
for(int i=0; i<16; i++) putchar(test.x_and_y[i]);
//left shift
if(test.DATA.x & LEFT_SHIFTMASK64) errorlevel += 1;
test.DATA.x <<= 1;
if(test.DATA.y & LEFT_SHIFTMASK64) errorlevel += 2;
test.DATA.y <<= 1;
//binary output to stdout
for(int i=0; i<16; i++) putchar(test.x_and_y[i]);
//right shift
if(test.DATA.y & RIGHT_SHIFTMASK64) errorlevel += 4;
test.DATA.y >>= 1;
if(test.DATA.x & RIGHT_SHIFTMASK64) errorlevel += 8;
test.DATA.x >>= 1;
//binary output to stdout
for(int i=0; i<16; i++) putchar(test.x_and_y[i]);
//right shift
if(test.DATA.y & RIGHT_SHIFTMASK64) errorlevel += 16;
test.DATA.y >>= 1;
if(test.DATA.x & RIGHT_SHIFTMASK64) errorlevel += 32;
test.DATA.x >>= 1;
//binary output to stdout
for(int i=0; i<16; i++) putchar(test.x_and_y[i]);
//left shift
if(test.DATA.x & LEFT_SHIFTMASK64) errorlevel += 64;
test.DATA.x <<= 1;
if(test.DATA.y & LEFT_SHIFTMASK64) errorlevel += 128;
test.DATA.y <<= 1;
//binary output to stdout
for(int i=0; i<16; i++) putchar(test.x_and_y[i]);
return errorlevel;
}

How can I use Bit-Fields to save memory?

This is about ANSI-C (C90). This is what I know:
I can directly tell the compiler how many bits I want for a specific variable.
If I want 1 bit which can have the values zero or one.
or 2 bits for the values 0,1,2,3, and so on...;
I'm familiar with the syntax.
I have problem concerning bitfields:
I want to define a SET structure.
It can have maximum 1024 elements (it can have less, but the maximum is 1024 elements).
The domain of the set is from 1 to 1024. So an element could have any value 1-1024.
I'm trying to create a structure for a SET, and it must be efficient as possible for the memory part.
I tried:
typedef struct set
{
unsigned int var: 1;
} SET;
//now define an array of SETS
SET array_of_sets[MAX_SIZE] //didn't define MAX_SIZE, but no more than 1024 elements in each set.
I know this isn't efficient; maybe it's even not good for what I want. That's why I'm looking for help.
As noted in extensive comments, using a bit field is not the way to go. You can use just 128 bytes of storage for your set containing values 1..1024. You will need to map the value N to bit N-1 (so you have bits 0..1023 to work with). You also need to decide on the operations you need for your set. This code supports 'create', 'destroy', 'insert', 'delete' and 'in_set'. It does not support iteration over the elements in the set; that can be added if you want it.
sets.h
#ifndef SETS_H_INCLUDED
#define SETS_H_INCLUDED
typedef struct Set Set;
enum { MAX_ELEMENTS = 1024 };
extern Set *create(void);
extern void destroy(Set *set);
extern void insert(Set *set, int value);
extern void delete(Set *set, int value);
extern int in_set(Set *set, int value);
#endif /* SETS_H_INCLUDED */
sets.c
#include "sets.h"
#include <assert.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned long Bits;
#define BITS_C(n) ((Bits)(n))
enum { ARRAY_SIZE = MAX_ELEMENTS / (sizeof(Bits) * CHAR_BIT) };
struct Set
{
Bits set[ARRAY_SIZE];
};
Set *create(void)
{
Set *set = malloc(sizeof(*set));
if (set != 0)
memset(set, 0, sizeof(*set));
return set;
}
void destroy(Set *set)
{
free(set);
}
void insert(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("I: %d (%d:%d:0x%.2lX)\n", value+1, index, bitnum, mask); */
set->set[index] |= mask;
}
void delete(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("D: %d (%d:%d:0x%.2lX)\n", value+1, index, bitnum, mask); */
set->set[index] &= ~mask;
}
/* C90 does not support <stdbool.h> */
int in_set(Set *set, int value)
{
assert(value >= 1 && value <= MAX_ELEMENTS);
value--; /* 0..1023 */
int index = value / (sizeof(Bits) * CHAR_BIT);
int bitnum = value % (sizeof(Bits) * CHAR_BIT);
Bits mask = BITS_C(1) << bitnum;
/* printf("T: %d (%d:%d:0x%.2lX) = %d\n", value+1, index, bitnum, mask,
(set->set[index] & mask) != 0); */
return (set->set[index] & mask) != 0;
}
#include <stdio.h>
enum { NUMBERS_PER_LINE = 15 };
int main(void)
{
Set *set = create();
if (set != 0)
{
int i;
int n = 0;
for (i = 1; i <= MAX_ELEMENTS; i += 4)
insert(set, i);
for (i = 3; i <= MAX_ELEMENTS; i += 6)
delete(set, i);
for (i = 1; i <= MAX_ELEMENTS; i++)
{
if (in_set(set, i))
{
printf(" %4d", i);
if (++n % NUMBERS_PER_LINE == 0)
{
putchar('\n');
n = 0;
}
}
}
if (n % NUMBERS_PER_LINE != 0)
putchar('\n');
destroy(set);
}
return 0;
}
The functions should really be given a systematic prefix, such as set_. The BITS_C macro is based on the INT64_C macro (and the other related macros) defined in <stdint.h> in C99 and later, which is also not a part of C90.
As per my previous comments, here is an example of how you can pack eight 1-bit elements into one char physical element.
I have only implemented the function to get the value of a 1-bit element, I leave the function to set it to you (it's easy to do).
Note: you can easily change the type of the array element (unsigned char) and experiment with types which can hold more bits (e.g unsigned int) and test if they perform better in terms of speed.
You can also modify the code to make it handle elements bigger than one bit.
#include <stdio.h>
#include <limits.h>
unsigned int get_el(unsigned char* array, unsigned int index)
{
unsigned int bits_per_arr_el = sizeof(unsigned char)*CHAR_BIT;
unsigned int arr_index = index / bits_per_arr_el;
unsigned int bit_offset = index % bits_per_arr_el;
unsigned int bitmask = 1 << bit_offset;
unsigned int retval;
// printf("index=%u\n", index);
// printf("bits_per_arr_el=%u\n", bits_per_arr_el);
// printf("arr_index=%u\n", arr_index);
// printf("bit_offset=%u\n", bit_offset);
retval = array[arr_index] & bitmask ? 1 : 0; // can be simpler if only True/False is needed
return(retval);
}
#define MAX_SIZE 10
unsigned char bitarray[MAX_SIZE];
int main()
{
bitarray[1] = 3; // 00000011
printf("array[7]=%u, array[8]=%u, array[9]=%u, array[10]=%u\n",
get_el(bitarray, 7),
get_el(bitarray, 8),
get_el(bitarray, 9),
get_el(bitarray,10));
return 0;
}
outputs
array[7]=0, array[8]=1, array[9]=1, array[10]=0
typedef struct set
{
unsigned short var:10; // uint var:1 will be padded to 32 bits
} SET; // ushort var:10 (which is max<=1024) padded to 16 bits
As was commented by #Jonathan Leffler use array(unsigned short[])
and define bitmasks
#define bitZer 0x00 //(unsigned)(0 == 0)? true:true;
#define bitOne 0x10 // so from (both inclusive)0-1023 = 1024
... // added for clarification
#define bitTen 0x0A
to look into the bits of each element.
http://www.catb.org/esr/structure-packing/ detailed
To store a value from 0 to 1023 (or from 1 to 1024, which is essentially the same and only involves adding/subtracting 1) you need a minimum of 10 bits.
This means that for 32-bit (unsigned) integers, you can pack 3 values into 30 bits, which gives 2 bits of useless padding.
Example:
%define ELEMENTS 100
uint32_t myArray[ (ELEMENTS + 2) / 3 ];
void setValue(int n, int value) {
uint32_t temp;
uint32_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return;
value--; // Convert "1 to 1024" into "0 to 1023"
temp = myArray[n / 3];
mask = mask << (n % 3)*10;
temp = (temp & ~mask) | (value << (n % 3)*10);
myArray[n / 3] = temp;
}
int getValue(int n) {
uint32_t temp;
uint32_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return 0;
temp = myArray[n / 3];
temp >>= (n % 3)*10;
return (temp & ~mask) + 1;
}
You can do this with bitfields instead, but the code to get/set individual values will end up using branches (e.g. switch( n%3 )) which will be slower in practice.
Removing those 2 bits of padding will cost a little more complexity and a little more overhead. For example:
%define ELEMENTS 100
uint32_t myArray[ (ELEMENTS*10 + 31) / 32 ];
int getValue(int n) {
uint64_t temp;
uint64_t mask = (1 << 10) - 1;
if(n >= ELEMENTS) return 0;
temp = myArray[n*10/32 + 1];
temp = (temp << 32) | myArray[n*10/32];
temp >>= (n*10 % 32);
return (temp & ~mask) + 1;
}
This can't be done with bitfields. This is the most space efficient way to store an array of values that range from 1 to 1024.
If you are storing an "array of booleans" or setting flags, it can be useful. For instance, you can initialize or compare up to 64 values at a time.
These macros will work for unsigned char, short, int, long long ... but simplifies significantly if you just pick a type (so you can use a safer static inline function)
#define getbit(x,n) x[n/(sizeof(*x)*8)] & (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define setbit(x,n) x[n/(sizeof(*x)*8)] |= (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define flpbit(x,n) x[n/(sizeof(*x)*8)] ^= (typeof(*x))1 << (n&((sizeof(*x)*8)-1))
#define clrbit(x,n) x[n/(sizeof(*x)*8)] &= ~( (typeof(*x))1 << (n&((sizeof(*x)*8)-1)) )
to initialize a large array of booleans all you need to do is: char cbits[]={0,0xF,0,0xFF};
or for all zeroes char cbits[4]={0};
or an int example: int ibits[]={0xF0F0F0F0,~0};
//1111000011110000111100001111000011111111111111111111111111111111
If you will only be accessing 1 type of array, it may be better to make the macros into proper functions like:
static inline unsigned char getbit(unsigned char *x, unsigned n){
return x[n>>3] & 1 << (n&7);
}
//etc... similar for other types and functions from macros above
You can also compare multiple flags at a time by '|'ing the flags together and using '&'ed masks; however, it does get a bit more complex when you exceed the native types
For your particular instance you can initialize to all zeroes by:
unsigned char flags[128]={0};
or all 1's by:
uint64_t flags[128] = {~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0,~0};
You can even use enums to name your flags
enum{
WHITE, //0
RED, //1
BLUE, //2
GREEN, //3
...
BLACK //1023
}
if (getbit(flags,WHITE) && getbit(flags,RED) && getbit(flags,BLUE))
printf("red, white and blue\n");
1) The proper solution for this question is to use Bit Array
The question provided the solution with Bit Fields with Struct. There are two typical ways to save memory space for bits related problem, another is to use Bit Array. For this specific case in the question, the better way is to use Bit Array (demoed as follows).
If it is the case like purely independent bit flags here, go
for the Bit Array
If there is a group of relevant bits , such as the IP address or Control Word definition, then it's better to combine them with a struct, that is to use Bit Fields with Sturct
2) Sample code just for demo Bit Array
#include<limits.h>
#define BITS_OF_INT (sizeof(int)*CHAR_BIT)
void SetBit(int A[], int k)
{
//Set the bit at the k-th position
A[k/BITS_OF_INT] |= 1 <<(k%BITS_OF_INT);
}
void ClearBit(int A[], int k)
{
//RESET the bit at the k-th position
A[k/BITS_OF_INT] &= ~(1 <<(k%BITS_OF_INT)) ;
}
int TestBit(int A[], int k)
{
// Return TRUE if bit set
return ((A[k/BITS_OF_INT] & (1 <<(k%BITS_OF_INT)))!= 0) ;
}
#define MAX_SIZE 1024
int main()
{
int A[MAX_SIZE/BITS_OF_INT];
int i;
int pos = 100; // position
for (i = 0; i < MAX_SIZE/BITS_OF_INT; i++)
A[i] = 0;
SetBit(A, pos);
if (TestBit(A, pos)){//do something}
ClearBit(A, pos);
}
3) Furthermore, a worthwhile discussing point from this question is,
How to choose a proper solution between "Bit Array" and "Bit fields with struct"?
Here are some references about this topic.
When to use bit-fields in C?
Readable and Maintainable Bitfields in C

Going below zero in unsigned integer operations

I want to deduce a list of 16-bit unsigned integers from another list of 16-bit unsigned integers.
For example, given the list:
10000, 12349, 32333, 3342
and I know the first integer of the other list is 0, now I want to deduce the rest. The mapping is to subtract 10000 from them, I got
0, 2349, 22333, 58878
where 58878 = (3342-10000+65536) modulo 65536 as the result of a wrapping.
The pseudocode is something like:
void deduce(u_int16_t list1[100], u_int16_t *list2[100], u_int16_t first)
{
int diff = first - list1[0];
for (i = 0; i < 100; i++)
(*list2)[i] = (list1[i] + diff + 65536) % 65536;
}
but we know that there is no minus number in unsigned integers.
so how to do the mapping(or deduction)?
thanks!
unsigned integers variables can be subtracted more than they contain - if I understand correctly the question.
u_int16_t u = 10;
u -= 20; // => u = u - 20;
printf("%x, %u\n", u, u); // => fff6, 65526
The difference is
when displayed, u does not show a negative value - ie the MSb (most significant bit, ie bit 15) is interpreted (here) as 215, the next as 214 etc...
when extended (eg to 32 bits) the MBb is not propagated from bit 16 to bit 31 (as they would be if signed) - they're 0
when right shifted the value MSb is always 0 (would be the same as previous MSb if signed, e.g 1 for a negative value)
So your mapping will keep working with u_int16_t (and you don't need the % modulo 65536 if you work with that type everywhere since anyway the values are on 16 bits - the modulo is implicit).
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
void deduce(uint16_t list1[], uint16_t list2[], size_t size){
int32_t i, first = list1[0];
for(i=0;i<size;++i){
// list2[i]= list1[i] - first;
int32_t wk = list1[i];
wk -= first;
if(wk<0)
wk += 65536;
list2[i] = wk;
}
}
int main(void){
uint16_t list1[100] = {
10000,
12349,
32333,
3342
};
uint16_t list2[100];
int i;
deduce(list1, list2, 4);
for(i = 0; i<4; ++i)
printf("%5" PRIu16 "\n", list2[i]);
return 0;
}
I'm not quite understand your question, but if what you want is to subtract every element of the first list by the different between the first element of both list. This code should work.
void deduce(uint16_t list1[], uint16_t list2[], int size)
{
uint16_t diff = list1[0] - list2[0];
int i;
for (i=0; i<size; i++)
list2[i] = list1[i] - diff;
}
You don't need to pass list2 as u_int16_t* list2[] because you actually can edit the content of the array with u_int16_t list2[]. Only use u_int16_t* list2[] if you want to do dynamic memory allocation in this function.

large integer addition with CUDA

I've been developing a cryptographic algorithm on the GPU and currently stuck with an algorithm to perform large integer addition. Large integers are represented in a usual way as a bunch of 32-bit words.
For example, we can use one thread to add two 32-bit words. For simplicity, let assume
that the numbers to be added are of the same length and number of threads per block == number of words. Then:
__global__ void add_kernel(int *C, const int *A, const int *B) {
int x = A[threadIdx.x];
int y = B[threadIdx.x];
int z = x + y;
int carry = (z < x);
/** do carry propagation in parallel somehow ? */
............
z = z + newcarry; // update the resulting words after carry propagation
C[threadIdx.x] = z;
}
I am pretty sure that there is a way to do carry propagation via some tricky reduction procedure but could not figure it out..
I had a look at CUDA thrust extensions but big integer package seems not to be implemented yet.
Perhaps someone can give me a hint how to do that on CUDA ?
You are right, carry propagation can be done via prefix sum computation but it's a bit tricky to define the binary function for this operation and prove that it is associative (needed for parallel prefix sum). As a matter of fact, this algorithm is used (theoretically) in Carry-lookahead adder.
Suppose we have two large integers a[0..n-1] and b[0..n-1].
Then we compute (i = 0..n-1):
s[i] = a[i] + b[i]l;
carryin[i] = (s[i] < a[i]);
We define two functions:
generate[i] = carryin[i];
propagate[i] = (s[i] == 0xffffffff);
with quite intuitive meaning: generate[i] == 1 means that the carry is generated at
position i while propagate[i] == 1 means that the carry will be propagated from position
(i - 1) to (i + 1). Our goal is to compute the function carryout[0..n-1] used to update the resulting sum s[0..n-1]. carryout can be computed recursively as follows:
carryout[i] = generate[i] OR (propagate[i] AND carryout[i-1])
carryout[0] = 0
Here carryout[i] == 1 if carry is generated at position i OR it is generated sometimes earlier AND propagated to position i. Finally, we update the resulting sum:
s[i] = s[i] + carryout[i-1]; for i = 1..n-1
carry = carryout[n-1];
Now it is quite straightforward to prove that carryout function is indeed binary associative and hence parallel prefix sum computation applies. To implement this on CUDA, we can merge both flags 'generate' and 'propagate' in a single variable since they are mutually exclusive, i.e.:
cy[i] = (s[i] == -1u ? -1u : 0) | carryin[i];
In other words,
cy[i] = 0xffffffff if propagate[i]
cy[i] = 1 if generate[i]
cy[u] = 0 otherwise
Then, one can verify that the following formula computes prefix sum for carryout function:
cy[i] = max((int)cy[i], (int)cy[k]) & cy[i];
for all k < i. The example code below shows large addition for 2048-word integers. Here I used CUDA blocks with 512 threads:
// add & output carry flag
#define UADDO(c, a, b) \
asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
// add with carry & output carry flag
#define UADDC(c, a, b) \
asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define WS 32
__global__ void bignum_add(unsigned *g_R, const unsigned *g_A,const unsigned *g_B) {
extern __shared__ unsigned shared[];
unsigned *r = shared;
const unsigned N_THIDS = 512;
unsigned thid = threadIdx.x, thid_in_warp = thid & WS-1;
unsigned ofs, cf;
uint4 a = ((const uint4 *)g_A)[thid],
b = ((const uint4 *)g_B)[thid];
UADDO(a.x, a.x, b.x) // adding 128-bit chunks with carry flag
UADDC(a.y, a.y, b.y)
UADDC(a.z, a.z, b.z)
UADDC(a.w, a.w, b.w)
UADDC(cf, 0, 0) // save carry-out
// memory consumption: 49 * N_THIDS / 64
// use "alternating" data layout for each pair of warps
volatile short *scan = (volatile short *)(r + 16 + thid_in_warp +
49 * (thid / 64)) + ((thid / 32) & 1);
scan[-32] = -1; // put identity element
if(a.x == -1u && a.x == a.y && a.x == a.z && a.x == a.w)
// this indicates that carry will propagate through the number
cf = -1u;
// "Hillis-and-Steele-style" reduction
scan[0] = cf;
cf = max((int)cf, (int)scan[-2]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-4]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-8]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-16]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-32]) & cf;
scan[0] = cf;
int *postscan = (int *)r + 16 + 49 * (N_THIDS / 64);
if(thid_in_warp == WS - 1) // scan leading carry-outs once again
postscan[thid >> 5] = cf;
__syncthreads();
if(thid < N_THIDS / 32) {
volatile int *t = (volatile int *)postscan + thid;
t[-8] = -1; // load identity symbol
cf = t[0];
cf = max((int)cf, (int)t[-1]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-2]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-4]) & cf;
t[0] = cf;
}
__syncthreads();
cf = scan[0];
int ps = postscan[(int)((thid >> 5) - 1)]; // postscan[-1] equals to -1
scan[0] = max((int)cf, ps) & cf; // update carry flags within warps
cf = scan[-2];
if(thid_in_warp == 0)
cf = ps;
if((int)cf < 0)
cf = 0;
UADDO(a.x, a.x, cf) // propagate carry flag if needed
UADDC(a.y, a.y, 0)
UADDC(a.z, a.z, 0)
UADDC(a.w, a.w, 0)
((uint4 *)g_R)[thid] = a;
}
Note that macros UADDO / UADDC might not be necessary anymore since CUDA 4.0 has corresponding intrinsics (however I am not entirely sure).
Also remark that, though parallel reduction is quite fast, if you need to add several large integers in a row, it might be better to use some redundant representation (which was suggested in comments above), i.e., first accumulate the results of additions in 64-bit words, and then perform one carry propagation at the very end in "one sweep".
I thought I would post my answer also, in addition to #asm, so this SO question can be a sort of repository of ideas. Similar to #asm, I detect and store the carry condition as well as the "carry-through" condition, ie. when the intermediate word result is all 1's (0xF...FFF) so that if a carry were to propagate into this word, it would "carry-through" to the next word.
I didn't use any PTX or asm in my code, so I chose to use 64-bit unsigned ints instead of 32-bit, to achieve the 2048x32bit capability, using 1024 threads.
A larger difference from #asm's code is in my parallel carry propagation scheme. I construct a bit-packed array ("carry") where each bit represents the carry condition generated from the independent intermediate 64-bit adds from each of the 1024 threads. I also construct a bit-packed array ("carry_through") where each bit represents the carry_through condition of the individual 64-bit intermediate results. For 1024 threads, this amounts to 1024/64 = 16x64 bit words of shared memory for each bit-packed array, so total shared mem usage is 64+3 32bit quantites. With these bit packed arrays, I perform the following to generate a combined propagated carry indicator:
carry = carry | (carry_through ^ ((carry & carry_through) + carry_through);
(note that carry is shifted left by one: carry[i] indicates that the result of a[i-1] + b[i-1] generated a carry)
The explanation is as follows:
the bitwise and of carry and carry_through generates the candidates where a carry will
interact with a sequence of one or more carry though conditions
adding the result of step one to carry_through generates a result which
has changed bits which represent all words that will be affected by
the propagation of the carry into the carry_through sequence
taking the exclusive-or of carry_through plus the result from step 2
shows the affected results indicated with a 1 bit
taking the bitwise or of the result from step 3 and the ordinary
carry indicators gives a combined carry condition, which is then
used to update all the intermediate results.
Note that the addition in step 2 requires another multi-word add (for big ints composed of more than 64 words). I believe this algorithm works, and it has passed the test cases I have thrown at it.
Here is my example code which implements this:
// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>
#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64 // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE
// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY
#define LOOPCNT 1000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.
__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){
__shared__ unsigned long long carry_through[BSIZE];
__shared__ unsigned long long carry[BSIZE+1];
__shared__ volatile unsigned mcarry;
__shared__ volatile unsigned mcarry_through;
unsigned idx = threadIdx.x + (psize * blockIdx.x);
if ((threadIdx.x < psize) && (idx < size)){
// handle 64 bit unsigned add first
unsigned long long cr1 = a[idx];
unsigned long long lc = cr1 + b[idx];
// handle carry
if (threadIdx.x < BSIZE){
carry[threadIdx.x] = 0;
carry_through[threadIdx.x] = 0;
}
if (threadIdx.x == 0){
mcarry = 0;
mcarry_through = 0;
}
__syncthreads();
if (lc < cr1){
if ((threadIdx.x%LLBITS) != (LLBITS-1))
atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));
else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);
}
// handle carry-through
if (lc == 0xFFFFFFFFFFFFFFFFull)
atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS)));
__syncthreads();
if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){
// only 1 warp executing within this if statement
unsigned long long cr3 = carry_through[threadIdx.x];
cr1 = carry[threadIdx.x] & cr3;
// start of sub-add
unsigned long long cr2 = cr3 + cr1;
if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));
if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));
if (threadIdx.x == 0) {
unsigned cr4 = mcarry & mcarry_through;
cr4 += mcarry_through;
mcarry |= (mcarry_through ^ cr4);
}
if (mcarry & (1u<<threadIdx.x)) cr2++;
// end of sub-add
carry[threadIdx.x] |= (cr2 ^ cr3);
}
__syncthreads();
if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;
c[idx] = lc;
}
}
int main() {
unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;
unsigned at_once = 256; // valid range = 1 .. 65535
unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZE
unsigned dsize = at_once * prob_size;
cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;
float et_gpu, et_cpu, tot_gpu, tot_cpu;
tot_gpu = 0;
tot_cpu = 0;
if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}
if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long))) == 0) {printf("Malloc Fail\n"); return 1;}
cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc1 fail");
cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc2 fail");
cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc3 fail");
cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc2 fail");
cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc3 fail");
cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));
cudaEventCreate(&t_start_gpu);
cudaEventCreate(&t_end_gpu);
cudaEventCreate(&t_start_cpu);
cudaEventCreate(&t_end_cpu);
for (unsigned loops = 0; loops <LOOPCNT; loops++){
//create some test cases
if (loops == 0){
for (int j=0; j<at_once; j++)
for (int k=0; k<prob_size; k++){
int i= (j*prob_size) + k;
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_a[prob_size-1] = 0;
h_b[prob_size-1] = 1;
h_b[0] = 1;
}
else if (loops == 1){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_b[0] = 1;
}
else if (loops == 2){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFEull;
h_b[i] = 2;
}
h_b[0] = 1;
}
else {
for (int i = 0; i<dsize; i++){
h_a[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
h_b[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
}
}
#ifdef GPUCOPY
cudaEventRecord(t_start_gpu, 0);
#endif
cudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPU
cudaEventRecord(t_start_gpu, 0);
#endif
paradd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);
cudaCheckErrors("Kernel Fail");
#ifdef GPU
cudaEventRecord(t_end_gpu, 0);
#endif
cudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPY
cudaEventRecord(t_end_gpu, 0);
#endif
cudaEventSynchronize(t_end_gpu);
cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);
tot_gpu += et_gpu;
cudaEventRecord(t_start_cpu, 0);
//also compute result on CPU for comparison
for (int j=0; j<at_once; j++) {
unsigned rc=0;
for (int n=0; n<prob_size; n++){
unsigned i = (j*prob_size) + n;
c[i] = h_a[i] + h_b[i];
if (c[i] < h_a[i]) {
c[i] += rc;
rc=1;}
else {
if ((c[i] += rc) != 0) rc=0;
}
if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}
}
}
cudaEventRecord(t_end_cpu, 0);
cudaEventSynchronize(t_end_cpu);
cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);
tot_cpu += et_cpu;
if ((loops%(LOOPCNT/10)) == 0) printf("*\n");
}
printf("\nResults Match!\n");
printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));
printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));
return 0;
}

Resources