Search an ordered array in a CUDA kernel - arrays

I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?

Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.

can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.

This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}

Related

Why does the point crash when free it?

In the following code, the result is ok, but the code will be crash when executing finish, and increase one error: Heap corruption detected, the free list is damaged at 0x600000008f50
int *mergeSort(int *a,int count) {
int leftCount = count / 2;
int rightCount = count - leftCount;
int *leftData = getData(a, 0, leftCount);
int *rightData = getData(a, leftCount, count);
int *sortedLeftData = mergeSort(leftData, leftCount);
int *sortedRightData = mergeSort(rightData, rightCount);
int *resultData = mergeData(sortedLeftData, sortedRightData, leftCount,
rightCount);
return resultData;
}
int *getData(int *a,int from, int to) {
if (from > to) { return nil; }
int *res = malloc(to - from + 1);
for (int index = from; index < to; index ++) {
int value = a[index];
res[index-from] = value;
}
return res;
}
int *mergeData(int *a, int *b, int acount, int bcount) {
int *result = malloc(acount + bcount);
int aindex,bindex,rindex;
aindex = bindex = rindex = 0;
while (aindex < acount | bindex < bcount) {
int value,avalue = INT_MAX,bvalue = INT_MAX;
if (aindex < acount) { avalue = a[aindex]; }
if (bindex < bcount) { bvalue = b[bindex]; }
// get value from a point.
if (avalue <= bvalue) {
value = avalue;
aindex ++;
}else {
// get value from b point.
value = bvalue;
bindex ++;
}
result[rindex] = value;
rindex ++;
}
return result;
}
I don't understand why does crash when free the point, any answer will helpfull, thanks.
All of your allocations are too small, and thus you are overflowing your buffers.
The malloc function allocates the requested number of bytes. You need to multiply the number of elements you require by sizeof(int) if your elements are int type. e.g.
int *result = malloc((acount + bcount) * sizeof(int));
Other potential problems I spotted while reading your code are:
Using the bitwise-or operator instead of logical-or:
while (aindex < acount | bindex < bcount)
// ^ should be ||
You never free your temporary buffers, thus your program will blow out memory by leaking like crazy. You must free leftData, rightData, sortedLeftData and sortedRightData in the mergeSort function after you are finished with them.
Note that merge sort actually does not require so much allocation. Doing so will have a huge impact on performance. An efficient implementation only requires a single additional buffer for scratch operations, which can be allocated at the beginning.
I did implementation merge sort use single buffer, as the following code:
void mergeSort(int *a, int count) {
int *tempBuffer = malloc(count * sizeof(int));
mergeSortWithBuffer(a, 0, 0, count - 1,tempBuffer);
free(tempBuffer);
}
void mergeSortWithBuffer(int *a, int leftStart, int rightStart, int end, int *tempBuffer) {
int leftCount = rightStart - leftStart;
int rightCount = end - rightStart + 1;
if (leftCount + rightCount <= 1) { return; }
if (leftCount != 0) {
// left dichotomy
int lls = leftStart;
int lrs = leftStart + leftCount/2;
int lnd = rightStart - 1;
mergeSortWithBuffer(a, lls, lrs, lnd,tempBuffer);
}
if (rightCount != 0) {
// right dichotomy
int rls = rightStart;
int rrs = rightStart + rightCount/2;
int rnd = end;
mergeSortWithBuffer(a, rls, rrs, rnd,tempBuffer);
}
mergeData(a, leftStart, rightStart, end, tempBuffer);
}
void mergeData(int *a, int leftStart, int rightStart, int end,int *tempBuffer) {
int leftCount = rightStart - leftStart;
int rightCount = end - rightStart + 1;
int lindex,rindex;
lindex = rindex = 0;
while (lindex < leftCount || rindex < rightCount) {
int lv = INT_MAX,rv = INT_MAX;
if (lindex < leftCount) { lv = a[leftStart + lindex]; }
if (rindex < rightCount) { rv = a[rightStart + rindex]; }
if (lv <= rv) {
tempBuffer[leftStart + lindex + rindex] = lv;
lindex ++;
}else {
tempBuffer[leftStart + lindex + rindex] = rv;
rindex ++;
}
}
for (int index = 0; index < end - leftStart + 1; index ++) {
a[leftStart + index] = tempBuffer[leftStart + index];
}
}
I thought the mergeData function can replace data in the point a each other without the temp buffer, but the logic is too complex and the efficient is not fast, so i add the temp buffer in this function.
Would you have better suggestions if you have willing?

How to check to make sure there are no repeats when generating 6 random numbers?

I can get the random numbers into an array but I can't figure out how to check to make sure that they aren't repeating. I print out the code but there are no numbers in the array (prints out nothing).
//puts random numbers into an array
i = 0, j = 0;
srand(time(NULL));
for (i = 0; i < arrSize; i++)
{
randArr[i] = randNums(1,50);
}
i = 0;
for(i = 0; i < arrSize; i++)
{
printf("%d ", randArr[i]);
}
printf("\n\n");
//checks to make sure there are no duplicates
i = 0, j = 0, k = 0, temp = 0;
for (i = 0; i < arrSize; i++)
{
for (j = 1; j <= arrSize;)
{
if (randArr[j] == randArr[i])
{
for (k = j; k <= arrSize; k++)
{
temp = randNums(1,50);
randArr[k + 1] = temp;
}
arrSize--;
}
else
j++;
}
}
//generates random numbers between the inputed max and min
int randNums(int min, int max)
{
int result = 0, low = 0, high = 0;
if (min < max)
{
low = min;
high = max + 1;
}
else
{
low = max + 1;
high = min;
}
result = (rand() % (high - low)) + low;
return (result);
}
Beware! There are many different solutions to this problem and they all have one or another downside. If I was to quickly implement it, I would go for something like this (without too much C-magic going on):
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE (30)
#define RAND_MIN (1)
#define RAND_MAX (50)
static int randNums(int min, int max) {
// ...
}
int main(void) {
(void) srand(time(NULL));
int arr[SIZE];
int used = 0;
while (used < SIZE) {
int num = randNums(RAND_MIN, RAND_MAX);
bool exists = false;
for (int i = 0; i < used; ++i) {
if (arr[i] == num)
exists = true;
}
if (exists == false)
arr[used++] = num;
}
for (int i = 0; i < SIZE; ++i)
(void) printf("%d\n", arr[i]);
return EXIT_SUCCESS;
}
I hope it helps a bit :)
Like this answer, you can do rejection sampling, but the uniform distribution of a fixed number of samples is perfect for a very simple hash set. (Though the asymptotic runtime might be irrelevant for n=6.)
#include <stdlib.h> /* (s)rand */
#include <stdio.h> /* printf */
#include <time.h> /* clock */
#include <assert.h> /* assert */
/* Double-pointers are confusing. */
struct Reference { int *ref; };
/* Simple fixed hash set. */
static struct Reference bins[256];
static int nums[6];
static const size_t no_bins = sizeof bins / sizeof *bins,
no_nums = sizeof nums / sizeof *nums;
static size_t count_num;
/* Uniformly distributed numbers are great for hashing, but possibly clump
together under linear probing. */
static size_t hash(const int n) { return ((size_t)n * 21) % no_bins; }
/* Linear probing. */
static struct Reference *probe(const int n) {
size_t bin_index;
struct Reference *bin;
assert(sizeof bins > sizeof nums);
for(bin_index = hash(n); bin = bins + bin_index,
bin->ref && *bin->ref != n; bin_index = (bin_index + 1) % no_bins);
return bin;
}
/* Return whether it's a new value. */
static int put_in_set(const int n) {
struct Reference *bin = probe(n);
int *num;
assert(count_num < no_nums);
if(bin->ref) return 0; /* Already in hash. */
num = nums + count_num++;
*num = n;
bin->ref = num;
return 1;
}
/* http://c-faq.com/lib/randrange.html */
static int rand_range(const unsigned n) {
unsigned int x = (RAND_MAX + 1u) / n;
unsigned int y = x * n;
unsigned int r;
assert(n > 0);
do {
r = rand();
} while(r >= y);
return r / x;
}
/* Generates random numbers between the inputed max and min without
repetition; [min, max] inclusive. */
static int unique_uniform(const int min, const int max) {
int n;
assert(min <= max && (size_t)(max - min) >= count_num);
do { n = rand_range(max - min + 1) + min; } while(!put_in_set(n));
return n;
}
int main(void) {
int n = 6;
srand((int)clock()), rand(); /* My computer always picks the same first? */
while(n--) { printf("%d\n", unique_uniform(1, 50)); }
return EXIT_SUCCESS;
}
However, if the numbers are densely packed, (eg, unique_uniform(1, 6),) it's going to reject a lot of numbers. Another solution is to take a Poisson distributed numbers as a running sum, (recurrence T(n+1)=T(n)+\mu_{n+1},) where the expected value is the range of numbers divided by the total samples, then take a random permutation.

CUDA reduction to find the maximum of an array

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.
Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}

Simple CUDA kernel with Bizarre Result?

I am using a CUDA kernel object in MATLAB in order to fill a 2D array with all '55's. The result is very strange. The 2D array only fills up to a certain point as shown below. After row 1025, the array is all zeros. Any idea what could be going wrong?
As I mentioned in the comment above, you are mistakenly offsetting the matrix rows. The code below is a full working example proving this point.
#include<thrust\device_vector.h>
__global__ void myKern(double* masterForces, int r_max, int iterations) {
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
//for (int i=0; i<iterations; i++) masterForces[globalIdx * r_max + i] = 55;
for (int i=0; i<iterations; i++) masterForces[globalIdx * iterations + i] = 55;
}
void main() {
int ThreadBlockSize = 32;
int GridSize = 32;
int reps = 1024;
int iterations = 2000;
thrust::device_vector<double> gpuF_M(reps*iterations, 0);
myKern<<<GridSize,ThreadBlockSize>>>(thrust::raw_pointer_cast(gpuF_M.data()),reps,iterations);
int numerrors = 0;
for (int i=0; i<reps*iterations; i++) {
double test = gpuF_M[i];
if (test != 55) { printf("Error %i %f\n",i,test); numerrors++; }
}
printf("Finished!\n");
printf("The number of errors is = %i\n",numerrors);
getchar();
}

How to Optimize CUDA Sieve of Eratosthenes [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n).
There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach.
To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index.
But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times.
Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this?
Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient?
Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)?
Is it necessary to call synchronize after every kernel call?
Do I need to synchronize after memcpy's as well?
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Thank you
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
#include <math.h>
#define MAX_BLOCKS 256
#define THREADS_PER_BLOCK 256 //Must be a power of 2
#define BLOCK_SPACE 2 * THREADS_PER_BLOCK
__global__ void initialize(int* isPrime, int n) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
int i;
for (i = idx; i <= 1; i += step) {
isPrime[i] = 0;
}
for (; i < n; i += step) {
isPrime[i] = 1;
}
}
__global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
int endInd, int n) {
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int ystep = gridDim.y * blockDim.y;
int xstep = gridDim.x * blockDim.x;
for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
int p = primeList[pnum];
int pstart = p * (p + xidx);
int pstep = p * xstep;
for (int i = pstart; i < n; i += pstep) {
isPrime[i] = 0;
}
}
}
__device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
__shared__ int tmpCounts[BLOCK_SPACE];
__shared__ int dumbCounts[BLOCK_SPACE];
int idx = threadIdx.x;
tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
__syncthreads();
int numEntries = THREADS_PER_BLOCK;
int cstart = 0;
while (numEntries > 1) {
int prevStart = cstart;
cstart += numEntries;
numEntries /= 2;
if (idx < numEntries) {
int i1 = idx * 2 + prevStart;
tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
}
__syncthreads();
}
if (idx == 0) {
dumbCounts[cstart] = tmpCounts[cstart];
tmpCounts[cstart] = 0;
}
while (cstart > 0) {
int prevStart = cstart;
cstart -= numEntries * 2;
if (idx < numEntries) {
int v1 = tmpCounts[idx + prevStart];
int i1 = idx * 2 + cstart;
tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
tmpCounts[i1] = v1;
dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
}
numEntries *= 2;
__syncthreads();
}
if (start + idx < stop) {
isPrime[start + idx] = tmpCounts[idx];
addend[start + idx] = dumbCounts[idx];
}
}
__global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
int start = i;
int stop = min(i + step, ub);
makeCounts(isPrime, addend, start, stop);
}
}
__global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
int* totalsum) {
int idx = blockIdx.x;
int s = 0;
for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
isPrime[i] += s;
s += addend[i];
}
if (idx == 0) {
*totalsum = s;
}
}
__global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
int primeStartInd, int primeCount) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + idx; i < ub; i += step) {
int term = isPrime[i];
int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
if (term < nextTerm) {
primeList[primeStartInd + term] = i;
}
}
}
int main(void) {
printf("Enter upper bound:\n");
int n;
scanf("%d", &n);
int *isPrime, *addend, *numPrimes, *primeList;
cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&addend, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&numPrimes, sizeof(int));
assert(t == cudaSuccess);
int primeBound = 2 * n / log(n);
t = cudaMalloc(&primeList, primeBound * sizeof(int));
assert(t == cudaSuccess);
int numBlocks = min(MAX_BLOCKS,
(n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int bound = (int) ceil(sqrt(n));
int lb;
int ub = 2;
int primeStartInd = 0;
int primeEndInd = 0;
while (ub < n) {
if (primeEndInd > primeStartInd) {
int lowprime;
t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
int numcols = n / lowprime;
int numrows = primeEndInd - primeStartInd;
int threadx = min(numcols, THREADS_PER_BLOCK);
int thready = min(numrows, THREADS_PER_BLOCK / threadx);
int blockx = min(numcols / threadx, MAX_BLOCKS);
int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
dim3 gridsize(blockx, blocky);
dim3 blocksize(threadx, thready);
clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
primeStartInd, primeEndInd, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
lb = ub;
ub *= 2;
if (lb >= bound) {
ub = n;
}
numBlocks = min(MAX_BLOCKS,
(ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int primeCount;
t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
assert(primeCount > 0);
primeStartInd = primeEndInd;
primeEndInd += primeCount;
condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
ub, primeStartInd, primeCount);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
int finalprimes[primeEndInd];
t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
t = cudaFree(isPrime);
assert(t == cudaSuccess);
t = cudaFree(addend);
assert(t == cudaSuccess);
t = cudaFree(numPrimes);
assert(t == cudaSuccess);
t = cudaFree(primeList);
assert(t == cudaSuccess);
for (int i = 0; i < primeEndInd; i++) {
if (i % 16 == 0)
printf("\n");
else
printf(" ");
printf("%4d", finalprimes[i]);
}
printf("\n");
return 0;
}
Answering some of your questions.
Fix your error checking as defined in the comments.
define what you mean by "concurrent reads". You're concerned about this but I'm not sure what you mean by it.
Is it necessary to call synchronize after every kernel call?
No, it isn't. If your code is not working correctly, synchronizing after every kernel call then doing proper error checking will tell you if any kernels are not launching correctly. Synchronization is generally not needed for relatively simple single-stream programs like this one. The cuda calls that need to synchronize like cudaMemcpy will do this automatically for you.
Do I need to synchronize after memcpy's as well?
No, cudaMemcpy is synchronous in nature (it will force all cuda calls in the same stream to complete before it begins, and it will not return control to the host thread until the copy is complete.) If you don't want the blocking characteristic (not returning control to the host thread until complete) then you can use the cudaMemcpyAsync version of the call. You would use streams to get around the behavior of forcing all previous cuda calls to complete.
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Please define what you mean by "it doesn't work". I compiled your code with THREADS_PER_BLOCK of 512 and 256, and for an upper bound of 1000 it gave the same output in each case.

Resources