I am using a CUDA kernel object in MATLAB in order to fill a 2D array with all '55's. The result is very strange. The 2D array only fills up to a certain point as shown below. After row 1025, the array is all zeros. Any idea what could be going wrong?
As I mentioned in the comment above, you are mistakenly offsetting the matrix rows. The code below is a full working example proving this point.
#include<thrust\device_vector.h>
__global__ void myKern(double* masterForces, int r_max, int iterations) {
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
//for (int i=0; i<iterations; i++) masterForces[globalIdx * r_max + i] = 55;
for (int i=0; i<iterations; i++) masterForces[globalIdx * iterations + i] = 55;
}
void main() {
int ThreadBlockSize = 32;
int GridSize = 32;
int reps = 1024;
int iterations = 2000;
thrust::device_vector<double> gpuF_M(reps*iterations, 0);
myKern<<<GridSize,ThreadBlockSize>>>(thrust::raw_pointer_cast(gpuF_M.data()),reps,iterations);
int numerrors = 0;
for (int i=0; i<reps*iterations; i++) {
double test = gpuF_M[i];
if (test != 55) { printf("Error %i %f\n",i,test); numerrors++; }
}
printf("Finished!\n");
printf("The number of errors is = %i\n",numerrors);
getchar();
}
Related
I am trying to adapt a secuential function writen for CPU to an OpenCL kernel for GPU.
The function is the well known im2col used in many deep learning applications.
I have found some code on the OpenCV repository implementing this im2col function written in OpenCL but the one that I have to adapt uses a batch that confuses me and seems to be a bit different.
What should I change on the OpenCL kernel to make it work the same on GPU as it does on the CPU function?
CPU code
int fn_im2col_cpu(int I, int WI, int HI, int B, int KW, int KH, int WO, int HO, int PW, int PH, int SW, int SH, type *in_ptr, type *out_ptr) {
PROFILING_HEADER_EXTERN(im2col);
PROFILING_DEVICE(im2col, DEV_CPU);
int i; // scrolls input channels
int w; // scrolls channel columns (width)
int h; // scrolls channel rows (height)
int kw; // scrolls filter columns (width)
int kh; // scrolls filter rows (height)
// we sweep all output pixels, and for each pixel we compute the associated input pixel
#pragma omp parallel for private (kh, kw, h, w)
for (i = 0; i < I; i++) {
size_t out_addr = ((size_t)B * (size_t)WO * (size_t)HO * (size_t)KW * (size_t)KH * (size_t)i);
size_t in_addr1 = (size_t)i * (size_t)B * (size_t)WI * (size_t)HI;
for (kh = 0; kh < KH; kh++) {
for (kw = 0; kw < KW; kw++) {
for (h = 0; h < HO; h++) {
int hi = h * SH - PH + kh;
size_t in_addr2 = in_addr1 + ((size_t)hi * (size_t)B * (size_t)WI);
for (w = 0; w < WO; w++) {
int wi = w * SW - PW + kw;
int force_padding = (wi < 0) || (wi >= WI) || (hi < 0) || (hi >= HI);
if (force_padding) {
bzero(&out_ptr[out_addr], B*sizeof(type));
} else {
int in_addr = in_addr2 + (wi * B);
memcpy(&out_ptr[out_addr], &in_ptr[in_addr], B*sizeof(type));
}
out_addr+=B;
}
}
}
}
}
return 1;
}
OpenCL kernel from https://github.com/opencv/opencv/blob/master/modules/dnn/src/opencl/im2col.cl
__kernel void im2col(__global const float *im_src, int im_src_offset,
int channels, int height_inp, int width_inp,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w,
int height_out, int width_out,
__global float *im_col, int im_col_offset
)
{
int index = get_global_id(0);
if (index >= height_out * width_out * channels)
return;
int j_out = index % width_out;
int i_out = (index / width_out) % height_out;
int c_inp = (index / width_out) / height_out;
int c_out = c_inp * kernel_h * kernel_w;
int i_inp = i_out * stride_h - pad_h;
int j_inp = j_out * stride_w - pad_w;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
for (int ki = 0; ki < kernel_h; ++ki)
for (int kj = 0; kj < kernel_w; ++kj) {
int i = i_inp + ki;
int j = j_inp + kj;
*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
im_src[ki * width_inp + kj] : 0;
im_col += height_out * width_out;
}
}
Your C version folds the batch into the lowest dimension. The opencl version isn't even using batch.
You need to pass in the batch size "B", and change this copy to a block copy (or just do a loop over) by the batch size:
for (int b=0; b<B; b++) *(im_col*B+b) = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[(ki * width_inp + kj)*B + b] : 0;
to emulate the memcpy(..., B*sizeof(type)).
And then just stride B times more:
im_col += height_out * width_out * B;
I'm making a program which dynamically creating 2d array.but it's showing the error which I mentioned on the title. I'm using Visual Studio 2015.
// last.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <stdio.h>
#include <time.h>
#include "stdlib.h"
double selectionSort(int * number, int number_count);
void print2d(int ** array, int rows, int cols);
void twodarray();
void main(int argc, char* argv[])
{
int num_count = 10000;
int num[10000];
for (int i = 0; i < num_count; i++)
{
num[i] = rand();
}
double sortTime = selectionSort(num, num_count);
printf("Total Runtime is: %.0f milliseconds. \n", sortTime * 1000);
twodarray();
getchar();
}
double selectionSort(int * number, int number_count)
{
clock_t start, end;
double duration;
int min;
start = clock();
for (int i = 0; i < number_count - 1; i++)
{
min = i;
for (int j = i + 1; j < number_count; j++)
{
if (number[min] > number[j])
{
min = j;
}
}
if (min != i)
{
int temp = number[min];
number[min] = number[i];
number[i] = temp;
}
}
end = clock();
return duration = (double)(end - start) / CLOCKS_PER_SEC;
}
void print2d(int ** array, int rows, int cols)
{
int i, j;
for (i = 0; i < rows; i++)
{
for (j = 0, j < cols; j++;)
{
printf("%10d ", array[i][j]);
}
puts("");
}
}
void twodarray()
{
int **twod;
int rows = 10;
twod = malloc(rows * sizeof(int));
int i,cols = 10;
for (i = 0; i < rows; i++)
{
twod[i] = malloc(cols*sizeof(int));
print2d(twod, rows, cols);
}
for (i = 0; rows; i++)
{
free(twod[i]);
free(twod);
}
}
In c++ you need to cast when assigining a void * pointer to another type of pointer. But in c++ you should not use malloc(), instead use
int **twod = new int *[rows];
If you didn't mean to write a c++ program, rename the file. Change the extension from .cpp to .c.
Your allocation is wrong too, as pointed out by #KeineLust here.
This is wrong:
int **twod;
int rows = 10;
twod = malloc(rows * sizeof(int));
You need to reserve space for n pointers to int, not for n ints, change to
twod = malloc(rows * sizeof(int *));
And here:
for (j = 0, j < cols; j++;)
^ ^
Use a semicolon instead of a comma and also remove the last semicolon.
Another problem:
for (i = 0; rows; i++)
{
free(twod[i]);
free(twod); /* Don't free twod in the loop, one malloc -> one free */
}
And as pointed out by Nicat and Iharob, it seems that you are mixing C and C++, use the proper extension (.c)
I want create random int array in CUDA. And I need to check for duplicity on array index 0-9, 10-19 ... and repair them.
Any idea, how to make it effective? I really dont want check each element with each other.
Here is my code:
__global__ void generateP(int *d_p, unsigned long seed)
{
int i = X * blockIdx.x + threadIdx.x * X;
int buffer[X];
curandState state;
curand_init(seed, i, 0, &state);
for (int j = 0; j < X; j++)
{
float random = HB + (curand_uniform(&state) * (LB - HB));
buffer[j] = (int)truncf(random);
}
// TODO unique check and repair duplicity
for (int k = 0; k < X; k++)
{
d_p[i] = buffer[k];
i++;
}
}
Is there in CUDA some kind of Contains function? Thanks for help.
You really are asking the wrong question here. You should be looking for a way of randomly ordering a list of unique values, rather than attempting to fill a list with unique random numbers by searching and replacing duplicates repeatedly until you have the unique list. The latter is terribly inefficient and a poor fit to a data parallel execution model like CUDA.
There are simple, robust algorithms for randomly shuffling list of values that only require at most N calls to a random generator in order to shuffle a list of N values. The Fisher-Yates shuffle is almost universally used for this.
I'm not going to comment much on this code except to say that it illustrates one approach to doing this, using one thread per list. It isn't intended to be performant, just a teaching example to get you started. I think it probably does close to what you are asking for (more based on your previous attempt at this question than this one). I recommend you study it as a lead-in to writing your own implementation which does whatever it is you are trying to do.
#include <ctime>
#include <iostream>
#include <curand_kernel.h>
struct source
{
int baseval;
__device__ source(int _b) : baseval(_b) {};
__device__ int operator()(int v) { return baseval + v; };
};
__device__ int urandint(int minval, int maxval, curandState_t& state)
{
float rval = curand_uniform(&state);
rval *= (float(maxval) - float(minval) + 0.99999999f);
rval += float(minval);
return (int)truncf(rval);
}
template<int X>
__global__ void kernel(int* out, int N, unsigned long long seed)
{
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
if (tidx < N) {
curandState_t state;
curand_init(seed, tidx, 0, &state);
int seq[X];
source vals(tidx * X);
// Fisher Yeats Shuffle straight from Wikipedia
#pragma unroll
for(int i=0; i<X; ++i) {
int j = urandint(0, i, state);
if (j != i)
seq[i] = seq[j];
seq[j] = vals(i);
}
// Copy local shuffled sequence to output array
int* dest = &out[X * tidx];
memcpy(dest, &seq[0], X * sizeof(int));
}
}
int main(void)
{
const int X = 10;
const int nsets = 200;
int* d_result;
size_t sz = size_t(nsets) * sizeof(int) * size_t(X);
cudaMalloc((void **)&d_result, sz);
int tpb = 32;
int nblocks = (nsets/tpb) + ((nsets%tpb !=0) ? 1 : 0);
kernel<X><<<nblocks, tpb>>>(d_result, nsets, std::time(0));
int h_result[nsets][X];
cudaMemcpy(&h_result[0][0], d_result, sz, cudaMemcpyDeviceToHost);
for(int i=0; i<nsets; ++i) {
std::cout << i << " : ";
for(int j=0; j<X; ++j) {
std::cout << h_result[i][j] << ",";
}
std::cout << std::endl;
}
cudaDeviceReset();
return 0;
}
I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?
Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.
can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.
This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}
This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n).
There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach.
To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index.
But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times.
Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this?
Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient?
Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)?
Is it necessary to call synchronize after every kernel call?
Do I need to synchronize after memcpy's as well?
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Thank you
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
#include <math.h>
#define MAX_BLOCKS 256
#define THREADS_PER_BLOCK 256 //Must be a power of 2
#define BLOCK_SPACE 2 * THREADS_PER_BLOCK
__global__ void initialize(int* isPrime, int n) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
int i;
for (i = idx; i <= 1; i += step) {
isPrime[i] = 0;
}
for (; i < n; i += step) {
isPrime[i] = 1;
}
}
__global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
int endInd, int n) {
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int ystep = gridDim.y * blockDim.y;
int xstep = gridDim.x * blockDim.x;
for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
int p = primeList[pnum];
int pstart = p * (p + xidx);
int pstep = p * xstep;
for (int i = pstart; i < n; i += pstep) {
isPrime[i] = 0;
}
}
}
__device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
__shared__ int tmpCounts[BLOCK_SPACE];
__shared__ int dumbCounts[BLOCK_SPACE];
int idx = threadIdx.x;
tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
__syncthreads();
int numEntries = THREADS_PER_BLOCK;
int cstart = 0;
while (numEntries > 1) {
int prevStart = cstart;
cstart += numEntries;
numEntries /= 2;
if (idx < numEntries) {
int i1 = idx * 2 + prevStart;
tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
}
__syncthreads();
}
if (idx == 0) {
dumbCounts[cstart] = tmpCounts[cstart];
tmpCounts[cstart] = 0;
}
while (cstart > 0) {
int prevStart = cstart;
cstart -= numEntries * 2;
if (idx < numEntries) {
int v1 = tmpCounts[idx + prevStart];
int i1 = idx * 2 + cstart;
tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
tmpCounts[i1] = v1;
dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
}
numEntries *= 2;
__syncthreads();
}
if (start + idx < stop) {
isPrime[start + idx] = tmpCounts[idx];
addend[start + idx] = dumbCounts[idx];
}
}
__global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
int start = i;
int stop = min(i + step, ub);
makeCounts(isPrime, addend, start, stop);
}
}
__global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
int* totalsum) {
int idx = blockIdx.x;
int s = 0;
for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
isPrime[i] += s;
s += addend[i];
}
if (idx == 0) {
*totalsum = s;
}
}
__global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
int primeStartInd, int primeCount) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + idx; i < ub; i += step) {
int term = isPrime[i];
int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
if (term < nextTerm) {
primeList[primeStartInd + term] = i;
}
}
}
int main(void) {
printf("Enter upper bound:\n");
int n;
scanf("%d", &n);
int *isPrime, *addend, *numPrimes, *primeList;
cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&addend, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&numPrimes, sizeof(int));
assert(t == cudaSuccess);
int primeBound = 2 * n / log(n);
t = cudaMalloc(&primeList, primeBound * sizeof(int));
assert(t == cudaSuccess);
int numBlocks = min(MAX_BLOCKS,
(n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int bound = (int) ceil(sqrt(n));
int lb;
int ub = 2;
int primeStartInd = 0;
int primeEndInd = 0;
while (ub < n) {
if (primeEndInd > primeStartInd) {
int lowprime;
t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
int numcols = n / lowprime;
int numrows = primeEndInd - primeStartInd;
int threadx = min(numcols, THREADS_PER_BLOCK);
int thready = min(numrows, THREADS_PER_BLOCK / threadx);
int blockx = min(numcols / threadx, MAX_BLOCKS);
int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
dim3 gridsize(blockx, blocky);
dim3 blocksize(threadx, thready);
clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
primeStartInd, primeEndInd, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
lb = ub;
ub *= 2;
if (lb >= bound) {
ub = n;
}
numBlocks = min(MAX_BLOCKS,
(ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int primeCount;
t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
assert(primeCount > 0);
primeStartInd = primeEndInd;
primeEndInd += primeCount;
condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
ub, primeStartInd, primeCount);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
int finalprimes[primeEndInd];
t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
t = cudaFree(isPrime);
assert(t == cudaSuccess);
t = cudaFree(addend);
assert(t == cudaSuccess);
t = cudaFree(numPrimes);
assert(t == cudaSuccess);
t = cudaFree(primeList);
assert(t == cudaSuccess);
for (int i = 0; i < primeEndInd; i++) {
if (i % 16 == 0)
printf("\n");
else
printf(" ");
printf("%4d", finalprimes[i]);
}
printf("\n");
return 0;
}
Answering some of your questions.
Fix your error checking as defined in the comments.
define what you mean by "concurrent reads". You're concerned about this but I'm not sure what you mean by it.
Is it necessary to call synchronize after every kernel call?
No, it isn't. If your code is not working correctly, synchronizing after every kernel call then doing proper error checking will tell you if any kernels are not launching correctly. Synchronization is generally not needed for relatively simple single-stream programs like this one. The cuda calls that need to synchronize like cudaMemcpy will do this automatically for you.
Do I need to synchronize after memcpy's as well?
No, cudaMemcpy is synchronous in nature (it will force all cuda calls in the same stream to complete before it begins, and it will not return control to the host thread until the copy is complete.) If you don't want the blocking characteristic (not returning control to the host thread until complete) then you can use the cudaMemcpyAsync version of the call. You would use streams to get around the behavior of forcing all previous cuda calls to complete.
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Please define what you mean by "it doesn't work". I compiled your code with THREADS_PER_BLOCK of 512 and 256, and for an upper bound of 1000 it gave the same output in each case.