OpenCL Kernel implementing im2col with batch

OpenCL Kernel implementing im2col with batch - c

I am trying to adapt a secuential function writen for CPU to an OpenCL kernel for GPU.
The function is the well known im2col used in many deep learning applications.
I have found some code on the OpenCV repository implementing this im2col function written in OpenCL but the one that I have to adapt uses a batch that confuses me and seems to be a bit different.
What should I change on the OpenCL kernel to make it work the same on GPU as it does on the CPU function?
CPU code
int fn_im2col_cpu(int I, int WI, int HI, int B, int KW, int KH, int WO, int HO, int PW, int PH, int SW, int SH, type *in_ptr, type *out_ptr) {
PROFILING_HEADER_EXTERN(im2col);
PROFILING_DEVICE(im2col, DEV_CPU);
int i; // scrolls input channels
int w; // scrolls channel columns (width)
int h; // scrolls channel rows (height)
int kw; // scrolls filter columns (width)
int kh; // scrolls filter rows (height)
// we sweep all output pixels, and for each pixel we compute the associated input pixel
#pragma omp parallel for private (kh, kw, h, w)
for (i = 0; i < I; i++) {
size_t out_addr = ((size_t)B * (size_t)WO * (size_t)HO * (size_t)KW * (size_t)KH * (size_t)i);
size_t in_addr1 = (size_t)i * (size_t)B * (size_t)WI * (size_t)HI;
for (kh = 0; kh < KH; kh++) {
for (kw = 0; kw < KW; kw++) {
for (h = 0; h < HO; h++) {
int hi = h * SH - PH + kh;
size_t in_addr2 = in_addr1 + ((size_t)hi * (size_t)B * (size_t)WI);
for (w = 0; w < WO; w++) {
int wi = w * SW - PW + kw;
int force_padding = (wi < 0) || (wi >= WI) || (hi < 0) || (hi >= HI);
if (force_padding) {
bzero(&out_ptr[out_addr], B*sizeof(type));
} else {
int in_addr = in_addr2 + (wi * B);
memcpy(&out_ptr[out_addr], &in_ptr[in_addr], B*sizeof(type));
}
out_addr+=B;
}
}
}
}
}
return 1;
}
OpenCL kernel from https://github.com/opencv/opencv/blob/master/modules/dnn/src/opencl/im2col.cl
__kernel void im2col(__global const float *im_src, int im_src_offset,
int channels, int height_inp, int width_inp,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w,
int height_out, int width_out,
__global float *im_col, int im_col_offset
)
{
int index = get_global_id(0);
if (index >= height_out * width_out * channels)
return;
int j_out = index % width_out;
int i_out = (index / width_out) % height_out;
int c_inp = (index / width_out) / height_out;
int c_out = c_inp * kernel_h * kernel_w;
int i_inp = i_out * stride_h - pad_h;
int j_inp = j_out * stride_w - pad_w;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
for (int ki = 0; ki < kernel_h; ++ki)
for (int kj = 0; kj < kernel_w; ++kj) {
int i = i_inp + ki;
int j = j_inp + kj;
*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
im_src[ki * width_inp + kj] : 0;
im_col += height_out * width_out;
}
}

Your C version folds the batch into the lowest dimension. The opencl version isn't even using batch.
You need to pass in the batch size "B", and change this copy to a block copy (or just do a loop over) by the batch size:
for (int b=0; b<B; b++) *(im_col*B+b) = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[(ki * width_inp + kj)*B + b] : 0;
to emulate the memcpy(..., B*sizeof(type)).
And then just stride B times more:
im_col += height_out * width_out * B;

Related

qsort in C is an unexpected bottle neck in my program

I am working on some code to align astronomical images. I am writing it in C99. For some reason, my algorithm for detecting the stars in an image was running much slower than expected. I basically need to ignore all pixels which are below the 99th percentile (because stars are just small bright points). To calculate the 99th percentile I applied a qsort to a copy of the pixels in the image. When I profiled the code, it said that it was spending 75% of the time executing the compare_quantum function used by qsort. The entire detection of stars takes about 3 seconds.
I had originally written the code in C++ and the same algorithm took about 0.2 seconds. I am guessing that the reason this is happening is that unlike C++, C can't just inline the call to the compare function like C++ can with std::sort.
I could write my own sort function, but I was just wondering if anyone had any other ideas to make this go faster. I have calls to qsort elsewhere in the code and I am thinking maybe I need to get rid of all of them.
I am using gcc 5.2. The first qsort in stars_map is the bottle neck. Also quantum_t is just a typedef for uint16_t.
int compare_quantum(const void* a, const void* b)
{
return (*(quantum_t*)a > *(quantum_t*)b) - (*(quantum_t*)a < *(quantum_t*)b);
}
void star_register(quantum_t* grey, double* rowint, double* colint, double* lum, size_t row, size_t col, size_t w, size_t h)
{
size_t gi = row * w + col;
if(!(row >= 0 && col >= 0 && row < h && col < w && grey[gi]))
return;
*rowint += grey[gi] * row;
*colint += grey[gi] * col;
*lum += grey[gi];
grey[gi] = 0;
for(int dr = -1; dr <= 1; dr++)
{
for(int dc = -1; dc <= 1; dc++)
{
if(dc == 0 && dr == 0)
continue;
star_register(grey, rowint, colint, lum, row + dr, col + dc, w, h);
}
}
}
stars_t* stars_map(image_t* img, float detection_percentile)
{
assert(img);
quantum_t* grey = NULL;
quantum_t* sorted = NULL;
star_t* stars = NULL;
size_t nstars = 0;
size_t stars_alloc = 0;
grey = malloc(sizeof(quantum_t) * img->w * img->h);
if(grey == NULL) goto fail;
sorted = malloc(sizeof(quantum_t) * img->w * img->h);
if(sorted == NULL) goto fail;
for(size_t i = 0; i < img->w * img->h; i++)
sorted[i] = grey[i] = ((uint32_t)img->px[i].red + (uint32_t)img->px[i].green + (uint32_t)img->px[i].blue) / 3;
//this qsort is the issue
qsort(sorted, img->w * img->h, sizeof(quantum_t), compare_quantum);
quantum_t cut = sorted[(size_t)(img->w * img->h * detection_percentile)];
free(sorted);
sorted = NULL;
for(size_t i = 0; i < img->w * img->h; i++)
grey[i] = clampq((int32_t)grey[i] - cut);
for(size_t i = 0; i < img->h; i++)
{
for(size_t j = 0; j < img->w; j++)
{
if(grey[i * img->w + j])
{
if(nstars == stars_alloc)
{
stars = realloc(stars, (stars_alloc += 500) * sizeof(star_t));
if(!stars) goto fail;
}
double rowint = 0.0;
double colint = 0.0;
double lum = 0.0;
star_register(grey, &rowint, &colint, &lum, i, j, img->w, img->h);
stars[nstars++] = (star_t){.x = colint / lum, .y = rowint / lum, .lum = lum};
}
}
}
free(grey);
qsort(stars, nstars, sizeof(star_t), star_compare);
stars_t* result = malloc(sizeof(stars_t) + nstars * sizeof(star_t));
if(result == NULL) goto fail;
result->npairs = nstars;
memcpy(result->stars, stars, sizeof(star_t) * nstars);
free(stars);
return result;
fail:
if(grey) free(grey);
if(sorted) free(sorted);
if(stars) free(stars);
return NULL;
}
I originally thought that the recursive call to star_register would be the performance hit, but it barely matters in the profile.

The issue was that I had forgotten that I was using std::nth_element not std::sort in the c++ version. That is why the code was slow. I wrote a qselect and now the entire program is about the same speed.
quantum_t quantum_qselect(quantum_t *v, size_t len, size_t k)
{
size_t i, st;
for(st = i = 0; i < len - 1; i++)
{
if(v[i] > v[len - 1])
continue;
swap(quantum_t, v[i], v[st]);
st++;
}
swap(quantum_t, v[len - 1], v[st]);
return k == st ? v[st] : st > k ? quantum_qselect(v, st, k) : quantum_qselect(v + st, len - st, k - st);
}

Simple CUDA kernel with Bizarre Result?

I am using a CUDA kernel object in MATLAB in order to fill a 2D array with all '55's. The result is very strange. The 2D array only fills up to a certain point as shown below. After row 1025, the array is all zeros. Any idea what could be going wrong?

As I mentioned in the comment above, you are mistakenly offsetting the matrix rows. The code below is a full working example proving this point.
#include<thrust\device_vector.h>
__global__ void myKern(double* masterForces, int r_max, int iterations) {
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
//for (int i=0; i<iterations; i++) masterForces[globalIdx * r_max + i] = 55;
for (int i=0; i<iterations; i++) masterForces[globalIdx * iterations + i] = 55;
}
void main() {
int ThreadBlockSize = 32;
int GridSize = 32;
int reps = 1024;
int iterations = 2000;
thrust::device_vector<double> gpuF_M(reps*iterations, 0);
myKern<<<GridSize,ThreadBlockSize>>>(thrust::raw_pointer_cast(gpuF_M.data()),reps,iterations);
int numerrors = 0;
for (int i=0; i<reps*iterations; i++) {
double test = gpuF_M[i];
if (test != 55) { printf("Error %i %f\n",i,test); numerrors++; }
}
printf("Finished!\n");
printf("The number of errors is = %i\n",numerrors);
getchar();
}

Search an ordered array in a CUDA kernel

I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?

Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.

can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.

This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}

How to Optimize CUDA Sieve of Eratosthenes [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I'm new to CUDA. To get my hands dirty, I tried writing a Sieve of Eratosthenes (for finding all the primes up to some number n).
There are a number of things I had to do to get it to work that it seems shouldn't have been necessary. I'm curious whether anyone knows of a more natural (and still CUDA-optimized) approach.
To take the entries marked as prime in the isPrime array, I had to do two separate kernel calls. The first counts the number of primes in each threadblock and assigns to each entry i the number of primes in that block less than i. Then I have to make a second call to add in the number of primes in all the previous blocks in order to get the final index.
But it's even worse than that, because to avoid heaps of concurrent reads, I had to store the number of primes in the block in a separate array at each of THREADS_PER_BLOCK indices effectively doubling the required memory for the algorithm. It seems like there should be a way to have all the threads read the same value for each block rather than have to copy it so many times.
Despite all this, there's still the problem of concurrent reads in the clearMultiples method. Especially for small primes like 2 and 3, every thread has to read the value in. Isn't there any way to deal with this?
Could anyone look at my code and tell me if there's anything obvious I could do that would be simpler or more efficient?
Is there anything I'm doing that's particularly inefficient (besides printing out all the primes at the end of course)?
Is it necessary to call synchronize after every kernel call?
Do I need to synchronize after memcpy's as well?
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Thank you
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
#include <math.h>
#define MAX_BLOCKS 256
#define THREADS_PER_BLOCK 256 //Must be a power of 2
#define BLOCK_SPACE 2 * THREADS_PER_BLOCK
__global__ void initialize(int* isPrime, int n) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
int i;
for (i = idx; i <= 1; i += step) {
isPrime[i] = 0;
}
for (; i < n; i += step) {
isPrime[i] = 1;
}
}
__global__ void clearMultiples(int* isPrime, int* primeList, int startInd,
int endInd, int n) {
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int ystep = gridDim.y * blockDim.y;
int xstep = gridDim.x * blockDim.x;
for (int pnum = startInd + yidx; pnum < endInd; pnum += ystep) {
int p = primeList[pnum];
int pstart = p * (p + xidx);
int pstep = p * xstep;
for (int i = pstart; i < n; i += pstep) {
isPrime[i] = 0;
}
}
}
__device__ void makeCounts(int* isPrime, int* addend, int start, int stop) {
__shared__ int tmpCounts[BLOCK_SPACE];
__shared__ int dumbCounts[BLOCK_SPACE];
int idx = threadIdx.x;
tmpCounts[idx] = ((start + idx) < stop) ? isPrime[start + idx] : 0;
__syncthreads();
int numEntries = THREADS_PER_BLOCK;
int cstart = 0;
while (numEntries > 1) {
int prevStart = cstart;
cstart += numEntries;
numEntries /= 2;
if (idx < numEntries) {
int i1 = idx * 2 + prevStart;
tmpCounts[idx + cstart] = tmpCounts[i1] + tmpCounts[i1 + 1];
}
__syncthreads();
}
if (idx == 0) {
dumbCounts[cstart] = tmpCounts[cstart];
tmpCounts[cstart] = 0;
}
while (cstart > 0) {
int prevStart = cstart;
cstart -= numEntries * 2;
if (idx < numEntries) {
int v1 = tmpCounts[idx + prevStart];
int i1 = idx * 2 + cstart;
tmpCounts[i1 + 1] = tmpCounts[i1] + v1;
tmpCounts[i1] = v1;
dumbCounts[i1] = dumbCounts[i1 + 1] = dumbCounts[idx + prevStart];
}
numEntries *= 2;
__syncthreads();
}
if (start + idx < stop) {
isPrime[start + idx] = tmpCounts[idx];
addend[start + idx] = dumbCounts[idx];
}
}
__global__ void createCounts(int* isPrime, int* addend, int lb, int ub) {
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + blockIdx.x * THREADS_PER_BLOCK; i < ub; i += step) {
int start = i;
int stop = min(i + step, ub);
makeCounts(isPrime, addend, start, stop);
}
}
__global__ void sumCounts(int* isPrime, int* addend, int lb, int ub,
int* totalsum) {
int idx = blockIdx.x;
int s = 0;
for (int i = lb + idx; i < ub; i += THREADS_PER_BLOCK) {
isPrime[i] += s;
s += addend[i];
}
if (idx == 0) {
*totalsum = s;
}
}
__global__ void condensePrimes(int* isPrime, int* primeList, int lb, int ub,
int primeStartInd, int primeCount) {
int idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
int step = gridDim.x * THREADS_PER_BLOCK;
for (int i = lb + idx; i < ub; i += step) {
int term = isPrime[i];
int nextTerm = i + 1 == ub ? primeCount : isPrime[i + 1];
if (term < nextTerm) {
primeList[primeStartInd + term] = i;
}
}
}
int main(void) {
printf("Enter upper bound:\n");
int n;
scanf("%d", &n);
int *isPrime, *addend, *numPrimes, *primeList;
cudaError_t t = cudaMalloc((void**) &isPrime, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&addend, n * sizeof(int));
assert(t == cudaSuccess);
t = cudaMalloc(&numPrimes, sizeof(int));
assert(t == cudaSuccess);
int primeBound = 2 * n / log(n);
t = cudaMalloc(&primeList, primeBound * sizeof(int));
assert(t == cudaSuccess);
int numBlocks = min(MAX_BLOCKS,
(n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
initialize<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int bound = (int) ceil(sqrt(n));
int lb;
int ub = 2;
int primeStartInd = 0;
int primeEndInd = 0;
while (ub < n) {
if (primeEndInd > primeStartInd) {
int lowprime;
t = cudaMemcpy(&lowprime, primeList + primeStartInd, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
int numcols = n / lowprime;
int numrows = primeEndInd - primeStartInd;
int threadx = min(numcols, THREADS_PER_BLOCK);
int thready = min(numrows, THREADS_PER_BLOCK / threadx);
int blockx = min(numcols / threadx, MAX_BLOCKS);
int blocky = min(numrows / thready, MAX_BLOCKS / blockx);
dim3 gridsize(blockx, blocky);
dim3 blocksize(threadx, thready);
clearMultiples<<<gridsize, blocksize>>>(isPrime, primeList,
primeStartInd, primeEndInd, n);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
lb = ub;
ub *= 2;
if (lb >= bound) {
ub = n;
}
numBlocks = min(MAX_BLOCKS,
(ub - lb + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK);
createCounts<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, addend, lb, ub);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
sumCounts<<<THREADS_PER_BLOCK, 1>>>(isPrime, addend, lb, ub, numPrimes);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
int primeCount;
t = cudaMemcpy(&primeCount, numPrimes, sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
assert(primeCount > 0);
primeStartInd = primeEndInd;
primeEndInd += primeCount;
condensePrimes<<<numBlocks, THREADS_PER_BLOCK>>>(isPrime, primeList, lb,
ub, primeStartInd, primeCount);
t = cudaDeviceSynchronize();
assert(t == cudaSuccess);
}
int finalprimes[primeEndInd];
t = cudaMemcpy(finalprimes, primeList, primeEndInd * sizeof(int),
cudaMemcpyDeviceToHost);
assert(t == cudaSuccess);
t = cudaFree(isPrime);
assert(t == cudaSuccess);
t = cudaFree(addend);
assert(t == cudaSuccess);
t = cudaFree(numPrimes);
assert(t == cudaSuccess);
t = cudaFree(primeList);
assert(t == cudaSuccess);
for (int i = 0; i < primeEndInd; i++) {
if (i % 16 == 0)
printf("\n");
else
printf(" ");
printf("%4d", finalprimes[i]);
}
printf("\n");
return 0;
}

Answering some of your questions.
Fix your error checking as defined in the comments.
define what you mean by "concurrent reads". You're concerned about this but I'm not sure what you mean by it.
Is it necessary to call synchronize after every kernel call?
No, it isn't. If your code is not working correctly, synchronizing after every kernel call then doing proper error checking will tell you if any kernels are not launching correctly. Synchronization is generally not needed for relatively simple single-stream programs like this one. The cuda calls that need to synchronize like cudaMemcpy will do this automatically for you.
Do I need to synchronize after memcpy's as well?
No, cudaMemcpy is synchronous in nature (it will force all cuda calls in the same stream to complete before it begins, and it will not return control to the host thread until the copy is complete.) If you don't want the blocking characteristic (not returning control to the host thread until complete) then you can use the cudaMemcpyAsync version of the call. You would use streams to get around the behavior of forcing all previous cuda calls to complete.
Finally, how come when I set THREADS_PER_BLOCK to 512 it doesn't work?
Please define what you mean by "it doesn't work". I compiled your code with THREADS_PER_BLOCK of 512 and 256, and for an upper bound of 1000 it gave the same output in each case.

Decompression stops inbetween and output file filled with zeros(BLACK PIXELS)?

I am trying to apply DCT(discrete cosine transformation) compression on a bmp(bitmap) file. I have a c file which i am running in Turbo C++. This is not actually compressing but i was trying to implement the DCT and IDCT. The code is as follows:
/*
the image to be compressed is a bmp with 24 bpp and
with name "college4.bmp" of dimensions 200*160 ie 25*20- 8*8 blocks
o/p is college2.dat
format: 8 bit signed integers starting rowwise from 0,0 to 8,8
the coefficients order is blue,green,red
for the block no 1 then 2 and soon
*/
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#define WIDTH 25
#define HEIGHT 20
typedef struct {
unsigned int type;
unsigned long int filesize;
unsigned int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
BMPHEAD bmphead;
INFOHEAD infohead;
FILE *bmp_fp1,*bmp_fp2;
int buf[WIDTH][8][8][3],buf1[WIDTH][8][8][3];
float pi=3.14159265,DCTcoeff[8][8][8][8];
void generatedctcoeff() {
int y, i, j, x;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
DCTcoeff[i][j][x][y] = cos(((2 * y + 1) * pi * j) / 16)
* cos(((2 * x + 1) * i * pi) / 16);
}
}
}
}
}
void outputtofile1() { // Write into college2.dat
int i, j, x, y, blockno; // One block at a time, buf contains pixel
int redcoef, greencoef, bluecoef; // data of one row of blocks
float gijred, gijgreen, gijblue, c, ci, cj;
c = 1 / (sqrt(2));
for (blockno = 0; blockno < WIDTH; blockno++) {
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
gijred = 0;
gijgreen = 0;
gijblue = 0;
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
gijblue = gijblue + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][0];
gijgreen = gijgreen + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][1];
gijred = gijred + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][2];
}
}
ci = cj = 1.0;
if (i == 0)
ci = c;
if (j == 0)
cj = c;
gijblue = ci * cj * gijblue / 4;
gijgreen = ci * cj * gijgreen / 4;
gijred = ci * cj * gijred / 4;
bluecoef = (int) gijblue;
greencoef = (int) gijgreen;
redcoef = (int) gijred;
fprintf(bmp_fp2, "%d %d %d ", bluecoef, greencoef, redcoef);
}
}
} /* end of one block processing */
}
void compressimage() {
int rowcount,x,y;
bmp_fp1=fopen("college4.bmp","r");
bmp_fp2=fopen("college2.dat","w");
printf("generating coefficients...\n");
generatedctcoeff();
if(bmp_fp1==NULL) {
printf("can't open");
return;
}
printf("compressing....\n");
fread(&bmphead,1,sizeof(bmphead),bmp_fp1);
fread(&infohead,1,sizeof(infohead),bmp_fp1);
fseek(bmp_fp1,bmphead.offset,SEEK_SET);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(y=0;y<8;y++) {
for(x=0;x<infohead.width;x++) {
buf[x/8][x%8][y][0]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][1]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][2]=(int)fgetc(bmp_fp1);
}
}
outputtofile1(); //output contents of buf after dct to file
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
void outputtofile2() { //output buf to college3.bmp
int i, j, x, y, blockno; // buf now contains coefficients
float pxyred, pxygreen, pxyblue, c, ci, cj; // a temp buffer buf1 used to
c = 1 / (sqrt(2)); // store one row of block of
for (blockno = 0; blockno < WIDTH; blockno++) { // decoded pixel values
for (x = 0; x < 8; x++)
for (y = 0; y < 8; y++) {
pxyred = 0;
pxygreen = 0;
pxyblue = 0;
for (j = 0; j < 8; j++) {
cj = 1.0;
if (j == 0)
cj = c;
for (i = 0; i < 8; i++) {
ci = 1.0;
if (i == 0)
ci = c;
pxyblue = pxyblue + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][0];
pxygreen = pxygreen + ci * cj
* DCTcoeff[i][j][y][x] * buf[blockno][i][j][1];
pxyred = pxyred + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][2];
}
}
pxyblue /= 4;
pxygreen /= 4;
pxyred /= 4;
buf1[blockno][y][x][0] = pxyblue;
buf1[blockno][y][x][1] = pxygreen;
buf1[blockno][y][x][2] = pxyred;
}
}
for (y = 0; y < 8; y++) {
for (blockno = 0; blockno < WIDTH; blockno++)
for (x = 0; x < 8; x++) {
fprintf(bmp_fp2, "%c%c%c", (char) buf1[blockno][x][y][0],
(char) buf1[blockno][x][y][1],
(char) buf1[blockno][x][y][2]);
}
}
}
void uncompressimage() {
int blue,green,red,rowcount,colcount,i,j;
bmp_fp1=fopen("college2.dat","r");
bmp_fp2=fopen("college3.bmp","w");
printf("generating coefficients...\n");
generatedctcoeff();
if (bmp_fp1==NULL) {
printf("open failed");
return;
}
printf("uncompressing....\n");
bmphead.type=0x4d42;
bmphead.filesize=30518;
bmphead.reserved1=0;
bmphead.reserved2=0;
bmphead.offset=sizeof(bmphead)+sizeof(infohead);
infohead.infosize=sizeof(infohead);
infohead.width=200;
infohead.height=160;
infohead.planes=1;
infohead.bitsperpixel=24;
infohead.compression=0;
infohead.sizeimage=0;
infohead.xpelspermeter=3780;
infohead.ypelspermeter=3780;
infohead.colorused=0;
infohead.colorimportant=0;
fwrite(&bmphead,sizeof(BMPHEAD),1,bmp_fp2);
fwrite(&infohead,sizeof(INFOHEAD),1,bmp_fp2);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(colcount=0;colcount<WIDTH;colcount++) {
for(i=0;i<8;i++) {
for(j=0;j<8;j++) {
fscanf(bmp_fp1,"%d",&blue);
fscanf(bmp_fp1,"%d",&green);
fscanf(bmp_fp1,"%d",&red);
buf[colcount][i][j][0]=blue;
buf[colcount][i][j][1]=green;
buf[colcount][i][j][2]=red;
}
}
}
outputtofile2();
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
int main() {
printf("opening files...\n");
compressimage();
printf("opening files...again\n");
uncompressimage();
printf("successful decompression\nenter any key\n");
return 0;
}
Here is the image i am using as input
(im srry the site converted the bmp into png. You may convert it back to bmp to use it)
Here is the image that is generated:
The file college3.bmp that gets created is of size 200x160 and of 93.8 kB but till quarter of the image it has decoded the coefficients correctly but later the file is filled with black pixels. I have taken a screenshot of the o/p as it was saying not a valid bmp while uploading. I am sitting on this problem since feb,2004. If anyone can say me where there is a bug i would be very thankful. I have analysed the output file and found an EOF right at the place where the pixels are starting to be black. I read some other questions on the topic and found that the conversion factors ci,cj have been used improperly. While coding i had also got confused with the indices x,y,i and j. So i hope this problem i will solve in a few days.

Apparently, the problem in the above code is in how you open your files.
This is what should be in your code (note the explicitly specified open modes, binary and text):
void compressimage() {
...
bmp_fp1=fopen("college4.bmp","rb");
bmp_fp2=fopen("college2.dat","wt");
...
}
void uncompressimage() {
...
bmp_fp1=fopen("college2.dat","rt");
bmp_fp2=fopen("college3.bmp","wb");
...
}
With that and slightly altered structure definitions:
#pragma pack(push,1)
typedef struct {
unsigned short int type;
unsigned long int filesize;
unsigned short int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned short int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
#pragma pack(pop)
I'm able to compile your program successfully using 3 different compilers (Turbo C++, Open Watcom, gcc) and get the desired output picture.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

OpenCL Kernel implementing im2col with batch - c

Related

qsort in C is an unexpected bottle neck in my program

Simple CUDA kernel with Bizarre Result?

Search an ordered array in a CUDA kernel

How to Optimize CUDA Sieve of Eratosthenes [closed]

Decompression stops inbetween and output file filled with zeros(BLACK PIXELS)?

Categories

Resources