I'm new in Cuda technology. I need help a CUDA find in binary (monochromatic) image only pixels, that have value white (255). Pixels are then required to sort the output array. My solution is based on critical section. However, it gives incorrect results.
//----- call kernel: -----
{
const dim3 block(16,16);
const dim3 grid(divUp(_binImg.cols, block.x), divUp(_binImg.rows, block.y));
// others allocations, declarations ...
cudaCalcWhitePixels<<<grid, block>>>(_binImg, _index, _pointsX, _pointsY);
}
__device__ int lock = 0;
__global__ void cudaCalcWhitePixels(cv::gpu::PtrStepSzb _binImg, int *_index, int *_pointsX, int *_pointsY)
{
extern int lock;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__syncthreads();
if(x < _binImg.cols && y < _binImg.rows)
{
if(_binImg.ptr(y)[x] == 255)
{
do{} while(atomicCAS(&lock, 0, 1) != 0)
//----- critical section ------
_pointsX[*_index] = x;
_pointsY[*_index] = y;
(*_index)++;
lock = 0;
//----- end CS ------
}
}
}
It seems to me that the critical section is not working properly. White pixels in the image will be represented approximately 1%.
Could you please advise me? Thank you and have a nice day :)
EDIT:
solution:
__global__ void cudaCalcWhitePixels(cv::gpu::PtrStepSzb _binImg, int *_index, int *_pointsX, int *_pointsY)
{
int myIndex = 0;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__syncthreads();
if(x < _binImg.cols && y < _binImg.rows)
{
if(_binImg.ptr(y)[x] == 255)
{
//----- critical section ------
myIndex = atomicAdd(_index, 1);
_pointsX[myIndex] = x;
_pointsY[myIndex] = y;
//----- end CS ------
}
}
}
This code from the following URL may assist you understand how to use atomicCAS() to create a critical section.
https://github.com/ArchaeaSoftware/cudahandbook/blob/master/memory/spinlockReduction.cu
class cudaSpinlock {
public:
cudaSpinlock( int *p );
void acquire();
void release();
private:
int *m_p;
};
inline __device__
cudaSpinlock::cudaSpinlock( int *p )
{
m_p = p;
}
inline __device__ void
cudaSpinlock::acquire( )
{
while ( atomicCAS( m_p, 0, 1 ) );
}
inline __device__ void
cudaSpinlock::release( )
{
atomicExch( m_p, 0 );
}
Since (*_index)++; is the only atomic operation you do in CS, you could consider use atomicAdd() instead.
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
On the other hand, you could try to use thrust::copy_if() to simplify the coding.
Related
I am trying to allocate a 3D array u[-nx/2:nx/2-1][-nx/2:nx/2-1][-nx/2:nx/2-1]
int nx = 512;
double *** u = (double ***)malloc(nx * sizeof(double**));
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double **)malloc(nx * sizeof(double *));
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double *)malloc(nx * sizeof(double));
}
}
Is this a correct way to do it? If it's not, how should I change it?
No, that’s not correct. You can get it to work by placing every pointer in the middle of the dimension it represents:
int nx = 512;
double*** u = (double***)malloc(nx * sizeof(double**)) + nx/2;
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double**)malloc(nx * sizeof(double*)) + nx/2;
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double*)malloc(nx * sizeof(double)) + nx/2;
}
}
but that’s unusual and confusing, does a lot of separate allocations, and has to be undone for the deallocation step.
Consider one block with accessors instead:
#define NX 512
/* or just double* if nx is dynamic, and calculate the index manually */
double[NX][NX][NX]* u = malloc(sizeof(*u));
double array_get(double[NX][NX][NX] const* u, int i, int j, int k) {
return u[i + NX/2][j + NX/2][k + NX/2];
}
void array_set(double[NX][NX][NX]* u, int i, int j, int k, double value) {
u[i + NX/2][j + NX/2][k + NX/2] = value;
}
No.
Array in C is actually just plain/flat memory block, which is always 0 based and always in 1d (one demension).
Suppose you need a 3d array in arbitrary boundary,
say u[lb_1d, ub_1d][lb_2d, ub_2d][lb_3d, ub_3d],
you will need to do some mapping -- address space from 3d to 1d and vice versa --.
Sample implementation like this:
typedef struct
{
double* _arr;
int _lb_1d;
int _ub_1d;
int _lb_2d;
int _ub_2d;
int _lb_3d;
int _ub_3d;
}DoubleArr3D;
DoubleArr3D* create_3d_arr(int lb_1d, int ub_1d, int lb_2d, int ub_2d, int lb_3d, int ub_3d)
{
int array_size = (ub_1d - lb_1d +1) * (ub_2d - lb_2d +1) * (ub_3d - lb_3d +1);
DoubleArr3D * arr = (DoubleArr3D *)malloc( sizeof( DoubleArr3D) );
if (!arr)
{
return NULL;
}
arr->_lb_1d = lb_1d;
arr->_ub_1d = ub_1d;
arr->_lb_2d = lb_2d;
arr->_ub_2d = ub_2d;
arr->_lb_3d = lb_3d;
arr->_ub_3d = ub_3d;
arr->_arr = (double*) malloc(sizeof(double) * (size_t) array_size);
if (!arr)
{
free(arr);
return NULL;
}
return arr;
}
// arr[i1d, i2d, i3d] ==> arr_get_at(arr, i1d, i2d, i3d)
double arr_get_at(DoubleArr3D* arr, int i1d, int i2d, int i3d )
{
if (!arr || !arr->_arr)
{
// just demo of 'validation check'. in real code we should have meanful error report
return 0;
}
return arr->_arr [
i3d - arr->_lb_3d
+ (i2d - arr->_lb_2d ) * (arr->_ub_3d - arr->_lb_3d +1)
+ (i1d - arr->_lb_1d ) * (arr->_ub_2d - arr->_lb_2d +1) * (arr->_ub_3d - arr->_lb_3d +1)
];
}
First off, all C arrays have index values ranging from 0 to ELEMENT_COUNT-1. Always.
As you are using malloc, I am presuming that the value of nx is only defined at runtime. This rules out static array sizes and thus rules out using the cute arr[x][y][z] syntax as in:
#define NX 512
double arr[NX][NX][NX];
void foo(void)
{
...
arr[z1][y1][x1] += 2 * arr[z2][y2][x2];
...
}
That in turn means that to have the functionality of a 3D array with nx different values for each of its three dimensions dimension, you will need to allocate a linear memory area of size nx_cubed = nx * nx * nx. To calculate that value nx_cubed properly, you will need to check for integer overflows.
Also, you need to properly convert from signed int coordinate values to unsigned size_t values used in the 0 based index ranges.
if (nx < 0) {
fprintf(stderr, "negative value of nx\n");
exit(EXIT_FAILURE);
}
const size_t unx = nx;
const size_t nx_cubed = unx * unx * unx;
/* TODO: Complete check for overflows */
if (nx_cubed < unx) {
fprintf(stderr, "nx_cubed overflow\n");
exit(EXIT_FAILURE);
}
Then you can allocate a memory buffer of the appropriate size, and then check that the malloc call has actually worked.
double *buf = malloc(nx_cubed);
if (!buf) {
fprintf(stderr, "Error allocating memory for nx_cubed elements\n");
exit(EXIT_FAILURE);
}
Now there is the question of calculcating the array index from your x, y, and z values each ranging from -nx/2 to nx/2-1. I recommend writing a function for that which maps that range to the 0 to nx-1 range, and then calculates the proper linear index from the three 0-based values. Again, proper integer overflow checks should be performed.
size_t array3index(const size_t nx, const int x, const int y, const int z) {
const size_t half_nx = nx/2;
/* zero based 3D coordinates,
* this probably triggers some signedness warnings */
const size_t x0 = half_nx + x;
const size_t y0 = half_nx + y;
const size_t z0 = half_nx + z;
if ((x0 >= nx) || (y0 >= nx) || (z0 >= nx)) {
fprintf(stderr, "Signed coordinate(s) out of range: (%d, %d, %d)\n",
x, y, z);
exit(EXIT_FAILURE);
}
const size_t idx = nx * (nx * z0 + y0) + x0;
/* Assuming that we have already checked that nx*nx*nx does not
* overflow, and given that we have checked for x0, y0, z0 to be
* in the range of 0 to (nx-1), the idx calculation should not
* have overflown here. */
return idx;
}
Then you can do your accesses to the 3D array like
const i1 = array3index(nx, x1, y1, z1);
const i2 = array3index(nx, x2, y2, z2);
buf[i1] += 2*buf[i2];
Considering the amount of calculations needed inside array3index, I would examine whether it makes more sense to do the array iteration in the 0 to nx-1 domain directly, and only convert that to -nx/2 to nx/2-1 range values if you actually need that value within a calculation.
I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
....
To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.
The confusion comes in on the following line:
temp[lindex - RADIUS] = in[gindex - RADIUS];
If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.
The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.
To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.
(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).
My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:
stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);
So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.
There are already good answers, but to focus on the actual point that caused the confusion:
In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.
For example, consider a pointer like this:
int* data= ... // Points to some memory
When you then write a statement like
data[3] = 42;
you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written
int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]
and consequently,
dataWithOffset[-3] = 123; // This will write into data[0]
In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)
I can compile #JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":
And when run, it will abort like:
I have modified the code to the following and the warning disappeared and it can get right result:
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
The result is like this:
I've got a Jacobi implementation on CUDA, but the problem is:
I assign threads at this way:
#define imin(a,b) (a < b ? a : b)
int dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks = imin(32, (dimThreads + dim - 1)/dimThreads);
But if I use 32 threads it's fastest than using 256 threads or moreover...
I've got these results:
Sequential times:
9900 5.882000
9900 6.071000
Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256
Where 9900 is matrix WIDTH... And we can see the following:
5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74
So 32 threads is more efficient than 256?
Sorry, I don't know if I should upload the code(since they are a bit long), if you request it I will do it.
EDIT:
//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#define imin(a,b) (a < b ? a : b)
// name OF FUNCTION: __copy_vector
// PURPOSE:
// The function will copy a vector.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// source double* value vector to be copied
// dest double* reference vector copied
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int dim)
{
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
while(tIdx < dim){
dest[tIdx] = source[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
// The function will execute matrix vector multiplication
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A*B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A,
const double *B,
double *resul,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
resul[tIdx] = 0;
for(int i = 0; i < dim; i++)
if(tIdx != i)
resul[tIdx] += A[tIdx * dim + i] * B[i];
tIdx += gridDim.x * blockDim.x;
}
__syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
// The function will execute A-B=resul
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A-B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A,
const double *B,
double *C,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
C[tIdx] = A[tIdx] - B[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __divide
// PURPOSE:
// The function will execute the jacobi division, that is,
// (B-sum)/A[i,i]
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* reference (B-sum)/A[i,i]
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
//if(A[tIdx * dim + tIdx] != 0)
B[tIdx] /= A[tIdx * dim + tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: __absolute
// PURPOSE:
// The function will calculate the absolute value for each
// number in an array
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* reference |A[i]|
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
if(A[tIdx] < 0)
A[tIdx] = -A[tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
// The function will calculate a X solution for a linear system
// using Jacobi's Method.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// Matrix_A double* value Matrix A(coefficients)
// Vector_B double* value Vector B
// Vector_X double* reference Solution
// dim int value Matrix Dimension
// e double value Error allowed
// maxIter int value Maximum iterations allowed
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
void Jacobi_Cuda(const double *Matrix_A,
const double *Vector_B,
double *Vector_X,
const int dim,
const double e,
const int maxIter,
double *t)
{
/** Host variables **/
int iter = 0; // iter counter
double err = 1; // error between X^k and X^k-1
double *tmp; // temporary for thrust norm
double *norm; // Vector norm
tmp = (double *) malloc(sizeof(double) * dim);
norm = (double *) malloc(sizeof(double));
int dimBlocks, dimThreads;
dimThreads = 64;
dimBlocks = imin(32, (dim + dimThreads - 1)/dimThreads);
/** ************** **/
/** Device variables **/
double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;
cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);
/** **************** **/
/** Initialize **/
cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
/** ********** **/
clock_t start,finish;
double totaltime;
start = clock();
/** Jacobi **/
while(err > e && iter < maxIter){
__copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);
__Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y,
d_Vector_Resul, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul,
d_Vector_X, dim);
__divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
d_Vector_Resul, dim);
__absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);
cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);
double *t = thrust::max_element(tmp, tmp + dim); //vector norm
err = *t;
iter++;
}
finish = clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
*t = totaltime;
cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim,
cudaMemcpyDeviceToHost);
if(iter == maxIter)
puts("Jacobi has reached maxIter!");
/** ****** **/
/** Free memory **/
cudaFree(d_Matrix_A);
cudaFree(d_Vector_B);
cudaFree(d_Vector_X);
cudaFree(d_Vector_Y);
cudaFree(d_Vector_Resul);
free(tmp);
free(norm);
/** *********** **/
}
It depends on your algorithm. Some algorithms are by definition non-parallelizable (calculating the Fibonacci series, for example). But here's a parallelizable Jacobi algorithm courtesy of Brown. Note that solving systems of equations CAN be solved either in serial or in parallel, it's just a matter of writing the code.
In short, it's impossible to know whether or not more threads = more speed unless you show us (or at least explain) the algorithm. As far as thread synchronization goes, CUDA is very (very) good at normalizing synchronization costs so (if your algorithm is proper), more threads should almost always yield more speed.
Fewer threads might be more efficient if the workload is small enough that the overheads of managing many threads cause the performance degradation.
...but without seeing your code it's hard to say. Personally I'm more inclined to believe it's just a bug in your code.
I am having trouble trying to figure out how to retrieve a 3D array from the GPU.
I want to allocate the memory for the 3d array in the host code, call the kernel, where the array will be populated, Then retrieve the 3D array in the host code to a return variable in the mexFunction (host code).
I have made several attempts at it, here is my latest code. The results are all '0's, where they should be '7'. Can anyone tell me where i'm going wrong? It might have something to do with the 3D parameters, i dont think i fully understand that part.
simulate3DArrays.cpp
/* Device code */
__global__ void simulate3DArrays(cudaPitchedPtr devPitchedPtr,
int width,
int height,
int depth)
{
int threadId;
threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
size_t pitch = devPitchedPtr.pitch;
for (int widthIndex = 0; widthIndex < width; widthIndex++) {
for (int heightIndex = 0; heightIndex < height; heightIndex++) {
*((double*)(((char*)devPitchedPtr.ptr + threadId * pitch * height) + heightIndex * pitch) + widthIndex) = 7.0;
}
}
}
mexFunction.cu
/* Host code */
#include <stdio.h>
#include "mex.h"
/* Kernel function */
#include "simulate3DArrays.cpp"
/* Define some constants. */
#define width 5
#define height 9
#define depth 6
void displayMemoryAvailability(mxArray **MatlabMemory);
void mexFunction(int nlhs,
mxArray *plhs[],
int nrhs,
mxArray *prhs[])
{
double *output;
mwSize ndim3 = 3;
mwSize dims3[] = {height, width, depth};
plhs[0] = mxCreateNumericArray(ndim3, dims3, mxDOUBLE_CLASS, mxREAL);
output = mxGetPr(plhs[0]);
cudaExtent extent = make_cudaExtent(width * sizeof(double), height, depth);
cudaPitchedPtr devicePointer;
cudaMalloc3D(&devicePointer, extent);
simulate3DArrays<<<1,depth>>>(devicePointer, width, height, depth);
cudaMemcpy3DParms deviceOuput = { 0 };
deviceOuput.srcPtr.ptr = devicePointer.ptr;
deviceOuput.srcPtr.pitch = devicePointer.pitch;
deviceOuput.srcPtr.xsize = width;
deviceOuput.srcPtr.ysize = height;
deviceOuput.dstPtr.ptr = output;
deviceOuput.dstPtr.pitch = devicePointer.pitch;
deviceOuput.dstPtr.xsize = width;
deviceOuput.dstPtr.ysize = height;
deviceOuput.kind = cudaMemcpyDeviceToHost;
/* copy 3d array back to 'ouput' */
cudaMemcpy3D(&deviceOuput);
return;
} /* End Mexfunction */
The basic problem appears to be that you are instructing the cudaMemcpy3D to copy zero bytes, because you have not included a non-zero extent which defines the size of the transfer to the API.
Your transfer could probably be as simple as:
cudaMemcpy3DParms deviceOuput = { 0 };
deviceOuput.srcPtr = devicePointer;
deviceOuput.dstPtr.ptr = output;
deviceOuput.extent = extent;
cudaMemcpy3D(&deviceOuput);
I can't comment on whether the MEX interface you are using is correct, but the kernel looks superficially correct and I don't see anything else obviously wrong, without going to a compiler and trying to run your code with Matlab, which I cannot.
I have a device function that is defined in a header file. The reason it is in a header file is because it is used by a global kernel, which needs to be in a header file since it is a template kernel.
When this header file is included across 2 or more .cu files, I get a LNK2005 error during linking:
FooDevice.cu.obj : error LNK2005: "int
__cdecl getCurThreadIdx(void)" (?getCurThreadIdx##YAHXZ) already defined
in Main.cu.obj
Why is this error caused? How to fix it?
Here is sample code to produces the above error:
FooDevice.h:
#ifndef FOO_DEVICE_H
#define FOO_DEVICE_H
__device__ int getCurThreadIdx()
{
return ( ( blockIdx.x * blockDim.x ) + threadIdx.x );
}
template< typename T >
__global__ void fooKernel( const T* inArr, int num, T* outArr )
{
const int threadNum = ( gridDim.x * blockDim.x );
for ( int idx = getCurThreadIdx(); idx < num; idx += threadNum )
outArr[ idx ] = inArr[ idx ];
return;
}
__global__ void fooKernel2( const int* inArr, int num, int* outArr );
#endif // FOO_DEVICE_H
FooDevice.cu:
#include "FooDevice.h"
// One other kernel that uses getCurThreadIdx()
__global__ void fooKernel2( const int* inArr, int num, int* outArr )
{
const int threadNum = ( gridDim.x * blockDim.x );
for ( int idx = getCurThreadIdx(); idx < num; idx += threadNum )
outArr[ idx ] = inArr[ idx ];
return;
}
Main.cu:
#include "FooDevice.h"
int main()
{
int num = 10;
int* dInArr = NULL;
int* dOutArr = NULL;
const int arrSize = num * sizeof( *dInArr );
cudaMalloc( &dInArr, arrSize );
cudaMalloc( &dOutArr, arrSize );
// Using template kernel
fooKernel<<< 10, 10 >>>( dInArr, num, dOutArr );
return 0;
}
Why is this error caused?
Because you have included your header in FooDevice.cu and Main.cu where it gets defined so you now have two copies of the same function and the linker detects this.
How to fix it?
If you have the following defined in foo.h
template<typename T> __device__ T foo(T x)
{
return x;
}
And two .cu files that both include foo.h and also contain a call to it, e.g.
int x = foo<int>(1);
Then you can force foo() inline:
template<typename T>
inline __device__ T foo(T x)
{
return x;
}
and call:
int x = foo<int>(1);
This will stop it from being declared multiple times.
Function templates are an exempt of
One Defintion Rule and may be more
than one definition of them in
different translation unit's. Full
function template specialization is
not a template, rather an ordinary
function, so you need to use inline
keyword not to violate ODR if you want
to put them in a header file included
into several translation unit's.
Taken from http://www.velocityreviews.com/forums/t447911-why-does-explicit-specialization-of-function-templates-cause-generation-of-code.html
See also: http://en.wikipedia.org/wiki/One_Definition_Rule
I changed your code like this:
inline __device__ int getCurThreadIdx()
{
return ( ( blockIdx.x * blockDim.x ) + threadIdx.x );
}
template< typename T >
__global__ void fooKernel( const T* inArr, int num, T* outArr )
{
const int threadNum = ( gridDim.x * blockDim.x );
for ( int idx = getCurThreadIdx(); idx < num; idx += threadNum )
outArr[ idx ] = inArr[ idx ];
return;
}
And it now compiles. Your declaration without the inline of getCurThreadIdx() was violating the one definition rule.
It should be inlined. You could try adding the inline keyword.
Maybe you could remove the unnecessary code and create a simple text example for us to see? Usually the problem lies in the details...