I've just started to learn CUDA and i wanted to fill an array (a 2D array represented as a 1D array) with random numbers. I followed another posts in order to generate random numbers, but i don't know if there is a problem with the generation of numbers or with the memory recovering from the device or anything else. The problem is that, though i have tried to fill any cell of the array with the id of the thread that is atending it in order to see the results after copying into the host memory, i receive an array that is filled with 0 in any position after recovering the data with cudaMemcpy().
I'm programming on Visual Studio 2013, with cuda 7.5, on a i5 2500k as my processor and a 960 GTX graphic card.
Here is the main and the method where i try to fill it. I'll update the cuRand Initialization too. If you need to see something else, just tell me.
__global__ void setup_cuRand(curandState * state, unsigned long seed)
{
int id = threadIdx.x;
curand_init(seed, id, 0, &state[id]);
}
__global__ void poblar(int * adn, curandState * state){
curandState localState = state[threadIdx.x];
int random = curand(&localState);
adn[threadIdx.x] = random;
// It doesn't mind if i use the following instruction, the result is a lot of 0's
//adn[threadIdx.x] = threadIdx.x;
}
int main()
{
const int adnLength = NUMCROMOSOMAS * SIZECROMOSOMAS; // 256 * 128 (32.768)
const size_t adnSize = adnLength * sizeof(int);
int adnCPU[adnLength];
int * adnDevice;
cudaError_t error = cudaSetDevice(0);
if (error != cudaSuccess)
exit(-EXIT_FAILURE);
curandState * randState;
error = cudaMalloc(&randState, adnLength * sizeof(curandState));
if (error != cudaSuccess){
cudaFree(randState);
exit(-EXIT_FAILURE);
}
//Here is initialized cuRand
setup_cuRand <<<1, adnLength >> > (randState, unsigned(time(NULL)));
error = cudaMalloc((void **)&adnDevice, adnSize);
if (error == cudaErrorMemoryAllocation){// cudaSuccess){
cudaFree(adnDevice);
cudaFree(randState);
printf("\n error");
exit(-EXIT_FAILURE);
}
poblar <<<1, adnLength >>> (adnDevice, randState);
error = cudaMemcpy(adnCPU, adnDevice, adnSize, cudaMemcpyDeviceToHost);
//After here, for any i, adnCPU[i] is 0 and i cannot figure what is wrong
if (error == cudaSuccess){
for (int i = 0; i < NUMCROMOSOMAS; i++){
for (int j = 0; j < SIZECROMOSOMAS; j++){
printf("%i,", adnCPU[(i*SIZECROMOSOMAS) + j]);
}
printf("\n");
}
}
return 0;
}
EDIT after answer solved: There was a particularity over the answer given, and is that you need a lower number of threads (half of that quantity worked for me) in order to seed correctly the random numbers with cuRand. For some reason, i could create the threads perfectly but i couldn't seed the pseudo-random algorithm generator.
The maximum number of threads per block is 1024 on your hardware, hence, you may not schedule a call with adnLength if it is larger than 1024.
The error you are having is most probably a call configuration error, and it is returned by cudaPeekAtLastError, as it occurs before any GPU work, right after the triple angled-bracket call. Indeed cudaMemcpy may not return it, even though it returns error from previous asynchronous calls.
The error that may occur is cudaErrorLaunchOutOfResources.
Related
I'm still a beginner with CUDA and I have been trying to write a simple kernel to perform a parallel prime sieve on the GPU. Originally I had written my code in C but I wanted to investigate the speed up on a GPU so I rewrote it:
41.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define B 1024
#define T 256
#define N (B*T)
#define checkCudaErrors(error) {\
if (error != cudaSuccess) {\
printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
exit(1);\
}\
}\
__global__ void prime_sieve(int *primes) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
primes[i] = i;
primes[0] = primes[1] = 0;
if (i > 1 && i<N) {
for (int j=2; j<N/2; j++) {
if (i*j < N) {
primes[i*j] = 0;
}
}
}
}
int main() {
int *h_primes=(int*)malloc(N * sizeof(int));
int *d_primes;
checkCudaErrors(cudaMalloc( (void**)&d_primes, N*sizeof(int)));
checkCudaErrors(cudaMemcpy(d_primes,h_primes,N*sizeof(int),cudaMemcpyHostToDevice));
prime_sieve<<<B,T>>>(d_primes);
checkCudaErrors(cudaMemcpy(h_primes,d_primes,N*sizeof(int),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_primes));
int size = 0;
int total = 0;
for (int i=2; i<N; i++) {
if (h_primes[i]) {
size++;
}
total++;
}
printf("\n");
printf("Length = %d\tPrimes = %d\n",total,size);
free(h_primes);
return 0;
}
I run the program on Ubuntu 16.04 (4.4.0-83-generic) and I compile using nvcc 41.cu -o 41.o -arch=sm_30 under version 8.0.61. The program is run on a GeForce GTX 780 Ti but everytime it runs, it always produces non-deterministic results:
Length = 262142 Primes = 49477
Length = 262142 Primes = 49486
Length = 262142 Primes = 49596
Length = 262142 Primes = 49589
There were no errors reported back. At first I thought it was a race condition but cuda-memcheck didn't report back any hazards for racecheck,initcheck or synccheck and I couldn't think of any problems with my assumptions. I was thinking this could be a synchronisation problem?
This non-deterministic behaviour only occurs when I increase the block size and thread size as seen in the code. When I tried a block size and thread size of say 16, then there were no problems (as far as I could tell). It seems that not all threads get the chance to execute? I was planning to run this on very large array sizes (< 1 billion integers) but I am stuck at this point.
What am I doing wrong here?
There is a giant race-condition
So prime[i] > 0 means prime, while prime[i]=0 means composite.
primes[i] = i; is executed as first update on primes by each thread. Keep this in mind.
Now let's see what happen when thread 16 executes. It marks primes[16]=16 and and all multiples of 16 too. Something like the following
primes[16] = primes[32] = primes[48]=....=primes[k*16]=0
Imagine that thread 48 gets scheduled just after thread 16 completed its job (or when j>3 in thread 16 loop`).
Thread 48 sets primes[48] = 48. You have lost the update made by thread 16.
That is a race condition.
When coding in CUDA you should make sure that the correctness of your code does not depend on a particular scheduling of warps.
You should think as the order of execution as something non-deterministic.
I'm aware that there are multiple questions similar to this one already answered but I've been unable to piece together anything very helpful from them other than that I'm probably incorrectly indexing something.
I'm trying to preform a sequential addressing reduction on input vector A into output vector B.
The full code is available here http://pastebin.com/7UGadgjX, but this is the kernel:
__global__ void vectorSum(int *A, int *B, int numElements) {
extern __shared__ int S[];
// Each thread loads one element from global to shared memory
int tid = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
S[tid] = A[i];
__syncthreads();
// Reduce in shared memory
for (int t = blockDim.x/2; t > 0; t>>=1) {
if (tid < t) {
S[tid] += S[tid + t];
}
__syncthreads();
}
if (tid == 0) B[blockIdx.x] = S[0];
}
}
and these are the kernel launch statements:
// Launch the Vector Summation CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorSum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, numElements);
I'm getting a unspecified launch error which I've read is similar to a segfault. I've been following the nvidia reduction documentation closely and tried to keep my kernel within the bounds of numElements but I seem to be missing something key considering how simple the code is.
Your problem is that the reduction kernel requires dynamically allocated shared memory to operate correctly, but your kernel launch doesn't specify any. The result is out of bounds/illegal shared memory access which aborts the kernel.
In CUDA runtime API syntax, the kernel launch statement has four arguments. The first two are the grid and block dimensions for the launch. The latter two are optional with zero default values, but specify the dynamically allocated shared memory size and stream.
To fix this, change the launch code as follows:
// Launch the Vector Summation CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t shmsz = (size_t)threadsPerBlock * sizeof(int);
vectorSum<<<blocksPerGrid, threadsPerBlock, shmsz>>>(d_A, d_B, numElements);
[disclaimer: code written in browser, not compiled or tested, use at own risk]
This should at least fix the most obvious problem with your code.
I have a problem that is parallel on two levels: I have a ton of sets of (x0, x1, y0, y1) coordinate pairs, which are turned into variables vdx, vdy, vyy and for each of these sets I'm trying to calculate the values of all "monomials" composed of them up to degree n (i.e. all possible combinations of different powers of them, like vdx^3*vdy*vyy^2 or vdx*1*vyy^4). These values are then added up over all the sets.
My strategy (and for now I'd just like to get it to work, it doesn't have to be optimized with multiple kernels or complex reductions, unless it really has to) is to have each thread deal with one set of coordinate pairs and calculate the values of all their corresponding monomials. Each block's shared memory holds all the monomial sums, and when the block is done, the first thread in the block adds the result to the global sum. Since each block's shared memory is accessed by all threads in all places, I'm using atomicAdd; same with the blocks and the global memory.
Unfortunately there still seems to be a race condition somewhere, since I different results every time I run the kernel.
If it helps, I'm currently using degree = 3 and omitting one of the variables, which means that in the code below, the innermost for loop (over evbl) doesn't do anything and just repeats 4 times. Indeed, the output of the kernel looks like this: 51502,55043.1,55043.1,51502,47868.5,47868.5,48440.5,48440.6,46284.7,46284.7,46284.7,46284.7,46034.3,46034.3,46034.3,46034.3,44972.8,44972.8,44972.8,44972.8,43607.6,43607.6,43607.6,43607.6,43011,43011,43011,43011,42747.8,42747.8,42747.8,42747.8,45937.8,45937.8,46509.9,46509.9,... and it's noticable that there is a (rough) pattern of 4-tuples. But everytime I run it the values are all very different.
Everything is in floats, but I'm on a 2.1 GPU and so that shouldn't be a problem. cuda-memcheck also reports no errors.
Can somebody with more CUDA experience give me some pointers how to track down the race condition here?
__global__ void kernel(...) {
extern __shared__ float s_data[];
// just use global memory for now
// get threadID:
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= nPairs) return;
// ... do some calculations to get x/y...
// calculate vdx, vdy and vyy
float vdx = (x1 - x0)/(float)xheight;
float vdy = (y1 - y0)/(float)xheight;
float vyy = 0.5*(y0 + y1)/(float)xheight;
const int offs1 = degree + 1;
const int offs2 = offs1 * offs1;
const int offs3 = offs2 * offs1;
float sol = 1.0;
// now calculate monomial results and store in shared memory
for(int evdx = 0; evdx <= degree; evdx++) {
for(int evdy = 0; evdy <= degree; evdy++) {
for(int evyy = 0; evyy <= degree; evyy++) {
for(int evbl = 0; evbl <= degree; evbl++) {
s = powf(vdx, evdx) + powf(vdy, evdy) + powf(vyy, evyy);
atomicAdd(&(s_data[evbl + offs1*evyy + offs2*evdy +
offs3*evdx]), sol/1000.0 );
}
}
}
}
// now copy shared memory to global
__syncthreads();
if(threadIdx.x == 0) {
for(int i = 0; i < nMonomials; i++) {
atomicAdd(&outmD[i], s_data[i]);
}
}
}
You are using shared memory but you are never initializing it.
This is program for matrix multiplication on CUDA architecture.
This code is working fine when size of array is 30 x 30 but giving output as a series of 0's when size is greater.
I am using standard ec2 instance for CUDA hosted on linux machine. Can anybody figure out the reason ?
#include <stdio.h>
#define SIZE 30
__global__ void matrix_multiply(float *input1,float *input2,float *output,int dimension){
int input1_index = threadIdx.x / dimension * dimension;
int input2_index = threadIdx.x % dimension;
int i=0;
for( i =0; i <dimension; i++){
output[threadIdx.x] += input1[input1_index + i] * input2[input2_index + i * dimension];
}
}
int main(){
int i,j,natural_number=1;
float input1[SIZE][SIZE],input2[SIZE][SIZE],result[SIZE][SIZE]={0};
float *c_input1,*c_input2,*c_result;
for(i=0;i<SIZE;i++){
for(j=0;j<SIZE;j++){
input1[i][j]=input2[i][j]=natural_number++;
}
}
cudaMalloc((void**)&c_input1,sizeof(input1));
cudaMalloc((void**)&c_input2,sizeof(input2));
cudaMalloc((void**)&c_result,sizeof(result));
cudaMemcpy(c_input1,input1,sizeof(input1),cudaMemcpyHostToDevice);
cudaMemcpy(c_input2,input2,sizeof(input2),cudaMemcpyHostToDevice);
cudaMemcpy(c_result,result,sizeof(result),cudaMemcpyHostToDevice);
matrix_multiply<<<1,SIZE * SIZE>>>(c_input1,c_input2,c_result,SIZE);
if(cudaGetLastError()!=cudaSuccess){
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
}
cudaMemcpy(result,c_result,sizeof(result),cudaMemcpyDeviceToHost);
for(i=0;i<SIZE;i++){
for(j=0;j<SIZE;j++){
printf("%.2f ",result[i][j]);
}
printf("\n");
}
cudaFree(c_input1);
cudaFree(c_input2);
cudaFree(c_result);
return 0;
}
You probably have a max of 1024 threads per block on your GPU. 30 x 30 = 900, so that should be OK, but e.g. 40 x 40 would results in a kernel launch failure (take-home message: always check for errors !).
You probably want to consider organizing your data differently, e.g. SIZE blocks of SIZE threads and then call the kernel as:
matrix_multiply<<<SIZE, SIZE>>>(c_input1,c_input2,c_result,SIZE);
(Obviously you'll need to modify your array indexing within the kernel code, e.g. use the block index as the row and the thread index as the column.)
You are invoking the kernel with a configuration of 1 grid with size 30x30:
matrix_multiply<<<1, SIZE * SIZE>>>(c_input1,c_input2,c_result,SIZE);
There are not enough threads to process more.
I'm having several problems regarding OpenCL (total noob) but I think that if I manage to solve this one I will be able to solve some of the other. I have the following kernel that I want to store in a double array the a number calculated by the data of a struct. The argument that I pass to the kernel is a struct array and is initialised and the values are non zero (I tested it).
When executing the kernel though I get a "Floating point exception". If I got it right it means that the local_density variable is zero and the division causes an error. What I don't get is why it is zero since in the host the values of non-zero. Am I doing something wrong in the kernel?
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef struct
{
double speeds[9];
} t_speed;
__kernel void prepare(__global const t_speed* cells,
__global const int* obstacles,
__global double* results,
const unsigned int count)
{
int pos = get_global_id(0);
if(pos >= count) return;
if(obstacles[pos] == 1) results[pos] = 0.00;
else
{
double local_density = 0.00;
for(int kk = 0; kk < 9; kk++)
local_density += cells[pos].speeds[kk];
results[pos] = (cells[pos].speeds[1] + cells[pos].speeds[5] +
cells[pos].speeds[8] - (cells[pos].speeds[3] +
cells[pos].speeds[6] + cells[pos].speeds[7])) /
local_density;
}
}
Here is also the initialization of the variable that I pass as an argument. params->ny/nx have correct values.
cells = (t_speed*) malloc(sizeof(t_speed) * (params->ny * params->nx));
Also I quote the argument setting for the kernel for the cells variable.
m_cells = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(t_speed) * count, NULL, NULL);
err = clEnqueueWriteBuffer(commands, m_cells, CL_TRUE, 0, sizeof(t_speed) * count, cells, 0, NULL, NULL);
err |= clSetKernelArg(av_velocity_prepare_kernel, 0, sizeof(cl_mem), &m_cells);
------------------------------------------ EDIT ------------------------------------------
OK, what is really weird is that I'm getting the same error (Floating point exception) even with the very simple following kernel. Anyone has got a clue?
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void test(__global float* result, const unsigned int n)
{
int i = get_global_id(0);
if(i >= n) return;
result[i] += 1.0f;
}
I noticed that you are declaring your buffer as CL_MEM_READ_ONLY, yet your are writing to it inside the kernel. According to the OpenCL spec, this is undefined. Try using CL_MEM_READ_WRITE instead.
OK, so it was a completely different thing than I thought it was. The problem was that when I was calling
clEnqueueNDRangeKernel (command_queue, kernel, work_dim, *global_work_offset,
*global_work_size, *local_work_size, num_events_in_wait_list,
*event_wait_list, *event)
the global_work_size was not divisible by local_work_size. That caused the Floating point exception.