Read from cudaBindTexture2D - c

I have been trying store a 2D array in texture memory and read from it via cudaBindTexture2D
but the value returned is 0, but I'm not sure if this is the right use of cudaBindTexture2D, and tex2D();
I made a pretty simple code to try it out :
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
texture<uint, cudaTextureType2D, cudaReadModeElementType> tex;
__global__
void texture2DTest(int *x){
*x = tex2D(tex,0,0);
}
void initTable(int textureTable[][9]){
int i=0;
int j=0;
for(i=0; i<10; i++){
for(j=0; j<9; j++){
textureTable[i][j]=0;
}
}
textureTable[0][0] = 12;
}
int main (int argc, char ** argv){
int textureTable[10][9];
int *d_x;
int x=2;
size_t pitch;
initTable(textureTable);
cudaMalloc(&d_x, sizeof(int));
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
cudaMallocPitch( (void**)textureTable,&pitch, 9, 10);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint>();
cudaBindTexture2D(NULL, tex, textureTable, desc, 9, 10, pitch) ;
texture2DTest<<<1,1>>>(d_x);
cudaThreadSynchronize();
cudaMemcpy(&x,d_x, sizeof(int), cudaMemcpyDeviceToHost);
printf(" \n %d \n",x);
cudaUnbindTexture(tex);
return 0;
}
Thank you.

There are quite a few issues in the provided code.
The device memory allocation using cudaMallocPitch is totally broken. You are trying to allocate device memory to a 2D array which is already allocated on the host.
Trying to do so will result in memory corruption and undefined behavior. A separate pointer variable is required for device memory allocation and memory should be copied from host to device after allocation.
The third argument of cudaMallocPitch expects width of memory in bytes; not elements.
Textures can only be bound to device memory, so cudaBindTexture expects device memory pointer as input.
Fixing all of the above issues, your final main will look something like this:
int main (int argc, char ** argv)
{
int textureTable[10][9];
int *d_x;
int x = 2;
size_t pitch;
initTable(textureTable);
cudaMalloc(&d_x, sizeof(int));
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
int* d_textureTable; //Device texture table
//Allocate pitch linear memory to device texture table
cudaMallocPitch((void**)&d_textureTable,&pitch, 9 * sizeof(int), 10);
//Use Memcpy2D as the pitch of host and device memory may be different
cudaMemcpy2D(d_textureTable, pitch, textureTable, 9 * sizeof(int), 9 *sizeof(int), 10, cudaMemcpyHostToDevice);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint>();
cudaBindTexture2D(NULL, tex, d_textureTable, desc, 9, 10, pitch) ;
texture2DTest<<<1,1>>>(d_x);
cudaThreadSynchronize();
cudaMemcpy(&x,d_x, sizeof(int), cudaMemcpyDeviceToHost);
printf(" \n %d \n",x);
cudaUnbindTexture(tex);
//Don't forget to free the allocated memory
cudaFree(d_textureTable);
cudaFree(d_x);
return 0;
}

Related

CUDA: Finding Minimum Value in 100x100 Matrix

i just learned GPU programming and now i have a task to find a minimum value from 100x100 matrix by doing parallel at CUDA. i have try this code, but it's not showing the answer, instead of showing my initiate value hmin = 9999999.can anyone give me the right code? oh, the code is in C lang.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define size (100*100)
//Kernel Functions & Variable
__global__ void FindMin(int* mat[100][100],int* kmin){
int b=blockIdx.x+threadIdx.x*blockDim.x;
int k=blockIdx.y+threadIdx.y*blockDim.y;
if(mat[b][k] < kmin){
kmin = mat[b][k];
}
}
int main(int argc, char *argv[]) {
//Declare Variabel
int i,j,hmaks=0,hmin=9999999,hsumin,hsumax; //Host Variable
int *da[100][100],*dmin,*dmaks,*dsumin,*dsumax; // Device Variable
FILE *baca; //for opening txt file
char buf[4]; //used for fscanf
int ha[100][100],b; //matrix shall be filled by "b"
//1: Read txt File
baca=fopen("MatrixTubes1.txt","r");
if (!baca){
printf("Hey, it's not even exist"); //Checking File, is it there?
}
i=0;j=0; //Matrix index initialization
if(!feof(baca)){ //if not end of file then do
for(i = 0; i < 100; i++){
for(j = 0; j < 100; j++){
fscanf(baca,"%s",buf); //read max 4 char
b=atoi(buf); //parsing from string to integer
ha[i][j]=b; //save it to my matrix
}
}
}
fclose(baca);
//all file has been read
//time to close the file
//Sesi 2: Allocation data di GPU
cudaMalloc((void **)&da, size*sizeof(int));
cudaMalloc((void **)&dmin, sizeof(int));
cudaMalloc((void **)&dmaks, sizeof(int));
cudaMalloc((void **)&dsumin, sizeof(int));
cudaMalloc((void **)&dsumax, sizeof(int));
//Sesi 3: Copy data to Device
cudaMemcpy(da, &ha, size*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmin, &hmin, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmaks, &hmaks, sizeof(int), cudaMemcpyHostToDevice);
//Sesi 4: Call Kernel
FindMin<<<100,100,1>>>(da,dmin);
//5: Copy from Device to Host
cudaMemcpy(&hmin, dmin, sizeof(int), cudaMemcpyDeviceToHost);
//6: Print that value
printf("Minimum Value = %i \n",hmin);
system("pause"); return 0;
}
this is my result
Minimum Value = 9999999
Press any key to continue . . .
I saw a few issues in your code.
As mentioned in the comments from MayurK, you got the indexing wrong.
Also as MayurK said, you are comparing two pointers and not the values they point to.
You kernel invocation code asks for 100 x 100 x 1 grid, with each block containing just 1 thread. This is very bad in terms of efficiency. Also, because of this, your b and k will only range from 0 to 99, as the threadIdx.x will always be zero.
Finally, all threads will be running in parallel, resulting in a race condition in kmin = mat[b][k] (which should be *kmin by the way). When you fixed the indexing problem, all threads in the same block will write to the location in global memory at same time. You should use atomicMin() or a parallel reduction for finding the minimum value in parallel.

cuda : Is shared memory always helpful?

When I read the programming guide, I got the feeling that shared memory will always improve the performance, but it seems not.
I have two functions:
const int Ntimes=1;
__global__ void testgl(float *A, float *C, int numElements){
int ti = threadIdx.x;
int b0 = blockDim.x*blockIdx.x;
if (b0+ti < numElements){
for(int i=0;i<Ntimes;i++){
A[b0+ti]=A[b0+ti]*A[b0+ti]*10-2*A[b0+ti]+1;
}
C[b0+ti] = A[b0+ti]*A[b0+ti];
}
}
__global__ void testsh(float *A, float *C, int numElements){
int ti = threadIdx.x;
int b0 = blockDim.x*blockIdx.x;
__shared__ float a[1024];
if (b0+ti < numElements){
a[ti]=A[b0+ti];
}
__syncthreads();
if (b0+ti < numElements){
for(int i=0;i<Ntimes;i++){
a[ti]=a[ti]*a[ti]*10-2*a[ti]+1;
}
C[b0+ti] = a[ti]*a[ti];
}
}
int main(void){
int numElements = 500000;
size_t size = numElements * sizeof(float);
// Allocate the host input
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
// Allocate the host output
float *h_C = (float *)malloc(size);
float *h_D = (float *)malloc(size);
// Initialize the host input
for (int i = 0; i < numElements; i++){
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = h_A[i];
}
// Allocate the device input
float *d_A = NULL; cudaMalloc((void **)&d_A, size);
float *d_B = NULL; cudaMalloc((void **)&d_B, size);
float *d_C = NULL; cudaMalloc((void **)&d_C, size);
float *d_D = NULL; cudaMalloc((void **)&d_D, size);
//Copy to Device
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 1024;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
testgl<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
testsh<<<blocksPerGrid, threadsPerBlock>>>(d_B, d_D, numElements);
// Copy the device resultto the host
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_D, d_D, size, cudaMemcpyDeviceToHost);
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_D);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
free(h_D);
// Reset the device and exit
cudaDeviceReset();
return 0;
}
If Ntimes is set to be 1, testgl costs 49us, and testsh costs 97us.
If Ntimes is set to be 100, testgl costs 9.7ms, and testsh costs 8.9ms.
I do not know why it's more than 100 times longer.
So it seems the shared memory helps only when we want to do a lot of things in device, is that right?
The card used here is GTX680.
Thanks in advance.
shared memory will always improve the performance
Thats not true. It depends on the algorithm. If you have a perfectly coalesced memory access in the kernel and you are accessing the global memory just once it may not help. But if you are implementing suppose a matrix multiplication where you need the partial sums to be held then it will be useful.
It will be also helpful if you are accessing the same memory location more than once in the kernel it will help in this case since the shared memory latency is 100 times less than the global memory because its on-chip memory.
When you analyse that the kernel is bandwidth limited then its a good place to think if there is a scope of using the shared memory and increase the performance. Its also better strategy to check the occupancy calculator to check if the usage of shared memory is going to affect the occupancy.
shared memory helps only when we want to do a lot of things in device ?
Partial Yes. Shared memory helps when we want to do a lot of things in device.
In your case in the above kernel, as you are accessing the global memory more than once in the kernel it should help. It will be helpful if you can provide the complete reproducer to analyze the code. Also it will be helpful to know the card details you are running on.

C error "variable-sized object may not be initialized" [duplicate]

This question already has answers here:
Closed 10 years ago.
Possible Duplicate:
C compile error: “Variable-sized object may not be initialized”
I've got a problem cause my compiler still gives me an error: Variable-sized object may not be initialized. What's wrong with my code?
int x, y, n, i;
printf ("give me the width of the table \n");
scanf ("%d", &x);
printf ("give me the height of the table\n");
scanf ("%d", &y);
int plansza [x][y] = 0;
int plansza2 [x][y] = 0;
Of course I want to fill the table with 'zeroes'.
Unfortunately the program still doesn't work. The tables are displayed with numbers like '416082' on all of their cells. My code looks like this now.:
int plansza [x][y];
memset(plansza, 0, sizeof plansza);
int plansza2 [x][y];
memset(plansza2, 0, sizeof plansza2);
printf("plansza: \n");
for(j=0;j<x;j++) {
for(l=0;l<y;l++) {
printf("%d",plansza[x][y]);
printf(" ");
}
printf("\n");
}
printf("plansza2: \n");
for(m=0;m<x;m++) {
for(o=0;o<y;o++) {
printf("%d",plansza2[x][y]);
printf(" ");
}
printf("\n");
}
Your two arrays are variable lenght arrays. You cannot initialize a variable length array in C.
To set all the int elements of your arrays to 0 you can use the memset function:
memset(plansza, 0, sizeof plansza);
By the way to initialize an array which is not a variable length array, the valid form to initialize all the elements to 0 is:
int array[31][14] = {{0}}; // you need the {}
If both dimensions are unknown, you'll have to use a one-dimensional array, and do the indexing yourself:
int *ary = (int *) malloc(x * y * sizeof(int));
memset(ary, 0, x * y * sizeof(int));
int elem1_2 = ary[1 * x + 2];
int elem3_4 = ary[3 * x + 4];
and so on. You better define some macros or access functions. And free the memory after use.
Alternative to the suggestion of #Chris:
You can create 2d array as an array of one-dimensional arrays. This will allow you to do array element indexing as you used to. Be aware that in this case array is allocated in heap, not in stack. Therefore, when you don't need the array anymore, you must clean the allocated memory.
Example:
#include <malloc.h>
#include <string.h>
/* Create dynamic 2-d array of size x*y. */
int** create_array (int x, int y)
{
int i;
int** arr = (int**) malloc(sizeof (int*) * x);
for (i = 0; i < x; i++)
{
arr[i] = (int*) malloc (sizeof (int) * y);
memset (arr[i], 0, sizeof (arr[i]));
}
return arr;
}
/* Deallocate memory. */
void remove_array (int** arr, int x)
{
int i;
for (i = 0; i < x; i++)
free (arr[i]);
free (arr);
}
int main()
{
int x = 5, y = 10;
int i, j;
int** plansza = create_array (x, y); /* Array creation. */
plansza[1][1] = 42; /* Array usage. */
remove_array (plansza, x); /* Array deallocation. */
return 0;
}

Sending 2D array to Cuda Kernel

I'm having a bit of trouble understanding how to send a 2D array to Cuda. I have a program that parses a large file with a 30 data points on each line. I read about 10 rows at a time and then create a matrix for each line and items(so in my example of 10 rows with 30 data points, it would be int list[10][30]; My goal is to send this array to my kernal and have each block process a row(I have gotten this to work perfectly in normal C, but Cuda has been a bit more challenging).
Here's what I'm doing so far but no luck(note: sizeofbucket = rows, and sizeOfBucketsHoldings = items in row...I know I should win a award for odd variable names):
int list[sizeOfBuckets][sizeOfBucketsHoldings]; //this is created at the start of the file and I can confirmed its filled with the correct data
#define sizeOfBuckets 10 //size of buckets before sending to process list
#define sizeOfBucketsHoldings 30
//Cuda part
//define device variables
int *dev_current_list[sizeOfBuckets][sizeOfBucketsHoldings];
//time to malloc the 2D array on device
size_t pitch;
cudaMallocPitch((int**)&dev_current_list, (size_t *)&pitch, sizeOfBucketsHoldings * sizeof(int), sizeOfBuckets);
//copy data from host to device
cudaMemcpy2D( dev_current_list, pitch, list, sizeOfBuckets * sizeof(int), sizeOfBuckets * sizeof(int), sizeOfBucketsHoldings * sizeof(int),cudaMemcpyHostToDevice );
process_list<<<count,1>>> (sizeOfBuckets, sizeOfBucketsHoldings, dev_current_list, pitch);
//free memory of device
cudaFree( dev_current_list );
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, int pitch) {
int tid = blockIdx.x;
for (int r = 0; r < sizeOfBuckets; ++r) {
int* row = (int*)((char*)current_list + r * pitch);
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
int element = row[c];
}
}
The error I'm getting is:
main.cu(266): error: argument of type "int *(*)[30]" is incompatible with parameter of type "int *"
1 error detected in the compilation of "/tmp/tmpxft_00003f32_00000000-4_main.cpp1.ii".
line 266 is the kernel call process_list<<<count,1>>> (count, countListItem, dev_current_list, pitch); I think the problem is I am trying to create my array in my function as int * but how else can I create it? In my pure C code, I use int current_list[num_of_rows][num_items_in_row] which works but I can't get the same outcome to work in Cuda.
My end goal is simple I just want to get each block to process each row(sizeOfBuckets) and then have it loop through all items in that row(sizeOfBucketHoldings). I orginally just did a normal cudamalloc and cudaMemcpy but it wasn't working so I looked around and found out about MallocPitch and 2dcopy(both of which were not in my cuda by example book) and I have been trying to study examples but they seem to be giving me the same error(I'm currently reading the CUDA_C programming guide found this idea on page22 but still no luck). Any ideas? or suggestions of where to look?
Edit:
To test this, I just want to add the value of each row together(I copied the logic from the cuda by example array addition example).
My kernel:
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, size_t pitch, int *total) {
//TODO: we need to flip the list as well
int tid = blockIdx.x;
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
total[tid] = total + current_list[tid][c];
}
}
Here's how I declare the total array in my main:
int *dev_total;
cudaMalloc( (void**)&dev_total, sizeOfBuckets * sizeof(int) );
You have some mistakes in your code.
Then you copy host array to device you should pass one dimensional host pointer.See the function signature.
You don't need to allocate static 2D array for device memory. It creates static array in host memory then you recreate it as device array. Keep in mind it must be one dimensional array, too. See this function signature.
This example should help you with memory allocation:
__global__ void process_list(int sizeOfBucketsHoldings, int* total, int* current_list, int pitch)
{
int tid = blockIdx.x;
total[tid] = 0;
for (int c = 0; c < sizeOfBucketsHoldings; ++c)
{
total[tid] += *((int*)((char*)current_list + tid * pitch) + c);
}
}
int main()
{
size_t sizeOfBuckets = 10;
size_t sizeOfBucketsHoldings = 30;
size_t width = sizeOfBucketsHoldings * sizeof(int);//ned to be in bytes
size_t height = sizeOfBuckets;
int* list = new int [sizeOfBuckets * sizeOfBucketsHoldings];// one dimensional
for (int i = 0; i < sizeOfBuckets; i++)
for (int j = 0; j < sizeOfBucketsHoldings; j++)
list[i *sizeOfBucketsHoldings + j] = i;
size_t pitch_h = sizeOfBucketsHoldings * sizeof(int);// always in bytes
int* dev_current_list;
size_t pitch_d;
cudaMallocPitch((int**)&dev_current_list, &pitch_d, width, height);
int *test;
cudaMalloc((void**)&test, sizeOfBuckets * sizeof(int));
int* h_test = new int[sizeOfBuckets];
cudaMemcpy2D(dev_current_list, pitch_d, list, pitch_h, width, height, cudaMemcpyHostToDevice);
process_list<<<10, 1>>>(sizeOfBucketsHoldings, test, dev_current_list, pitch_d);
cudaDeviceSynchronize();
cudaMemcpy(h_test, test, sizeOfBuckets * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < sizeOfBuckets; i++)
printf("%d %d\n", i , h_test[i]);
return 0;
}
To access your 2D array in kernel you should use pattern base_addr + y * pitch_d + x.
WARNING: the pitvh allways in bytes. You need to cast your pointer to byte*.

C matrix, allocating doesnt zero all elements?

I'm trying to write a little matrix program. Using doublke pointers doesnt work so I figure the easiest way is to have a struct that has the #rows and #columns and a 1d array as the matrix.
But there is some error in the initiation of the matrix as i get:
weird values for the indices (0,0) and (0.1) instead of 0.
Something with this perhaps:
matrix *mtrx = malloc(sizeof(matrix));
mtrx->m = malloc(r * c * sizeof(int));
matrix.c:
#include <stdio.h>
#include <stdlib.h>
#include "Matrix.h"
matrix *alloc_matrix(int r, int c)
{
matrix *mtrx = malloc(sizeof(matrix));
mtrx->m = malloc(r * c * sizeof(int));
if (mtrx == NULL || m == NULL) {
printf("Out of memory.");
exit(1);
}
mtrx->rows = r;
mtrx->columns = c;
return mtrx;
}
void free_matrix(matrix *mtrx)
{
free(mtrx->m);
free(mtrx);
}
void set(matrix *mtrx, int r, int c, int v)
{
(mtrx->m)[r * mtrx->columns + c] = v;
}
int get(matrix *mtrx, int r, int c)
{
return (mtrx->m)[r * mtrx->columns + c];
}
void print_matrix(matrix *mtrx)
{
int i,j;
printf("\n");
for(i=0; i<mtrx->rows; i++) {
for(j=0; j<mtrx->columns; j++) {
printf("%i ", get(mtrx,i,j));
}
printf("\n");
}
}
matrix.h:
struct matrix_ {
int rows;
int columns;
int *m;
};
typedef struct matrix_ matrix;
matrix *alloc_matrix(int r, int c);
void free_matrix(matrix *mtrx);
void set(matrix *mtrx, int r, int c, int v);
int get(matrix *mtrx, int r, int c);
void print_matrix(matrix *m);
main.c:
#include <stdio.h>
#include <stdlib.h>
#include "Matrix.h"
int main(void)
{
matrix *m = alloc_matrix(3,4);
print_matrix(m);
printf("\nm[0][0] = %i", get(m,0,0));
set(m,0,0,0);
printf("\nm[0][0] = %i", (m->m)[0]);
printf("\nm[0][0] = %i", (m->m)[12]);
return 0;
}
output:
all elements except (0,0) and (0,1) is 0.
Function malloc allocates a block of memory, returning a pointer to the beginning of the block. It doesn't set all its bits to zero.
Allocating a block of memory and initializing all its bits to zero - that's what calloc function is for.
Or you can explicitly set these bits to zero by using memset
The object allocated by malloc has an unspecified value. If needed, you have to zero-ize the object yourself (for example using memset function) or call calloc instead of malloc.
malloc() is not guaranteed to zero the memory. Use calloc() to allocate memory with zeros.
malloc does not initiate with 0's (because of performance issues... it's not always what you want)...
use calloc() instead.
malloc does not zero out allocated memory. If you want to fill the matrix with zeros on allocation, use calloc instead. In your case, replace:
mtrx->m = malloc(r * c * sizeof(int));
with
mtrx->m = calloc(r*c, sizeof(int));
Answering your follow-up question:
However is there any difference in efficiency between malloc+memset and calloc? or is calloc simplye malloc+memset?
Typically, for "small" allocations calloc is equivalent to malloc + memset. For "large" allocations (multiple pages, at least), your library may do something more clever relying on some amount of OS support. One approach would be for the OS to lazily zero fill the allocated pages as they are actually used, rather than zero filling all of them immediately upon allocation.
That is correct. The C specification does not say arrays are initilized to anything. You just get a piece of memory that will have whatever values there where before.
You can easily initialize to sero though: memset(mtrx->m, 0, sizeof(int) * r * c);

Resources