Negative array indexing in shared memory based 1d stencil CUDA implementation

Negative array indexing in shared memory based 1d stencil CUDA implementation - arrays

I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
....
To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.
The confusion comes in on the following line:
temp[lindex - RADIUS] = in[gindex - RADIUS];
If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.

The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.
To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.
(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}

You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).
My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:
stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);
So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.

There are already good answers, but to focus on the actual point that caused the confusion:
In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.
For example, consider a pointer like this:
int* data= ... // Points to some memory
When you then write a statement like
data[3] = 42;
you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written
int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]
and consequently,
dataWithOffset[-3] = 123; // This will write into data[0]
In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)

I can compile #JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":
And when run, it will abort like:
I have modified the code to the following and the warning disappeared and it can get right result:
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
The result is like this:

Related

How to allocate array starting negative index

I am trying to allocate a 3D array u[-nx/2:nx/2-1][-nx/2:nx/2-1][-nx/2:nx/2-1]
int nx = 512;
double *** u = (double ***)malloc(nx * sizeof(double**));
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double **)malloc(nx * sizeof(double *));
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double *)malloc(nx * sizeof(double));
}
}
Is this a correct way to do it? If it's not, how should I change it?

No, that’s not correct. You can get it to work by placing every pointer in the middle of the dimension it represents:
int nx = 512;
double*** u = (double***)malloc(nx * sizeof(double**)) + nx/2;
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double**)malloc(nx * sizeof(double*)) + nx/2;
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double*)malloc(nx * sizeof(double)) + nx/2;
}
}
but that’s unusual and confusing, does a lot of separate allocations, and has to be undone for the deallocation step.
Consider one block with accessors instead:
#define NX 512
/* or just double* if nx is dynamic, and calculate the index manually */
double[NX][NX][NX]* u = malloc(sizeof(*u));
double array_get(double[NX][NX][NX] const* u, int i, int j, int k) {
return u[i + NX/2][j + NX/2][k + NX/2];
}
void array_set(double[NX][NX][NX]* u, int i, int j, int k, double value) {
u[i + NX/2][j + NX/2][k + NX/2] = value;
}

No.
Array in C is actually just plain/flat memory block, which is always 0 based and always in 1d (one demension).
Suppose you need a 3d array in arbitrary boundary,
say u[lb_1d, ub_1d][lb_2d, ub_2d][lb_3d, ub_3d],
you will need to do some mapping -- address space from 3d to 1d and vice versa --.
Sample implementation like this:
typedef struct
{
double* _arr;
int _lb_1d;
int _ub_1d;
int _lb_2d;
int _ub_2d;
int _lb_3d;
int _ub_3d;
}DoubleArr3D;
DoubleArr3D* create_3d_arr(int lb_1d, int ub_1d, int lb_2d, int ub_2d, int lb_3d, int ub_3d)
{
int array_size = (ub_1d - lb_1d +1) * (ub_2d - lb_2d +1) * (ub_3d - lb_3d +1);
DoubleArr3D * arr = (DoubleArr3D *)malloc( sizeof( DoubleArr3D) );
if (!arr)
{
return NULL;
}
arr->_lb_1d = lb_1d;
arr->_ub_1d = ub_1d;
arr->_lb_2d = lb_2d;
arr->_ub_2d = ub_2d;
arr->_lb_3d = lb_3d;
arr->_ub_3d = ub_3d;
arr->_arr = (double*) malloc(sizeof(double) * (size_t) array_size);
if (!arr)
{
free(arr);
return NULL;
}
return arr;
}
// arr[i1d, i2d, i3d] ==> arr_get_at(arr, i1d, i2d, i3d)
double arr_get_at(DoubleArr3D* arr, int i1d, int i2d, int i3d )
{
if (!arr || !arr->_arr)
{
// just demo of 'validation check'. in real code we should have meanful error report
return 0;
}
return arr->_arr [
i3d - arr->_lb_3d
+ (i2d - arr->_lb_2d ) * (arr->_ub_3d - arr->_lb_3d +1)
+ (i1d - arr->_lb_1d ) * (arr->_ub_2d - arr->_lb_2d +1) * (arr->_ub_3d - arr->_lb_3d +1)
];
}

First off, all C arrays have index values ranging from 0 to ELEMENT_COUNT-1. Always.
As you are using malloc, I am presuming that the value of nx is only defined at runtime. This rules out static array sizes and thus rules out using the cute arr[x][y][z] syntax as in:
#define NX 512
double arr[NX][NX][NX];
void foo(void)
{
...
arr[z1][y1][x1] += 2 * arr[z2][y2][x2];
...
}
That in turn means that to have the functionality of a 3D array with nx different values for each of its three dimensions dimension, you will need to allocate a linear memory area of size nx_cubed = nx * nx * nx. To calculate that value nx_cubed properly, you will need to check for integer overflows.
Also, you need to properly convert from signed int coordinate values to unsigned size_t values used in the 0 based index ranges.
if (nx < 0) {
fprintf(stderr, "negative value of nx\n");
exit(EXIT_FAILURE);
}
const size_t unx = nx;
const size_t nx_cubed = unx * unx * unx;
/* TODO: Complete check for overflows */
if (nx_cubed < unx) {
fprintf(stderr, "nx_cubed overflow\n");
exit(EXIT_FAILURE);
}
Then you can allocate a memory buffer of the appropriate size, and then check that the malloc call has actually worked.
double *buf = malloc(nx_cubed);
if (!buf) {
fprintf(stderr, "Error allocating memory for nx_cubed elements\n");
exit(EXIT_FAILURE);
}
Now there is the question of calculcating the array index from your x, y, and z values each ranging from -nx/2 to nx/2-1. I recommend writing a function for that which maps that range to the 0 to nx-1 range, and then calculates the proper linear index from the three 0-based values. Again, proper integer overflow checks should be performed.
size_t array3index(const size_t nx, const int x, const int y, const int z) {
const size_t half_nx = nx/2;
/* zero based 3D coordinates,
* this probably triggers some signedness warnings */
const size_t x0 = half_nx + x;
const size_t y0 = half_nx + y;
const size_t z0 = half_nx + z;
if ((x0 >= nx) || (y0 >= nx) || (z0 >= nx)) {
fprintf(stderr, "Signed coordinate(s) out of range: (%d, %d, %d)\n",
x, y, z);
exit(EXIT_FAILURE);
}
const size_t idx = nx * (nx * z0 + y0) + x0;
/* Assuming that we have already checked that nx*nx*nx does not
* overflow, and given that we have checked for x0, y0, z0 to be
* in the range of 0 to (nx-1), the idx calculation should not
* have overflown here. */
return idx;
}
Then you can do your accesses to the 3D array like
const i1 = array3index(nx, x1, y1, z1);
const i2 = array3index(nx, x2, y2, z2);
buf[i1] += 2*buf[i2];
Considering the amount of calculations needed inside array3index, I would examine whether it makes more sense to do the array iteration in the 0 to nx-1 domain directly, and only convert that to -nx/2 to nx/2-1 range values if you actually need that value within a calculation.

How to write a generic function to sort a string array in C?

I'm trying to write a generic function to sort different types of data. My code is:
#include<stdio.h>
#define GENERIC_SORT(TYPE) \
TYPE ##_SORT(TYPE a[],int n) \
{ \
int i,j; \
TYPE aux; \
for(i=1;i<n;i++) \
for(j=n-1;j>=i;j--) \
if(a[j]<a[j-1]) \
{ \
aux=a[j]; \
a[j]=a[j-1]; \
a[j-1]=aux; \
} \
}
GENERIC_SORT(int)
GENERIC_SORT(float)
GENERIC_SORT(double)
GENERIC_SORT(char)
int main(void)
{
int i,a[]={3,7,5,4,6,1};
int_SORT(a,6);
for(i=0;i<6;i++)
printf("%d ",a[i]);
return 0;
}
I'm preparing for an exam,and in the courses there's an example with GENERIC_MAX, which finds the maximum between 2 values. And I'm supposed to make the sort just like this...
It works fine on int, float, double and char. But how can I use it to sort a string array (char a[][100] or char *a[])?

A prime example of a generic sort is the C runtime library qsort(). One of its most versatile attributes is that it makes use of a "comparison function" which is passed as a parameter.
Why not adopt that tactic? While it is true that most comparison functions are trivial, in the case of accessing objects it is invaluable for interpreting what is inside the object.

What you need to do is generate the C equivalent of C++'s template functions using generics. This is usually a combination of function pointers and re-casting void* data to achieve the desired result. The qsort() function does just this. Included below is a code listing and sample run from a similar answer of mine from some time back that shows you how to use a simple Bubble Sort implementation for multiple data types.
To extend this to any data type, you just need to:
Create your own int compareDataType(void* a, void* b) function
Update the sizeOfElement and compareFcn parameters passed to the BubbleSort() function.
Your approach might work for primitive data types that already have defined comparison operations, but it won't work for abstract data types likes structs, etc.
Code Listing
/*******************************************************************************
* Preprocessor Directives
******************************************************************************/
#include <stdio.h> // printf
#include <stdlib.h> // calloc
#include <string.h> // memcpy
#include <time.h> // random seed initialization
#define ELEMENT_AT(arr, i, w) (((char*)arr) + ((i)*(w)))
#define BUF_SIZE (20)
/*******************************************************************************
* Function Prototypes
******************************************************************************/
typedef struct cricket_s {
char pname[BUF_SIZE];
char tname[BUF_SIZE];
int avg;
} cricket_t;
/*******************************************************************************
* Function Prototypes
******************************************************************************/
/* #functionName: bubbleSort
* #brief: Performs a bubble sort on an input array, using a user-
* provided function pointer for comparing data types so that
* the function can be as generic as possible.
* #param: arr: The array to search.
* #param: compareFcn: The comparison function to use.
* #param: sizeOfElement: The size of a single element in arr
* #param: numElements: The number of elements in arr
*/
void* bubbleSort(void* arr, int (*compareFcn)(void*, void*), size_t sizeOfElement, size_t numElements);
void rand_str(char *dest, size_t length);
int compareCricketAvg(void *a, void *b);
int compareCricketPname(void *a, void *b);
/*******************************************************************************
* Function Definitions
******************************************************************************/
/*----------------------------------------------------------------------------*/
void* bubbleSort(void* arr, int (*compareFcn)(void*, void*), size_t sizeOfElement, size_t numElements) {
if (!arr || !compareFcn || !numElements || !sizeOfElement) {
return NULL;
}
int i, j;
void* tempBuf;
/* Create a swap buffer */
if ((tempBuf = calloc(1, sizeOfElement)) == NULL) {
return NULL;
}
/* Sort the list via bubble sort (stable) */
for (i=0; i<(numElements-1); i++) {
for (j=0; j<(numElements - i -1); j++) {
if (compareFcn(ELEMENT_AT(arr, j, sizeOfElement), ELEMENT_AT(arr, j+1, sizeOfElement)) == (-1)) {
memcpy(tempBuf, ELEMENT_AT(arr, j, sizeOfElement), sizeOfElement);
memcpy(ELEMENT_AT(arr, j, sizeOfElement), ELEMENT_AT(arr, j+1, sizeOfElement), sizeOfElement);
memcpy(ELEMENT_AT(arr, j+1, sizeOfElement), tempBuf, sizeOfElement);
}
}
}
/* Clean up and exit */
free(tempBuf);
return arr;
}
/*******************************************************************************
* Comparson function s.
* Returns (-1) if a<b, +1 if a>b, 0 if a==b
*/
/*----------------------------------------------------------------------------*/
int compareCricketAvg(void *a, void *b) {
if (!a || !b) {
/* Treat bad input as equality */
return 0;
}
int ret;
if (((cricket_t*)a)->avg < ((cricket_t*)b)->avg) {
ret = (-1);
} else if (((cricket_t*)a)->avg > ((cricket_t*)b)->avg) {
ret = 1;
} else
ret = 0;
return ret;
}
/*----------------------------------------------------------------------------*/
int compareCricketPname(void *a, void *b) {
if (!a || !b) {
/* Treat bad input as equality */
return 0;
}
int ret;
char *s1, *s2;
s1 = ((cricket_t*)a)->pname;
s2 = ((cricket_t*)b)->pname;
ret = strncmp(s1, s2, BUF_SIZE);
if (ret > 0) {
ret = 1;
} else if (ret < 0) {
ret = (-1);
} else {
ret = 0;
}
return ret;
}
/*----------------------------------------------------------------------------*/
void rand_str(char *dest, size_t length) {
char charset[] = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
while (length-- > 0) {
size_t index = (double) rand() / RAND_MAX * (sizeof charset - 1);
*dest++ = charset[index];
}
*dest = '\0';
}
/*******************************************************************************
* Main Entry Point
******************************************************************************/
/*----------------------------------------------------------------------------*/
int main(void) {
srand(time(NULL)); // init random seed
int numPlayers = 10;
int i;
/* Dynamically allocate memory for a few teams */
cricket_t* team;
if ((team = calloc(numPlayers, sizeof(cricket_t))) == NULL) {
printf("Memory error\n");
return (-1);
}
/* Populate struct values */
for (i=0; i<numPlayers; i++) {
team[i].avg = rand() % 1000;
rand_str(team[i].pname, BUF_SIZE);
printf("Team %d - Pname:%s - Average:%d\n", i, team[i].pname, team[i].avg);
}
printf("\n");
/* Sort the list according to AVG value */
bubbleSort((void*)team, compareCricketAvg, sizeof(cricket_t), numPlayers);
/* Print sorted team */
for (i=0; i<numPlayers; i++) {
printf("Team %d - Pname:%s - Average:%d\n", i, team[i].pname, team[i].avg);
}
printf("\n");
/* Sort again, now by pname */
bubbleSort((void*)team, compareCricketPname, sizeof(cricket_t), numPlayers);
/* Print sorted team */
for (i=0; i<numPlayers; i++) {
printf("Team %d - Pname:%s - Average:%d\n", i, team[i].pname, team[i].avg);
}
printf("\n");
free(team);
return 0;
}
Sample Run
Team 0 - Pname:YY7plBOnjIi7YQTKjgqB - Average:605
Team 1 - Pname:sKGbl8pIAjHzq6U2UimD - Average:439
Team 2 - Pname:tBrmmKDNmvf6crrlQaWa - Average:226
Team 3 - Pname:vBXqESI0vju7KRuvvhS1 - Average:117
Team 4 - Pname:YdYqzPBv0s0Bqqgi9hNs - Average:209
Team 5 - Pname:VdDpJ8GB9dAnb0W1Bs14 - Average:633
Team 6 - Pname:DuUTM3bAvXvJAVsJB3TP - Average:212
Team 7 - Pname:h1Fd2hF3l8GQ2AD6LdBI - Average:237
Team 8 - Pname:kjEN3gRX5ve6ar8r7cMg - Average:467
Team 9 - Pname:Djtgpet1XdmhSal81iew - Average:473
Team 0 - Pname:VdDpJ8GB9dAnb0W1Bs14 - Average:633
Team 1 - Pname:YY7plBOnjIi7YQTKjgqB - Average:605
Team 2 - Pname:Djtgpet1XdmhSal81iew - Average:473
Team 3 - Pname:kjEN3gRX5ve6ar8r7cMg - Average:467
Team 4 - Pname:sKGbl8pIAjHzq6U2UimD - Average:439
Team 5 - Pname:h1Fd2hF3l8GQ2AD6LdBI - Average:237
Team 6 - Pname:tBrmmKDNmvf6crrlQaWa - Average:226
Team 7 - Pname:DuUTM3bAvXvJAVsJB3TP - Average:212
Team 8 - Pname:YdYqzPBv0s0Bqqgi9hNs - Average:209
Team 9 - Pname:vBXqESI0vju7KRuvvhS1 - Average:117
Team 0 - Pname:vBXqESI0vju7KRuvvhS1 - Average:117
Team 1 - Pname:tBrmmKDNmvf6crrlQaWa - Average:226
Team 2 - Pname:sKGbl8pIAjHzq6U2UimD - Average:439
Team 3 - Pname:kjEN3gRX5ve6ar8r7cMg - Average:467
Team 4 - Pname:h1Fd2hF3l8GQ2AD6LdBI - Average:237
Team 5 - Pname:YdYqzPBv0s0Bqqgi9hNs - Average:209
Team 6 - Pname:YY7plBOnjIi7YQTKjgqB - Average:605
Team 7 - Pname:VdDpJ8GB9dAnb0W1Bs14 - Average:633
Team 8 - Pname:DuUTM3bAvXvJAVsJB3TP - Average:212
Team 9 - Pname:Djtgpet1XdmhSal81iew - Average:473

You can circumvent the problem of non-symbol types like char * or struct point by defining new types with typedef. A better approach may be to pass the name of the new function to the macro as additional parameter.
The problem of the comparison can be solved by passing the comparison criterion, as in the callback function of qsort that others have pointed you to. Because the function is not really called, but substituted at compile time, it can be a macro.
Here's your macro extended:
#define GENERIC_SORT(NAME, TYPE, LT) \
void NAME(TYPE a[], int n) \
{ \
int i, j; \
\
for (i = 1; i < n; i++) { \
for (j = n - 1; j >= i; j--) { \
if (LT(a[j], a[j - 1])) { \
TYPE aux = a[j]; \
a[j] = a[j - 1]; \
a[j - 1] = aux; \
} \
} \
} \
}
Your integer sort is then:
#define LESS(a, b) ((a) < (b))
GENERIC_SORT(int_sort, int, LESS);
int main(void)
{
int array[] = {
6, 3, 9, 2, 7, 10, 5, 1
};
int n = sizeof(array) / sizeof(*array);
int i;
int_sort(array, n);
for (i = 0; i < n; i++) {
printf("%d\n", array[i]);
}
return 0;
}
Sort strings with a comparison function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "gen.h" /* your macro */
int str_less(const char *a, const char *b)
{
return strcmp(a, b) < 0;
}
GENERIC_SORT(str_sort, const char *, str_less);
int main(void)
{
const char *array[] = {
"apricot", "orange", "banana", "apple", "papaya", "kiwi"
};
int n = sizeof(array) / sizeof(*array);
int i;
str_sort(array, n);
for (i = 0; i < n; i++) {
printf("%s\n", array[i]);
}
return 0;
}

If you already have the GENERIC_MAX code (as described in a comment), you can use it instead of comparing and swapping. To do it, define GENERIC_MIN (should be easy - very similar to generic max). Then:
#define GENERIC_SORT(TYPE) \
TYPE ##_SORT(TYPE a[],int n) \
{ \
int i,j; \
TYPE aux_min, aux_max; \
for(i=1;i<n;i++) \
for(j=n-1;j>=i;j--) \
{ \
aux_min=GENERIC_MIN(a[j], a[j-1]); \
aux_max=GENERIC_MAX(a[j], a[j-1]); \
a[j-1]=aux_min; \
a[j]=aux_max; \
} \
}

Sparse matrix addition in CUDA

I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?

CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.

Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.

Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}

Unspecified launch failure - parallel scan in CUDA

I am using GeForce GT 520 (compute capablility v2.1) to run a program that performs the scan operation on an array of int elements. Here's the code:
/*
This is an implementation of the parallel scan algorithm.
Only a single block of threads is used. Maximum array size = 2048
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define errorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s, file: %s line: %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void blelloch_scan(int* d_in, int* d_out, int n)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = d_in[2*thid]; // load input into shared memory
temp[2*thid+1] = d_in[2*thid+1];
// build sum in place up the tree
for (int d = n>>1; d > 0; d >>= 1)
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
// clear the last element
if (thid == 0)
temp[n - 1] = 0;
__syncthreads();
// traverse down tree & build scan
for (int d = 1; d < n; d *= 2)
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
d_out[2*thid] = temp[2*thid]; // write results to device memory
d_out[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char **argv)
{
int ARRAY_SIZE;
if(argc != 2)
{
printf("Input Syntax: ./a.out <number-of-elements>\nProgram terminated.\n");
exit (1);
}
else
ARRAY_SIZE = (int) atoi(*(argv+1));
int *h_in, *h_out, *d_in, *d_out, i;
h_in = (int *) malloc(sizeof(int) * ARRAY_SIZE);
h_out = (int *) malloc(sizeof(int) * ARRAY_SIZE);
cudaSetDevice(0);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, 0) == 0)
{
printf("Using device %d:\n", 0);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
for(i = 0; i < ARRAY_SIZE; i++)
{
h_in[i] = i;
}
errorCheck(cudaMalloc((void **) &d_in, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMalloc((void **) &d_out, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMemcpy(d_in, h_in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice));
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>> (d_in, d_out, ARRAY_SIZE);
cudaDeviceSynchronize();
errorCheck(cudaGetLastError());
errorCheck(cudaMemcpy(h_out, d_out, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
printf("Results:\n");
for(i = 0; i < ARRAY_SIZE; i++)
{
printf("h_in[%d] = %d, h_out[%d] = %d\n", i, h_in[i], i, h_out[i]);
}
return 0;
}
On compiling using nvcc -arch=sm_21 parallel-scan.cu -o parallel-scan, I get an error:
GPUassert: unspecified launch failure, file: parallel-scan-single-block.cu line: 106
Line 106 is the line after kernel launch when we check for errors using errorCheck.
This is what I am planning to implement:
From the kernel, it can be seen that if a block has 1000 threads, it can operate on 2000 elements. Therefore, blockSize = ARRAY_SIZE / 2.
And, shared memory = sizeof(int) * ARRAY_SIZE
Everything is loaded into shared mem. Then, up sweep is done, with last element being set to 0. Finally, down sweep is done to give an exclusive scan of the elements.
I have used this file as the reference to write this code. I do not understand what's the mistake in my code. Any help would be greatly appreciated.

You are launching the kernel like so
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>>
meaning that witihin then kernel 0 < thid < int(ARRAY_SIZE/2).
However, your kernel requires a minimum of (2 * int(ARRAY_SIZE/2)) + 1 words of available shared memory to work correctly, otherwise this:
temp[2*thid+1] = d_in[2*thid+1];
will produce an out-of-bounds shared memory access.
If my integer mathematical skillz are not too rusty, this should mean that the code will be safe if ARRAY_SIZE is odd, because ARRAY_SIZE == (2 * int(ARRAY_SIZE/2)) + 1 for any odd integer. However, if ARRAY_SIZE is even, then ARRAY_SIZE < (2 * int(ARRAY_SIZE/2)) + 1 and you have a problem.
It might be that shared memory page size granularity saves you for some even values of ARRAY_SIZE which should theoretically fail, because the hardware will always round up the dynamic shared memory allocation to the next page size larger than the request size. But there should be a number of even values of ARRAY_SIZE for which this fails.
I can't comment on whether the rest of the kernel is correct or not, but using a shared memory size of sizeof(int) * size_t(1 + ARRAY_SIZE) should make this particular problem go away.

C pthread Segmentation fault

so I was trying to make a GPGPU emulator with c & pthreads but ran into a rather strange problem which I have no idea why its occurring. The code is as below:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
// simplifies malloc
#define MALLOC(a) (a *)malloc(sizeof(a))
// Index of x/y coordinate
#define x (0)
#define y (1)
// Defines size of a block
#define BLOCK_DIM_X (3)
#define BLOCK_DIM_Y (2)
// Defines size of the grid, i.e., how many blocks
#define GRID_DIM_X (5)
#define GRID_DIM_Y (7)
// Defines the number of threads in the grid
#define GRID_SIZE (BLOCK_DIM_X * BLOCK_DIM_Y * GRID_DIM_X * GRID_DIM_Y)
// execution environment for the kernel
typedef struct exec_env {
int threadIdx[2]; // thread location
int blockIdx[2];
int blockDim[2];
int gridDim[2];
float *A,*B; // parameters for the thread
float *C;
} exec_env;
// kernel
void *kernel(void *arg)
{
exec_env *env = (exec_env *) arg;
// compute number of threads in a block
int sz = env->blockDim[x] * env->blockDim[y];
// compute the index of the first thread in the block
int k = sz * (env->blockIdx[y]*env->gridDim[x] + env->blockIdx[x]);
// compute the index of a thread inside a block
k = k + env->threadIdx[y]*env->blockDim[x] + env->threadIdx[x];
// check whether it is in range
assert(k >= 0 && k < GRID_SIZE && "Wrong index computation");
// print coordinates in block and grid and computed index
/*printf("tx:%d ty:%d bx:%d by:%d idx:%d\n",env->threadIdx[x],
env->threadIdx[y],
env->blockIdx[x],
env->blockIdx[y], k);
*/
// retrieve two operands
float *A = &env->A[k];
float *B = &env->B[k];
printf("%f %f \n",*A, *B);
// retrieve pointer to result
float *C = &env->C[k];
// do actual computation here !!!
// For assignment replace the following line with
// the code to do matrix addition and multiplication.
*C = *A + *B;
// free execution environment (not needed anymore)
free(env);
return NULL;
}
// main function
int main(int argc, char **argv)
{
float A[GRID_SIZE] = {-1};
float B[GRID_SIZE] = {-1};
float C[GRID_SIZE] = {-1};
pthread_t threads[GRID_SIZE];
int i=0, bx, by, tx, ty;
//Error location
/*for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}*/
// Step 1: create execution environment for threads and create thread
for (bx=0;bx<GRID_DIM_X;bx++) {
for (by=0;by<GRID_DIM_Y;by++) {
for (tx=0;tx<BLOCK_DIM_X;tx++) {
for (ty=0;ty<BLOCK_DIM_Y;ty++) {
exec_env *e = MALLOC(exec_env);
assert(e != NULL && "memory exhausted");
e->threadIdx[x]=tx;
e->threadIdx[y]=ty;
e->blockIdx[x]=bx;
e->blockIdx[y]=by;
e->blockDim[x]=BLOCK_DIM_X;
e->blockDim[y]=BLOCK_DIM_Y;
e->gridDim[x]=GRID_DIM_X;
e->gridDim[y]=GRID_DIM_Y;
// set parameters
e->A = A;
e->B = B;
e->C = C;
// create thread
pthread_create(&threads[i++],NULL,kernel,(void *)e);
}
}
}
}
// Step 2: wait for completion of all threads
for (i=0;i<GRID_SIZE;i++) {
pthread_join(threads[i], NULL);
}
// Step 3: print result
for (i=0;i<GRID_SIZE;i++) {
printf("%f ",C[i]);
}
printf("\n");
return 0;
}
Ok this code here runs fine, but as soon as I uncomment the "Error Location" (for loop which assigns A[i] = i and B[i] = i + 1, I get snapped by a segmentation fault in unix, and by these random 0s within C in cygwin. I must admit my fundamentals in C is pretty poor, so it may be highly likely that I missed something. If someone can give an idea on what's going wrong it'd be greatly appreciated. Thanks.

It works when you comment that because i is still 0 when the 4 nested loops start.
You have this:
for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}
/* What value is `i` now ? */
And then
pthread_create(&threads[i++],NULL,kernel,(void *)e);
^
So pthread_create will try to access some interesting indexes indeed.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Negative array indexing in shared memory based 1d stencil CUDA implementation - arrays

Related

How to allocate array starting negative index

How to write a generic function to sort a string array in C?

Sparse matrix addition in CUDA

Unspecified launch failure - parallel scan in CUDA

C pthread Segmentation fault

Categories

Resources