Does "more threads" mean more speed? - c

I've got a Jacobi implementation on CUDA, but the problem is:
I assign threads at this way:
#define imin(a,b) (a < b ? a : b)
int dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks = imin(32, (dimThreads + dim - 1)/dimThreads);
But if I use 32 threads it's fastest than using 256 threads or moreover...
I've got these results:
Sequential times:
9900 5.882000
9900 6.071000
Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256
Where 9900 is matrix WIDTH... And we can see the following:
5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74
So 32 threads is more efficient than 256?
Sorry, I don't know if I should upload the code(since they are a bit long), if you request it I will do it.
EDIT:
//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#define imin(a,b) (a < b ? a : b)
// name OF FUNCTION: __copy_vector
// PURPOSE:
// The function will copy a vector.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// source double* value vector to be copied
// dest double* reference vector copied
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int dim)
{
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
while(tIdx < dim){
dest[tIdx] = source[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
// The function will execute matrix vector multiplication
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A*B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A,
const double *B,
double *resul,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
resul[tIdx] = 0;
for(int i = 0; i < dim; i++)
if(tIdx != i)
resul[tIdx] += A[tIdx * dim + i] * B[i];
tIdx += gridDim.x * blockDim.x;
}
__syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
// The function will execute A-B=resul
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A-B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A,
const double *B,
double *C,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
C[tIdx] = A[tIdx] - B[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __divide
// PURPOSE:
// The function will execute the jacobi division, that is,
// (B-sum)/A[i,i]
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* reference (B-sum)/A[i,i]
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
//if(A[tIdx * dim + tIdx] != 0)
B[tIdx] /= A[tIdx * dim + tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: __absolute
// PURPOSE:
// The function will calculate the absolute value for each
// number in an array
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* reference |A[i]|
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
if(A[tIdx] < 0)
A[tIdx] = -A[tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
// The function will calculate a X solution for a linear system
// using Jacobi's Method.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// Matrix_A double* value Matrix A(coefficients)
// Vector_B double* value Vector B
// Vector_X double* reference Solution
// dim int value Matrix Dimension
// e double value Error allowed
// maxIter int value Maximum iterations allowed
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
void Jacobi_Cuda(const double *Matrix_A,
const double *Vector_B,
double *Vector_X,
const int dim,
const double e,
const int maxIter,
double *t)
{
/** Host variables **/
int iter = 0; // iter counter
double err = 1; // error between X^k and X^k-1
double *tmp; // temporary for thrust norm
double *norm; // Vector norm
tmp = (double *) malloc(sizeof(double) * dim);
norm = (double *) malloc(sizeof(double));
int dimBlocks, dimThreads;
dimThreads = 64;
dimBlocks = imin(32, (dim + dimThreads - 1)/dimThreads);
/** ************** **/
/** Device variables **/
double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;
cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);
/** **************** **/
/** Initialize **/
cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
/** ********** **/
clock_t start,finish;
double totaltime;
start = clock();
/** Jacobi **/
while(err > e && iter < maxIter){
__copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);
__Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y,
d_Vector_Resul, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul,
d_Vector_X, dim);
__divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
d_Vector_Resul, dim);
__absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);
cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);
double *t = thrust::max_element(tmp, tmp + dim); //vector norm
err = *t;
iter++;
}
finish = clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
*t = totaltime;
cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim,
cudaMemcpyDeviceToHost);
if(iter == maxIter)
puts("Jacobi has reached maxIter!");
/** ****** **/
/** Free memory **/
cudaFree(d_Matrix_A);
cudaFree(d_Vector_B);
cudaFree(d_Vector_X);
cudaFree(d_Vector_Y);
cudaFree(d_Vector_Resul);
free(tmp);
free(norm);
/** *********** **/
}

It depends on your algorithm. Some algorithms are by definition non-parallelizable (calculating the Fibonacci series, for example). But here's a parallelizable Jacobi algorithm courtesy of Brown. Note that solving systems of equations CAN be solved either in serial or in parallel, it's just a matter of writing the code.
In short, it's impossible to know whether or not more threads = more speed unless you show us (or at least explain) the algorithm. As far as thread synchronization goes, CUDA is very (very) good at normalizing synchronization costs so (if your algorithm is proper), more threads should almost always yield more speed.

Fewer threads might be more efficient if the workload is small enough that the overheads of managing many threads cause the performance degradation.
...but without seeing your code it's hard to say. Personally I'm more inclined to believe it's just a bug in your code.

Related

How to allocate array starting negative index

I am trying to allocate a 3D array u[-nx/2:nx/2-1][-nx/2:nx/2-1][-nx/2:nx/2-1]
int nx = 512;
double *** u = (double ***)malloc(nx * sizeof(double**));
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double **)malloc(nx * sizeof(double *));
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double *)malloc(nx * sizeof(double));
}
}
Is this a correct way to do it? If it's not, how should I change it?
No, that’s not correct. You can get it to work by placing every pointer in the middle of the dimension it represents:
int nx = 512;
double*** u = (double***)malloc(nx * sizeof(double**)) + nx/2;
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double**)malloc(nx * sizeof(double*)) + nx/2;
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double*)malloc(nx * sizeof(double)) + nx/2;
}
}
but that’s unusual and confusing, does a lot of separate allocations, and has to be undone for the deallocation step.
Consider one block with accessors instead:
#define NX 512
/* or just double* if nx is dynamic, and calculate the index manually */
double[NX][NX][NX]* u = malloc(sizeof(*u));
double array_get(double[NX][NX][NX] const* u, int i, int j, int k) {
return u[i + NX/2][j + NX/2][k + NX/2];
}
void array_set(double[NX][NX][NX]* u, int i, int j, int k, double value) {
u[i + NX/2][j + NX/2][k + NX/2] = value;
}
No.
Array in C is actually just plain/flat memory block, which is always 0 based and always in 1d (one demension).
Suppose you need a 3d array in arbitrary boundary,
say u[lb_1d, ub_1d][lb_2d, ub_2d][lb_3d, ub_3d],
you will need to do some mapping -- address space from 3d to 1d and vice versa --.
Sample implementation like this:
typedef struct
{
double* _arr;
int _lb_1d;
int _ub_1d;
int _lb_2d;
int _ub_2d;
int _lb_3d;
int _ub_3d;
}DoubleArr3D;
DoubleArr3D* create_3d_arr(int lb_1d, int ub_1d, int lb_2d, int ub_2d, int lb_3d, int ub_3d)
{
int array_size = (ub_1d - lb_1d +1) * (ub_2d - lb_2d +1) * (ub_3d - lb_3d +1);
DoubleArr3D * arr = (DoubleArr3D *)malloc( sizeof( DoubleArr3D) );
if (!arr)
{
return NULL;
}
arr->_lb_1d = lb_1d;
arr->_ub_1d = ub_1d;
arr->_lb_2d = lb_2d;
arr->_ub_2d = ub_2d;
arr->_lb_3d = lb_3d;
arr->_ub_3d = ub_3d;
arr->_arr = (double*) malloc(sizeof(double) * (size_t) array_size);
if (!arr)
{
free(arr);
return NULL;
}
return arr;
}
// arr[i1d, i2d, i3d] ==> arr_get_at(arr, i1d, i2d, i3d)
double arr_get_at(DoubleArr3D* arr, int i1d, int i2d, int i3d )
{
if (!arr || !arr->_arr)
{
// just demo of 'validation check'. in real code we should have meanful error report
return 0;
}
return arr->_arr [
i3d - arr->_lb_3d
+ (i2d - arr->_lb_2d ) * (arr->_ub_3d - arr->_lb_3d +1)
+ (i1d - arr->_lb_1d ) * (arr->_ub_2d - arr->_lb_2d +1) * (arr->_ub_3d - arr->_lb_3d +1)
];
}
First off, all C arrays have index values ranging from 0 to ELEMENT_COUNT-1. Always.
As you are using malloc, I am presuming that the value of nx is only defined at runtime. This rules out static array sizes and thus rules out using the cute arr[x][y][z] syntax as in:
#define NX 512
double arr[NX][NX][NX];
void foo(void)
{
...
arr[z1][y1][x1] += 2 * arr[z2][y2][x2];
...
}
That in turn means that to have the functionality of a 3D array with nx different values for each of its three dimensions dimension, you will need to allocate a linear memory area of size nx_cubed = nx * nx * nx. To calculate that value nx_cubed properly, you will need to check for integer overflows.
Also, you need to properly convert from signed int coordinate values to unsigned size_t values used in the 0 based index ranges.
if (nx < 0) {
fprintf(stderr, "negative value of nx\n");
exit(EXIT_FAILURE);
}
const size_t unx = nx;
const size_t nx_cubed = unx * unx * unx;
/* TODO: Complete check for overflows */
if (nx_cubed < unx) {
fprintf(stderr, "nx_cubed overflow\n");
exit(EXIT_FAILURE);
}
Then you can allocate a memory buffer of the appropriate size, and then check that the malloc call has actually worked.
double *buf = malloc(nx_cubed);
if (!buf) {
fprintf(stderr, "Error allocating memory for nx_cubed elements\n");
exit(EXIT_FAILURE);
}
Now there is the question of calculcating the array index from your x, y, and z values each ranging from -nx/2 to nx/2-1. I recommend writing a function for that which maps that range to the 0 to nx-1 range, and then calculates the proper linear index from the three 0-based values. Again, proper integer overflow checks should be performed.
size_t array3index(const size_t nx, const int x, const int y, const int z) {
const size_t half_nx = nx/2;
/* zero based 3D coordinates,
* this probably triggers some signedness warnings */
const size_t x0 = half_nx + x;
const size_t y0 = half_nx + y;
const size_t z0 = half_nx + z;
if ((x0 >= nx) || (y0 >= nx) || (z0 >= nx)) {
fprintf(stderr, "Signed coordinate(s) out of range: (%d, %d, %d)\n",
x, y, z);
exit(EXIT_FAILURE);
}
const size_t idx = nx * (nx * z0 + y0) + x0;
/* Assuming that we have already checked that nx*nx*nx does not
* overflow, and given that we have checked for x0, y0, z0 to be
* in the range of 0 to (nx-1), the idx calculation should not
* have overflown here. */
return idx;
}
Then you can do your accesses to the 3D array like
const i1 = array3index(nx, x1, y1, z1);
const i2 = array3index(nx, x2, y2, z2);
buf[i1] += 2*buf[i2];
Considering the amount of calculations needed inside array3index, I would examine whether it makes more sense to do the array iteration in the 0 to nx-1 domain directly, and only convert that to -nx/2 to nx/2-1 range values if you actually need that value within a calculation.

Sparse matrix addition in CUDA

I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?
CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.
Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.
Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}

Negative array indexing in shared memory based 1d stencil CUDA implementation

I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
....
To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.
The confusion comes in on the following line:
temp[lindex - RADIUS] = in[gindex - RADIUS];
If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.
The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.
To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.
(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).
My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:
stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);
So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.
There are already good answers, but to focus on the actual point that caused the confusion:
In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.
For example, consider a pointer like this:
int* data= ... // Points to some memory
When you then write a statement like
data[3] = 42;
you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written
int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]
and consequently,
dataWithOffset[-3] = 123; // This will write into data[0]
In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)
I can compile #JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":
And when run, it will abort like:
I have modified the code to the following and the warning disappeared and it can get right result:
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
The result is like this:

error C2102: '&' requires l-value

The code line: gsl_blas_daxpy(-a,&gsl_matrix_column(D, q).vector,y);
cause the error
error C2102: '&' requires l-value
, now the problem is that I have no control of the GSL functions so I don't know how to figure this out (removing the "&" didn't work)
afterwards i get
error C2198: 'gsl_blas_daxpy' : too few arguments for call
I'm using Visual studio 2010.
GSL_EXPORT int gsl_blas_daxpy (double alpha,
const gsl_vector * X,
gsl_vector * Y);
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_blas.h>
#define M (10) // Number of columns in dictionary */
#define N ((int)(M/2)) // Number of rows in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
int P=1; //number of signals
double epsilon = 1.0e-7; // Residual error
int numOfIterations = N; /* Max num of iterations - same as num of elements in signal */
double sign(double x){return (x>=0) - (x<0);} // Sign function
int main(int argc, char** argv)
{
int n, m, k, iter, q;
double normi, normf, tmp , norm=sqrt(N), htime;
gsl_matrix *D; // A random dictionary used for encoding the sparse signal NxM
gsl_vector *x; // Sparse info signal (encoder input) MxP
gsl_vector *z; // Evaluated Sparse info signal (decoder output) MxP
gsl_vector *r; // Residual error vector MxP
gsl_vector *y; // Sparse representation of signal (encoder output) NxP
gsl_vector_view v;
clock_t start; //for measuring performance
printf("\nDictionary is:NxM=%dx%d,and the signal sparsity is K=%d", N, M, K);
srand(time(NULL)); //Initialize srand
start =clock(); //Initialize clock
/* Initiallize D as a Bernoulli random dictionary */
D = gsl_matrix_alloc (N, M);
for(m=0; m<M; m++)
{
for(n=0; n<N; n++)
{
tmp=sign(2.0*rand()/(double)RAND_MAX-1.0)/norm;
gsl_matrix_set (D, n, m, tmp); //D[n,m]=tmp
}
}
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++)
{
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
}
/* Allocate memory for solution (evaluated signal) */
z = gsl_vector_calloc(M);
/* Allocate memory for residual vector */
r = gsl_vector_calloc(M);
/* Allocate memory for the encoded signal vector (its representation) */
y = gsl_vector_alloc(N);
htime=((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime data allocation: %f", htime);
/* Encoding the signal (x to y) */
start = clock();
gsl_blas_dgemv(CblasNoTrans, 1, D, x, 0, y); // y = Dx
htime=((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime for encoding: %f", htime);
/* Decoding the signal */
start = clock();
normi = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
epsilon = sqrt(epsilon * normi);
normf = normi;
iter = 0;
/*iterate till the computational error is small enough*/
while(normf > epsilon && iter < numOfIterations)
{
gsl_blas_dgemv(CblasTrans, 1, D, y, 0, r); // r=D'*y
q = gsl_blas_idamax(r); //index of max element in residual vector
tmp = gsl_vector_get(r, q); //the max element in r
gsl_vector_set(z, q, gsl_vector_get(z, q)+tmp); // z[q]=z[q]+ tmp
v=gsl_matrix_column(D, q); // choose the dictrionary's atom (coloum) with the index of largest element in r
gsl_blas_daxpy(-tmp,&v.vector,y); // y = y-tmp*v
normf = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
iter++;
}
htime = ((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime for decoding: %f", htime);
tmp = 100.0*(normf*normf)/(normi*normi); // the error at end of algorithm
printf("\nComputation residual error: %f",tmp);
/* Check the solution (evaluated signal) against the original signal */
printf("\nSolution (first column),Reference (second column):");
getchar(); // wait for pressing a key
for(m=0; m<M; m++)
{
printf("\n%.3f\t%.3f", gsl_vector_get(x, m),gsl_vector_get(z, m));
}
normi = gsl_blas_dnrm2(x);
gsl_blas_daxpy(-1.0, x, z); // z = z-x
normf = gsl_blas_dnrm2(z); // ||z|| (L2 norm)
tmp = 100.0*(normf*normf)/(normi*normi); //final error
printf("\nSolution residual error: %f\n",tmp);
/* Memory clean up and shutdown*/
gsl_vector_free(y); gsl_vector_free(r);
gsl_vector_free(z); gsl_vector_free(x);
gsl_matrix_free(D);
getchar();
return EXIT_SUCCESS;
}
gsl_matrix_column(D, q).vector is an R-value. You can't take its address. You need an L-value, so assign it to a named variable first, then pass the address of that variable to the function.
If you make a more permanent home for the return value of gsl_matrix_column, (this particular) problem will go away.
Here is some simplified code that illustrates how one might capture a return value in an addressable slot:
struct _foo {
int i;
};
struct _foo bar () {
struct _foo result = { 5 };
return result;
}
/* won't compile; 'lvalue required as unary & operand */
void qux () {
int *j = &bar().i;
}
/* compiles OK */
void qal () {
struct _foo result = bar();
int* j = &result.i;
}
gsl_vector_view c=gsl_matrix_column(D, q);
gsl_blas_daxpy(-a,&c.vector,y);
I think, introducing a temporal variable led you pass a pointer to it to the function.
EDIT: Well, trying to understand the problem, I wanted to know what the function expect:
int gsl_blas_daxpy (double alpha, const gsl_vector * x, gsl_vector * y)
and
gsl_vector_view gsl_matrix_column (gsl_matrix * m, size_t j)
witj some explanation:
A vector view can be passed to any subroutine which takes a vector
argument just as a directly allocated vector would be, using
&view.vector.
and an example:
for (j = 0; j < 10; j++)
{
gsl_vector_view column = gsl_matrix_column (m, j);
double d;
d = gsl_blas_dnrm2 (&column.vector);
printf ("matrix column %d, norm = %g\n", j, d);
}
Now we have another problem:
Here another answer:
Are you aware that int K= 0.7 is K=0 ??
#define M (10) // Number of columns in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
alloc do not initialice the vector x. x will contain garbage values, not 0. Did you meant x = gsl_vector_calloc(M); with c? It will set x to 0.
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++) // K=0, for get skiped and x not modified.
{
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
}
(And here you will have at most K random values, but possible lest)

How do I parallelize this triple loop in an efficient way?

I'm trying to parallelize a function which takes as input three arrays (x, y, and prb) and one scalar, and outputs three arrays (P1, Pt1, and Px).
The original c code is here (the outlier and E are inconsequential):
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B) ((A) > (B) ? (A) : (B))
#define min(A, B) ((A) < (B) ? (A) : (B))
void cpd_comp(
double* x,
double* y,
double* prb,
double* sigma2,
double* outlier,
double* P1,
double* Pt1,
double* Px,
double* E,
int N,
int M,
int D
)
{
int n, m, d;
double ksig, diff, razn, outlier_tmp, sp;
double *P, *temp_x;
P = (double*) calloc(M, sizeof(double));
temp_x = (double*) calloc(D, sizeof(double));
ksig = -2.0 * *sigma2;
for (n=0; n < N; n++) {
sp=0;
for (m=0; m < M; m++) {
razn=0;
for (d=0; d < D; d++) {
diff=*(x+n+d*N)-*(y+m+d*M); diff=diff*diff;
razn+=diff;
}
*(P+m)=exp(razn/ksig) ;
sp+=*(P+m);
}
*(Pt1+n)=*(prb+n);
for (d=0; d < D; d++) {
*(temp_x+d)=*(x+n+d*N)/ sp;
}
for (m=0; m < M; m++) {
*(P1+m)+=((*(P+m)/ sp) **(prb+n));
for (d=0; d < D; d++) {
*(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
}
}
*E += -log(sp);
}
*E +=D*N*log(*sigma2)/2;
free((void*)P);
free((void*)temp_x);
return;
}
Here is my attempt at parallelizing it:
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
/*headers*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Square of sigma
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in
);
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M);
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M);
/*implementations*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Scalar
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in y
){
//X is generatedPointPos
//Y is points
float
*P,
*P1timessp,
*Pxtimessp,
ksig = -2.0 * (*sigma2),
*h_sumofP = new float[N], //sum of P, on host
*d_sumofP; //sum of P, on device
cudaMalloc((void**)&P, sizeof(float)*M*N);
cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
cudaMalloc((void**)&d_sumofP, sizeof(float)*N);
cudaMalloc((void**)P1, sizeof(float)*M);
cudaMalloc((void**)Px, sizeof(float)*M*3);
cudaMalloc((void**)Pt1, sizeof(float)*N);
d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);
for(int n=0; n<N; n++){
thrust::device_ptr<float>dev_ptr(P);
h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
}
cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);
d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);
cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaFree(P);
cudaFree(P1timessp);
cudaFree(Pxtimessp);
cudaFree(d_sumofP);
delete[]h_sumofP;
}
/*kernels*/
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M){
//thread configuration: <<<dim3(N,M/1024+1),1024>>>
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
float
x1 = x[3*n],
x2 = x[3*n+1],
x3 = x[3*n+2],
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
razn = diff1*diff1+diff2*diff2+diff3*diff3,
Pm = __expf(razn/ksig), //fast exponentiation
prbn = prb[n];
P[M*n+m] = Pm;
__syncthreads();
P1[N*m+n] = Pm*prbn;
Px[3*(N*m+n)+0] = x1*Pm*prbn;
Px[3*(N*m+n)+1] = x2*Pm*prbn;
Px[3*(N*m+n)+2] = x3*Pm*prbn;
}
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M){
//computes P1 and Px
//thread configuration: <<<M/1024+1,1024>>>
int m = threadIdx.x+blockIdx.x*blockDim.x;
if(m>=M) return;
float
P1m = 0,
Pxm1 = 0,
Pxm2 = 0,
Pxm3 = 0;
for(int n=0; n<N; n++){
float spn = 1/sp[n];
P1m += P1timessp[N*m+n]*spn;
Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
}
P1[m] = P1m;
Px[3*m+0] = Pxm1;
Px[3*m+1] = Pxm2;
Px[3*m+2] = Pxm3;
}
However, to my horror, it runs much, much slower than the original version. How do I make it run faster? Please explain things thoroughly since I am very new to CUDA and parallel programming and have no experience in algorithms.
Do note that the c version has column-major ordering and the CUDA version has row-major. I have done several tests to make sure that the result is correct. It's just extremely slow and takes up a LOT of memory.
Any help is greatly appreciated!
EDIT: More information: N and M are on the order of a few thousand (say, 300-3000) and D is always 3. The CUDA version expects arrays to be device memory, except for variables prefixed with h_.
Before trying any CUDA-specific optimizations, profile your code to see where time is being spent.
Try and arrange your array reads/writes so that each CUDA thread uses a strided access pattern. For example, currently you have
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
So thread 1 will read from y[0],y[1],y[2] etc. Instead, rearrange your data so that thread 1 reads from y[0],y[M],y[2*M] and thread 2 reads from y[1],y[M+1],y[2*M+1] etc. You should follow this access pattern for other arrays.
Also, you may want to consider whether you can avoid the use of __syncthreads(). I don't quite follow why it's necessary in this algorithm, it might be worth removing it to see if it improves performance ( even if it produces incorrect results ).
The key to good CUDA performance is almost always to make as near to optimal memory access as possible. Your memory access pattern looks very similar to matrix multiplication. I would start with a good CUDA matrix multiplication implementation, being sure to understand why it's implemented the way it is, and then modify that to suit your needs.

Resources