Cuda Matrix multiplication program can't pass, very strange error code - c

I am doing a matrix multiplication using cuda. I think I am about to success, but some very strange error stops me, I can't find out where the code goes wrong. Below is the code sample:
#include <stdio.h>
#include <cuda.h>
#define BLOCK_SIZE 16;
__global__ void matmulKernel(float* mat_in1,float* mat_in2, float* mat_out,int mat_dim);
int main() {
float *h_M, *h_N, *h_P, *d_M, *d_N, *d_P;
int i,width=10;
int size=width*width*sizeof(float);
dim3 block_dim(BLOCK_SIZE,BLOCK_SIZE,1);
int grid_size=width/BLOCK_SIZE;
if(width%BLOCK_SIZE) grid_size++;
dim3 grid_dim (grid_size,grid_size,1);
h_M=(float*)malloc(size);
h_N=(float*)malloc(size);
h_P=(float*)malloc(size);
cudaMalloc((void**)&d_M,size);
cudaMalloc((void**)&d_N,size);
cudaMalloc((void**)&d_P,size);
if(h_M==0||h_N==0||h_P==0||d_M==0||d_N==0||d_P==0) {
printf("memory locate fail!\n");
}
for(i=0;i<width*width;i++) {
h_M[i]=1.2*i;
h_N[i]=1.4*i;
}
cudaMemcpy(d_M,h_M,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_N,h_N,size,cudaMemcpyHostToDevice);
matmulKernel<<<grid_dim,block_dim>>>(d_M,d_N,d_P,width);
cudaMemcpy(h_P,d_P,size,cudaMemcpyDeviceToHost);
printf("firt row of the results matrix P:\n");
for(i=0;i<width;i++) {
printf("%f, %f",h_P[i]);
}
printf("\n");
return 0;
}
__global__ void matmulKernel(float* mat1,float* mat2, float* matP,int dim) {
int thread_x,thread_y,i;
thread_x=blockIdx.x*blockDim.x+threadIdx.x;
thread_y=blockIdx.y*blockDim.y+threadIdx.y;
if(thread_x<dim&&thread_y<dim) {
float P_value=0.;
for(i=0;i<dim;i++) {
P_value+=mat1[thread_y*dim+i]*mat2[i*dim+thread_x];
}
matP[thread_y*dim+thread_x]=P_value;
}
}
Using nvcc compile, the error is :
matmul.cu(11): error: expected a ")"
matmul.cu(11): error: expected an expression
matmul.cu(11): error: expected an expression
matmul.cu(13): error: expected a ")"
I cannot see why the compiler report this error, anyone please tell me where I am wrong.

You have a stray semicolon.
Change:
#define BLOCK_SIZE 16;
to:
#define BLOCK_SIZE 16

Related

warning: implicit declaration of function ‘colcheck’ [-Wimplicit-function-declaration]

I am new to C programming which's why I am confused with its syntax.
Question: A Sudoku puzzle uses a 9 × 9 grid in which each column and row, as well as
each of the nine 3 × 3 subgrids, must contain all of the digits 1 ⋅ ⋅ ⋅ 9. Figure
4.26 presents an example of a valid Sudoku puzzle. This project consists of
designing a multithreaded application that determines whether the solution
to a Sudoku puzzle is valid.
There are several different ways of multithreading this application. The one
suggested strategy is to create threads that check the following criteria:
• A thread to check that each column contains the digits 1 through 9
• A thread to check that each row contains the digits 1 through 9
#include <stdlib.h>
int i,j,m,b,k;
void main()
{
int a[9][9]={1,2,3,4,5,6,7,8,9,
4,5,6,7,8,9,1,2,3,
7,8,9,1,2,3,4,5,6,
2,3,4,5,6,7,8,9,1,
5,6,7,8,9,1,2,3,4,
8,9,1,2,3,4,5,6,7,
3,4,5,6,7,8,9,1,2,
6,7,8,9,1,2,3,4,5,
9,1,2,3,4,5,6,7,8};
if(rowcheck(a)==1 && colcheck(a)==1 & cubecheck(a)==1)
{
printf("Success");
}
else{
printf("Failed");
}
}
int rowcheck(int a[9][9])
{
int c[10]={0};
for(i=0;i<9;i++)
{
for(j=0;j<9;j++)
{
c[a[i][j]]++;
}
for(k=1;k<=9;k++)
if(c[k]!=1)
{
printf("The value %d came %d times in %d row \n",k,c[k],i+1);
return 0;
}
for(k=1;k<=9;k++)
c[k]=0;
}
return 1;
}
int colcheck(int a[9][9])
{
int c[10]={0};
for(i=0;i<9;i++)
{
for(j=0;j<9;j++)
{
c[a[i][j]]++;
}
for(k=1;k<=9;k++)
if(c[k]!=1)
{
printf("The value %d came %d times in %d column \n",k,c[k],i+1);
return 0;
}
for(k=1;k<=9;k++)
c[k]=0;
}
return 1;
}
int cubecheck(int a[9][9])
{
int c[10]={0},count=0;
for(m=0;m<9;m+=3)
{
for(b=0;b<9;b+=3)
{
for(i=m;i<m+3;i++)
{
for(j=b;j<b+3;j++)
{
c[a[i][j]]++;
}
}
count++;
for(k=1;k<=9;k++)
if(c[k]!=1)
{
printf("The value %d came %d times in %d box\n",k,c[k],count);
return 0;
}
for(k=1;k<=9;k++)
c[k]=0;
}
}
return 1;
}```
I am getting this error plz help.
```proj1.c: In function ‘main’:
proj1.c:18:8: warning: implicit declaration of function ‘rowcheck’ [-Wimplicit-function-declaration]
if(rowcheck(a)==1 && colcheck(a)==1 & cubecheck(a)==1)
^~~~~~~~
proj1.c:18:26: warning: implicit declaration of function ‘colcheck’ [-Wimplicit-function-declaration]
if(rowcheck(a)==1 && colcheck(a)==1 & cubecheck(a)==1)
^~~~~~~~
proj1.c:18:43: warning: implicit declaration of function ‘cubecheck’ [-Wimplicit-function-declaration]
if(rowcheck(a)==1 && colcheck(a)==1 & cubecheck(a)==1)
^~~~~~~~~
proj1.c: At top level:
proj1.c:136:1: error: expected identifier or ‘(’ before ‘}’ token
}```
^
You need to give declarations for your functions before main() method since you define them afterwards:. A forward function declaration just tells the compiler that, somewhere in your code, a function with the same name will be defined, in this case after its usage in main().
Usually, in C/C++ you define function prototypes in header files, then include them in your main source code, so the functions can safely be defined after they are called.
#include <stdio.h>
#include <stdlib.h>
// add forward declarations for your functions here
int rowcheck(int [][9]);
int colcheck(int [][9]);
int cubecheck(int [][9]);
int main() {
// your code
return 0:
}
// method definitions afterwards
int rowcheck(int a[9][9]) {
// your definition here
}
int colcheck(int a[9][9]) {
// your definition here
}
int cubecheck(int a[9][9]) {
// your definition here
}
Also, define your main method as int main() rather than void main().

need help to find segmentation fault error

can someone help me to find what cause segmentation fault in my program, I used the gdb but I cannot find which line that cause the error.
Array* Merge(Array *arr1, Array *arr2)
{
int i,j,k;
i=j=k=0;
Array *arr3 = (Array*)malloc(sizeof(Array));
arr3->size = arr1->size + arr2->size;
arr3->length = arr1->length + arr2->length;
while(i<arr1->length && j<arr2->length)
{
if(arr1->A[i] < arr2->A[j])
arr3->A[k++]=arr1->A[i++];
else
arr3->A[k++]=arr2->A[j++];
}
for(;i<arr1->length;i++)
arr3->A[k++]=arr1->A[i];
for(;j<arr2->length;j++)
arr3->A[k++]=arr2->A[j];
return arr3;
}
#include <stdio.h>
#include <stdlib.h>
#include "array1.h"
int main()
{
Array arr,arr1,*arr2;
arr.size=10;
arr.length=5;
arr1.size=10;
arr1.length=5;
arr.A=(int*)malloc(arr.size*sizeof(int));
arr1.A=(int*)malloc(arr.size*sizeof(int));
printf("\n enter elements of arr\n");
for (int i=0;i<arr.length;i++)
scanf("%d",&arr.A[i]);
/**********************************************/
printf("\n enter elements of arr1\n");
for (int i=0;i<arr1.length;i++)
scanf("%d",&arr1.A[i]);
arr2=Merge(&arr, &arr1);
display(*arr2);
return 0;
}
here is the result of the gdb
enter image description here
In Merge you don't allocate any space for arr3's data (probably a member called A looking at the other code)
It would help to see the rest of the code (especially the declaration of Array) but I suspect arr3->A is an uninitialised pointer.

"Invalid Operands to Binary" error when trying to calculate square of elements in array

this code is supposed to read a text file that contains integers, which then finds the squares of these numbers after putting them into an array.
After this its supposed to print the results onto a new text file "result.txt", but I keep getting the error "invalid operands to binary*(have 'int*' and 'int*')" on this line: square[x] = square[x] * square[x];
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(){
int square[100][2];
int x;
int i;
FILE* input=fopen("in.csv","r");
FILE * f=fopen("result.txt","wb");
system("cls");
for(x=0;x<100;x++)
{
fscanf(input,"%d",&square[x][0]);
}
fclose(input);
for(x=0;x<100;x++)
{
square[x] = square[x] * square[x]; //this line produces the error
}
for(i=0;i<100;i++)
{
fprintf(f,"%d -> %d || ",square[i][0],square[i][1]);
}
fclose(f);
getchar();
}
I am using Eclipse IDE and MinGW-w64
I have tried finding solutions online but am stuck, any help or replies would be appreciated, thanks!
square[x] yields an array with two elements. You should reference the [0] and [1] elements like you do in the rest of your code.
square[x][1] = square[x][0] * square[x][0];

CaptureStackBackTrace inconsistencies using FramesToSkip

On windows you can capturing the stack trace using CaptureStackBackTrace as
void* frames[USHRT_MAX];
USHORT framesCount = CaptureStackBackTrace(0, USHRT_MAX, frames, NULL);
However, capturing it by smaller chunks in a loop to avoid allocating a USHRT_MAX buffer doesn't provide the same result.
This code
#include <Windows.h>
#include <assert.h>
#include <stdio.h>
__declspec(noinline) void CheckStack(void)
{
printf("Checking stack...\n");
void* entireStack[USHRT_MAX];
USHORT frameCount = CaptureStackBackTrace(0, USHRT_MAX, entireStack, NULL);
printf("Stack size is: %u\n", frameCount);
ULONG frameOffset = 1;
for (;;)
{
void* chunk[64];
USHORT framesFound = CaptureStackBackTrace(frameOffset, 64, chunk, NULL);
if (framesFound)
{
if (memcmp(entireStack + frameOffset, chunk, sizeof(chunk)) != 0)
{
printf("Incorrect content\n");
}
frameOffset += (ULONG)framesFound;
}
else
{
break;
}
}
if (frameCount != frameOffset)
{
printf("Incorrect count (%u != %u)\n", frameCount, frameOffset);
}
printf("Done\n");
}
__declspec(noinline) void Test(int i)
{
if (i != 500)
Test(++i);
else
CheckStack();
}
int main()
{
Test(0);
}
produces the following output
Checking stack...
Stack size is: 507
Incorrect count (507 != 257)
Done
when building as cl /Od main.c /link /OUT:main.exe.
Am I using the FramesToSkip parameter incorrectly or why are the counts not equal?
If you are using Windows Server 2003 and Windows XP,
The sum of the FramesToSkip and FramesToCapture parameters must be
less than 63.
That's in document.
Else, as #RbMm says, In the API source code, there is the following logic:
if(FramesToSkip>0xfe)
{
return 0; //There are too many stack structures skipped, returning directly to 0.
}
However, this is not metioned on msdn both in the CaptureStackBackTrace and RtlCaptureStackBackTrace.
I am not going to post the source code here, but prove it in debugging:
1.Create a sample:
#include <Windows.h>
#include <assert.h>
#include <stdio.h>
__declspec(noinline) void CheckStack(void)
{
void* entireStack[USHRT_MAX];
USHORT frameCount = CaptureStackBackTrace(255, USHRT_MAX, entireStack, NULL);
}
__declspec(noinline) void Test(int i)
{
if (i != 500)
Test(++i);
else
CheckStack();
}
int main()
{
Test(0);
}
2. Step into CaptureStackBackTrace in Disassembly:
You can see that dword ptr[ebp+8](the first parameter of CaptureStackBackTrace pushed in stack) will be compared with 0feh(254). If true, return 0.

CUDA local array initalization modifies program output

I have a program which (for now) calculates values of two functions in random points on GPU , sends these values back to host, and then visualizes them. This is what I get, some nice semi-random points:
Now, if I modify my kernel code, and add the local array initalization code at the very end,
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};
}
I get something like this everytime:
Does anyone know the cause of this behavior? All the drawn points are computed BEFORE test_array is initialized, yet they are affected by it. It doesn't happen when I initialize test_array before the 'for' loop.
Host/device code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"
#include "device_functions.h"
#include <random>
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float g(float* x)
{
float tmp=1;
for(int i=1;i<XSIZE;i++)
tmp*=x[i];
return 1+9*(tmp/(XSIZE-1));
}
__device__ float ZDT_f1(float* x)
{
return x[0];
}
__device__ float ZDT_f2(float* x)
{
float gp=g(x);
return gp*(1-sqrtf(x[0]/gp));
}
__device__ bool oneDominatesTwo(float* x1, float* x2)
{
for(int i=0;i<XSIZE;i++)
if(x1[i]>=x2[i])
return false;
return true;
}
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = blockDim.x*blockIdx.x+threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
__syncthreads();
//float* test_array=new float[2];
//test_array[0]=1.0f;
//test_array[1]=1.0f;
float test_array[2]={1.0f,1.0f};
}
void saveResultToFile(float* result)
{
ofstream resultFile;
resultFile.open ("result.txt");
for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
{
resultFile << result[i] << " "<<result[i+1]<<"\n";
}
resultFile.close();
}
int main()
{
float* dev_fPoints;
float* fPoints=new float[NOF*TEST_POINTS];
gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));
curandState* devStates;
gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));
cudaEvent_t start;
gpuErrchk(cudaEventCreate(&start));
cudaEvent_t stop;
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
gpuErrchk(cudaEventRecord(start, NULL));
setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop, NULL));
gpuErrchk(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
saveResultToFile(fPoints);
system("start pythonw plot_data.py result.txt");
cudaFree(dev_fPoints);
cudaFree(devStates);
system("pause");
return 0;
}
Plot script code:
import matplotlib.pyplot as plt;
import sys;
if len(sys.argv)<2:
print("Usage: python PlotScript <filename>");
sys.exit(0);
path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
for line in f:
vals=line.strip().split(" ");
x.append(vals[0]);
y.append(vals[1]);
plt.plot(x,y,'ro')
plt.show();
The basic problem was in code you originally didn't show in your question, specifically this:
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
Returning an address or reference to a local scope variable from a function results in undefined behaviour. It is only valid to use x by reference or value within generateX while it is in scope. There should be no surprise that adding or moving other local scope variables around within the kernel changes the kernel behaviour.
Fix this function so it populates an array passed by reference, rather than returning the address of a local scope array. And pay attention to compiler warnings - there will have been one for this which should have immediately set off alarm bells that there was something wrong.

Resources