I'm currently developping a C code with mpi for matrix multiplication. I have functions already implemented as mult or multadd defined in an other file, working well.
But my file pblas.c compiles, but crashes when running.
I run my project on a university server, which has mli installed.
Where am i wrong in my pblas code ?
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "commfct.h"
#include "toolsfct.h"
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
createGridComm(me,P,Q);
ligMe = me / Q;
colMe = me % Q;
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<P;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,ligMe,commCol);
multadd(A,B,C,blockSize);
}
else {
int dest = colMe * Q + ligMe;
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,dest%Q,commCol);
mult(A,Btmp,CC,blockSize);
}
}
else {
int dest = colMe*Q + ligMe;
if(dest%Q == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,colMe,commCol);
multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] != ((double)me/10.0 + sum*blockSize*(colMe+1.0))) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
releaseGridComm();
MPI_Finalize();
return 0;
}
The trouble may come from indexes in MPI_Send(), MPI_Recv() or MPI_Bcast(). For instance, in MPI_Send(), the destinator shall call MPI_Recv(). In MPI_Recv(), the origin should have called MPI_Send(). Otherwise, you will get a deadlock. The code keeps on running, but it waits for messages.
From what you gave us, I guess you have two matrix A and B.
A0 | A1
...........
A2 | A3
To compute the first column of C, you need :
A0xB0 | A1xB2
...................
A2xB0 | A3xB2
I changed your code so that :
for each column i
i_th column of B is transposed in the i_th line of Btmp
i_th line of Btmp is broadcast to Btmp in each column.
Since the issue was about MPI, i reply with a MPI code that looks very close to yours, works but it does nothing, except communication operations...
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
//#include "commfct.h"
//#include "toolsfct.h"
#define TAG_TRANSPOSE 42
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
//createGridComm(me,P,Q);
colMe = me / Q;
ligMe = me % Q;
MPI_Comm commCol, commLine;
//comes from http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
/* Split comm into row and column comms */
MPI_Comm_split(MPI_COMM_WORLD, ligMe, colMe, &commLine);
/* color by row, rank by column */
MPI_Comm_split(MPI_COMM_WORLD, colMe, ligMe, &commCol);
/* color by column, rank by row */
printf("[%d]:My coordinates are i j (%d,%d)\n",me,ligMe,colMe);
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
// t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<Q;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,B,C,blockSize);
}
else {
int dest = ligMe * Q + i;//transpose !
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
else {
int from = i*Q + colMe;// transpose !
if(ligMe == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,from,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
//t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] <0.99999*((double)me/10.0 + sum*blockSize*(colMe+1.0)) || C[i+j*blockSize] >1.00001*((double)me/10.0 + sum*blockSize*(colMe+1.0)) ) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
//releaseGridComm();
MPI_Finalize();
return 0;
}
Watch for the createGridComm(me,P,Q); I had to found something equivalent at http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
I also changed the test at the end of your code. Testing equality between double precision numbers will be too strict. Inequalities are better for floating-point numbers !
I hope this will help !
Bye,
Francis
Related
I'm comparing two different phase shifted wave signals with same frequency, my first task is to measure phase shift angle between two signals in C language.
Actually i'm from IC field and don't have much idea about programming language, but i'm currently learning C for that.
I'd write a code for Correlation Co-efficient for finding a time delay.
but its shows an error.
enter image description here
#include <stdio.h>
#include <stdbool.h>
#include <math.h>
int main()
{
bool True = 1; //for sqr wave conversion
bool False = 0 ;
int n[10],m[10] ; // n is an array of 10 integers (as of now assuming some value)
int i1, i2,x,y ;
int r,xx[10],xy[10],yy[10],nr=0,dr_1=0,dr_2=0,dr_3=0,dr=0;
int sum_y=0,sum_yy=0,sum_xy=0,sum_x=0,sum_xx=0;
int i, h=10;
/* initialize elements1 of array n to 0 */
for (i1=0;i1<10;i1++)
{
n[i1] = i1+1; // set an element at location i to 10
}
/* initialize elements2 of array n to 0 */
for (i2=0; i2<10; i2++)
{
m[i2] = i2+5;
}
//output of each array element
for (x=0 ;x<10; x++)
{
if (x%2 == 0) //sqr wave coversion
{
True = 1 ;
}
else
{
False = 0 ;
}
printf("element1 [%d] = %d\n" , x, n[x]);
}
for (y=0 ; y<10; y++)
{
if (y%2 == 0) //sqr wave
{
False = 0;
}
else
{
True = 1;
}
printf("element2 [%d] = %d\n" , y, m[y]);
}
for(i=0; i<h; i++)
{
xx[i]= x[i]*x[i];
yy[i]= y[i]*y[i];
}
for(i=0;i<h;i++)
{
sum_x+=x[i];
sum_y+=y[i];
sum_xx+= xx[i];
sum_yy+=yy[i];
sum_xy+= x[i]*y[i];
}
hr=(h*sum_xy)-(sum_x*sum_y);
int sum_x2=sum_x*sum_x;
int sum_y2=sum_y*sum_y;
dr_1=(h*sum_xx)-sum_x2;
dr_2=(h*sum_yy)-sum_y2;
dr_3=dr_1*dr_2;
dr=sqrt(dr_3);
r=(hr/dr);
printf("Total Numbers:%d\nCorrelation Coefficient:%.2f",h,r);
return 0;
}
Code is still incomplete for finding a phase angle , but as of now i'm facing this error.[added snapshot]
PS: I'd already done this task on H/W platform.
Sorry, it's my bad ..here i replaced x and y with n and m , which is an array.
I sorted out this error:)
My code is a parallel implmentation that calculates the nth digit of pi. When I finish the kernel and try to copy the memory back to the host I get a "the launch timed out and was terminated" error.
I used this code for error checking for each cudamalloc, cudamemcpy, and kernal launch.
std::string error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
These calls were saying everything was fine until the first cudamemcpy call after returning from the kernel. the error happens in the line "cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);" in main. Any help is appreciated.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of x mod y */
__device__ int inv_mod(int x,int y) {
int q,u,v,a,c,t;
u=x;
v=y;
c=1;
a=0;
do {
q=v/u;
t=c;
c=a-q*c;
a=t;
t=u;
u=v-q*u;
v=t;
} while (u!=0);
a=a%y;
if (a<0) a=y+a;
return a;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of u mod v, if v is odd */
__device__ int inv_mod2(int u,int v) {
int u1,u3,v1,v3,t1,t3;
u1=1;
u3=u;
v1=v;
v3=v;
if ((u&1)!=0) {
t1=0;
t3=-v;
goto Y4;
} else {
t1=1;
t3=u;
}
do {
do {
if ((t1&1)==0) {
t1=t1>>1;
t3=t3>>1;
} else {
t1=(t1+v)>>1;
t3=t3>>1;
}
Y4:;
} while ((t3&1)==0);
if (t3>=0) {
u1=t1;
u3=t3;
} else {
v1=v-t1;
v3=-t3;
}
t1=u1-v1;
t3=u3-v3;
if (t1<0) {
t1=t1+v;
}
} while (t3 != 0);
return u1;
}
/* return (a^b) mod m */
__device__ int pow_mod(int a,int b,int m)
{
int r,aa;
r=1;
aa=a;
while (1) {
if (b&1) r=mul_mod(r,aa,m);
b=b>>1;
if (b == 0) break;
aa=mul_mod(aa,aa,m);
}
return r;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return true if n is prime */
int is_prime(int n)
{
int r,i;
if ((n % 2) == 0) return 0;
r=(int)(sqrtf(n));
for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0;
return 1;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the prime number immediatly after n */
int next_prime(int n)
{
do {
n++;
} while (!is_prime(n));
return n;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
#define DIVN(t,a,v,vinc,kq,kqinc) \
{ \
kq+=kqinc; \
if (kq >= a) { \
do { kq-=a; } while (kq>=a); \
if (kq == 0) { \
do { \
t=t/a; \
v+=vinc; \
} while ((t % a) == 0); \
} \
} \
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
__global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){
int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h;
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
// GIANT LOOP
for (h = 0; h<1; h++){
if(tid > nthreads) continue;
a = primes[tid];
vmax=(int)(logf(3*N)/logf(a));
if (a==2) {
vmax=vmax+(N-n);
if (vmax<=0) continue;
}
av[tid]=1;
for(i=0;i<vmax;i++) av[tid]*= a;
s[tid]=0;
den=1;
kq1=0;
kq2=-1;
kq3=-3;
kq4=-2;
if (a==2) {
num=1;
v=-n;
} else {
num=pow_mod(2,n,av[tid]);
v=0;
}
for(k=1;k<=N;k++) {
t=2*k;
DIVN(t,a,v,-1,kq1,2);
num=mul_mod(num,t,av[tid]);
t=2*k-1;
DIVN(t,a,v,-1,kq2,2);
num=mul_mod(num,t,av[tid]);
t=3*(3*k-1);
DIVN(t,a,v,1,kq3,9);
den=mul_mod(den,t,av[tid]);
t=(3*k-2);
DIVN(t,a,v,1,kq4,3);
if (a!=2) t=t*2; else v++;
den=mul_mod(den,t,av[tid]);
if (v > 0) {
if (a!=2) t=inv_mod2(den,av[tid]);
else t=inv_mod(den,av[tid]);
t=mul_mod(t,num,av[tid]);
for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]);
t1=(25*k-3);
t=mul_mod(t,t1,av[tid]);
s[tid]+=t;
if (s[tid]>=av[tid]) s-=av[tid];
}
}
t=pow_mod(5,n-1,av[tid]);
s[tid]=mul_mod(s[tid],t,av[tid]);
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
int main(int argc,char *argv[])
{
int N,n,i,totalp, h;
double sum;
const char *error;
int *sdev, *avdev, *shost, *avhost, *adev, *ahost;
argc = 2;
argv[1] = "2";
if (argc<2 || (n=atoi(argv[1])) <= 0) {
printf("This program computes the n'th decimal digit of pi\n"
"usage: pi n , where n is the digit you want\n"
);
exit(1);
}
sum = 0;
N=(int)((n+20)*logf(10)/logf(13.5));
totalp=(N/logf(N))+10;
ahost = (int *)calloc(totalp, sizeof(int));
i = 0;
ahost[0]=2;
for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){
i++;
}
// allocate host memory
size_t size = i*sizeof(int);
shost = (int *)malloc(size);
avhost = (int *)malloc(size);
//allocate memory on device
cudaMalloc((void **) &sdev, size);
cudaMalloc((void **) &avdev, size);
cudaMalloc((void **) &adev, size);
cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice);
if (i >= 512){
h = 512;
}
else h = i;
dim3 dimGrid(((i+512)/512),1,1);
dim3 dimBlock(h,1,1);
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
//copy memory back to host
cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);
cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost);
// end malloc's, memcpy's, kernel calls
for(h = 0; h <=i; h++){
sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0);
}
printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9));
//free memory
cudaFree(sdev);
cudaFree(avdev);
cudaFree(adev);
free(shost);
free(avhost);
free(ahost);
return 0;
}
This is exactly the same problem you asked about in this question. The kernel is getting terminated early by the driver because it is taking too long to finish. If you read the documentation for any of these runtime API functions you will see the following note:
Note:
Note that this function may also return error codes from previous,
asynchronous launches.
All that is happening is that the first API call after the kernel launch is returning the error incurred while the kernel was running - in this case the cudaMemcpy call. The way you can confirm this for yourself is to do something like this directly after the kernel launch:
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
std::string error = cudaGetErrorString(cudaPeekAtLastError());
printf("%s\n", error);
error = cudaGetErrorString(cudaThreadSynchronize());
printf("%s\n", error);
The cudaPeekAtLastError() call will show you if there are any errors in the kernel launch, and the error code returned by the cudaThreadSynchronize() call will show whether any errors were generated while the kernel was executing.
The solution is exactly as outlined in the previous question: probably the simplest way is redesign the code so it is "re-entrant" so you can split the work over several kernel launches, with each kernel launch safely under the display driver watchdog timer limit.
Cuda somehow buffers all the read/write operations on global memory. So you can batch the operations in some loop with some kernel, and it will take actually NO TIME. Then, when you call memcpy, all the buffered operations are done, and it can timeout. Method to go with, is to call cudaThreadSynchronize procedure between iterations.
So remember: if a kernel run takes only nanoseconds to calculate - it doesn't mean that it is so fast - some of the writes to the global memory, are done when memcpy or threadsynchronize is called.
I am getting error while running below program when paralleling using MPI_Scatter and MPI_Reduce. When data does not belong to the particular bin it gets exited. But it is happening every time. All the time i run the program and its get exited by throwing this error of "Data does not belong to the bin". I am not sure it is because of wrong place for MPI_Scatter or MPI_Reduce. Please help me in regards to what is wrong with this c code? I am just putting here Main routine.
int main(int argc, char* argv[]) {
int bin_count = 5;
int i, bin;
float min_meas = 1.00;
float max_meas = sizeof(float);
float* bin_maxes;
int* bin_counts;
int* bin_result;
int data_count = 100;
float* data;
int size = 1;
int rank = 0;
MPI_Init(&argc, &argv);
// note that argc and argv are passed by address
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
/* Allocate arrays needed */
bin_maxes = malloc(bin_count*sizeof(float));
bin_counts = malloc(bin_count*sizeof(int));
bin_result = malloc(bin_count*sizeof(int));
data = malloc(data_count*sizeof(float));
int count = (int)data_count/size;
if (rank == 0){
/* Generate the random data */
Gen_data(min_meas, max_meas, data, data_count);
}
MPI_Scatter(data,count,MPI_FLOAT,data,data_count,MPI_FLOAT,0,MPI_COMM_WORLD);
/* Create bins for storing counts */
Gen_bins(min_meas, max_meas, bin_maxes, bin_counts, bin_count);
/* Count number of values in each bin */
for (i = 0; i < count; i++) {
bin = Which_bin(data[i], bin_maxes, bin_count, min_meas); //decide data is belong to which bin
bin_counts[bin]++; //increments bin
}
MPI_Reduce(bin_counts,bin_result,bin_count,MPI_INT,MPI_SUM,0,MPI_COMM_WORLD);
if(rank == 0) {
/* Print the histogram */
Print_histo(bin_maxes, bin_result, bin_count, min_meas); // prints the result
}
free(data);
free(bin_maxes);
free(bin_counts);
free(bin_result);
MPI_Finalize();
return 0;
}
I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:
__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
__shared__ int D[MAX_THREADS+2];
__shared__ char Text_S[MAX_PATTERN_SIZE];
__shared__ int DNew[MAX_THREADS+2];
__shared__ int BB_S[4][MAX_THREADS];
int w=threadIdx.x+1;
for(int i=0;i<4;i++)
{
BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
}
{
D[threadIdx.x] = 0;
{
D[w] = (1<<(k+1)) -1;
for(int i = 0; i < lc - 1; i++)
{
D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
}
}
D[J+1] = (1<<((k+2)*lc)) - 1;
}
int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));
int copyBlock=(size/J)+ ((size%J)==0?0:1);
if((threadIdx.x * copyBlock) <= size)
memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
memcpy(DNew, D, (J+2)*sizeof(int));
__syncthreads();
uint32_t initial = D[1];
uint32_t x;
uint32_t mask = 1;
for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
for(int i = 0; i < size;i++)
{
{
x = ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
DNew[w] = ((D[w]<<1) | mask)
& (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
& (((x + mask) ^ x) >> 1)
& initial;
}
__syncthreads();
memcpy(D, DNew, (J+2)*sizeof(int));
if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
{
matched[startblock+i] = 1;
D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
}
}
}
I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?
As asked, this is the code where I launch the kernels:
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
char* getTextString() {
FILE *input, *output;
char c;
char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);
int numchars = 0, index = 0;
input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0';
return inputbuffer;
}
int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString(); //reading text from file, supported upto 200MB currently
int k = 13;
int i;
int count=0;
char *pattern_d, *text_d; // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));
int m = strlen(pattern_h);
int n = strlen(text_h);
uint32_t* BB_h[4];
unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2; // Determines the number of threads per block
// can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc) == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
printf("Error: max threads\n");
exit(0);
}
calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa); // not included this function
for(i=0;i<4;i++)
{
cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
cudaStreamCreate( &stream[i] );
}
int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
maxNoBlocks=1;
}
else
{
maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
cudaStreamSynchronize( stream[i] );
// do stuff with matched
// ....
// ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
return 0;
}
Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn't that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously
NOTE: I don't have GUI access.
With all the shared memory you're using, you could be running into bank conflicts if consecutive threads are not reading from consecutive addresses in the shared arrays ... that could cause serialization of the memory accesses, which in turn will kill the parallel performance of your algorithm.
I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? did you try profiling it?
I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads
I'm working on a large scale project in which I'm designing a sparse matrix vector application but I'm still working to understand the code. I'm beginning by building the foundation for the application but I've run into a segmentation fault when executing the program. I've tracked the problem to this loop within the MatrixRead function and am enclosing the code below. When the program is executed I tried programming in some test messages and the program appears to execute all the loops but it returns the segmentation fault at the end. Of course, this is all just speculation. Any help would be awesome. Thanks!
while (ret != EOF && row <= mat->rows)
{
if (row != curr_row) // Won't execute for first iteration
{
/* store this row */
MatrixSetRow(mat, curr_row, len, ind, val);
/* check if the previous row is zero */
i = 1;
while(row != curr_row + i)
{
mat->lens[curr_row+i-1] = 0;
mat->inds[curr_row+i-1] = 0;
mat->vals[curr_row+i-1] = 0;
i++;
}
curr_row = row;
/* reset row pointer */
len = 0;
}
ind[len] = col;
val[len] = value;
len++;
ret = fscanf(file, "%lf %lf %lf", &r1, &c1, &value);
col = (int) (c1);
row = (int) (r1);
}
/* Store the final row */
if (ret == EOF || row > mat->rows)
MatrixSetRow(mat, mat->rows, len, ind, val);
Here's the code for the MatrixSetRow function:
/*--------------------------------------------------------------------------
* MatrixSetRow - Set a row in a matrix. Only local rows can be set.
* Once a row has been set, it should not be set again, or else the
* memory used by the existing row will not be recovered until
* the matrix is destroyed. "row" is in global coordinate numbering.
*--------------------------------------------------------------------------*/
void MatrixSetRow(Matrix *mat, int row, int len, int *ind, double *val)
{
row -= 1;
mat->lens[row] = len;
mat->inds[row] = (int *) MemAlloc(mat->mem, len*sizeof(int));
mat->vals[row] = (double *) MemAlloc(mat->mem, len*sizeof(double));
if (ind != NULL)
memcpy(mat->inds[row], ind, len*sizeof(int));
if (val != NULL)
memcpy(mat->vals[row], val, len*sizeof(double));
}
I'm also including the code for the Matrix.h file that went with it, where the members of Matrix are defined:
#include <stdio.h>
#include "Common.h"
#include "Mem.h"
#ifndef _MATRIX_H
#define _MATRIX_H
typedef struct
{
int rows;
int columns;
Mem *mem;
int *lens;
int **inds;
double **vals;
}
Matrix;