CUDA BFS Giant Graph (seg.fault)

CUDA BFS Giant Graph (seg.fault) - c

I'm making a test on a BFS algorithm on CUDA ( wich I know that has some syncronization problems, but it's part of my work to test it anyway ) but i'm having problems on using (or creating?) 1M+ size graphs.
Here's the code I use to create them:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // comeco da sub-string de vizinhos
int num; // tamanho da sub-string de vizinhos
};
int getSize()
{
int size,arcs;
printf("Size of the graph: \nNodes:\n>>");
scanf ("%d", &size);
return size;
}
void createEdges(int graphSize, int* Edges)
{
int j,value, aux, rndIdx;
int edgesSize = 2*GRAPHSIZE;
srand(time(NULL));
printf ("\nGS : %d\n", graphSize);
j = 1;
for (int i=0; i < edgesSize; i++) //first it creates an ordered array of edges
{
if (j < GRAPHSIZE)
{
Edges [i] = j;
j++;
}
else
{
j=1;
Edges [i] = j;
j++;
}
}
for (int i=0; i < edgesSize; i++) //now, it randomly swaps the edges array
{
rndIdx = rand()%graphSize;
aux = Edges[rndIdx];
Edges[rndIdx] = Edges [i];
Edges [i] = aux;
}
}
int main ()
{
int size,graphAtts[2];
int edgesSize = 2*GRAPHSIZE;
int Edges[edgesSize];
struct Node node[GRAPHSIZE];
FILE *file;
printf("____________________________\nRandom graph generator in compact format, optmized for CUDA algorithms by Ianuarivs Severvs.\nFor details about this format read the README. \n");
//size = getSize(graphAtts);
//printf ("%d,%d",size,arcs);
createEdges(GRAPHSIZE,Edges); // or size?
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
*/
printf("\nEdges:\n");
for (int i=0; i < edgesSize; i++)
printf("%d,",Edges[i]);
for (int i=0,j=0 ; i < GRAPHSIZE; i++,j+=2) // now, completes the graph
{
node[i].begin=j;
node[i].num=2;
printf ("\n node %d : begin = %d, num = 2",i,j);
}
printf("\n");
//writes file:
file = fopen ("graph1M.g","wb");
fwrite (&Edges, edgesSize * sizeof(int),1,file);
fwrite (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", Edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
}
And here's my BFS code (on CUDA):
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cutil.h>
#define GRAPHSIZE 1000000
struct Node
{
int begin; // begining of the substring
int num; // size of the sub-string
};
__global__ void BFS (Node *Va, int *Ea, bool *Fa, bool *Xa, int *Ca, bool *parada) // memory races on both Xa and Ca
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > GRAPHSIZE)
*parada=true;
if (Fa[tid] == true && Xa[tid] == false)
{
Fa[tid] = false;
Xa[tid] = true;
//__syncthreads(); // this solves the memrace problem as long as the threads are all on the same block
for (int i = Va[tid].begin; i < (Va[tid].begin + Va[tid].num); i++) // Va begin is where it's edges' subarray begins, Va is it's number of elements
{
int nid = Ea[i];
if (Xa[nid] == false)
{
Ca[nid] = Ca[tid] + 1;
Fa[nid] = true;
*parada = true;
}
}
}
}
// The BFS frontier corresponds to all the nodes being processed at the current level.
int main()
{
// for the time couting:
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
FILE * file;
printf("\nLoading graph file...\n");
struct Node node[GRAPHSIZE];
int edgesSize = 2*GRAPHSIZE;
int edges[edgesSize];
file = fopen ("graph1M.g","rb");
printf("abriu");
fread (&edges, edgesSize * sizeof(int),1,file);
fread (&node, GRAPHSIZE * sizeof(struct Node),1,file);
fclose(file);
//For file read test propouses only:
/*
for (int i = 0; i < edgesSize ; i ++)
{
printf ("-- %d --", edges[i]);
}
for (int i = 0; i < GRAPHSIZE ; i ++)
{
printf ("* %d *", i);
}
*/
bool frontier[GRAPHSIZE]={false};
bool visited[GRAPHSIZE]={false};
int custo[GRAPHSIZE]={0};
int source=0;
frontier[source]=true;
Node* Va;
cudaMalloc((void**)&Va,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Va,node,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ea;
cudaMalloc((void**)&Ea,sizeof(Node)*GRAPHSIZE);
cudaMemcpy(Ea,edges,sizeof(Node)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Fa;
cudaMalloc((void**)&Fa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Fa,frontier,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
bool* Xa;
cudaMalloc((void**)&Xa,sizeof(bool)*GRAPHSIZE);
cudaMemcpy(Xa,visited,sizeof(bool)*GRAPHSIZE,cudaMemcpyHostToDevice);
int* Ca;
cudaMalloc((void**)&Ca,sizeof(int)*GRAPHSIZE);
cudaMemcpy(Ca,custo,sizeof(int)*GRAPHSIZE,cudaMemcpyHostToDevice);
dim3 grid(100,100,1); //blocks per grid
dim3 threads(100,1,1); // threads per block
bool para;
bool* parada;
cudaMalloc((void**)&parada,sizeof(bool));
printf("_____________________________________________\n");
int count=0;
cudaEventRecord(start, 0);
do{
count ++;
para=false;
cudaMemcpy(parada,&para,sizeof(bool),cudaMemcpyHostToDevice);
BFS <<<grid,threads,0>>>(Va,Ea,Fa,Xa,Ca,parada);
CUT_CHECK_ERROR("kernel1 execution failed");
cudaMemcpy(&para,parada,sizeof(bool),cudaMemcpyDeviceToHost);
}while(para);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//printf("\nFinal:\n");
cudaMemcpy(custo,Ca,sizeof(int)*GRAPHSIZE,cudaMemcpyDeviceToHost);
/*
printf("\n_____________________________________________\n");
for(int i=0;i<GRAPHSIZE;i++)
printf("%d ",custo[i]);
printf("\n");
printf("_____________________________________________\n");
*/
cudaEventElapsedTime(&time, start, stop);
printf ("\nTime for the kernel: %lf s \n", time/1000);
printf ("Number of kernel calls : %d \n", count);
file = fopen ("graph125MPar","w");
for(int i=0;i<GRAPHSIZE;i++)
fprintf(file,"%d ",custo[i]);
fprintf(file,"\n");
fclose(file);
}
I'm having a segmantation fault while trying to run it for 1M+ graphs (please note that I used the changed the stack size of the sistem with the command ' ulimit -s 16384 ' on linux)
Can someone help?

Don't use statically allocated host arrays for the graph, use dynamic memory allocation instead. Your ulimit command is setting the stacksize to 16384 kb, but you require something like 5*sizeof(int) + 2*sizeof(bool) per graph entry which is probably 22 bytes per entry. It is pretty easy to see where you will run out of stack space with 1 million entries.

Related

Parallel BFS OpenMP C

I'm trying to develop the code to make a parallel version of the BFS (topDown) algorithm on a graph. This is the code I have developed so far:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<stdbool.h>
#include<time.h>
#include<omp.h>
#define MAX 10000
int n;
int v;
long dim = (long) MAX*MAX;
long* adj;
void create_graph();
void BF_Traversal();
void BFS(int v);
int main()
{
create_graph();
BFS(v);
return 0;
}
bool isEmpty(int* array){
for (int i=0;i<n;i++){
if (array[i] != -1)
return false;
}
return true;
}
void topDownStep(int* frontier, int* next, int* parents){
int index = 0;
#pragma omp parallel for num_threads(4)
for (int i=0;i<n;i++){
if (frontier[i]!=-1){
for (int j=0;j<n;j++){
long offset= frontier[i]*MAX + j;
if (adj[offset]==1 && j!=v){
if (parents[j] ==-1){
parents[j]=frontier[i];
next[index] = j;
index++;
}
}
}
}
}
}
void BFS(int startVertex){
int* frontier = malloc(n * sizeof(int));
int* next = malloc(n * sizeof(int));
int* parents = malloc(n * sizeof(int));
memset( frontier, -1, n*sizeof(int) );
memset( next, -1, n*sizeof(int) );
memset( parents, -1, n*sizeof(int) );
frontier[0] = startVertex;
while (!isEmpty(frontier))
{
topDownStep(frontier,next,parents);
printf("Visited frontier:\n");
for (int i=0;i<n;i++){
if(frontier[i]!=-1)
printf("%d\t",frontier[i]);
}
printf("\n");
memcpy(frontier,next,n*sizeof(int));
memset( next, -1, n*sizeof(int) );
}
}
void create_graph(){
adj = malloc(dim* sizeof(long));
int count,max_edge,origin,destin;
srand(time(NULL));
// FILE *in_file = fopen("graph.txt", "r");
// fscanf(in_file, "%d %d", &n , &v);
// printf("%d - %d",n,v);
// max_edge = n*(n-1);
// while ( fscanf(in_file, "%d %d", &origin,&destin) != EOF ) {
// if((origin == -1) && (destin == -1))
// break;
// if(origin>=n || destin>=n || origin<0 || destin<0)
// printf("Invalid edge!\n");
// else
// adj[origin][destin] = 1;
// adj[destin][origin] = 1;
// }
n = MAX;
v= 0;
max_edge = n*(n-1);
for(int i =0;i<n;i++){
for(int j=0;j<2;j++){
int val = rand()%MAX;
if(i!=val){
long offset = i*MAX+val;
adj[offset]=1;
}
}
}
}
I'm not sure if this approach is good or not for parallelizing such algorithm (see topDownStep).
The code seems to work correctly as the frontiers are visited in the right order, but I have a feeling that the code is not optimized.
Also I am afraid that race conditions may occur even if from the tests I have done it seems that this does not happen.
I look forward to your advice and I always thank you for your availability.

How can I find why my merge sorting algorithm crash when sorting an array of 1 million element?

I'm a French student and trying to calculate the execution time of the Merge Sort algorithm for different size of array.
I also want to write the different execution time in a .csv file. But when my program tries to sort an array with 1 million elements the process returns -1073741571 (0xC00000FD) in Code::Blocks. So if you could point me to a way to find a solution I would be very grateful!
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
void genTab(int *tab, int n) {
int i;
for (i = 0; i < n; i++) {
tab[i] = rand() % 100;
}
}
void fusion(int *tab, int deb, int mid, int fin) {
int i = deb;
int j = mid + 1;
int k = deb;
int temp[fin + 1];
while ((i <= mid) && (j <= fin)) {
if (tab[i] <= tab[j]) {
temp[k] = tab[i];
i++;
} else {
temp[k] = tab[j];
j++;
}
k++;
}
while (i <= mid) {
temp[k] = tab[i];
i++;
k++;
}
while (j <= fin) {
temp[k] = tab[j];
k++;
j++;
}
for (i = deb; i <= fin; i++) {
tab[i] = temp[i];
}
}
void triFusion(int *tab, int i, int j) {
if (i < j) {
triFusion(tab, i, (int)((i + j) / 2));
triFusion(tab, (int)((i + j) / 2 + 1), j);
fusion(tab, i, (int)((i + j) / 2), j);
}
}
void reset(int *tab1, int *tab2, int n) {
for (int i = 0; i < n; i++) {
tab2[i] = tab1[i];
}
}
int main() {
srand(time(NULL));
clock_t start, end;
int nbrTest[15] = {
1000, 5000, 10000, 50000, 80000, 100000, 120000, 140000,
150000, 180000, 200000, 250000, 300000, 450000, 1000000
};
FILE *fp;
char *tpsExecution = "exeTime.csv";
fp = fopen(tpsExecution, "w");
fprintf(fp, "Array Size; Merge Time");
for (int i = 0; i < 15; i++) {
int n = nbrTest[i];
printf("Calculating time for an array of %d \n", n);
int *tab = malloc(sizeof(int) * n);
genTab(tab, n);
int *copie = malloc(sizeof(int) * n);
reset(tab, copie, n);
start = clock();
triFusion(tab, 0, n - 1);
end = clock();
float tpsFusion = (float)(end - start) / CLOCKS_PER_SEC;
reset(tab, copie, n);
printf("writing in the file\n");
fprintf(fp, "\n%d;%f", n, tpsFusion);
free(tab);
free(copie);
}
fclose(fp);
return 0;
}

int temp[fin+1]; may exceed the space limit for the stack. You should allocate it with malloc instead, and free it with free.
If you want to exclude malloc and free from the timed code, the allocation could be performed outside the timed code and passed in as work space.

(Note: posted after the answer from #Eric Postpischil).
The function
void fusion(int * tab, int deb, int mid, int fin)
Has the line
int temp[fin+1];
and the value of fin comes through another function from the number of elements n to be sorted
triFusion(tab, 0, n-1);
and as an automatic variable, breaks the stack when n is large.
I suggest replacing the line with
int *temp = malloc((fin+1) * sizeof *temp);
if(temp == NULL) {
puts("malloc");
exit(1);
}
// ...
free(temp);

fusion() is always allocating the full size of the array for temp, even when only a small fraction of temp is being used. You could change this to:
int k = 0;
...
int temp[fin+1-deb];
...
tab[i]=temp[i-deb];
still this will exceed stack space if n is large. So as suggested in the other answers:
int k = 0;
...
int *temp = malloc((fin+1-deb)*sizeof(int));
...
tab[i]=temp[i-deb];
...
free(temp)
or better still, do a one time allocation of a second array in main or in a "helper" function, the include a pointer to the second array in the merge sort functions.

How to get the correct order of execution of pthreads

I was doing Histogram using pthreads and after long struggle on it.. finally it said:
Segmentation Fault (Core Dumped)
unfortunately I had this line
p=(struct1 *)malloc(sizeof(struct1));
after getting the values to the struct variables from command line.. So that was cleared off.. Thanks for #DNT for letting me know that..
Now when I try to execute the following program.. It sometimes displays the output and sometimes it is going out to the which_bin function and prints the following
output type 1(which is not the correct output):
Data = 0.000000 doesn't belong to a bin!
Quitting
output type 2(almost the correct output of histo with time taken by threads):
10.000-28.000:
28.000-46.000:
46.000-64.000:
64.000-82.000:
82.000-100.000: XXXXXXXXXX
The code to be timed took 0.000415 seconds
My ques is why the same prog when ran shows different outputs.. I am confused of what it is exactly looking for..
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "timer.h"
void Usage(char prog_name[]);
void Gen_data(void *p);
void Gen_bins(void *p);
int Which_bin(void *p);
void Print_histo(void *p);
void func(void *p);
struct test
{
int bin_count, i, bin;
float min_meas, max_meas;
float* bin_maxes;
int* bin_counts;
int data_count;
float* data;
};
typedef struct test struct1;
int main(int argc, char* argv[])
{
double start, finish, elapsed;
GET_TIME(start);
struct1 *p;
pthread_t th1, th2, th3;
p=(struct1 *)malloc(sizeof(struct1));
if (argc != 5)
Usage(argv[0]);
p->bin_count = strtol(argv[1], NULL, 10);
p->min_meas = strtof(argv[2], NULL);
p->max_meas = strtof(argv[3], NULL);
p->data_count = strtol(argv[4], NULL, 10);
p->bin_maxes = malloc(p->bin_count*sizeof(float));
p->bin_counts = malloc(p->bin_count*sizeof(int));
p->data = malloc(p->data_count*sizeof(float));
pthread_create(&th1,NULL,(void*) Gen_data,(void*) p);
pthread_create(&th2,NULL,(void*) Gen_bins,(void*) p);
pthread_create(&th3,NULL,(void*) func,(void*) p);
printf("Hi\n");
pthread_join(th1,NULL);
pthread_join(th2,NULL);
pthread_join(th3,NULL);
Print_histo(p);
free(p->data);
free(p->bin_maxes);
free(p->bin_counts);
GET_TIME(finish);
elapsed = finish - start;
printf("The code to be timed took %f seconds\n", elapsed);
return 0;
} /* main */
void func(void *p)
{
int i;
struct1 *args;
args=(struct1*)p;
for (i = 0; i < args->data_count; i++)
{
args->bin = Which_bin(args);
args->bin_counts[args->bin]++;
}
# ifdef DEBUG
printf("bin_counts = ");
for (i = 0; i < args->bin_count; i++)
printf("%d ", args->bin_counts[i]);
printf("\n");
# endif
}
/*---------------------------------------------------------------------
* Function: Usage
* Purpose: Print a message showing how to run program and quit
* In arg: prog_name: the name of the program from the command line
*/
void Usage(char prog_name[] /* in */)
{
fprintf(stderr, "usage: %s ", prog_name);
fprintf(stderr, "<bin_count> <min_meas> <max_meas> <data_count>\n");
exit(0);
} /* Usage */
void Gen_data(void *p)
{
struct1 *args;
args=(struct1*)p;
int i;
srandom(0);
for (i = 0; i < args->data_count; i++)
args->data[i] = args->min_meas + (args->max_meas - args->min_meas)*random()/((double) RAND_MAX);
#ifdef DEBUG
printf("data = ");
for (i = 0; i < args->data_count; i++)
printf("%4.3f ", args->data[i]);
printf("\n");
#endif
} /* Gen_data */
void Gen_bins(void* p)
{
struct1 *args;
args=(struct1*)p;
float bin_width;
int i;
bin_width = (args->max_meas - args->min_meas)/args->bin_count;
for (i = 0; i < args->bin_count; i++)
{
args->bin_maxes[i] = args->min_meas + (i+1)*bin_width;
args->bin_counts[i] = 0;
}
# ifdef DEBUG
printf("bin_maxes = ");
for (i = 0; i < args->bin_count; i++)
printf("%4.3f ", args->bin_maxes[i]);
printf("\n");
# endif
}
int Which_bin(void* p)
{
struct1 *args;
args=(struct1*)p;
int bottom = 0, top = args->bin_count-1;
int mid;
float bin_max, bin_min;
while (bottom <= top)
{
mid = (bottom + top)/2;
bin_max = args->bin_maxes[mid];
bin_min = (mid == 0) ? args->min_meas: args->bin_maxes[mid-1];
if (*(args->data) >= bin_max)
bottom = mid+1;
else if (*(args->data) < bin_min)
top = mid-1;
else
return mid;
}
fprintf(stderr, "Data = %f doesn't belong to a bin!\n", args->data);
fprintf(stderr, "Quitting\n");
exit(-1);
}
void Print_histo(void *p)
{
struct1 *args;
args=(struct1*)p;
int i, j;
float bin_max, bin_min;
for (i = 0; i < args->bin_count; i++)
{
bin_max = args->bin_maxes[i];
bin_min = (i == 0) ? args->min_meas: args->bin_maxes[i-1];
printf("%.3f-%.3f:\t", bin_min, bin_max);
for (j = 0; j < args->bin_counts[i]; j++)
printf("X");
printf("\n");
}
}
/* Print_histo */ #include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "timer.h"
void Usage(char prog_name[]);
void Gen_data(void *p);
void Gen_bins(void *p);
int Which_bin(void *p);
void Print_histo(void *p);
void func(void *p);
pthread_mutex_t lock;
struct test
{
int bin_count, i, bin;
float min_meas, max_meas;
float* bin_maxes;
int* bin_counts;
int data_count;
float* data;
};
typedef struct test struct1;
int main(int argc, char* argv[])
{
if (pthread_mutex_init(&lock, NULL) != 0)
{
printf("\n mutex init failed\n");
return 1;
}
double start, finish, elapsed;
GET_TIME(start);
struct1 *p;
pthread_t th1, th2, th3;
p=(struct1 *)malloc(sizeof(struct1));
if (argc != 5)
Usage(argv[0]);
p->bin_count = strtol(argv[1], NULL, 10);
p->min_meas = strtof(argv[2], NULL);
p->max_meas = strtof(argv[3], NULL);
p->data_count = strtol(argv[4], NULL, 10);
p->bin_maxes = malloc(p->bin_count*sizeof(float));
p->bin_counts = malloc(p->bin_count*sizeof(int));
p->data = malloc(p->data_count*sizeof(float));
pthread_create(&th1,NULL,(void*) Gen_data,(void*) p);
pthread_create(&th2,NULL,(void*) Gen_bins,(void*) p);
pthread_create(&th3,NULL,(void*) func,(void*) p);
printf("Hi\n");
pthread_join(th1,NULL);
pthread_join(th2,NULL);
pthread_join(th3,NULL);
Print_histo(p);
free(p->data);
free(p->bin_maxes);
free(p->bin_counts);
GET_TIME(finish);
elapsed = finish - start;
printf("The code to be timed took %f seconds\n", elapsed);
return 0;
} /* main */
void func(void *p)
{
pthread_mutex_lock(&lock);
printf("th3 from Gen_func\n");
int i;
struct1 *args;
args=(struct1*)p;
for (i = 0; i < args->data_count; i++)
{
args->bin = Which_bin(args);
args->bin_counts[args->bin]++;
}
# ifdef DEBUG
printf("bin_counts = ");
for (i = 0; i < args->bin_count; i++)
printf("%d ", args->bin_counts[i]);
printf("\n");
# endif
pthread_mutex_unlock(&lock);
}
/*---------------------------------------------------------------------
* Function: Usage
* Purpose: Print a message showing how to run program and quit
* In arg: prog_name: the name of the program from the command line
*/
void Usage(char prog_name[] /* in */)
{
fprintf(stderr, "usage: %s ", prog_name);
fprintf(stderr, "<bin_count> <min_meas> <max_meas> <data_count>\n");
exit(0);
} /* Usage */
void Gen_data(void *p)
{
pthread_mutex_lock(&lock);
printf("th1 from Gen_data\n");
struct1 *args;
args=(struct1*)p;
int i;
srandom(0);
for (i = 0; i < args->data_count; i++)
args->data[i] = args->min_meas + (args->max_meas - args->min_meas)*random()/((double) RAND_MAX);
#ifdef DEBUG
printf("data = ");
for (i = 0; i < args->data_count; i++)
printf("%4.3f ", args->data[i]);
printf("\n");
#endif
pthread_mutex_unlock(&lock);
} /* Gen_data */
void Gen_bins(void* p)
{
pthread_mutex_lock(&lock);
printf("th2 from Gen_bins\n");
struct1 *args;
args=(struct1*)p;
float bin_width;
int i;
bin_width = (args->max_meas - args->min_meas)/args->bin_count;
for (i = 0; i < args->bin_count; i++)
{
args->bin_maxes[i] = args->min_meas + (i+1)*bin_width;
args->bin_counts[i] = 0;
}
# ifdef DEBUG
printf("bin_maxes = ");
for (i = 0; i < args->bin_count; i++)
printf("%4.3f ", args->bin_maxes[i]);
printf("\n");
# endif
pthread_mutex_unlock(&lock);
}
int Which_bin(void* p)
{
struct1 *args;
args=(struct1*)p;
int bottom = 0, top = args->bin_count-1;
int mid;
float bin_max, bin_min;
while (bottom <= top)
{
mid = (bottom + top)/2;
bin_max = args->bin_maxes[mid];
bin_min = (mid == 0) ? args->min_meas: args->bin_maxes[mid-1];
if (*(args->data) >= bin_max)
bottom = mid+1;
else if (*(args->data) < bin_min)
top = mid-1;
else
return mid;
}
fprintf(stderr, "Data = %f doesn't belong to a bin!\n", args->data);
fprintf(stderr, "Quitting\n");
exit(-1);
}
void Print_histo(void *p)
{
struct1 *args;
args=(struct1*)p;
int i, j;
float bin_max, bin_min;
for (i = 0; i < args->bin_count; i++)
{
bin_max = args->bin_maxes[i];
bin_min = (i == 0) ? args->min_meas: args->bin_maxes[i-1];
printf("%.3f-%.3f:\t", bin_min, bin_max);
for (j = 0; j < args->bin_counts[i]; j++)
printf("X");
printf("\n");
}
}
/* Print_histo */
I have added the lines to see if all the threads are accessing its functions.. I observed this..
output 1:
Hi
th1 from Gen_data
th3 from Gen_func
Data = 0.000000 doesn't belong to a bin!
Quitting
In the output 1, I can see that th2 is not executed and program ended displaying error..
output 2:
th1 from Gen_data
Hi
th2 from Gen_bins
th3 from Gen_func
10.000-28.000:
28.000-46.000:
46.000-64.000:
64.000-82.000:
82.000-100.000: XXXXXXXXXX
The code to be timed took 0.000348 seconds
In output 2, all the threads are executed and so is the output..
I am confused that why the thread th2 is not being executed and how can I make sure that all threads runs in correct order..
I would like to know if the program is logically wrong? if it is wrong logically in that case why is it showing the histogram output at times.. Thanks!

Order of thread execution is not guaranteed. On a modern, multi-core processor, the threads may even execute concurrently. There is no guarantee that the Gen_bins thread completes before the func thread. Since your threads access and manipulate the same data structures, the results are unpredictable as you have noticed.
While I don't think threads are necessary for this application, make the following change to ensure the threads execute in the order listed. Change:
pthread_create(&th1,NULL,(void*) Gen_data,(void*) p);
pthread_create(&th2,NULL,(void*) Gen_bins,(void*) p);
pthread_create(&th3,NULL,(void*) func,(void*) p);
pthread_join(th1,NULL);
pthread_join(th2,NULL);
pthread_join(th3,NULL);
to:
pthread_create(&th1,NULL,(void*) Gen_data,(void*) p);
pthread_join(th1,NULL);
pthread_create(&th2,NULL,(void*) Gen_bins,(void*) p);
pthread_join(th2,NULL);
pthread_create(&th3,NULL,(void*) func,(void*) p);
pthread_join(th3,NULL);
This ensures that each thread executes and completes before the next starts. Again, since the threads aren't executing concurrently, threading isn't necessary for this program and just adds complexity.

Issues with pointers when passing a struct to a thread on Win32 API

The user provides command line arguments that are used to compute the number of partitions, and the number of threads, where each thread does a minimum linear search of a specific partition of the large array. Each minimum value found by a thread is stored inside a small global array. The main function then does a minimum linear search of the small array, and also a minimum search of the large array and confirms that the minimum found in both small and large array are equal. The problem that I am encountering is that the minimums inside the small global array are sometimes garbage, and sometimes matches the minimum found in the large array. I have tried to figure out the problem but I don't seem to find it. Your help will be really appreciated. I am coding in C, using Dev-C++ on win32 API. The code is bellow:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
int number_items = 0;
int size = 1;
int partits = 1;
int P = 0;
int N = 0;
int Index = 0;
int index_global = 0;
int min;
#define NUM_THREADS 65536 //or 2^16
typedef struct thread_data
{
int thread_id;
int a;
int b;
int * copy_array;
int * glob_array;
int nbr_items;
int subarraysize;
} s_param, *p_s_param;
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
DWORD WINAPI CompMin( LPVOID lpParam )
{
int i, tmp;
int SubArSize,nbrItems,thrid;
p_s_param param2;
param2 = (p_s_param)lpParam;
min = param2->copy_array[Index];
min = param2->copy_array[param2->a];
param2->glob_array[index_global] = min;
Index++;
index_global++;
}
int main(int argc, char *argv[])
{
int sub_array_size;
p_s_param pDataArray[NUM_THREADS];
DWORD dwThreadIdArray[NUM_THREADS];
HANDLE hThreadArray[NUM_THREADS];
HANDLE myhandle;
//pthread_t thID, thread;
p_s_param param[NUM_THREADS];
int rNum, rc = 0, i, j, large_min;
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
P = atoi(argv[1]); /* will be used to define size of large array */
N = atoi(argv[2]); /* will be used to define number of threads */
if(N>P)
{
printf(" Argument 1 should be greater than argument 2\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Create a dynamic array of size size*/
int *array = (int*) malloc(size*sizeof(int));
srand(time(NULL));
for (i=0; i<size; i++)
{
rNum = rand() / RAND_DIVISOR;
array[i] = rNum;
}
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
/*numbers of elements per sub array*/
sub_array_size = size/partits;
/*Global array*/
int *Globalarray = (int*) malloc(partits*sizeof(int));
for (i=0; i<partits; i++)
{
/*Allocate memory for thread data*/
param[i] = (p_s_param) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(s_param));
if( param[i] == NULL)
{
ExitProcess(2);
}
param[i]->a=i;
param[i]->copy_array=array;
param[i]->glob_array = Globalarray;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, param[i], 0, &dwThreadIdArray[i]);
if(hThreadArray[i] == NULL)
{
puts("Error, cannot create Thread!");
puts(strerror(errno));
ExitProcess(3);
}
//printf("Number of partitions: %d\n",partits );
} WaitForMultipleObjects(NUM_THREADS,hThreadArray, TRUE, INFINITE);
/*find mimimum value from Global array returned by threads*/
min = Globalarray[0];
for(i = 0; i< partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, Globalarray[i] );
if(Globalarray[i] < min)
min = Globalarray[i];
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
large_min = array[0];
for(i = 0; i< partits; i++)
{
printf("value into large array: %d\n",array[i] );
if(array[i] < large_min)
large_min = array[i];
}
//large_min = array[0];
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small %d array is different from large array %d!\n", min, array[0]);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free((void*) Globalarray);
free((void*) array);
exit (0);
}

I just added a sleep(3) after the wait, and it fixed the problem.

Your CompMin() function is not thread-safe. It is accessing global variables that are shared and modified by multiple threads at the same time, so they are going to step over each other's data as they run in parallel to each other. You need to make your work data self-contained so each thread is only operating on the data it is given to work on, and get rid of your shared globals altogether. You designed your thread_data struct to allow partitioning the array data, but you are not actually utilizing that functionality, so each thread is not searching its individual partition of data, and not storing its search result in its individual section of the global array.
You are also passing the wrong number of thread handles to WaitForMultipleObjects(), so it will fail to wait, which you are not checking for, and then you move on to process the array data before they are actually ready to be processed.
Your are also searching the arrays incorrectly after the threads have finished running, so you are not going to end up with the correct results.
Try something more like this instead:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
typedef struct thread_data
{
DWORD thread_id;
int *items;
int nbr_items;
int min;
} s_thread_data, *p_s_thread_data;
/*
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
*/
DWORD WINAPI CompMin( LPVOID lpParam )
{
p_s_thread_data data = (p_s_thread_data) lpParam;
int i;
data->min = data->items[0];
for(i = 1; i < data->nbr_items; i++)
{
if(data->items[i] < data->min)
data->min = data->items[i];
}
return 0;
}
int main(int argc, char *argv[])
{
int size = 1;
int partits = 1;
int sub_array_size;
int i;
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
int P = atoi(argv[1]); /* will be used to define size of large array */
if(P < 1)
{
printf(" Argument 1 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
int N = atoi(argv[2]); /* will be used to define number of threads */
if(N < 1)
{
printf(" Argument 2 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the large array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Allocate memory for large array*/
int *array = (int*) malloc(size*sizeof(int));
if(array == NULL)
return 2;
srand(time(NULL));
/*Fill the large array with random data*/
for (i=0; i<size; i++)
array[i] = rand() / RAND_DIVISOR;
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
//printf("Number of partitions: %d\n", partits );
/*numbers of elements per partition*/
sub_array_size = size/partits;
/*Allocate memory for thread data*/
p_s_thread_data ThreadDataArray = (p_s_thread_data) malloc(partits*sizeof(s_thread_data));
if(ThreadDataArray == NULL)
return 2;
memset(ThreadDataArray, 0, partits*sizeof(s_thread_data));
/*Allocate memory for thread handles array*/
HANDLE *hThreadArray = (HANDLE*) malloc(partits*sizeof(HANDLE));
if(hThreadArray == NULL)
return 2;
memset(hThreadArray, 0, partits*sizeof(HANDLE));
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
for (i=0; i<partits; i++)
{
ThreadDataArray[i].items = &array[i*sub_array_size];
ThreadDataArray[1].nbr_items = sub_array_size;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, &param[i], 0, &(param[i].thread_id));
if(hThreadArray[i] == NULL)
{
printf("Error, cannot create Thread! %s\n", strerror(errno));
return 3;
}
}
/*Wait for threads to finish*/
i = 0;
int nbr_handles = partits;
while (nbr_handles >= MAXIMUM_WAIT_OBJECTS)
{
if (WaitForMultipleObjects(MAXIMUM_WAIT_OBJECTS, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
i = i + MAXIMUM_WAIT_OBJECTS;
nbr_handles = nbr_handles - MAXIMUM_WAIT_OBJECTS;
}
if (nbr_handles > 0)
{
if (WaitForMultipleObjects(nbr_handles, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
}
/*find minimum value from thread results*/
int min = ThreadDataArray[0].min;
for(i = 0; i < partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, ThreadDataArray[i].min );
if(ThreadDataArray[i].min < min)
min = ThreadDataArray[i].min;
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
int large_min = array[0];
for(i = 0; i < size; i++)
{
printf("value into large array: %d\n", array[i] );
if(array[i] < large_min)
large_min = array[i];
}
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small array (%d) is different from large array (%d)!\n", min, large_min);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free(array);
free(ThreadDataArray);
free(hThreadArray);
return 0;
}

Pthread_join() Causing segment default error

What the following code trying to accomplish is just to compute the Matrix Multiplication of A and B to get matrix C. It uses nXn threads to compute each entry of C independently. So the code works on Cygwin, but not on linux. I keep getting segment default with the Pthread_join calls.
#define _REENTRANT // Make sure the library functions are MT (muti-thread) safe
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define BUFFER_SIZE 512
// Declare a structure data type that will be used to pass arguments to the worker threads
typedef struct args_for_thread_t{
int *rowA;
int rowIdx;
int *columnB;
int columnIdx;
int **matrixC;
} ARGS_FOR_THREAD;
/* Global Variables */
int numRows,numColumns;
/*Function Prototype*/
void *computeC(void *this_arg);
void printMatrix(int** matrix,int numRows,int numColumns);
int main(void){
const char filename[] = "input_data.txt";
FILE *file = fopen(filename,"r");
char *delims = " ";
int **matrixA,**matrixB,**matrixC;
int flagB = 0; //Indicate wether the program should process matrixB
int i,j;
if (file){
char line[BUFFER_SIZE];
int rowIdx = 0;
while (fgets(line,sizeof(line), file)){
char substr[BUFFER_SIZE], *result;
//fputs(line,stdout);
result = strtok(line, delims);
int columnIdx = 0;
//Once we reach a line break, we start the processing of matrix B
if (!strcmp(line,"\n")){
flagB = 1;
rowIdx = 0; //Reset the rowIdx
continue; //Skip the new line, and start to read data into matrix B
}
while (result != NULL){
if (!strcmp(result,"ROWS")){ //To retrieve the number of rows
result = strtok(NULL,delims);
numRows = atoi(result);
matrixA = (int **) malloc(numRows*sizeof(int*));
matrixB = (int **) malloc(numRows*sizeof(int*));
matrixC = (int **) malloc(numRows*sizeof(int*));
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!strcmp(result,"COLUMNS")){//To retrieve the number of Columns
result = strtok(NULL,delims);
numColumns = atoi(result);
for (i=0;i<numRows;i++){ //Malloc the columns
matrixA[i] = (int *) malloc(numColumns*sizeof(int));
matrixB[i] = (int *) malloc(numColumns*sizeof(int));
matrixC[i] = (int *) malloc(numColumns*sizeof(int));
}
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!flagB){ //Processing Matrix A
matrixA[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
else if (flagB){ //Processing Matrix B
matrixB[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
}
rowIdx++;
}
}
else{
printf("No Such File exists!\n");
}
//At this point, matrix A and matrix B are both ready for computation. We will start to compute the product of the two matrices
int num_threads = numRows*numColumns; //The toal number of worker threads
pthread_t *worker_thread = (pthread_t *) malloc(sizeof(pthread_t)*num_threads);
ARGS_FOR_THREAD *args_for_thread;
for(i = 0; i < numRows; i++){
for(j = 0; j < numColumns; j++){
args_for_thread = (ARGS_FOR_THREAD *)malloc(sizeof(ARGS_FOR_THREAD)); // Allocate memory for the structure that will be used to pack the arguments
args_for_thread-> rowA = matrixA[i];
//We need to allocate the corresponding column in B for multiplication
int k;
args_for_thread->columnB =(int *) malloc(sizeof(int)*numRows);
for (k=0;k<numRows;k++){
args_for_thread-> columnB[k] = matrixB[k][j];
}
//rowIdx and columnIdx gives the corresponding entry for matrix C
args_for_thread-> rowIdx = i;
args_for_thread-> columnIdx = j;
args_for_thread-> matrixC = matrixC;
if((pthread_create(&worker_thread[i], NULL, computeC, (void *)args_for_thread)) != 0){
printf("Cannot create thread \n");
exit(0);
}
}
}
// Wait for all the worker threads to finish
for(i = 0; i < num_threads; i++)
pthread_join(worker_thread[i], NULL);
//Print out the Final Matrix C
printMatrix(matrixC,numRows,numColumns);
//Clean up pointers
for(i = 0; i < numRows; i++){
free(matrixA[i]);
free(matrixB[i]);
free(matrixC[i]);
}
free(matrixA);
free(matrixB);
free(matrixC);
}
void printMatrix(int** matrix,int numRows, int numColumns){
int i,j;
for (i=0;i<numRows;i++){
for (j=0;j<numColumns;j++){
printf("%d ",matrix[i][j]);
if (j==numColumns-1){
printf("\n");
}
}
}
}
/* Function that will be executed by all the worker threads. It will compute the i,j entry for column C */
void *computeC(void *this_arg){
ARGS_FOR_THREAD *arg = (ARGS_FOR_THREAD *) this_arg;
int rowIdx = arg->rowIdx;
int columnIdx = arg->columnIdx;
int *rowA = arg->rowA;
int *columnB = arg->columnB;
int **matrixC = arg->matrixC;
int i;
int sum = 0;
for (i=0;i<numRows;i++){ //Compute entry for matrix C. Since A,B are nxn square matrix, we can use either numRows or numColumns as the size
sum += rowA[i]*columnB[i];
}
matrixC[rowIdx][columnIdx] = sum;
free((void *) arg); // Free up the structure
pthread_exit(NULL);
}
What is the issue here? Thank you.

Here:
pthread_create(&worker_thread[i] ...
You create i * j threads, yet you only provide worker_threads[i] hence your program keeps using the same pthread_t variables. It later fails when you try to join the threads with undefined pthread_t values.
Replace by:
pthread_create(&worker_thread[i*numColumns+j] ...

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

CUDA BFS Giant Graph (seg.fault) - c

Related

Parallel BFS OpenMP C

How can I find why my merge sorting algorithm crash when sorting an array of 1 million element?

How to get the correct order of execution of pthreads

Issues with pointers when passing a struct to a thread on Win32 API

Pthread_join() Causing segment default error

Categories

Resources