I am writing a multi threaded c program to multiply two matrices and find the row norm using pthreads and Blas. I thought I had it working when I set the dimension of the matrices to 4 and the number of threads to use to 2. I then changed the number of threads, and it no longer works. It does not compute the wrong answers, but gets stuck when I try to join the threads
void *matrix_norm(void *arg){
mat_norm_t *thread_mat_norm_data = arg;
int n = thread_mat_norm_data->n;
int i, j;
double norm = 0.;
for(i=0;i<thread_mat_norm_data->sub_n;i++){
double row_sum = 0.;
for(j=0;j<n;j++){
row_sum += *(thread_mat_norm_data->z+i*n+j);
}
if(row_sum>norm){
norm = row_sum;
}
}
pthread_mutex_lock(thread_mat_norm_data->mutex);
if (norm > *(thread_mat_norm_data->global_norm)){
*(thread_mat_norm_data->global_norm)=norm;
}
pthread_mutex_unlock(thread_mat_norm_data->mutex);
pthread_exit(NULL);
}
int main() {
pthread_t *working_thread;
mat_mult_t *thread_mat_mult_data;
mat_norm_t *thread_mat_norm_data;
pthread_mutex_t *mutex;
double *x, *y, *z, norm;
int i, rows_per_thread;
int n = 8;
int num_of_thrds = 4;// Works when this is 2, not when 4
if(n<=num_of_thrds && num_of_thrds < MAXTHRDS){
printf("Matrix dim must be greater than num of thrds\nand num of thrds less than 124.\n");
return (-1);
}
x = malloc(n*n*sizeof(double));
y = malloc(n*n*sizeof(double));
z = malloc(n*n*sizeof(double));
initMat(n, x);
initMat(n, y);
working_thread = malloc(num_of_thrds * sizeof(pthread_t));
thread_mat_mult_data = malloc(num_of_thrds * sizeof(mat_mult_t));
rows_per_thread = n/num_of_thrds;
for(i=0;i<num_of_thrds;i++){
thread_mat_mult_data[i].x = x + i * rows_per_thread * n;
thread_mat_mult_data[i].y = y;
thread_mat_mult_data[i].z = z + i * rows_per_thread * n;
thread_mat_mult_data[i].n = n;
thread_mat_mult_data[i].sub_n =
(i == num_of_thrds-1) ? n-(num_of_thrds-1)*rows_per_thread : rows_per_thread;
pthread_create(&working_thread[i], NULL, matrix_mult, (void *)&thread_mat_mult_data[i]);
}
for(i=0;i<num_of_thrds;i++){
pthread_join(working_thread[i], NULL);
}
free(working_thread);
working_thread = malloc(num_of_thrds * sizeof(pthread_t));
thread_mat_norm_data = malloc(num_of_thrds * sizeof(mat_norm_t));
mutex = malloc(sizeof(pthread_mutex_t));
for(i=0;i<num_of_thrds;i++){
thread_mat_norm_data[i].z = z + i * rows_per_thread * n;
thread_mat_norm_data[i].n = n;
thread_mat_norm_data[i].global_norm = &norm;
thread_mat_norm_data[i].sub_n =
(i == num_of_thrds-1) ? n-(num_of_thrds-1)*rows_per_thread : rows_per_thread;
thread_mat_norm_data[i].mutex = mutex;
pthread_create(&working_thread[i], NULL, matrix_norm, (void *)&thread_mat_norm_data[i]);
}
//Stuck running here
for(i=0;i<num_of_thrds;i++){
pthread_join(working_thread[i], NULL);
}
printMat(n, z , "z");
printf("\nRow Sum Norm = %f\n", norm);
free(x);
free(y);
free(z);
free(working_thread);
free(thread_mat_mult_data);
free(thread_mat_norm_data);
pthread_mutex_destroy(mutex);
free(mutex);
return(0);
}
I unsure why it works under certain circumstances and not others, any explanation would be great!
Forgot to initialize the mutex with pthread_mutex_init(mutex, NULL); I am still unsure why it would work with out this for two threads but not more than this?
Related
I am creating 2 programs to test the differences in run time of serial matrix multiply vs that of parallel matrix multiply. The parallel code that I have written is actually running slower than serial code, and running the program with additional cores enabled provides no speedup at all... using more cores actually seems to slow down the parallel program.
What is going on here? This is my parallel code: to use this pass in matrix size and thread number (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
#include <pthread.h>
// Time struct + prototypes
struct timespec time1, time2, diffTime;
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** reserveMatrix(int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
void* matMult(void* arg);
// Argstruct
typedef struct {
double** result;
int tid;
int size;
int s;
int e;
} argStr;
// global variables for use by all threads
int size; // Size of a row and column.
int numThreads; // Number of pThreads to do work
double** mat1;
double** mat2;
double** mat3;
// Main function
int main(int argc, char *argv[]) {
size = atoi(argv[1]);
numThreads = atoi(argv[2]);
mat1 = reserveMatrix(size, size);
mat2 = reserveMatrix(size, size);
mat3 = reserveMatrix(size, size);
if (size == 0) {
//printf("Matrix cannot be size 0\n");
return -1;
}
//Start timer
clock_gettime(CLOCK_MONOTONIC, &time1);
// *********** Begin main operation *********** //
// //
// declare necessary local variables
pthread_t theThreads[numThreads];
argStr data[numThreads]; // Create numThreads # of argStr objects
for (int i = 0; i < numThreads; i++) {
data[i].result = reserveMatrix(size, size);
data[i].tid = i; // Self-assigned threadID
data[i].size = size; // Size of a block
data[i].s = size * i / numThreads;
data[i].e = size * (i + 1) / numThreads - 1;
//printf("I handle operations from %d to %d\n", data[i].s, data[i].e);
}
// Start the threads
for (int i = 0; i < numThreads; i++) {
pthread_create(&theThreads[i], NULL, matMult, (void*) (&data[i]));
}
// await all threads being done.
for (int i = 0; i < numThreads; i++) {
pthread_join(theThreads[i], NULL);
}
// rejoin received data
//printMat(data[1].result, size, size);
// //
// *********** End main operation *********** //
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Pthread Matrix Multiply, %d, %d, %lf\n", size, numThreads, cpuTimeUsed);
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// Reserve matrix function
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
void* matMult(void* arg) {
//printf("Begin an operation\n");
argStr* args = (argStr*)arg;
double** result = args->result;
int tid = args->tid;
int size = args->size; // Size of the matrix
long s = args->s; // Start
long e = args->e; // End
// Print message to confirm data is getting stored
//printf("Hello from operation %d! \n", tid);
//printf("I am working from number %ld to %ld\n", s, e);
for(int r = s; r <= e; r++) { // May need to declare out of loop
for(int c = 0; c < size; c++) {
result[r][c] = 0.0;
for(int i = 0; i < size; i++) {
result[r][c] += mat1[r][i] * mat2[i][c];
}
}
}
// Print multipled matrix values
//printMat(mat3, size, size);
return NULL;
}
This is my serial code: To use this pass in the same sized row and column (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
// Matrix multiply code
// **** Time struct **** //
struct timespec time1, time2, diffTime;
// Prototypes
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** reserveMatrix(int nRows, int nCols);
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
// Begin main
int main(int argc, char *argv[])
{
int rows = atoi(argv[1]);
int cols = atoi(argv[2]);
// Declare the ARRAYS and populate them
double** arr1 = reserveMatrix(rows, cols);
double** arr2 = reserveMatrix(rows, cols);
double** arr3 = reserveMatrix(rows, cols);
double** arr4 = reserveMatrix(rows, cols);
double prod1 = matrixProduct(arr1, arr2, rows, cols);
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr3 = matrixMultiply(arr1, arr2, arr3, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
// Print input matrix values. Used to test that matrix multiply works - it does
// Perform a transposition of matrix 2
for (int r = 0; r < rows; ++r) {
for (int c = r + 1; c < cols; ++c) {
double val = arr2[r][c];
arr2[r][c] = arr2[c][r];
arr2[c][r] = val;
}
}
// Run matrix multiply again on the newly transposed data.
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr4 = transMatrixMultiply(arr1, arr2, arr4, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Trans Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
//double prod2 = matrixProduct(arr3, arr4, rows, cols);
//printf("The matrix product of m3 and m4 is: %f\n", prod2);
//printMat(mat3, rows, cols);
return 0;
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// standard matrix multiply
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int r = 0; r < nRows; ++r) {
for (int c = 0; c < nCols; ++c) {
result[r][c] = 0.0;
for (int i = 0; i < nRows; ++i) {
result[r][c] += matrix1[r][i] * matrix2[i][c];
}
}
}
return result;
}
// Transpose matrix multiply
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int c = 0; c < nCols; ++c) {
for (int r = 0; r < nRows; ++r) {
result[c][r] = 0.0;
for (int i = 0; i < nCols; ++i) {
result[c][r] += matrix1[c][i] * matrix2[r][i];
}
}
}
return result;
}
// Reserve data function. Reserves and populates array data
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Check that matrix1 and matrix2 are the same
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols) {
double sum = 0.0;
for(int i = 0; i < nRows * nCols; i++) {
sum += (mat1[0][i] - mat2[0][i]) * (mat1[0][i] - mat2[0][i]);
//printf("matrix product pos: %i, sum: %f\n", i, sum);
}
return sum;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
Here is the linux output of me compiling and running this code. At matrix size 1200 x 1200 the run time differences are not that pronounced, but the serial code ends up being significantly faster than the parallel at sizes above 1500 x 1500:
MYPC:~/Projects/matrixMultiply/phase3$ gcc matrixMult.c -o MM
MYPC:~/Projects/matrixMultiply/phase3$ gcc pMatMult.c -lpthread -o PMM
MYPC:~/Projects/matrixMultiply/phase3$ ./MM 1200 1200
Matrix Multiply, 1200, 25.487388
Trans Matrix Multiply, 1200, 16.452777
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 2
Pthread Matrix Multiply, 1200, 2, 22.495115
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 4
Pthread Matrix Multiply, 1200, 4, 22.181686
The sections in bold contain the meaningful output. It reads
name of the process
matrix size
number of threads spawned (in pThread program only)
run time
Any help would be appreciated. I will be instantly replying to questions for the next 2 hours.
The solution was to terminate extra processes that were running on my ubuntu machine. The code worked perfectly fine as a few users pointed out. Killing all other processes on the machine, then running my parallel code provided the expected speedups.
I am not sure of the precise technical reason this is going on other than the machine wasn't prioritizing my program when it had others running, resulting in slower times.
ex = 1 + x + x2/2! + x3/3! + x4/4! + x5/5! +...
I have converted a Taylor series of ex (above) into a program of OpenMp.
All the codes are written below.
When I run the code through Oracle Ubuntu it works.
It is giving me e^0=1,e^1=2.718,e^2=7.389056
But when I run it on Ubuntu (not virtually), then it doesn't work right.
It is giving me e^0=nan,e^1=0.40..,e^2=4.780.
And output is totally random as in its not exact as I mentioned above.
I need help.
#include <math.h>
#include <pthread.h>
#include <stdlib.h>
long double x, fact[150], pwr[150], s[1];
int i, term;
void *Power(void *temp) {
int k;
for (k = 0; k < 150; k++) {
pwr[k] = pow(x, k);
//printf("%.2Lf\n", pwr[k]);
}
return pwr;
}
void *Fact(void *temp) {
long double f;
int j;
fact[0] = 1.0;
for (term = 1; term < 150; term++) {
f = 1.0;
for (j = term; j > 0; j--)
f = f * j;
fact[term] = f;
//printf("%.2Lf\n", fact[term]);
}
return fact;
}
void *Exp(void *temp) {
int t;
s[0] = 0;
for (t = 0; t < 150; t++)
s[0] = s[0] + (pwr[t] / fact[t]);
return s;
}
int main(void) {
pthread_t thread1, thread2, thread3;
long double **sum;
printf("Exponential [PROMPT] Enter the value of x (between 0 to 100) (for calculating exp(x)):");
scanf("%Lf", &x);
printf("\nExponential [INFO] Threads creating.....\n");
pthread_create(&thread1, NULL, Power, NULL); //calling power function
pthread_create(&thread2, NULL, Fact, NULL); //calling factorial function
printf("Exponential [INFO] Threads created\n");
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
printf("Exponential [INFO] Master thread and terminated threads are joining\n");
printf("Exponential [INFO] Result collected in Master thread\n");
pthread_create(&thread3, NULL, Exp, NULL);
pthread_join(thread3, sum);
printf("\neXPONENTIAL [INFO] Value of exp(%.2Lf) is : %Lf\n\n", x, s[0]);
exit(1);
}
The above code is originally for ex using threads which works.
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(void) {
long double x, f, fact[150], pwr[150], s[1];
int i, term, k, j, t;
long double sum;
printf("Exponential [PROMPT] Enter the value of x (between 0 to 100) (for calculating exp(x)):");
scanf("%Lf", &x);
#pragma omp parallel num_threads(10)
#pragma omp for
for (k = 0; k < 150; k++) {
for (int h = 0; h <= k; h++) {
if (h == 0)
x = 1;
else
pwr[k] = pow(x, k);
}
#pragma omp for
for (term = 1; term < 150; term++) {
f = 1.0;
for (j = term; j > 0; j--)
f = f * j;
fact[term] = f;
}
#pragma omp for
for (t = 0; t < 150; t++)
s[0] = s[0] + (pwr[t] / fact[t]);
printf("\neXPONENTIAL [INFO] Value of exp(%.2Lf) is : %Lf\n\n", x, s[0]);
exit(1);
}
And the code above is a conversion of the previous code to an OpenMP.
for (t = 0; t < 150; t++)
s[0] = s[0] + (pwr[t] / fact[t]);
This code, when parallelized, will overwrite the same variable concurrently with partial calculation results. This can only work when the threads are coordinated somehow. Fortunately, openmp has a dedicated directive reduce for calculating sums, so you can fix this easily.
In the pthread version of the code, one thread does this calculation, so no problem there.
am brand new to C. I have a little program which is intended to solve find the dot product of a large 2d matrix with itself using pthread. Now when the function assigned to the pthread is called and the struct passed as a variable is accessed, the program breaks and stop working. I don't really know what am doing wrong. Here is the code:
This is the main function.
int main()
{
int rc;
int threadcount = 1;
char filename[100];
//patch to enable console printing in eclipse
setvbuf(stdout, NULL, _IONBF, 0);
do {
prompt_for_fileName(filename);
if (filename[0] == 'Q' || filename[0] == 'q') {
puts("Program ended");
return 0;
}
//read thread count
read_threadcount(&threadcount);
//initialize matrices
matrix_def matrix = initialize_matrix(filename);
//get the dimension of sub-matrices
int dfRow = (int) floor(matrix.NROWS / threadcount);
pthread_t threads[threadcount];
pthread_arg pthreadargs[threadcount];
for (int i = 0; i < threadcount; i++) {
int startRow = i * dfRow;
int endRow =
((i + 1) == threadcount) ?
matrix.NROWS : (startRow + dfRow) - 1; //we're subtracting one because its zero based.
//create a structure that we'll passed to the array.
pthread_arg arg = { matrix.NROWS, matrix.NCOLS, startRow, endRow,
0.0, NULL, NULL };
arg.data = matrix.data;
arg.result_set = create_result_memory(matrix.NCOLS);
fprintf(stderr, "before %p\n", arg.result_set);
//push arg into array.
pthreadargs[i] = arg;
rc = pthread_create(&threads[i], NULL, compute_dot_product,
(void *) &arg);
if (rc) {
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
/* Last thing that main() should do */
pthread_exit(NULL);
puts("Completed processing.");
double totalTime = 0.0;
for (int z = 0; z < threadcount; z++) {
pthread_arg ar = pthreadargs[z];
printf("Thread %d took %g to process %d rows and %d columns.\n", z,
ar.execution_time, ar.endz - ar.start, ar.col);
totalTime += ar.execution_time;
}
printf(
"It took the total time of %g, to compute the dot product of the matrices.\n",
totalTime);
//free memory
free(matrix.data);
for (int k = 0; k < threadcount; k++) {
free(pthreadargs[k].data);
free(pthreadargs[k].result_set);
}
} while (filename[0] != 'Q' || filename[0] != 'q');
}
This is the function being called by the pthread
void * compute_dot_product(void * inputArgs) {
double startTime, endTime;
pthread_arg * args = inputArgs;
/*Compute the dimension of the result matrix*/
int col, row, start, endz;
col = args->col;
start = args->start;
endz = args->endz;
row = endz - start;
fprintf(stderr, "after %p\n", args->result_set);
//create a pointer to the two array
double **arr1 = args->data;
double **arr2 = arr1;
//begin the computation
int x;
startTime = seconds();
//calculate the dot product the two matrices.
for (x = 0; x < col; x++) {
double colProduct = 0.0;
for (int y = start; y < endz; y++) {
colProduct += arr1[y][x] * arr2[y][x];
}
//The code breaks here.
args->result_set[x] = colProduct;
}
endTime = seconds();
double diff = endTime - startTime;
args->execution_time = diff;
return (void *) 4;
}
This is my struct definitions
typedef struct
{
int NROWS; /*for m rows*/
int NCOLS; /*for n columns*/
double ** data;
} matrix_def;
typedef struct
{
double execution_time;
matrix_def matrix;
} compute_result;
typedef struct{
int row;
int col;
int start;
int endz;
double execution_time;
double **data;
double *result_set;
} pthread_arg;
Memory allocation of the 2D matrix.
/*dynamically allocate array based on the read size*/
matrix.data = (double **) malloc(sizeof(double *) * M);
if(matrix.data != NULL){
int x;
for(x = 0; x < M; x++){
matrix.data[x] = (double) malloc(sizeof(double) * N);
}
}else{
fprintf(stderr, "Unable to allocate memory\n");
exit(1);
}
Initialize Matrix function
matrix_def initialize_matrix(char *argv)
{
int ret_code;
MM_typecode matcode;
FILE *f;
int M, N, nz;
int i;
matrix_def matrix;
if((f = fopen(argv, "r")) == NULL)
{
fprintf(stderr, "Reading file: '%s' failed", argv);
exit(1);
}
/*Read matrix banner*/
if(mm_read_banner(f, &matcode) != 0)
{
printf("Could not process Matrix Market banner. \n");
exit(1);
}
/*Check if the current matrix is supported.*/
if(mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))
{
printf("Sorry, this application does not support ");
printf("Market Matrix type: [%s]\n", mm_typecode_to_str(matcode));
exit(1);
}
/*find out size of the sparse matrix...*/
if((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) != 0)
exit(1);
/*Assign m, n sizes.*/
matrix.NROWS = M;
matrix.NCOLS = N;
/*dynamically allocate array based on the read size*/
matrix.data = (double **) malloc(sizeof(double *) * M);
if(matrix.data != NULL){
int x;
for(x = 0; x < M; x++){
matrix.data[x] = (double *) malloc(sizeof(double) * N);
}
}else{
fprintf(stderr, "Unable to allocate memory\n");
exit(1);
}
/*Iterate through the created memory location and fill it with zeros*/
int a, b;
for(a = 0; a < M; a++){
for(b = 0; b < N; b++){
matrix.data[a][b] = 0;
}
}
/*Read the matrix*/
for(i = 0; i < nz; i++)
{
int I = 0, J = 0;
double val = 0;
fscanf(f, "%d %d %lg\n", &I, &J, &val);
//since the matrix market file starts off at
//1,1 we have to subtract 1 from the index
//to account for the array which starts off at
// 0,0
matrix.data[--I][--J] = val;
}
if(f != stdin)
fclose(f);
return matrix;
}
Any help will be appreciated, as am not very sure with the formula. Thanks
arg goes out of scope before the pthread is executed. Change your call to
rc = pthread_create(&threads[i], NULL, compute_dot_product, (void *) &pthreadargs[i]);
You will also need pthread_join before you exit, just in case the threads have not finished.
Edit
1) Replace your pthread_exit with
int rv;
for (i = 0; i < threadcount; ++i)
pthread_join(thread[i], &rv);
You would normally call pthread_exit inside a thread (like compute_dot_product) as an abnormal exit. This is a possible reason for your program breaking.
2) On your exit, I don't know how you have allocated your memory but this is a potential area where your code might be broken. If you have allocated your memory as
matrix.data = malloc(sizeof(double*) * matrix.NROWS);
matrix.data[0] = malloc(sizeof(double) * matrix.NROWS * matrix.NCOLS);
for (i = 1; i < matrix.NROWS; ++i)
matrix.data[i] = matrix.data[i - 1] + matrix.NCOLS;
Then you should free as
free(matrix.data[0]);
free(matrix.data);
If you have allocated each row individually, then free all the rows before freeing matrix.data.
3) Since matrix.data has been freed, pthreadargs[k].data should not be freed as it is pointing to the same area of memory that has already been freed.
The arg object defined here:
pthread_arg arg = { matrix.NROWS, matrix.NCOLS, startRow, endRow,
0.0, NULL, NULL };
goes out of scope while thread is still running. You need to prevent this from happening somehow, for example by allocating it on the heap instead.
I am working on a multi-threaded numerical integration program using the trapezoidal rule.
I have a struct which contains six items:
typedef struct trapezoidalIntegrationThread{
float a;
float b;
int n;
float h;
double res;
float elTime;
}threadParams;
a is the left end point, b is the right end point, n is the number of trapezoids, h is the height, res is the result calculated within compute_with_pthread, and finally, elTime is the elapsed time for compute_with_pthread for benchmarking.
Here is my code in main:
int n = NUM_TRAPEZOIDS;
float a = LEFT_ENDPOINT;
float b = RIGHT_ENDPOINT;
pthread_t masterThread;
pthread_t slaveThread[NUM_THREADs];
threadParams *trapThread;
for(i = 0; i < NUM_THREADs; i++) {
trapThread = (threadParams *) malloc(sizeof(threadParams));
trapThread->a = a;
trapThread->b = b;
trapThread->n = n;
trapThread->h = (b - a) / (float) n;
if (pthread_create(&slaveThread[i], NULL, compute_using_pthreads, (void *) trapThread) != 0) {
printf("Looks like something went wrong..\n");
return -1;
}
}
for(i = 0; i < NUM_THREADs; i++) {
pthread_join(slaveThread[i], NULL);
}
pthread_exit((void *) masterThread);
I am basically creating the number of threads defined in NUM_THREADS (let's assume this value is 4). I am allocating how much memory the struct needs, and setting the pre-defined values of:
#define LEFT_ENDPOINT 5
#define RIGHT_ENDPOINT 1000
#define NUM_TRAPEZOIDS 100000000
#define NUM_THREADs 8 /* Number of threads to run. */
Next, I create my pthreads, and call the compute_using_pthreads function:
void *compute_using_pthreads(void *inputs)
{
double integral;
int k;
threadParams *args = (threadParams *) inputs;
unsigned long p_micros = 0;
float p_millis = 0.0;
clock_t p_start, p_end;
float a = args->a;
float b = args->b;
int n = args->n;
float h = args->h;
p_start = clock();
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
p_end = clock();
p_micros = p_end - p_start;
p_millis = p_micros / 1000;
args->res = integral;
args->elTime = p_millis;
}
I ran this program and compared it against a non-multithreaded function:
double compute_gold(float a, float b, int n, float h)
{
double integral;
int k;
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
return integral;
}
So here are the results:
Run-time of compute_gold:
~3000 ms
Run_time of compute_with_pthread:
Using 1 thread: ~3000 ms
Using 2 threads: ~6000 ms
Using 4 thrads: ~12000 ms
....
So for some reason, the more threads I added, the execution took n-threads more time to execute. I can't for the life of me figure out why this is happening, as I am quite new to C programming =/
I have the following Pthreads code about calculating and creating a picture of the Mandelbrot set. My code in C works just fine and it prints the resulting picture nicely. The point is that using the below code, I am able to compile the code and execute it. Afterwards, if I try to view the resulting .ppm file in Gimp, it simply cannot open it. I guess I'm doing something wrong in my code. If someone could help me I would be glad.
// mandpthread.c
// to compile: gcc mandpthread.c -o mandpthread -lm -lrt -lpthread
// usage: ./mandpthread <no_of_iterations> <no_of_threads> > output.ppm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <assert.h>
#include <pthread.h>
typedef struct {
int r, g, b;
} rgb;
int NITERATIONS, NTHREADS;
rgb **m;
void color(rgb **m, int x, int y, int red, int green, int blue)
{
m[y][x].r = red;
m[y][x].g = green;
m[y][x].b = blue;
}
void mandelbrot(int tid)
{
int w = 600, h = 400, x, y;
// each iteration, it calculates: newz = oldz*oldz + p,
// where p is the current pixel, and oldz stars at the origin
double pr, pi; // real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; // real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; // you can change these to zoom and change position
int start = tid * NITERATIONS/NTHREADS;
int end = (tid+1) * (NITERATIONS/NTHREADS) - 1;
//loop through every pixel
for(y = 0; y < h; y++) {
for(x = 0; x < w; x++) {
// calculate the initial real and imaginary part of z,
// based on the pixel location and zoom and position values
pr = 1.5 * (x - w / 2) / (0.5 * zoom * w) + moveX;
pi = (y - h / 2) / (0.5 * zoom * h) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
// i will represent the number of iterations
int i;
// start the iteration process
for(i = start; i <= end; i++) {
// remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
// the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
// if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
if(i == NITERATIONS)
color(m, x, y, 0, 0, 0); // black
else
{
// normalized iteration count method for proper coloring
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256. * log2(1.75 + i - log2(log2(z))) / log2((double)NITERATIONS);
color(m, x, y, brightness, brightness, 255);
}
}
}
}
// worker function which will be passed to pthread_create function
void *worker(void *arg)
{
int tid = (int)arg;
mandelbrot(tid);
}
int main(int argc, char *argv[])
{
pthread_t* threads;
int i, j, rc;
if(argc != 3)
{
printf("Usage: %s <no_of_iterations> <no_of_threads> > output.ppm\n", argv[0]);
exit(1);
}
NITERATIONS = atoi(argv[1]);
NTHREADS = atoi(argv[2]);
threads = (pthread_t*)malloc(NTHREADS * sizeof(pthread_t));
m = malloc(400 * sizeof(rgb *));
for(i = 0; i < 400; i++)
m[i] = malloc(600 * sizeof(rgb));
// declaring the needed variables for calculating the running time
struct timespec begin, end;
double time_spent;
// starting the run time
clock_gettime(CLOCK_MONOTONIC, &begin);
printf("P6\n# AUTHOR: ET\n");
printf("%d %d\n255\n",600,400);
for(i = 0; i < NTHREADS; i++) {
rc = pthread_create(&threads[i], NULL, worker, (void *)i);
assert(rc == 0); // checking whether thread creating was successfull
}
for(i = 0; i < NTHREADS; i++) {
rc = pthread_join(threads[i], NULL);
assert(rc == 0); // checking whether thread join was successfull
}
// printing to file
for(i = 0; i < 400; i++) {
for(j = 0; j < 600; j++) {
fputc((char)m[i][j].r, stdout);
fputc((char)m[i][j].g, stdout);
fputc((char)m[i][j].b, stdout);
}
}
// ending the run time
clock_gettime(CLOCK_MONOTONIC, &end);
// calculating time spent during the calculation and printing it
time_spent = end.tv_sec - begin.tv_sec;
time_spent += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
for(i = 0; i < 400; i++)
free(m[i]);
free(m);
free(threads);
return 0;
}
The newest version of your code works for me with 100 iterations and 1 thread.
Doing two threads fails, because the ppm file has 2 headers one from each thread.
If I delete one of the headers, the image loads but the colours are off and there's a glitch in the image.