Pi calculator with mutex Synchronization - c

Finishing up an assignment here. Got the code to work and calculate pie except for random values where I receive the following errors:
./piesync 10 3
pi computed with 10 terms in 3 threads is 3.14183961892940200045
* Error in `./piesync': free(): invalid next size (fast): 0x0000000001ca3010 *
./piesync 100 5
* Error in `./piesync': double free or corruption (out): 0x0000000000ee5040 *
I know it might be something with the array or the mutex but cant figure out what.
Code:
//Pini Vaknine
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
//global variables
int N, T;
double gpie = 3.0;
pthread_mutex_t mutex;
//pie function
void* pie_runner(void* arg)
{
long j = (long)arg;
long lower = (N/T)*(j-1)+1;
long upper = ((N/T)*(j));
double myPartialSum = 0;
//printf("lower=%lu upper=%lu\n",lower , upper);
for(long i = lower; i <= upper; i++)
{
if(i % 2 == 0){
myPartialSum -= 4.0/((2*i)*(2*i+1)*(2*i+2));
//printf("vsum %lu = %f\n", j, vsum[j]);
}
else{
myPartialSum += 4.0/((2*i)*(2*i+1)*(2*i+2));
//printf("vsum %lu = %f\n", j, vsum[j]);
}
}
pthread_mutex_lock (&mutex);
gpie = gpie + myPartialSum;
pthread_mutex_unlock (&mutex);
pthread_exit(0);
//return NULL;
}
int main(int argc, char **argv)
{
if(argc != 3) {
printf("Error: Must send it 2 parameters, you sent %d\n", argc-1);
exit(1);
}
N = atoi(argv[1]);
T = atoi(argv[2]);
if(N <= T) {
printf("Error: Number of terms must be greater then number of threads.\n");
exit(1);
}
//launch threads
pthread_attr_t attr;
pthread_t *tids = (pthread_t *) calloc(T, sizeof(pthread_t));
if(tids == NULL) {
fprintf(stderr, "Memory allocation problem\n");
exit(1);
}
pthread_mutex_init(&mutex, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for(long i = 1; i<=T; i++)
{
int r = pthread_create(&tids[i], &attr, pie_runner, (void*)i);
if(r<0) {
printf("ERROR: pthread_create() returned %d\n", r);
exit(2);
}
}
//wait for threads...
for(int k = 1; k<=T; k++)
{
pthread_join(tids[k], NULL);
}
printf("pi computed with %d terms in %d threads is %.20f\n", N, T, gpie);
pthread_mutex_destroy(&mutex);
pthread_attr_destroy(&attr);
free(tids);
return 0;
}

You are indexing out of range of an array. You have allocated an array for T elements here
pthread_t *tids = (pthread_t *) calloc(T, sizeof(pthread_t));
but you index it incorrectly, here
for(int k = 1; k<=T; k++)
{
pthread_join(tids[k], NULL);
}
and other instances too. In C you index an array from 0 so the loop should be
for(int k=0; k<T; k++)

You have a division by zero in
if(i % 2 == 0){
myPartialSum -= 4.0/((2*i)*(2*i+1)*(2*i+2));
for j=0.
Fixing this
for(long i = lower; i <= upper; i++)
{
if(i % 2 == 0){
if ( ((2*i)*(2*i+1)*(2*i+2)) == 0)
myPartialSum = 0.0;
else
myPartialSum -= 4.0/((2*i)*(2*i+1)*(2*i+2));
}
else{
myPartialSum += 4.0/((2*i)*(2*i+1)*(2*i+2));
}
}
and changing the indices, the program works out of the box

Related

Why is my subroutine-variable overwritten by the return value of a previous subroutine?

I have a nested for loop that calls subroutines and stores the return values in a two-dimensional array. The problem is, that the variable length changes the value to the return value of the previous function call, which is not even from the same subroutine in each step, i.e. on step 0 it takes the return value avg_time from the call of inc_serial from the outer loop, but on step 1 it takes the return value from the last call of inc_omp in the following loop:
for (length = minlength; length <= maxlength; length = length * 10) {
int threads = 0;
inc_serial(length, nSamples, pravg_time);
for (int j = 0; threads < maxthreads; j++) {
threads = pow(2, j);
serial[i][j] = avg_time;
printf("omp[%d][%d] = inc_omp(%f, %ld, %d)\n",
i, j, length, nSamples, threads);
omp[i][j] = inc_omp(length, nSamples, threads);
printf("omp[%d][%d] = %f\n", i, j, omp[i][j]);
}
i++;
threads = 1;
}
The variable length should actually only be changed in the outer loop, not within steps of the inner loop. How can I investige what causes this change?
I'm happy to share the full code, where all the declaration is visible:
/*****************************************************************************
DESCRIPTION:
This program increments every element of the array by two.
using a serial approach, openmp, and mpi.
It extracts the averge execution time for different numbers of threads,
and stores the results, which can be plotted.
We do this in order to compare the performance of each routine.
Compile:
* $mpicc mpi_scalability.c -o mpi_scalability -fopenmp -lm -Ofast
Run:
* $export OMP_NUM_THREADS=
* $mpirun -np ./mpi_scalability
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#include <omp.h>
int
main(int argc, char *argv[])
{
int j = 0;
int len_per_process = 0;
int remainder = 0;
int mylen_per_process = 0;
int size = 0;
int rank = 0;
int *recvcounts,
*displs;
double *a,
*a_per_process;
double start_comp = 0;
double start_comm = 0;
double end_comp = 0;
double end_comm = 0;
double maxtime_comp = 0;
double maxtime_comm = 0;
double inc_serial(long, long, double *);
double inc_omp(long, long, int);
int i = 0;
long nSamples = 1000;
long length = 1.0;
double *serial[4];
double *omp[4];
double *mpi[4]; // assuming that we compare 4 different array lengths, could be done dynamically with a pointer to a pointer
int maxthreads = 0;
int testnumber = 0;
long minlength = 1;
long maxlength = 1;
int cycles = 0;
long longlength = 0;
double avg_time = 0;
double *pravg_time = &avg_time;
/* Get maxmimum number of threads to use in parallelisation */
if (argc != 2) {
printf("Wrong number of arguments!!! \n");
printf("usage: %s \n", argv[0]);
return -1;
}
maxthreads = atoi(argv[1]);
/* Get the number of cycles to run the program */
minlength = pow(10, 6);
maxlength = pow(10, 7);
cycles = (int) log10(maxlength / minlength) + 1; // number of cycles to run the program
take the log base 2 of maxthreads
testnumber = (int) (log2(maxthreads) + 1); // We run the test for 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ... threads
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Whole array allocation in master process */
if (rank == 0) {
a = (double *) malloc(longlength * sizeof(double));
/* Allocate memory for results */
for (i = 0; i < cycles; i++) {
serial[i] = (double *) malloc(testnumber * sizeof(double));
omp[i] = (double *) malloc(testnumber * sizeof(double));
mpi[i] = (double *) malloc(testnumber * sizeof(double));
}
i = 0;
}
recvcounts = (int *) malloc(size * sizeof(int));
displs = (int *) malloc(size * sizeof(int));
for (int tmp_rank = 0; tmp_rank <= rank; tmp_rank++) {
if (rank == 0) {
/* Run the routines */
for (length = minlength; length <= maxlength; length = length * 10) {
int threads = 0;
inc_serial(length, nSamples, pravg_time);
for (int j = 0; threads < maxthreads; j++) {
threads = pow(2, j);
serial[i][j] = avg_time;
printf("omp[%d][%d] = inc_omp(%f, %ld, %d)\n", i, j, length, nSamples, threads);
omp[i][j] = inc_omp(length, nSamples, threads);
printf("omp[%d][%d] = %f\n", i, j, omp[i][j]);
}
i++;
threads = 1;
}
}
}
/* Data distribution to processes */
len_per_process = longlength / size;
remainder = longlength % size;
mylen_per_process = (rank < remainder) ? (len_per_process + 1) : (len_per_process);
MPI_Allgather(&mylen_per_process, 1, MPI_INT, recvcounts, 1, MPI_INT, MPI_COMM_WORLD);
displs[0] = 0;
for (i = 1; i < size; i++) {
displs[i] = displs[i - 1] + recvcounts[i - 1];
}
/* Sub-Arrays Allocation and Initialisation at each process */
a_per_process = (double *) malloc(mylen_per_process * sizeof(double));
for (i = 0; i < mylen_per_process; i++) {
a_per_process[i] = 0.0;
}
/* Increment elements by 2 */
start_comp = omp_get_wtime();
for (i = 0; i < nSamples; i++) {
for (j = 0; j < mylen_per_process; j++) {
a_per_process[j] = a_per_process[j] + 2.0;
}
}
end_comp = omp_get_wtime() - start_comp;
start_comm = omp_get_wtime();
MPI_Gatherv(a_per_process, mylen_per_process, MPI_DOUBLE, a, recvcounts, displs, MPI_DOUBLE, 0, MPI_COMM_WORLD);
end_comm = omp_get_wtime() - start_comm;
/* Check correctness */
for (i = 0; i < longlength; i++) {
if (rank == 0) {
if (a[i] != 2.0 * nSamples) {
printf("Not equal at %d\n", i);
break;
}
}
}
MPI_Reduce(&end_comp, &maxtime_comp, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&end_comm, &maxtime_comm, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (rank == 0) {
printf("Size=%ld, Number of processes=%d\n", longlength, size);
printf("Computation Time= %lf sec, Communication Time= %lf sec\n", maxtime_comp / nSamples, maxtime_comm);
}
free(a_per_process);
free(recvcounts);
free(displs);
if (rank == 0) {
free(a);
}
MPI_Finalize();
return 0;
}
double
inc_serial(long length, long nSamples, double *pravg_time)
{
int i = 0;
int j = 0;
double *a;
double start = 0;
double end = 0;
/* Array Allocation and Initialisation */
a = (double *) malloc(length * sizeof(double));
for (i = 0; i < length; i++) {
a[i] = 0.0;
}
/* Increment elements by 2 */
start = omp_get_wtime();
for (i = 0; i < nSamples; i++) {
for (j = 0; j < length; j++) {
a[j] = a[j] + 2.0;
}
}
end = omp_get_wtime();
/* Check correctness */
for (i = 0; i < length; i++) {
if (a[i] != 2.0 * nSamples) {
printf("Not equal at %d\n", i);
break;
}
}
*pravg_time = (end - start) / nSamples;
free(a);
return 0;
}
double
inc_omp(long length, long nSamples, int nthreads)
{
int i,
j;
double *a;
double start,
end;
double avg_time = 0.0;
#pragma omp parallel
#pragma omp master
/* Array Allocation and Initialisation */
a = (double *) malloc(length * sizeof(double));
for (i = 0; i < length; i++) {
a[i] = 0.0;
}
/* Increment elements by 2 */
start = omp_get_wtime();
for (i = 0; i < nSamples; i++) {
#pragma omp parallel for private(j) shared(a) num_threads(nthreads)
for (j = 0; j < length; j++) {
a[j] = a[j] + 2.0;
}
}
end = omp_get_wtime();
/* Check correctness */
for (i = 0; i < length; i++) {
if (a[i] != 2.0 * nSamples) {
printf("Not equal at %d\n", i);
break;
}
}
Get the average execution time as a double
avg_time = (end - start) / nSamples;
free(a);
return avg_time;
}
Instead of using a return value for inc_serial, I set return 0 and used a pointer, but it the first call of inc_omp still uses the result of inc_serial as the input argument for length on its first call and it's own return value for consequtive calls. I would expect it to use the actual value of length from the outer for-loop.

How do I fix this segmentation fault?

I am trying to run a program I have written for multiplying to square NxN matrices. However, I am getting a segmentation fault error. I have working code for the program without threading. But I have been unsuccessful in adapting my code for multiple threads.
I am attempting to run the code on a raspberry pi 4. The debugger states that the following line is where I receive the error signal SIGSEGV:
args.A[i][j] = rand() % 100;
I have tried putting printf statements around the sections of code where I allocate memory, but they were never run, so I am assuming that seg faults happen before any of the code is actually ran. I did some research on the internet in regards to solving seg faults, and that is when I tried using the debugger, but I do not understand why it is having a problem with setting the matrix elements. Especially since my previous unthreaded program has the same line of code and runs without any errors.
Feedback would be greatly appreciated.
The following is my code:
/* Program must be passed exactly one integer that satisfies the following condition:
* N % n = 0, where N is the square matrices' dimensions and n is the number of threads.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#define N 2000
typedef struct __myarg_t
{
FILE *Aptr, *Bptr, *Cptr; // Files containing the matrices
int **A, **B, **C, **T; // Matrices A , B, resultant and transpose of B
int rows; // Number of rows each thread computes
int cur; // Current thread number
} myarg_t;
void *mythread(void *arg)
{
myarg_t *m = (myarg_t *) arg;
int start = m->cur++ * m->rows;
int end = start + m->rows;
// Matrix Multiplication for rows start:(end - 1)
for (int i = start; i < end; i++)
{
for (int j = start; j < end; j++)
{
int num = 0;
for (int k = 0; k < N; k++)
{
num += m->A[i][k] * m->T[j][k];
}
m->C[i][j] = num;
}
}
return NULL;
}
int main(int argc, char *argv[])
{
if (argc != 2)
{
fprintf(stderr, "usage: main-first <#ofthreads>\n");
exit(1);
}
pthread_t *thread;
clock_t tic, toc;
myarg_t args;
int rc, n;
args.cur = 0;
args.rows = N/n;
n = atoi(argv[1]);
args.Aptr = fopen("A_multi.txt", "w");
args.Bptr = fopen("B_multi.txt", "w");
args.Cptr = fopen("C_multi.txt", "w");
args.A = (int**)malloc(N * sizeof(int*));
args.B = (int**)malloc(N * sizeof(int*));
args.C = (int**)malloc(N * sizeof(int*));
args.T = (int**)malloc(N * sizeof(int*));
thread = (pthread_t *)malloc(n * sizeof(pthread_t));
// Dynamically allocate memory for 2D Array
for (int i = 0; i < N; i++)
{
args.A[i] = (int*)malloc(N * sizeof(int*));
args.B[i] = (int*)malloc(N * sizeof(int*));
args.C[i] = (int*)malloc(N * sizeof(int*));
args.T[i] = (int*)malloc(N * sizeof(int*));
}
// Assign values to the elements of the Matrices
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; i++)
{
args.A[i][j] = rand() % 100;
args.B[i][j] = rand() % 100;
args.T[j][i] = args.B[i][j];
}
}
tic = clock();
// Create threads
for (int i = 0; i < n; i++)
{
rc = pthread_create(&thread[i], NULL, mythread, &args);
if (rc != 0)
{
printf("pthread_create failed with thread %d.\n", i);
exit(1);
}
}
// Wait for threads to complete
for (int i = 0; i < n; i++)
{
rc = pthread_join(thread[i], NULL);
if (rc != 0)
{
printf("ptphread_join failed with thread %d.\n", i);
exit(1);
}
}
toc = clock();
printf("Elapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
// Write matrices to their output files
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
fprintf(args.Aptr, "%d ", args.A[i][j]);
fprintf(args.Bptr, "%d ", args.B[i][j]);
fprintf(args.Cptr, "%d ", args.C[i][j]);
}
fprintf(args.Aptr, "\n");
fprintf(args.Bptr, "\n");
fprintf(args.Cptr, "\n");
}
// Deallocate memory
for (int i = 0; i < N; i++)
{
free(args.A[i]);
free(args.B[i]);
free(args.C[i]);
free(args.T[i]);
}
free(args.A);
free(args.B);
free(args.C);
free(args.T);
fclose(args.Aptr);
fclose(args.Bptr);
fclose(args.Cptr);
return 0;
}
Change:
int rc, n;
...
args.rows = N/n;
n = atoi(argv[1]);
to:
int rc;
...
int n = atoi(argv[1]);
if(!n) {
// atoi() will return for "0" or error
}
args.rows = N/n;
The 2nd loop after the "Assign values" comment probably increments the wrong variable i but should be j. Otherwise i will be 2 * (N-1) which will overflow the arrays A, B and T which are of has N elements. This will cause your segfault.

Why the execution time of parallel problem is better for an odd number of threads?

I implemented in parallel an algorithm which finds and prints all the circular prime numbers within the interval [2, X]. I measured the execution time of the problem varying the number of threads from 1 to 16. Can someone explain why I get worse execution time when the number of threads is even? (Note: actually I get worse time for an odd number of threads if the main thread is also taken into account)
The processor of the machine I ran the program is "Intel® Core™ i7-8550U", with 4 cores and hyperthreading.
This is the function which the threads are executing (I performed load balancing):
void *printIfCircularPrime(void *threadId)
{
int tid = (int)threadId;
printf("Hi, I am thread %d\n", threadId);
for (long i = 2 + tid; i <= X; i += NR_THREADS)
{
if (isCircularPrime(i))
{
printf("%ld\n", i);
}
}
pthread_exit(NULL);
}
These are my execution time measurements:
Full code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <math.h>
#include <time.h>
#define NR_THREADS 16
#define X 120000000
int isPrime(long n)
{
if (n <= 1)
{
return 0;
}
if (n == 2)
{
return 1;
}
if (n % 2 == 0)
{
return 0;
}
for (int d = 3; d <= floor(sqrt(n)); d += 2)
{
if (n % d == 0)
{
return 0;
}
}
return 1;
}
int countDigits(long n)
{
int digits = 0;
while (n)
{
n /= 10;
digits++;
}
return digits;
}
long cyclicallyPermute(long n, int numberOfDigits)
{
int lastDigit = n % 10;
return pow(10, numberOfDigits - 1) * lastDigit + n / 10;
}
int isCircularPrime(long n)
{
int numberOfDigits = countDigits(n);
for (int i = 0; i < numberOfDigits; i++)
{
if (!isPrime(n))
{
return 0;
}
n = cyclicallyPermute(n, numberOfDigits);
}
return 1;
}
void *printIfCircularPrime(void *threadId)
{
int tid = (int)threadId;
printf("Hi, I am thread %d\n", threadId);
for (long i = 2 + tid; i <= X; i += NR_THREADS)
{
if (isCircularPrime(i))
{
printf("%ld\n", i);
}
}
pthread_exit(NULL);
}
int main(int argc, char *argv[])
{
printf("Number of threads: %d\n", NR_THREADS);
pthread_t threads[NR_THREADS];
struct timespec start, stop;
clock_gettime(CLOCK_REALTIME, &start);
for (int i = 0; i < NR_THREADS; i++)
{
pthread_create(&threads[i], NULL, printIfCircularPrime, (void *)i);
}
for (int i = 0; i < NR_THREADS; i++)
{
pthread_join(threads[i], NULL);
}
clock_gettime(CLOCK_REALTIME, &stop);
printf("\nExecution time: %ld seconds\n", stop.tv_sec - start.tv_sec);
pthread_exit(NULL);
}

Error getting shared memory for creating more than 6 child process

I am trying to understand how fork in c work. The problem that I am trying to solve is; given f(upper), I am trying to find f(1) + f(2) + .. f(upper).
I wanted to do multi process programming to have fork each child process and have each child process calculate f(x).
So f(1) , f(2) ... f(upper) is calculated by each child process.
The parent process should calculate following f(1) + .. + f(upper).
Here is my code
#include <sys/types.h>
#include <stdio.h>
#include <unistd.h>
#include<sys/shm.h>
#include<sys/ipc.h>
int upper = 0;
int n = 0;
int main(int argc, char*argv[]){
pid_t pid;
if(argc != 2){
printf("Input one argument");
return -1;
}
upper = atoi(argv[1]);
int segment_id;
int *s;
pid_t *pids;
pids = (pid_t *) malloc(sizeof(int) * upper);
s = (int *) malloc(sizeof(int) * upper);
key_t key = 4141;
if((segment_id = shmget(key, upper * sizeof(int), IPC_CREAT | 0667))< 0) perror("shmget: failure");
if((s = shmat(segment_id, NULL, 0)) == (char *) -1){
perror("shmat : failure");
exit(1);
}
for(int i = 1; i <= upper; i++){
pid = fork();
if(pid == 0) {
n = i;
break;
}
pids[i] = pid;
}
if(pid > 0){
wait(1 * upper);
int totalSum;
for(int i = 0; i < upper; i++){
totalSum += s[i];
}
printf("Total sum = %d", totalSum);
} else {
sleep(2);
int sum = 0;
for(int i = 0; i <= n; i++){
sum += i;
}
s[n - 1] = sum;
printf("n => %d : sum %d\n", n, sum);
}
}
However whenever I try to run this program with argument more than 6.
I get Invalid argument error.
You are writing outside of the bounds of pids
pids = (pid_t *) malloc(sizeof(int) * upper);
...
for(int i = 1; i <= upper; i++){
pid = fork();
if(pid == 0) {
n = i;
break;
}
pids[i] = pid; /* Here */
}
Change to
for(int i = 1; i < upper; i++){

Can't find the error C

I have to write two threads. Each one prints 5 even/odd numbers from 1 to 100 like this (odd is impair in French, even is pair).
even 2,4,6,8,10
odd 1,3,5,7,9
even 12,14,16,18,20
odd 13,15,17,19,21
etc...
I wrote this code:
#include <stdio.h>
#include <semaphore.h>
#include <pthread.h>
#define maxi 100
pthread_mutex_t mutex;
sem_t p;
sem_t imp;
int tour = 0;
void *pair(void *arg);
void *impair(void *arg);
int main() {
pthread_t tidp, tidimp;
pthread_mutex_init(&mutex, NULL);
sem_init(&p, 0, 1);
sem_init(&imp, 0, 1);
pthread_create(&tidp, NULL, pair, (void *)2);
pthread_create(&tidimp, NULL, impair, (void *)1);
pthread_join(tidp, NULL);
pthread_join(tidp, NULL);
sem_destroy(&imp);
sem_destroy(&p);
pthread_mutex_destroy(&mutex);
return 0;
}
void *pair(void *arg) {
int i = (int)arg;
int j, l;
// sleep(5);
pthread_mutex_lock(&mutex);
if (!tour) {
tour = 1;
pthread_mutex_unlock(&mutex);
sem_wait(&imp);
} else {
pthread_mutex_unlock(&mutex);
}
for (l = 0; l < maxi; l += 10) {
sem_wait(&p);
printf(" Pair ");
pthread_mutex_lock(&mutex);
for (j = 0; j < 10; j += 2) {
printf(" %4d \t", j + i);
}
pthread_mutex_unlock(&mutex);
printf("\n");
sem_post(&imp);
i += 10;
}
pthread_exit(NULL);
}
void *impair(void *arg) {
int i = (int)arg;
int j, l;
pthread_mutex_lock(&mutex);
if (!tour) {
tour = 1;
pthread_mutex_unlock(&mutex);
sem_wait(&p);
} else {
pthread_mutex_unlock(&mutex);
}
for (l = 0; l < maxi; l += 10) {
sem_wait(&imp);
printf("Impair ");
pthread_mutex_lock(&mutex);
for (j = 0; j < 10; j += 2) {
printf(" %4d \t", j + i);
}
pthread_mutex_unlock(&mutex);
printf("\n");
sem_post(&p);
i += 10;
}
pthread_exit(NULL);
}
What I don't understand is that when I run the code, sometimes it starts with odd, sometimes with even. More particularly, when it starts with odd everything goes normal and I get the all numbers from 1 to 100, but when it starts with even sometimes I only get to 91, sometimes 93, sometimes 97.
Can anyone tell me what is wrong? The screenshots below might help.
You're not waiting for both threads to exit:
pthread_join(tidp, NULL);
pthread_join(tidp, NULL);
One of those should be tidimp.

Resources