Parallel computing for prefix sum using openMP (c code)

Parallel computing for prefix sum using openMP (c code) - c

Have some problems with assigning parallel algorithm to prefix sum issue. I am using openMP for parallel implementation. I have the code in c as below.
Result showing:
seqsum[6] = 28 != parallelsum[6] = 34
Please advise. Thanks.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "omp.h"
#include <string.h>
#define N 10 //33554432 // 2 ^ 25
#define NUM_THREADS 4
void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
int nthr, *z, *x = _pprefixsum;
int i, j, tid, work, lo, hi;
#pragma omp parallel shared(nthr,x,z) private(i,j,tid,work,lo,hi)
{
int prev_sum;
memcpy((void *)x, (void *)iplist, sizeof(int)*size);
// Assume nthr = 2^k
#pragma omp single
{
nthr = omp_get_num_threads();
z = malloc(sizeof(int)*nthr);
}
tid = omp_get_thread_num();
work = size /nthr + (i = tid < size%nthr ? 1 : 0);
lo = (size/nthr)*tid + (i==1 ? tid : size%nthr);
hi = lo + work;
if (hi > size)
hi = size;
// local prefix sum over x
for(i=lo+1; i<hi; i++)
x[i] += x[i-1];
// local prefix sum for tid
z[tid] = x[hi-1];
#pragma omp barrier
// global prefix sum over z
for(j=1; j<nthr; j=2*j) {
if (tid >= j)
z[tid] = z[tid] + z[tid-j];
#pragma omp barrier
}
// Update local prefix sum x
prev_sum = z[tid] - x[hi-1];
for(i=lo;i<hi;i++)
x[i] += prev_sum;
}
free(z);
}
void initlist(int *iplist, unsigned long size)
{
int i;
for ( i = 0; i < size; i++)
iplist[i] = i+1;
// iplist[i] = rand() % 13;
}
void printlist(int *list, unsigned long size)
{
int i;
for(i = 0; i < size; i++) {
printf("%d ", list[i]);
}
printf("\n");
}
void computeseqprefixsum(int *iplist, int *seqprefixsum, unsigned long size)
{
int i;
seqprefixsum[0] = iplist[0];
for(i = 1; i < size; i++) {
seqprefixsum[i] = seqprefixsum[i-1] + iplist[i];
}
}
void checkresults(int *seqsum, int *parallelsum, unsigned long size)
{
int i;
for(i = 0; i < size; i++)
{
if(seqsum[i] != parallelsum[i]) {
printf("seqsum[%d] = %d != parallelsum[%d] = %d\n", i, seqsum[i], i,
parallelsum[i]);
exit(1);
}
}
}
int main(int argc, char *argv[])
{
// seed the rand generator
srand(time(NULL));
double seqstart, seqend, parstart, parend, seqtime, partime;
// initialize list
int *iplist, *seqprefixsum, *pprefixsum ;
iplist = (int*) malloc(sizeof(int) * N);
seqprefixsum = (int*) malloc(sizeof(int) * N);
pprefixsum = (int*) malloc(sizeof(int) * N);
if(iplist == NULL || seqprefixsum == NULL || pprefixsum == NULL) {
printf("memory cannot be allocated\n");
exit(1);
}
initlist(iplist, N);
seqstart = omp_get_wtime();
computeseqprefixsum(iplist, seqprefixsum, N);
seqend = omp_get_wtime();
seqtime = seqend - seqstart;
omp_set_num_threads(NUM_THREADS);
parstart = omp_get_wtime();
computeparallelprefix(iplist, pprefixsum, N);
parend= omp_get_wtime();
partime = parend - parstart;
checkresults(seqprefixsum, pprefixsum, N);
printf("Seq Time : %f, Par Time : %f, Speedup : %f\n", seqtime, partime,
seqtime/partime);
free(iplist); free(seqprefixsum); free(pprefixsum);
return 0;
}

You have the right idea for the prefix sum with your code.
I'm not sure exactly why you don't get the correct result but I cleaned up your code and my version gets the correct result. See the following question for more details parallel-cumulative-prefix-sums-in-openmp-communicating-values-between-thread
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "omp.h"
#define N 11 //33554432 // 2 ^ 25
void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
int nthr, *z, *x = _pprefixsum;
#pragma omp parallel
{
int i;
#pragma omp single
{
nthr = omp_get_num_threads();
z = malloc(sizeof(int)*nthr+1);
z[0] = 0;
}
int tid = omp_get_thread_num();
int sum = 0;
#pragma omp for schedule(static)
for(i=0; i<size; i++) {
sum += iplist[i];
x[i] = sum;
}
z[tid+1] = sum;
#pragma omp barrier
int offset = 0;
for(i=0; i<(tid+1); i++) {
offset += z[i];
}
#pragma omp for schedule(static)
for(i=0; i<size; i++) {
x[i] += offset;
}
}
free(z);
}
int main(void ) {
int *iplist, *pprefixsum ;
iplist = (int*) malloc(sizeof(int) * N);
pprefixsum = (int*) malloc(sizeof(int) * N);
for(int i=0; i<N; i++) iplist[i] = i+1;
for(int i=0; i<N; i++) printf("%d ", iplist[i]); printf("\n");
computeparallelprefix(iplist, pprefixsum, N);
for(int i=0; i<N; i++) printf("%d ", pprefixsum[i]); printf("\n");
for(int i=0; i<N; i++) printf("%d ", (i+1)*(i+2)/2); printf("\n");
return 0;
}

Related

Trying to allocate and set to 0 a matrix dynamically in C, but I can't get the set to 0 part to work

I have this matrix initialization function I need: I can generate it without any problems but can't initialize all its values to 0, neither with calloc or via looping on the matrix elements.
The function is as follows:
int initMTX(int r, int c, int ***MTX) {
int **locMTX = malloc(r * sizeof(int *)), i, j;
for (i = 0; i < r; i++) {
locMTX[i] = (int *)malloc(c * sizeof(int));
}
for (i = 0; i < r; i++)
for (j = 0; j < c; j++)
locMTX[i][j] = 0;
*MTX = locMTX;
return 1;
}
If needed for context, the entire code is this:
#include <stdio.h>
#include <stdlib.h>
int initMTX(int r, int c, int ***MTX);
void printMTX(int r, int c, int **MTX);
void freeMTX(int r, int **MTX);
int main(void) {
int NBits = 4, NNumbers = 2 ^ NBits, **MTXResults;
initMTX(NNumbers, NBits, &MTXResults);
// code
printMTX(NNumbers, NBits, MTXResults);
freeMTX(NNumbers, MTXResults);
}
int initMTX(int r, int c, int ***MTX) {
int **locMTX = malloc(r * sizeof(int *)), i, j;
for (i = 0; i < r; i++) {
locMTX[i] = (int *)malloc(c * sizeof(int));
}
for (i = 0; i < r; i++)
for (j = 0; j < c; j++)
locMTX[i][j] = 0;
*MTX = locMTX;
return 1;
}
void printMTX(int r, int c, int **MTX) {
int i, j;
for (i = 0; i < r; i++) {
for (j = 0; j < c; j++)
printf("%d ", MTX[i][r]);
printf("\n");
}
}
void freeMTX(int r, int **MTX) {
for (int i = 0; i < r; i++)
free(MTX[i]);
free(MTX);
}
I thought I grasped such a thing well by now, but it turns out it is not the case, so I've been stuck for a while.
If possible I would appreciate also knowing where the problem is when using calloc too.

This code fixes the bugs in the code in the question (most notably the typo MTX[i][r] becomes MTX[i][j]), and implements freeMTX(). It also includes the code to recover from a memory allocation failure while creating the matrix.
#include <stdio.h>
#include <stdlib.h>
int initMTX(int r, int c, int ***MTX);
void printMTX(int r, int c, int **MTX);
void freeMTX(int r, int **MTX);
int main(void)
{
int NBits = 4;
int NNumbers = 16;
int **MTXResults;
if (initMTX(NNumbers, NBits, &MTXResults))
{
printMTX(NNumbers, NBits, MTXResults);
freeMTX(NNumbers, MTXResults);
}
else
fprintf(stderr, "failed to allocate %dx%d matrix\n", NBits, NNumbers);
}
int initMTX(int r, int c, int ***MTX)
{
int **locMTX = malloc(r * sizeof(int *));
if (locMTX == 0)
return 0;
for (int i = 0; i < r; i++)
{
locMTX[i] = (int *)malloc(c * sizeof(int));
if (locMTX[i] == 0)
{
for (int j = 0; j < i; j++)
free(locMTX[j]);
free(locMTX);
return 0;
}
}
for (int i = 0; i < r; i++)
{
for (int j = 0; j < c; j++)
locMTX[i][j] = 0;
}
*MTX = locMTX;
return 1;
}
void printMTX(int r, int c, int **MTX)
{
for (int i = 0; i < r; i++)
{
for (int j = 0; j < c; j++)
printf("%d ", MTX[i][j]);
printf("\n");
}
}
void freeMTX(int r, int **MTX)
{
for (int i = 0; i < r; i++)
free(MTX[i]);
free(MTX);
}
This code uses the simpler, two allocation scheme outlined in two comments:
#include <stdio.h>
#include <stdlib.h>
int initMTX(int r, int c, int ***MTX);
void printMTX(int r, int c, int **MTX);
void freeMTX(int **MTX);
int main(void)
{
int NBits = 4;
int NNumbers = 16;
int **MTXResults;
if (initMTX(NNumbers, NBits, &MTXResults))
{
printMTX(NNumbers, NBits, MTXResults);
freeMTX(MTXResults);
}
else
fprintf(stderr, "failed to allocate %dx%d matrix\n", NBits, NNumbers);
}
int initMTX(int r, int c, int ***MTX)
{
int **locMTX = malloc(r * sizeof(locMTX[0]));
if (locMTX == 0)
return 0;
int *data = malloc(r * c * sizeof(locMTX[0][0]));
if (data == 0)
{
free(locMTX);
return 0;
}
for (int i = 0; i < r; i++)
{
locMTX[i] = data + i * c;
for (int j = 0; j < c; j++)
locMTX[i][j] = 0;
}
*MTX = locMTX;
return 1;
}
void printMTX(int r, int c, int **MTX)
{
for (int i = 0; i < r; i++)
{
for (int j = 0; j < c; j++)
printf("%d ", MTX[i][j]);
printf("\n");
}
}
void freeMTX(int **MTX)
{
free(MTX[0]);
free(MTX);
}
The freeing code is much simpler, as is the error recovery while creating the matrix — mainly because there are only two allocations that can fail.

Reducing the max value and saving its index

int v[10] = {2,9,1,3,5,7,1,2,0,0};
int maximo = 0;
int b = 0;
int i;
#pragma omp parallel for shared(v) private(i) reduction(max:maximo)
for(i = 0; i< 10; i++){
if (v[i] > maximo)
maximo = v[i];
b = i + 100;
}
How can I get the value that b gets during the iteration when maximo gets its max value (and therefore, its value after the for loop)?

TL;DR You can use User-Defined Reduction.
First, instead of:
for(i = 0; i< 10; i++){
if (v[i] > maximo)
maximo = v[i];
b = i + 100;
}
you meant this:
for(i = 0; i< 10; i++){
if (v[i] > maximo){
maximo = v[i];
b = i + 100;
}
}
OpenMP has in-build reduction functions that consider a single target value, however in your case you want to reduce taking into account 2 values the max and the array index. After OpenMP 4.0 one can create its own reduction functions (i.e., User-Defined Reduction).
First, create a struct to store the two relevant values:
struct MyMax {
int max;
int index;
};
then we need to teach the OpenMP implementation how to reduce it:
#pragma omp declare reduction(maximo : struct MyMax : omp_out = omp_in.max > omp_out.max ? omp_in : omp_out)
we set our parallel region accordingly:
#pragma omp parallel for reduction(maximo:myMaxStruct)
for(int i = 0; i< 10; i++){
if (v[i] > myMaxStruct.max){
myMaxStruct.max = v[i];
myMaxStruct.index = i + 100;
}
}
Side Note You do not really need private(i), because with the #pragma omp parallel for the index variable of the for loop will be implicitly private anyway.
All put together:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
struct MyMax {
int max;
int index;
};
int main(void)
{
#pragma omp declare reduction(maximo : struct MyMax : omp_out = omp_in.max > omp_out.max ? omp_in : omp_out)
struct MyMax myMaxStruct;
myMaxStruct.max = 0;
myMaxStruct.index = 0;
int v[10] = {2,9,1,3,5,7,1,2,0,0};
#pragma omp parallel for reduction(maximo:myMaxStruct)
for(int i = 0; i< 10; i++){
if (v[i] > myMaxStruct.max){
myMaxStruct.max = v[i];
myMaxStruct.index = i + 100;
}
}
printf("Max %d : Index %d\n", myMaxStruct.max, myMaxStruct.index);
}
OUTPUT:
Max 9 : Index 101
(Index is 101 because you have b = i + 100)

I've coded this but not compiled or tested it:
int v[10] = { 2, 9, 1, 3, 5, 7, 1, 2, 0, 0 };
int maximo = 0;
int b = 0;
int i;
int nt = omp_get_num_threads();
int bv[nt] = { 0 };
#pragma omp parallel for shared(v) shared(bv) private(i) reduction(max:maximo)
for (i = 0; i < 10; i++) {
if (v[i] > maximo) {
maximo = v[i];
bv[omp_get_thread_num()] = i + 100;
}
}
for (i = 0; i < nt; ++i)
printf("bv[%d] = %d\n",i,bv[i]);
Beware that "Returns the number of threads in the current team. In a sequential section of the program omp_get_num_threads returns 1"
Okay, I've recoded it [and built/run it] and it does produce one non-zero bv output:
#include <stdio.h>
#include <omp.h>
int
main(void)
{
int v[10] = { 2, 9, 1, 3, 5, 7, 1, 2, 0, 0 };
int i;
int nt;
int maximo = 0;
int index = 0;
int bv[32] = { 0 };
int max[32] = { 0 };
#pragma omp parallel shared(v, bv)
{
nt = omp_get_num_threads();
int thread_id = omp_get_thread_num();
#pragma omp for private(i)
for (i = 0; i < 10; i++) {
if (v[i] > max[thread_id]) {
max[thread_id] = v[i];
bv[thread_id] = i + 100;
}
}
}
// Reducing sequentially
for (i = 0; i < nt; ++i){
if(max[i] > maximo){
maximo = max[i];
index = bv[i];
}
}
printf("Max %d at index %d\n", maximo, index);
return 0;
}
Here is the program output:
Max 9 at index 101

Matrix determinant with system calls

So, this is my program that calculates matrix determinant using system calls, not good at all, but, the trouble is that when i put in a number bigger than 8 for dimension of matrix, it crashes somehow and i can't figure why it keeps happening. Please, give me some ideas.
The task was to calculate determinant using multithreading. Maybe, the problem is that I exceed max threads? valgrind says that
Use --max-threads=INT to specify a larger number of threads
and rerun valgrind
valgrind: the 'impossible' happened:
Max number of threads is too low
compile it with gcc -g -pthread
#include <stdlib.h>
#include <pthread.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
pthread_mutex_t mutex;
typedef struct {
int **matrix;
int size;
} T_MS;
void* determinant(void *npt) {
T_MS* tmp = (T_MS*) npt;
int i,j;
double det = 0;
pthread_t *array = malloc(sizeof(pthread_t) * tmp->size);
T_MS *mtarr = malloc(sizeof(T_MS) * tmp->size);
if (tmp->size == 1) {
det = tmp->matrix[0][0];
} else if (tmp->size == 2) {
det = tmp->matrix[0][0] * tmp->matrix[1][1] - tmp->matrix[0][1] * tmp->matrix[1][0];
} else {
for (i = 0; i < tmp->size; ++i) {
mtarr[i].matrix = (int **)malloc(sizeof(int *) * tmp->size);
mtarr[i].size = tmp->size - 1;
for (j = 0; j < tmp->size - 1; ++j) {
if (j < i)
mtarr[i].matrix[j] = tmp->matrix[j];
else
mtarr[i].matrix[j] = tmp->matrix[j + 1];
}
pthread_create(&array[i], NULL, determinant, mtarr + i);
}
for (i = 0; i < tmp->size; ++i) {
void *res;
for (j = 0; j < tmp->size - 1; ++j) {
}
pthread_join(array[i], &res);
double x = *(double *)&res;
det += (-1 + 2 * !(i % 2)) * x * tmp->matrix[i][tmp->size - 1];
double answer = *(double*)&det;
free(mtarr[i].matrix);
}
}
free(mtarr);
free(array);
void* ans = *(void **)&det;
return ans;
}
int main(int argc, char const *argv[]) {
srand(time(NULL));
int **matrix;
int n = 0;
int a;
pthread_t tid;
pthread_attr_t attr;
pthread_attr_init(&attr);
printf("Insert the demention of matrix:\n");
scanf("%d", &n);
matrix = (int**)malloc(n * sizeof(int*));
for (int i=0; i<n; ++i)
matrix[i] = (int*)malloc(n * sizeof(int));
printf("Insert matrix:\n");
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
matrix[i][j]=rand()%15;
//matrix[i][j] = i;
}
}
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
printf("%d ", matrix[i][j]);
}
printf("\n");
}
T_MS* npt = (T_MS*)malloc(sizeof(T_MS));
npt->matrix = matrix;
npt->size = n;
void *det;
pthread_mutex_init(&mutex, NULL);
pthread_create(&tid, NULL, determinant, npt);
pthread_join(tid, &det);
double answer = *(double*)&det;
printf("Det is: %f\n", answer);
for (int i = 0; i < n; ++i)
free(matrix[i]);
free(matrix);
free(npt);
return 0;
} ```

-Wpointer-to-int-cast and -Wint-conversion warning in C p thread program

I have created a program to matrix multiplication using C in pthread. But when i compile my program i get some warning and note. I tried my best to solve this. I was unable to do this. I mentioned my warning and note below.
warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] int s = (int)threadid;
warning: passing argument 1 of ‘multiply’ makes pointer from integer without a cast [-Wint-conversion] multiply(i);
note: expected ‘void *’ but argument is of type ‘int’
void* multiply(void *threadid)
warning and note image
code:
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define SIZE 1024
int num_thrd;
int A[SIZE][SIZE], B[SIZE][SIZE], C[SIZE][SIZE];
int randnumgen(){
return rand()%10 +1 ;
}
void init_matrix(int m[SIZE][SIZE])
{
int i, j, val = 0;
for (i = 0; i < SIZE; i++)
{
for (j = 0; j < SIZE; j++)
{
m[i][j] = randnumgen();
}
}
}
void print_matrix(int m[SIZE][SIZE])
{
int i, j;
for (i = 0; i < SIZE; i++) {
printf("\n\t| ");
for (j = 0; j < SIZE; j++)
printf("%2d ", m[i][j]);
printf("|");
}
}
// thread function
void* multiply(void *threadid)
{
int s = (int)threadid;
int from = (s * SIZE)/num_thrd;
int to = ((s+1) * SIZE)/num_thrd;
int i,j,k;
clock_t begin = clock();
for (i = from; i < to; i++)
{
for (j = 0; j < SIZE; j++)
{
C[i][j] = 0;
for ( k = 0; k < SIZE; k++)
C[i][j] += A[i][k]*B[k][j];
}
}
return 0;
}
int main(int argc, char* argv[])
{
pthread_t* thread;
int i;
double runtime=0;
num_thrd = atoi(argv[1]);
init_matrix(A);
init_matrix(B);
thread = (pthread_t*) malloc(num_thrd*sizeof(pthread_t));
clock_t begin = clock();
for (i = 0; i < num_thrd; i++){
multiply(i);
}
clock_t end = clock();
runtime = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%f\n",runtime);
free(thread);
return 0;
}

How can i sum of array's elements with threads in C?

I work on this but i did not find solution of my problem.
This is my code. This code give an error on pthread_create and pthread_join lines. I tried everything to fix this problem but i cannot do this.
#include <stdio.h>
#include <pthread.h>
#define array_size 1000
#define no_threads 10
float a[array_size];
int global_index = 0;
int sum = 0;
pthread_t mutex_t ,mutex1;
void *slave(void *ignored)
{
int local_index, partial_sum = 0;
do {
pthread_t mutex_t ,lock(mutex1);
local_index = global_index;
global_index++;
pthread_t mutex ,unlock(mutex1);
if (local_index < array_size)
partial_sum += *(a + local_index);
}
while
(local_index < array_size);
pthread_t mutex , lock(mutex1);
sum += partial_sum;
pthread_t mutex_t , unlock(mutex1);
return 0;
}
main()
{
int i;
pthread_t thread_x[10];
pthread_mutex_init(&mutex1, NULL);
for (i = 0; i < array_size; i++)
a[i] = i+1;
for (i = 0; i < no_threads ; i++)
pthread_create(&thread_x[i] , NULL, slave,NULL);
for (i = 0; i < no_threads; i++)
pthread_join(&thread_x[i] , NULL);
printf("The sum of 1 to %d is %d\n", array_size, sum);
}

I have updated your code(pt1.c) I think you are looking for something like this.
#include <stdio.h>
#include <pthread.h>
#define array_size 1000
#define no_threads 10
float a[array_size];
int global_index = 0;
int sum = 0;
pthread_mutex_t mutex1;
void *slave(void *ignored)
{
int local_index, partial_sum = 0;
do {
pthread_mutex_lock(&mutex1);
local_index = global_index;
global_index++;
pthread_mutex_unlock(&mutex1);
if (local_index < array_size)
partial_sum += *(a + local_index);
}
while(local_index < array_size);
pthread_mutex_lock(&mutex1);
sum += partial_sum;
pthread_mutex_unlock(&mutex1);
return 0;
}
main()
{
int i;
pthread_t thread_x[10];
pthread_mutex_init(&mutex1, NULL);
for (i = 0; i < array_size; i++)
a[i] = i+1;
for (i = 0; i < no_threads ; i++)
pthread_create(&thread_x[i] , NULL, slave,NULL);
for (i = 0; i < no_threads; i++)
pthread_join(thread_x[i] , NULL);
printf("The sum of 1 to %d is %d\n", array_size, sum);
}
You should compile this code as
gcc -pthread pt1.c
output :
The sum of 1 to 1000 is 500500