What causes MPI_Gatherv() to segfault?

What causes MPI_Gatherv() to segfault? - c

I perform a computation on a subset of parallel processes, but when I join the results in the master process with the command MPI_Gatherv(a_per_process, mylen_per_process, MPI_LONG_DOUBLE, a, recvcounts, displs, MPI_LONG_DOUBLE, 0, MPI_COMM_WORLD);, I get a segmentation fault.
/*****************************************************************************
* DESCRIPTION:
* This program increments every element of the array by two.
* It extracts the averge execution time for different numbers of threads,
* We do this in order to compare the performance of each routine.
* Compile:
* $mpicc mpic.c -o mpic -fopenmp -lm -Ofast
* Run:
* $mpirun -np <maxthreads> ./mpic
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
int j = 0;
long long int len_per_process = 0;
long long int remainder = 0;
long long int mylen_per_process = 0;
int size = 0;
int rank = 0;
int *recvcounts, *displs;
long double *a, *a_per_process;
double start_comp = 0;
double start_comm = 0;
double end_comp = 0;
double end_comm = 0;
double maxtime_comp = 0;
double maxtime_comm = 0;
int i = 0;
long nSamples = 10;
long long int length = 1.0;
int maxthreads = 0;
int testnumber = 0;
long long int minlength = 1;
long long int maxlength = 1;
int cycles = 0;
long longlength = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/*Whole array allocation in master process*/
if (rank == 0)
{
a = (long double *)malloc(length * sizeof(long double));
}
for (length = minlength; length <= maxlength; length = length * 10)
{
for (i = 1; i <= size; i = i * 2)
{
/*Data distribution to processes*/
len_per_process = length / i;
remainder = length % i;
mylen_per_process = (rank < remainder) ? (len_per_process + 1) : (len_per_process);
recvcounts = (int *)malloc(size * sizeof(int));
displs = (int *)malloc(size * sizeof(int));
MPI_Allgather(&mylen_per_process, 1, MPI_INT, recvcounts, 1, MPI_INT, MPI_COMM_WORLD);
displs[0] = 0;
for (j = 1; j < size; j++)
{
displs[j] = displs[j - 1] + recvcounts[j - 1];
}
/*Sub-Arrays Allocation and Initialisation at each process*/
a_per_process = (long double *)malloc(mylen_per_process * sizeof(long double));
for (j = 0; j < mylen_per_process; j++)
{
a_per_process[j] = 0.0;
}
if (rank <= i)
{
/*Increment elements by 2*/
start_comp = omp_get_wtime();
for (j = 0; j < nSamples; j++)
{
for (int k = 0; k < mylen_per_process; k++)
{
a_per_process[k] = a_per_process[k] + 2.0;
}
}
end_comp = omp_get_wtime() - start_comp;
start_comm = omp_get_wtime();
end_comm = omp_get_wtime() - start_comm;
}
// The following line causes a segfault:
MPI_Gatherv(a_per_process, mylen_per_process, MPI_LONG_DOUBLE, a, recvcounts, displs, MPI_LONG_DOUBLE, 0, MPI_COMM_WORLD);
// Get the maximum computation and communication time
MPI_Reduce(&end_comp, &maxtime_comp, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&end_comm, &maxtime_comm, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
free(a_per_process);
free(recvcounts);
free(displs);
}
}
if (rank == 0)
{
free(a);
}
MPI_Finalize();
return 0;
}
I tried double and long double types for my variables a and a_per_process, i.e. MPI_LONG_DLOUBLE and MPI_DOUBLE in the MPI_Gatherv command. The code runs when I comment this line out, meaning it doesn't abort or segfault.

Related

MPI Radix-Sort implementation low speed-up

I tried to implement a solution for the Radix-Sort algorithm using MPI to parallelize execution. My idea was to have each process compute the vector of the counts locally and then aggregate the results on the root process that does the sorting. With this solution I noticed a maximum speed up of just 1.5 compared to the sequential version.
I was wondering if a similar speed up was normal or if a different MPI implementation could be adopted to improve this result. I've seen around that you might think about sorting subarrays on processes but I don't quite understand how to put these sorted blocks together to get the final sorted array efficiently.
Sequential:
/**
* #brief This function allows to find the maximum in an array.
* #param arr array.
* #param n array size.
*/
int getMax(int* array, int n) {
int max = array[0];
for (int i = 1; i < n; i++)
if (array[i] > max)
max = array[i];
return max;
}
/**
* #brief The main function that sorts the array of size 'size'.
* #param array array.
* #param size array size.
* #param digit number which represent ciphers.
*/
void countingSort(int* array, int size, int digit) {
int* output= (int*) malloc(sizeof(int)*(size + 1));
int count[10]={0};
for (int i = 0; i < size; i++)
count[(array[i] / digit) % 10]++;
for (int i = 1; i < 10; i++)
count[i] += count[i - 1];
for (int i = size - 1; i >= 0; i--) {
output[count[(array[i] / digit) % 10] - 1] = array[i];
count[(array[i] / digit) % 10]--;
}
for (int i = 0; i < size; i++)
array[i] = output[i];
free(output);
}
/**
* #brief The function that takes the max and starts the sorting process.
* #param array array.
* #param size array size.
*/
void radixsort(int* array, int size) {
int max = getMax(array, size);
for (int digit = 1; max / digit > 0; digit *= 10)
countingSort(array, size, digit);
}
MPI version:
/**
* #brief This function allows to find the maximum in an array.
* #param arr array.
* #param n array size.
*/
int getMax(int* arr, int n) {
int max = arr[0];
for (int i = 1; i < n; i++)
if (arr[i] > max)
max = arr[i];
return max;
}
/**
* #brief The main function that sorts the array of size n.
* #param array array.
* #param rec_buf sub-array of each process.
* #param n array size.
* #param digit number which represent ciphers.
* #param num_process number of processes.
* #param rank rank of the current process.
* #param dim dimension of the rec_buf.
*/
void countingSort(int* array, int* rec_buf, int n, int digit, int num_process, int rank, int dim) {
// Compute local count for each processes
int i, local_count[10] = {0};
for (i = 0; i < dim; i++) {
local_count[(rec_buf[i] / digit) % 10]++;
}
// Reduce all the sub counts to root process
if (rank == 0) {
int count[10] = {0};
MPI_Reduce(local_count, count, 10, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
for (i = 1; i < 10; i++) {
count[i] += count[i - 1];
}
int* temp_array = (int*) malloc(sizeof(int) * n);
for (i = n - 1; i >= 0; i--) {
temp_array[count[(array[i] / digit) % 10] - 1] = array[i];
count[(array[i] / digit) % 10]--;
}
memcpy(array, temp_array, sizeof(int) * n);
free(temp_array);
} else {
MPI_Reduce(local_count, 0, 10, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
}
}
/**
* #brief The function that separates the array in subarray and starts the sorting process.
* #param array array.
* #param n array size.
* #param num_process number of processes.
* #param rank rank of the current process.
*/
void radix_sort(int* array, int n, int num_process, int rank) {
int rem = n%num_process; // elements remaining after division among processes
int dim, displacement;
if ( rank < rem) {
dim = n/num_process+1;
displacement = rank * dim;
}
else {
dim = n/num_process;
displacement = rank * dim + rem;
}
int* rec_buf= (int*) malloc(sizeof(int)*dim) ;
int* sendcounts = NULL;
int* displs = NULL;
if (rank == 0) {
sendcounts = malloc(sizeof(int)*num_process);
displs = malloc(sizeof(int)*num_process);
}
MPI_Gather(&dim,1,MPI_INT, sendcounts, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Gather(&displacement, 1, MPI_INT, displs, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatterv(array, sendcounts, displs, MPI_INT, rec_buf, dim, MPI_INT, 0, MPI_COMM_WORLD);
if (rank==0) {
free(sendcounts);
free(displs);
}
int local_max = getMax(rec_buf,dim);
int global_max;
MPI_Allreduce(&local_max,&global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
for (int digit = 1; global_max / digit > 0; digit *= 10) {
countingSort(array, rec_buf, n, digit, num_process, rank, dim);
}
free(rec_buf);
}
Also I noticed that using the same kind of approach of calculating the vector of counts locally, but using OpenMP, the speed up reaches maximum value equal to 4.5. Could this result be due to the different architecture used in the two cases? (shared memory for OpenMP and distributed memory for MPI).
For completeness I also report the solution with OpenMP:
/**
* #brief This function allows to find the maximum in an array.
* #param n array size.
* #param arr[n] array.
*/
unsigned getMax(int n, unsigned arr[n]) {
unsigned mx = arr[0];
#pragma omp parallel for reduction(max:mx)
for (int i = 1; i < n; i++)
if (arr[i] > mx)
mx = arr[i];
return mx;
}
/* Source: https://gist.github.com/wanghc78/2c2b403299cab172e74c62f4397a6997
Copyright (c) 2014, Haichuan Wang All rights reserved. */
/**
* #brief The main function that sorts arr[] of size n using Radix Sort
* #param n array size.
* #param arr[n] array.
* #param threads number of threads.
*/
unsigned * radixsort(int n, unsigned arr[n], int threads) {
if (threads == 0)
threads+=1;
unsigned m = getMax(n, arr);
unsigned exp;
unsigned *output = malloc(n*sizeof(unsigned));
for (exp = 1; m / exp > 0; exp *= 10) {
int count[10] = {0}, local_count[10] = { 0 };
#pragma omp parallel firstprivate(local_count) num_threads(threads)
{
#pragma omp for schedule(static) nowait
for (int i = 0; i < n; i++)
local_count[(arr[i] / exp) % 10]++;
#pragma omp critical
for(int i = 0; i < 10; i++)
count[i] += local_count[i];
#pragma omp barrier
#pragma omp single
for (int i = 1; i < 10; i++)
count[i] += count[i - 1];
int tid = omp_get_thread_num();
for(int cur_t = threads - 1; cur_t >= 0; cur_t--) {
if(cur_t == tid) {
for(int i = 0; i < 10; i++) {
count[i] -= local_count[i];
local_count[i] = count[i];
}
}
else {
#pragma omp barrier
}
}
#pragma omp for schedule(static)
for(int i = 0; i < n; i++)
output[local_count[(arr[i] / exp) % 10]++] = arr[i];
}
unsigned* tmp = arr;
arr = output;
output = tmp;
}
free(output);
return arr;
}
/**
* #brief This function initializes all the data structures needed in the program.
* #param N array size.
* #param threads number of threads.
*/
unsigned* init_structure(int N, int threads) {
unsigned * data_vector = malloc(N*sizeof(unsigned));
#pragma omp parallel for shared(data_vector) num_threads(threads)
for (int i=0; i<N; i++)
data_vector[i]=N-i;
return data_vector;
}
Thanks everyone for the answers.

Parallel Merge Sort using MPI

I implemented Parallel Merge sort in this code using the tree Structural scheme; but it doesn't sort the Array!
Could you take look at it and tell me what is wrong?
For communication among the processor I used the normal MPI_send() and MPI_recv().
However I used numbers 0 and 1 and 2 as tags for the fifth argument of MPI_recv().
For 8 processors the tree structural scheme gives the Array to the processor with rank 0 then it splits the array in half an gives the right half to processor 4 and keeps the left half.
Then the processor 4 splits its array in half an gives the right half to processor 6 and keeps the left half.
At the end with this scheme all the processors work an the program and none of them will idle.
Since at the leaves of the tree all the processors have a piece of Array to do sequential Merge_sort_inc on it.
text
#include<stdio.h>
#include<mpi.h>
#include<stdlib.h>
#include<time.h>
#include<math.h>
/* print_array() takes the elements of Array to the output */
void print_array(int arr[], int size)
{
for (int i = 0; i < size; i++)
printf("%d ", arr[i]);
printf("\n");
}
/*copyarray() takes as first argument an Array a[] which its elements
between indexes start_a and end_a are to be copied to the dynamic Array *b with size of (size_b) */
void copyarray(int a[] ,int start_a , int end_a, int* b, int size_b)
{
int i = 0;
for (i = 0; i < size_b;i++)
{
b[i] = a[start_a];
start_a++;
if (start_a == end_a)
break;
}
}
/* merge () function is just the sequential implementation of merge Sort Algorithm */
void merge(int Arr[], int left, int mid, int right)
{
int n_l = (mid - left + 1);
int n_r = (right - mid);
int* Arr_l = (int*)calloc(n_l, sizeof(int));
int* Arr_r = (int*)calloc(n_r, sizeof(int));
if (Arr_l == NULL)
return;
if (Arr_r == NULL)
return;
for (int i = 0;i < n_l;i++)
Arr_l[i] = Arr[left + i];
for (int j = 0;j < n_r;j++)
Arr_r[j] = Arr[mid + 1 + j];
int i = 0, j = 0, k = left;
while (i < n_l && j < n_r)
{
if (Arr_l[i] <= Arr_r[j])
{
Arr[k] = Arr_l[i];
i++;
k++;
}
else
{
Arr[k] = Arr_r[j];
j++;
k++;
}
}
while (i < n_l)
{
Arr[k] = Arr_l[i];
i++;
k++;
}
while (j < n_r)
{
Arr[k] = Arr_r[j];
j++;
k++;
}
free(Arr_l);
free(Arr_r);
}
/*merge_sort_inc() is the sequential algorithm of sorting in increasing order*/
void merge_sort_inc(int Arr[], int left, int right)
{
int mid = (int)(left + (right - left) / 2);
if (left < right)
{
merge_sort_inc(Arr, left, mid);
merge_sort_inc(Arr, mid + 1, right - 1);
merge(Arr, left, mid, right);
}
}
/*parallelMerge() builds first the tree-structural communication between the processors. at the leafs of the tree,
where there is no more divide and concurrent progress the Function gives the the processor the sequential Merge sort algorithm*/
void parallelMerge(int* array, int size, int height)
{
int parent;
int rank;
int numberOfproc;
int next;
int rightChild;
//MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &numberOfproc);
parent = rank & ~(1 << height);
next = height - 1;
rightChild = rank | (1 << (height - 1));
if (height > 0)
if (rightChild >= numberOfproc)
parallelMerge(array, size, next);
else
{
int left_size = (int)(size / 2);
int right_size = size - left_size;
int* leftArray = (int*)calloc(left_size, sizeof(int));
int * rightArray = (int*)calloc(right_size,sizeof(int));
if (leftArray == NULL)
return;
if (rightArray == NULL)
return;
int massage[2];
int i, j , k;
MPI_Status status;
copyarray(array, 0, left_size, leftArray, left_size);
copyarray(array, size - left_size, size, rightArray,right_size);
massage[0] = next;
massage[1] = right_size;
MPI_Send(massage, 2, MPI_INT, rightChild,0, MPI_COMM_WORLD);
MPI_Send(rightArray, right_size, MPI_INT, rightChild, 1, MPI_COMM_WORLD);
parallelMerge(leftArray, left_size, next);
MPI_Recv(rightArray, right_size, MPI_INT, rightChild, 2, MPI_COMM_WORLD, &status);
i = j = k = 0;
while (i < left_size && j < right_size)
{
if (leftArray[i] < rightArray[j])
{
array[k] = leftArray[i]; i++, k++;
}
else
{
array[k] = rightArray[j]; j++, k++;
}
}
while (i<left_size)
{
array[k] = leftArray[i];
k++;
i++;
}
while (j<right_size)
{
array[k] = rightArray[j];
k++;
j++;
}
}
else
{
merge_sort_inc(array, 0 ,size);
if (parent != rank)
MPI_Send(array, size, MPI_INT, parent, 2, MPI_COMM_WORLD);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////
int main()
{
/*building an array with the help of Random function*/
time_t t;
srand((unsigned)time(&t));
int Arr[100];
int arrSize = sizeof(Arr) / sizeof(int);
for (int i = 0; i < arrSize; i++)
Arr[i] = rand() / 100;
printf("the unsorted array is : \n ");
print_array(Arr, arrSize);
/*starting the parallel sorting*/
int rank;
int comm_size;
MPI_Init(NULL,NULL);
MPI_Comm_rank(MPI_COMM_WORLD , &rank);
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
double start = MPI_Wtime();//capture time
if (rank == 0)
{
int roothight = 0;
int nodeCount = 1;
while (nodeCount < comm_size)
nodeCount++;
roothight = (int)log(nodeCount);
int* newarray = (int*)calloc(arrSize, sizeof(int));
if (newarray == NULL)
return 1;
copyarray(Arr, 0, arrSize - 1, newarray, arrSize );
parallelMerge(newarray, arrSize, roothight);
double midle = MPI_Wtime();
}
else
{
int massage[2];
int height;
int size_array;
MPI_Status status;
MPI_Recv(massage, 2, MPI_INT, MPI_ANY_SOURCE,0, MPI_COMM_WORLD, &status);
height = massage[0];
size_array = massage[1];
int* newarray = (int*)calloc(size_array, sizeof(int));
if (newarray == NULL)
return 1;
MPI_Recv(newarray, size_array, MPI_INT, MPI_ANY_SOURCE,1, MPI_COMM_WORLD, &status);
parallelMerge(newarray, size_array, height);
}
double end = MPI_Wtime();
MPI_Finalize();
printf("\n the sorted array is : \n");
print_array(Arr, arrSize);
printf("\n the sorting takes %lf time ", (end - start));
return 0;
}

Hybrid approach with OpenMP and MPI does not use same number of threads in cluster with different number of hosts

I'm testing a hybrid approach by paralleling the friendly-numbers (CAPBenchmark) program with MPI and OpenMP.
My cluster has 8 machines and each machine has a 4 core processor.
The code:
/*
* Copyright(C) 2014 Pedro H. Penna <pedrohenriquepenna#gmail.com>
*
* friendly-numbers.c - Friendly numbers kernel.
*/
#include <global.h>
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <util.h>
#include "fn.h"
/*
* Computes the Greatest Common Divisor of two numbers.
*/
static int gcd(int a, int b)
{
int c;
/* Compute greatest common divisor. */
while (a != 0)
{
c = a;
a = b%a;
b = c;
}
return (b);
}
/*
* Some of divisors.
*/
static int sumdiv(int n)
{
int sum; /* Sum of divisors. */
int factor; /* Working factor. */
sum = 1 + n;
/* Compute sum of divisors. */
for (factor = 2; factor < n; factor++)
{
/* Divisor found. */
if ((n%factor) == 0)
sum += factor;
}
return (sum);
}
/*
* Computes friendly numbers.
*/
int friendly_numbers(int start, int end)
{
int n; /* Divisor. */
int *num; /* Numerator. */
int *den; /* Denominator. */
int *totalnum;
int *totalden;
int rcv_friends;
int range; /* Range of numbers. */
int i, j; /* Loop indexes. */
int nfriends; /* Number of friendly numbers. */
int slice;
range = end - start + 1;
slice = range / nthreads;
if (rank == 0) {
num = smalloc(sizeof(int)*range);
den = smalloc(sizeof(int)*range);
totalnum = smalloc(sizeof(int)*range);
totalden = smalloc(sizeof(int)*range);
} else {
num = smalloc(sizeof(int) * slice);
den = smalloc(sizeof(int) * slice);
totalnum = smalloc(sizeof(int)*range);
totalden = smalloc(sizeof(int)*range);
}
j = 0;
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j, n) default(shared)
for (i = start + rank * slice; i < start + (rank + 1) * slice; i++) {
j = i - (start + rank * slice);
num[j] = sumdiv(i);
den[j] = i;
n = gcd(num[j], den[j]);
num[j] /= n;
den[j] /= n;
}
if (rank != 0) {
MPI_Send(num, slice, MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(den, slice, MPI_INT, 0, 1, MPI_COMM_WORLD);
} else {
for (i = 1; i < nthreads; i++) {
MPI_Recv(num + (i * (slice)), slice, MPI_INT, i, 0, MPI_COMM_WORLD, 0);
MPI_Recv(den + (i * (slice)), slice, MPI_INT, i, 1, MPI_COMM_WORLD, 0);
}
}
if (rank == 0) {
for (i = 1; i < nthreads; i++) {
MPI_Send(num, range, MPI_INT, i, 2, MPI_COMM_WORLD);
MPI_Send(den, range, MPI_INT, i, 3, MPI_COMM_WORLD);
}
} else {
MPI_Recv(totalnum, range, MPI_INT, 0, 2, MPI_COMM_WORLD,0);
MPI_Recv(totalden, range, MPI_INT, 0, 3, MPI_COMM_WORLD,0);
}
/* Check friendly numbers. */
nfriends = 0;
if (rank == 0) {
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j) default(shared) reduction(+:nfriends)
for (i = rank; i < range; i += nthreads) {
for (j = 0; j < i; j++) {
/* Friends. */
if ((num[i] == num[j]) && (den[i] == den[j]))
nfriends++;
}
}
} else {
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j) default(shared) reduction(+:nfriends)
for (i = rank; i < range; i += nthreads) {
for (j = 0; j < i; j++) {
/* Friends. */
if ((totalnum[i] == totalnum[j]) && (totalden[i] == totalden[j]))
nfriends++;
}
}
}
if (rank == 0) {
for (i = 1; i < nthreads; i++) {
MPI_Recv(&rcv_friends, 1, MPI_INT, i, 4, MPI_COMM_WORLD, 0);
nfriends += rcv_friends;
}
} else {
MPI_Send(&nfriends, 1, MPI_INT, 0, 4, MPI_COMM_WORLD);
}
free(num);
free(den);
return (nfriends);
}
During the executions I observed the following behavior:
When I run mpirun with 4 and 8 hosts, each of the hosts uses 4 threads for processing, as expected.
However when running using only 2 hosts only 1 thread is used on each machine.
What could cause this behavior? Is there any alternative to "force" the use of the 4 threads in the case of the 2 hosts?

I assume you are using Open MPI.
The default binding policy is to bind to socket or numa domain (depending on your version). I assume your nodes are single socket, which means one MPI tasks is bound to 4 cores, and then the OpenMP runtime will likely start 4 OpenMP threads.
A special case is when you start only 2 MPI tasks. In this case, the binding policy is to bind to core, which means one MPI task in only bound to one core, and hence the OpenMP runtime only start one OpenMP thread.
In order to achieve the desired behavior, you can
mpirun --bind-to numa -np 2 ...
If it fails, you can fallback to
mpirun --bind-to socket -np 2 ...

Use MPI_Sendrecv for Conway Game of Life but the program can't exchange data in the borders

I'm trying to write a code of Conway's Game of Life, the pattern is Rabbit. And I used the Cartesian 2D method to build a processes group, the communication between them is MPI_Sendrecv. But this code didn't work, it just hanging there without any respond when I ran it. It has taken me a long time to find the problem but I got no progress. Could you please help me to figure it out? I'll be so glad for that!
#include <stdio.h>
#include "mpi.h"
#include <math.h>
#include <stdlib.h>
#define array 20
#define arrayhalf (array/2)
main(int argc, char *argv[])
{
int ndims = 2, ierr;
int p, my_rank, my_cart_rank;
MPI_Comm comm2d;
MPI_Datatype newtype;
int dims[ndims], coord[ndims];
int wrap_around[ndims];
int reorder, nrows, ncols;
int x[arrayhalf+2][arrayhalf+2], x2[arrayhalf+2][arrayhalf+2], x_rev[array+4][array+4];
int left, right, down, top;
MPI_Status status;
int tag_up = 20, tag_down =21, tag_left = 22, tag_right = 23;
long start, stop;
/*** start up initial MPI environment ***/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
/* hardcore 2 processes in each dimension */
nrows = ncols = (int) sqrt(p);
dims[0] = dims[1] = 2;
/* create cartesian topology for processes */
MPI_Dims_create(p, ndims, dims);
/*if (my_rank == 0)
printf("PW[%d]/[%d%]: PEdims = [%d x %d] \n", my_rank, p, dims[0], dims[1]);*/
/* create cartesian mapping, and check it is created either correct or wrong */
wrap_around[0] = wrap_around[1] = 1; /*set periodicity to be true */
reorder = 0;
ierr = 0;
ierr = MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, wrap_around, reorder, &comm2d);
if (ierr != 0)
printf("ERROR[%d] creating CART\n", ierr);
MPI_Type_vector( arrayhalf, 1, arrayhalf+2, MPI_INT, &newtype);
MPI_Type_commit( &newtype );
/* get the neighbour process, which is useful for hola exchange */
int SHIFT_ROW = 0;
int SHIFT_COL = 1;
int DISP = 1;
/*** load pattern ***/
/* initialize the data array */
int i, j ;
for (i = 0; i < arrayhalf + 2 ; i++)
for (j = 0; j < arrayhalf + 2; j++)
{
x[i][j] = 0;
x2[i][j] = 0;
}
if (my_rank == 0)
{
int r,c;
r = arrayhalf / 2;
c = arrayhalf / 2;
/* rabbits pattern
1 1 1 1
1 1 1 1
1
*/
x[r][c] = 1;
x[r][c+4] = 1;
x[r][c+5] = 1;
x[r][c+6] = 1;
x[r+1][c] = 1;
x[r+1][c+1] = 1;
x[r+1][c+2] = 1;
x[r+1][c+5] = 1;
x[r+2][c+1] = 1;
}
/*** calculate the next generation ***/
int row, col;
int steps;
steps = atoi(argv[1]); /* get the generation number from command line */
start = MPI_Wtime();
int soc;
int destination;
for (i = 1; i <= steps; i++)
{
/*** use hola exchange in boundary elements ***/
int * send_buffer = (int*) malloc((arrayhalf)*sizeof(int));
int * recv_buffer = (int*) malloc((arrayhalf)*sizeof(int));
/*int * send_buffer = (int *) calloc(arrayhalf,sizeof(int));
int * recv_buffer = (int *) calloc(arrayhalf,sizeof(int));
*/
/* to up */
MPI_Cart_shift(comm2d, 1, 1, &soc,&destination);
MPI_Sendrecv( &x[1][1], arrayhalf, MPI_INT, destination, tag_up,& x[arrayhalf + 1][1], arrayhalf, MPI_INT, soc, tag_up, comm2d, &status );
/* to down */
MPI_Cart_shift(comm2d, 1, 1, &destination,&soc);
MPI_Sendrecv( &x[arrayhalf][1], arrayhalf, MPI_INT, destination, tag_down,& x[0][1], arrayhalf, MPI_INT, soc, tag_down, comm2d, &status);
/* to left */
MPI_Cart_shift(comm2d, 0, 1, &destination,&soc);
MPI_Sendrecv( &x[1][1], 1,newtype, destination, tag_left,& x[1][arrayhalf+1], 1, newtype, soc, tag_left, comm2d, &status );
/*for (j=0;j<arrayhalf;j++) {
send_buffer[j]=x[j+1][1];
}
MPI_Sendrecv( send_buffer, arrayhalf,MPI_INT, destination, tag_left,recv_buffer, arrayhalf, MPI_INT, soc, tag_left, comm2d, &status );
for (j=0;j<arrayhalf;j++) {
x[j+1][arrayhalf+1]=recv_buffer[j];
}
*/
/* to right */
MPI_Cart_shift(comm2d, 0, 1, &soc,&destination);
MPI_Sendrecv( &x[1][arrayhalf], 1, newtype, destination, tag_right, &x[1][0], 1, newtype, soc, tag_right, comm2d, &status );
/*for (j=0;j<arrayhalf;j++) {
send_buffer[j]=x[j+1][arrayhalf];
}
MPI_Sendrecv( send_buffer, arrayhalf,MPI_INT, destination, tag_right,recv_buffer, arrayhalf, MPI_INT, soc, tag_right, comm2d, &status );
for (j=0;j<arrayhalf;j++) {
x[j+1][1]=recv_buffer[j];
}
*/
/*** sum the neighbour values and get the next generation ***/
for (row = 1; row < arrayhalf; row++)
{
for (col = 1; col < arrayhalf; col++)
{
int neighbor;
neighbor = x[row - 1][col - 1] + x[row - 1][col] + x[row - 1][col + 1] + x[row][col - 1] +
x[row][col + 1] +
x[row + 1][col - 1] + x[row + 1][col] + x[row + 1][col + 1];
if (neighbor == 3)
{
x2[row][col] = 1;
}
else if (x[row][col] == 1 && neighbor == 2)
{
x2[row][col] = 1;
}
else
{
x2[row][col] = 0;
}
}
}
/* used to be swap */
for (row = 1; row < arrayhalf; row++)
{
for (col = 1; col < arrayhalf; col++)
{
x[row][col] = x2[row][col];
}
}
free(send_buffer);
free(recv_buffer);
}
/*** print the final generation ***/
int population = 0;
int* A;
int process_num = dims[0]*dims[1];
int row_indx;
int col_indx;
int k;
if(my_rank == 0)
{
A = (int*) malloc((arrayhalf+2)*(arrayhalf+2)*sizeof(int));
for (k= 1; k< process_num; k++)
{
MPI_Recv(A,(arrayhalf+2)*(arrayhalf+2), MPI_INT,k, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (i = 0; i<arrayhalf+2; i++)
{
for (j = 0; j<arrayhalf+2; j++)
{
row_indx = (k%dims[1])*(arrayhalf+2)+i;
col_indx = (k/dims[0]*(arrayhalf+2))+j;
x_rev[row_indx][col_indx] = A[i*(arrayhalf+2)+j];
}
}
}
for (i = 0; i<arrayhalf+2; i++)
{
for (j = 0; j<arrayhalf+2; j++)
{
x_rev[i][j] = x[i][j];
}
}
for (row = 0; row < array+4; row++) {
for (col = 0; col < array+4; col++)
{
printf("%2d",x_rev[row][col]);
if(x_rev[row][col]==1)
{
population = population + 1;
}
}
printf("\n");
}
stop = MPI_Wtime();
printf("Running Time: %f\n ",stop-start);
printf("Population: %d\n",population);
printf("Generation: %d\n",steps);
}
else{
A = (int*) malloc((array+4)*(array+4)*sizeof(int));
for (i=0; i< arrayhalf +2; i++)
{
for(j = 0; j<arrayhalf+2; j++)
{
A[i*(arrayhalf+2)+j] = x[i][j];
}
}
MPI_Send(A,(arrayhalf+2)*(arrayhalf+2),MPI_INT,0,0,MPI_COMM_WORLD);
}
MPI_Comm_free( &comm2d );
MPI_Type_free( &newtype );
free(A);
MPI_Finalize();
}

I think I found the error.
It is in line 176.
Rank 0 is trying to listen to a msg from rank 0, but rank 0 is not sending a msg to itself. You should start the loop from 1 and not 0.

FFTW+MPI, wrong results with fftw_mpi_plan_dft_r2c_2d planer

I try to compute Fourier transform with the planer fftw_mpi_plan_dft_r2c_2d of FFTW 3.3. Unfortunately, I can not make it work. The result is correct if N0 is equal to the number of processors (nb_proc) but is wrong when N0 != nb_proc.
An example showing my problem:
#include <stdio.h>
#include <complex.h>
#include <fftw3-mpi.h>
int main(int argc, char **argv)
{
/* if N0 (=ny) is equal to nb_proc, result are OK */
/* if N0 is not equal to nb_proc => bug */
const ptrdiff_t N0 = 4, N1 = 4;
int coef_norm = N0*N1;
fftw_plan plan_forward;
double *carrayX;
fftw_complex *carrayK;
ptrdiff_t n_alloc_local, i, j;
ptrdiff_t nX0loc, iX0loc_start, nK0loc, nK1loc;
/* X and K denote physical and Fourier spaces. */
int rank, nb_proc, irank;
MPI_Init(&argc, &argv);
fftw_mpi_init();
/*DETERMINE RANK OF THIS PROCESSOR*/
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/*DETERMINE TOTAL NUMBER OF PROCESSORS*/
MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
if (rank==0) printf("program test_fftw3_2Dmpi_simple\n");
printf("I'm rank (processor number) %i of size %i\n", rank, nb_proc);
n_alloc_local = fftw_mpi_local_size_2d(N0, N1/2+1, MPI_COMM_WORLD,
&nX0loc, &iX0loc_start);
carrayX = fftw_alloc_real(2 * n_alloc_local);
carrayK = fftw_alloc_complex(n_alloc_local);
/* create plan for out-of-place r2c DFT */
plan_forward = fftw_mpi_plan_dft_r2c_2d(N0, N1,
carrayX, carrayK,
MPI_COMM_WORLD,
FFTW_MEASURE);
nK0loc = nX0loc;
nK1loc = N1/2+1;
/* initialize carrayX to a constant */
for (i = 0; i < nX0loc; ++i) for (j = 0; j < N1; ++j)
carrayX[i*N1 + j] = 1.;
/* compute forward transform and normalize */
fftw_execute(plan_forward);
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
carrayK[i*nK1loc + j] = carrayK[i*nK1loc + j]/coef_norm;
/* print carrayK, there should be only one 1 in the first case for rank=0 */
for (irank = 0; irank<nb_proc; irank++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (rank == irank)
{
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
{
printf("rank = %i, carrayK[%ti*nK1loc + %ti] = (%6.4f, %6.4f)\n",
rank, i, j,
creal(carrayK[i*nK1loc + j]),
cimag(carrayK[i*nK1loc + j]));
}
printf("\n");
}
}
MPI_Barrier(MPI_COMM_WORLD);
fftw_destroy_plan(plan_forward);
MPI_Finalize();
}
There is something wrong in this example but I don't understand what.
For this case (N0 = 4, N1 = 4), the results are correct with
mpirun -np 4 ./test_fftw3_2Dmpi_simple
but not with
mpirun -np 2 ./test_fftw3_2Dmpi_simple
PS: same thing with the flag FFTW_MPI_TRANSPOSED_OUT.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

What causes MPI_Gatherv() to segfault? - c

Related

MPI Radix-Sort implementation low speed-up

Parallel Merge Sort using MPI

Hybrid approach with OpenMP and MPI does not use same number of threads in cluster with different number of hosts

Use MPI_Sendrecv for Conway Game of Life but the program can't exchange data in the borders

FFTW+MPI, wrong results with fftw_mpi_plan_dft_r2c_2d planer

Categories

Resources