I'm trying to find the max of randomly generated numbers. Any thoughts on this...
I am using MPI_Scatter to split the randomly generated numbers into equal processes. I am using MPI_Reduce to get the MAX from each process.
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <mpi.h>
#define atmost 1000
int find(int* partial_max, int from, int to){
int i, max;
printf("%d----%d\n", from, to);
max = partial_max[from];
for (i = from + 1; i <= to; i++)
if (partial_max[i] > max)
max = partial_max[i];
return max;
}
int main(){
int i, j,n, comm_sz, biggest, b, my_rank, q,result;
//1. Declare array of size 1000
int a[atmost];
//2. generate random integer of 0 to 999
srand((unsigned)time(NULL));
n = rand() % atmost;
//n = 10;
for (i = 0; i <= n; i++){
a[i] = rand() % atmost;
printf("My Numbers: %d\n", a[i]);
//a[i] = i;
}
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
//j is the size we will split each segment into
j = (n / (comm_sz-1));
int partial_max[j];
int receive_vector[j];
//Send random numbers equally to each process
MPI_Scatter(a, j, MPI_INT, receive_vector,
j, MPI_INT, 0, MPI_COMM_WORLD);
int localmax;
localmax = -1;
for (i = 0; i <= comm_sz-1; i++)
if (receive_vector[i] > localmax)
localmax = receive_vector[i];
// Get Max from each process
//MPI_Reduce(receive_vector, partial_max, j, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(&localmax, &result, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
if (my_rank == 0)
{
/*
biggest = -1;
for (i = 0; i < comm_sz - 1; i++){
if (i == comm_sz - 2)
b = find(partial_max, i * j, n - 1);
else
b = find(partial_max, i * j, (i + 1) * j - 1);
if (b > biggest)
biggest = b;
}*/
printf("-------------------\n");
printf("The biggest is: %d\n", result);
printf("The n is: %d\n", n);
}
MPI_Finalize();
return 0;
}
You have few bugs there:
You select (a different value of) n in each process. It is better to
select it within rank 0 and bcast to the rest of the processes.
When calculating j you divise by comm_sz-1 instead of comm_sz.
You assume n is divisible by comm_sz and that each process receives the exact same amount of numbers to process.
You loop with i going up to comm_sz-1 instead of going up to j
This is what I could find in a quick look..
Related
I'm testing a hybrid approach by paralleling the friendly-numbers (CAPBenchmark) program with MPI and OpenMP.
My cluster has 8 machines and each machine has a 4 core processor.
The code:
/*
* Copyright(C) 2014 Pedro H. Penna <pedrohenriquepenna#gmail.com>
*
* friendly-numbers.c - Friendly numbers kernel.
*/
#include <global.h>
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <util.h>
#include "fn.h"
/*
* Computes the Greatest Common Divisor of two numbers.
*/
static int gcd(int a, int b)
{
int c;
/* Compute greatest common divisor. */
while (a != 0)
{
c = a;
a = b%a;
b = c;
}
return (b);
}
/*
* Some of divisors.
*/
static int sumdiv(int n)
{
int sum; /* Sum of divisors. */
int factor; /* Working factor. */
sum = 1 + n;
/* Compute sum of divisors. */
for (factor = 2; factor < n; factor++)
{
/* Divisor found. */
if ((n%factor) == 0)
sum += factor;
}
return (sum);
}
/*
* Computes friendly numbers.
*/
int friendly_numbers(int start, int end)
{
int n; /* Divisor. */
int *num; /* Numerator. */
int *den; /* Denominator. */
int *totalnum;
int *totalden;
int rcv_friends;
int range; /* Range of numbers. */
int i, j; /* Loop indexes. */
int nfriends; /* Number of friendly numbers. */
int slice;
range = end - start + 1;
slice = range / nthreads;
if (rank == 0) {
num = smalloc(sizeof(int)*range);
den = smalloc(sizeof(int)*range);
totalnum = smalloc(sizeof(int)*range);
totalden = smalloc(sizeof(int)*range);
} else {
num = smalloc(sizeof(int) * slice);
den = smalloc(sizeof(int) * slice);
totalnum = smalloc(sizeof(int)*range);
totalden = smalloc(sizeof(int)*range);
}
j = 0;
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j, n) default(shared)
for (i = start + rank * slice; i < start + (rank + 1) * slice; i++) {
j = i - (start + rank * slice);
num[j] = sumdiv(i);
den[j] = i;
n = gcd(num[j], den[j]);
num[j] /= n;
den[j] /= n;
}
if (rank != 0) {
MPI_Send(num, slice, MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(den, slice, MPI_INT, 0, 1, MPI_COMM_WORLD);
} else {
for (i = 1; i < nthreads; i++) {
MPI_Recv(num + (i * (slice)), slice, MPI_INT, i, 0, MPI_COMM_WORLD, 0);
MPI_Recv(den + (i * (slice)), slice, MPI_INT, i, 1, MPI_COMM_WORLD, 0);
}
}
if (rank == 0) {
for (i = 1; i < nthreads; i++) {
MPI_Send(num, range, MPI_INT, i, 2, MPI_COMM_WORLD);
MPI_Send(den, range, MPI_INT, i, 3, MPI_COMM_WORLD);
}
} else {
MPI_Recv(totalnum, range, MPI_INT, 0, 2, MPI_COMM_WORLD,0);
MPI_Recv(totalden, range, MPI_INT, 0, 3, MPI_COMM_WORLD,0);
}
/* Check friendly numbers. */
nfriends = 0;
if (rank == 0) {
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j) default(shared) reduction(+:nfriends)
for (i = rank; i < range; i += nthreads) {
for (j = 0; j < i; j++) {
/* Friends. */
if ((num[i] == num[j]) && (den[i] == den[j]))
nfriends++;
}
}
} else {
omp_set_dynamic(0);
omp_set_num_threads(4);
#pragma omp parallel for private(i, j) default(shared) reduction(+:nfriends)
for (i = rank; i < range; i += nthreads) {
for (j = 0; j < i; j++) {
/* Friends. */
if ((totalnum[i] == totalnum[j]) && (totalden[i] == totalden[j]))
nfriends++;
}
}
}
if (rank == 0) {
for (i = 1; i < nthreads; i++) {
MPI_Recv(&rcv_friends, 1, MPI_INT, i, 4, MPI_COMM_WORLD, 0);
nfriends += rcv_friends;
}
} else {
MPI_Send(&nfriends, 1, MPI_INT, 0, 4, MPI_COMM_WORLD);
}
free(num);
free(den);
return (nfriends);
}
During the executions I observed the following behavior:
When I run mpirun with 4 and 8 hosts, each of the hosts uses 4 threads for processing, as expected.
However when running using only 2 hosts only 1 thread is used on each machine.
What could cause this behavior? Is there any alternative to "force" the use of the 4 threads in the case of the 2 hosts?
I assume you are using Open MPI.
The default binding policy is to bind to socket or numa domain (depending on your version). I assume your nodes are single socket, which means one MPI tasks is bound to 4 cores, and then the OpenMP runtime will likely start 4 OpenMP threads.
A special case is when you start only 2 MPI tasks. In this case, the binding policy is to bind to core, which means one MPI task in only bound to one core, and hence the OpenMP runtime only start one OpenMP thread.
In order to achieve the desired behavior, you can
mpirun --bind-to numa -np 2 ...
If it fails, you can fallback to
mpirun --bind-to socket -np 2 ...
I am trying to split up a program that calculates whether or not a number is a prime. So, I spread out into each process a range of numbers and find the primes in that range. This works, each process takes the range and finds the correct primes. Then when I want to Gather all of the arrays from each process into one is where I am running into problems.
Either it prints out the proper number of array values, but has extra 0's so it will print out something like 2,3,5,7,0,11,13,0,0,0,17,19... or I get the assertion failed error where the memcpy argument memory ranges overlap.
Here is the relevant code in my main -
thesePrimes = (int*)malloc(sizeof(int) * maxNumberOfTotalPrimes);
//findPrimes returns k which is the number of primes found and
//puts all of the prime numbers from this process into thesePrimes.
//n is the highest number to check if prime (ie n = 100 check for primes
//less than 100)
k = findPrimes(thesePrimes, start, end, n);
//Reduce k to get the total number of primes within the input
MPI_Reduce(&k, &numberOfPrimes, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
//Allocate just enough space to hold all of the primes based on the reduced
//number of primes
allPrimes = (int*)malloc(sizeof(int) * numberOfPrimes);
//Gather each process's thesePrimes into allPrimes using k as the buffer
//size since k is the number of primes for the process, just send k numbers
MPI_Gather(thesePrimes, k, MPI_INT, allPrimes, k, MPI_INT, 0, MPI_COMM_WORLD);
if(myRank == 0) {
printf("Attempting to print...\n");
for(i = 0; i < numberOfPrimes; i++)
printf("allPrimes[%d]=%d\n", i, allPrimes[i]);
printf("There are %d prime numbers in the range 0 to %d\n", numberOfPrimes, n);
}
Here is my function to find the number of primes -
int findPrimes(int primes[], int start, int end, int n){
//k is used to count the number of primes
int i, j, maxJ, k = 0;
int isPrime = 1;
printf("Finding primes from %d to %d\n", start, end);
if(end > n) end = n;
if(start == 0) start = 2;
for(i = start; i <= end; i++) {
maxJ = sqrt(i);
for(j = 2; j <= maxJ; j++) {
if(i%j == 0) {
isPrime = 0;
break;
}
}
printf("Checking if %d is prime...\n", i);
if(isPrime) {
primes[k++] = i;
printf("%d is a prime number.\n", primes[k-1]);
}
else isPrime = 1;
// printf("Prime check complete.\n");
}
printf("k = %d\n", k);
return k;
}
You need to MPI_Gather() the number of prime numbers on each rank, and then you will be able to MPI_Gatherv() the prime numbers.
I have Mac OS X Yosemite 10.10.1 (14B25).
I have some problems with compiling the code. Here it is:
#include <stdio.h>
#include <mpi.h>
#define n 3
#define repeats 1
double abs(double item)
{
return (item > 0) ? item : -item;
}
int swap_raws (double **a, int p, int q)
{
if (p >= 0 && p < n && q >= 0 && q < n)
{
if (p == q)
return 0;
for (int i = 0; i < n; i++)
{
double temp = a[p][i];
a[p][i] = a[q][i];
a[q][i] = temp;
}
return 0;
}
else
return -1;
}
double f_column (int rank, int size, double *least)
{
double t1, t2, tbeg, tend, each_least = 1, least0;
int map[n];
double **a = malloc (sizeof (*a) * n);
int i, j, k;
for (i = 0; i < n; i++)
a[i] = malloc (sizeof (*a[i]) * n);
if (rank == 0)
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i][j] = 1.0 / (i + j + 1);
MPI_Bcast (a, n * n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < n; i++)
map[i] = i % size;
MPI_Barrier (MPI_COMM_WORLD);
t1 = MPI_Wtime ();
for (k = 0; k < n - 1; k++)
{
double max = abs (a[k][k]);
int column = k;
for (j = k + 1; j < n; j++)
{
double absv = abs (a[k][j]);
if (absv > max)
{
max = absv;
column = j;
}
}
if (map[k] == rank && column != k && swap_raws (a, k, column))
{
printf("ERROR SWAPPING %d and %d columns\n", k, column);
return -1;
}
MPI_Bcast (&a[k], n, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
MPI_Bcast (&a[column], n, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
if (map[k] == rank)
for (i = k + 1; i < n; i++)
a[k][i] /= a[k][k];
MPI_Barrier (MPI_COMM_WORLD);
MPI_Bcast (&a[k][k+1], n - k - 1, MPI_DOUBLE, map[k], MPI_COMM_WORLD);
for (i = k + 1; i < n; i++)
if (map[i] == rank)
for (j = k + 1; j < n; j++)
a[j][i] -= a[j][k] * a[i][j];
}
t2 = MPI_Wtime ();
for (i = 0; i < n; i++)
if (map[i] == rank)
for (j = 0; j < n; j++)
{
double absv = abs (a[i][j]);
if (each_least > absv)
each_least = absv;
//printf ("a[%d][%d] = %lg\n", j, i, a[i][j]);
}
MPI_Reduce (&each_least, &least0, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce (&t1, &tbeg, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce (&t2, &tend, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
for (i = 0; i < n; i++)
free (a[i]);
free (a);
if (rank == 0)
{
*least = least0;
return (tend - tbeg);
}
}
int main (int argc, char *argv[])
{
int rank, size;
double min, max, aver, least;
if (n == 0)
return 0;
MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
MPI_Comm_size (MPI_COMM_WORLD, &size);
// It works!
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
aver = max = min = try;
for (int i = 1; i < repeats; i++)
{
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
if (try < min)
min = try;
else if (try > max)
max = try;
aver += try;
}
aver /= repeats;
MPI_Finalize ();
if (rank == 0)
printf("N: %d\nMIN: %f\nMAX: %f\nAVER: %f\nLEAST: %lg\n", size, min, max, aver, least);
return 0;
}
I have the Gilbert matrix. a(i)(j) = 1 / (i + j + 1) for i,j from 0 to n
This code should find LU decomposition using MPI in order to do it in the parallel way.
The first one process initialises the array and then broadcasts it to other processes.
Then I find the maximum in the raw and swap that columns. Then I would like to broadcast that data to every process, i.e. using MPI_Barrier (MPI_COMM_WORLD); but it says:
So, I don't know what's happened and how I can fix that problem. The same variant of the program runs without using processes and non-parallel version but doesn't work here.
If you find the solution, the example should work like that (I was calculating it by myself, you can check it too, but I can admit it's true). The matrix (here j and i vertically and horizontally respectively, it works in not such a convenient way for people but you should take it):
1 1/2 1/3 1 1/2 1/3 1 1/2 1/3 |1 1/2 1/3 |
1/2 1/3 1/4 -> 1/2 1/12 1/12 -> 1/2 1/12 1 -> |1/2 1/12 1/12 | <- answer
1/3 1/4 1/5 1/3 1/12 4/45 1/3 1/12 1/180 |1/3 1 1/180|
The source matrix so:
|1 0 0| |1 1/2 1/3 | |1 1/2 1/3|
A = |1/2 1 0| * |0 1/12 1/12 | = |1/2 1/3 1/4|
|1/3 1 1| |0 0 1/180| |1/3 1/4 1/5|
Can you help me to find out made mistake? Thank you in advance :)
Your program has a bug in the following part of the code:
double **a = malloc (sizeof (*a) * n);
[...snip...]
MPI_Bcast (a, n * n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
You are allocating 'n' pointers in "a", not an 'n * n' array. So when you do an 'n * n' size MPI_Bcast of "a", you are asking MPI to transfer from garbage memory locations that is not allocated. This is causing MPI to segfault.
You can change "a" to simply "double *" instead of "double **" and allocate 'n * n' doubles in there to fix this issue.
What grieves me the most is that f_column() is supposed to return a double, but the return value is undefined when rank != 0.
This comment caught my attention:
// It works!
//double try = f_column_non_parallel (rank, size, &least);
double try = f_column (rank, size, &least);
It suggests that the previous version of f_column() was working, and that you ran into troubles when attempting to parallelize it (I'm guessing that's what you're doing now).
How this could lead to a segfault is not immediately apparent to me though. I'd expect a floating point exception.
A couple of other points:
I'm not too comfortable with your memory allocation code (I'd probably use calloc() instead of malloc(), and sizeof() on explicit data types, etc...); it just freaks me out to see things like a[i] = malloc(sizeof (*a[i]) * n);, but it's just a matter of style, really.
You appear to have proper bound checking (indices over a are always positive and < n).
Oh, and you're redefining abs(), which is probably not a good idea.
Try to compile your code in debug mode, and run it with gdb; also run it through valgrind if you can, MacOS X should be supported by now.
You should probably take a closer look at your compiler warnings ;-)
i have a strange error when using MPI_Send --i get this error when trying to send a portion of a bi-dimensional array (matrix): "MPI_matrixMultiplication.c:68:99: error: expected expression before ‘,’ token".
The specific line is the one where i try to send a portion if the matrix: MPI_Send(&a[beginPosition][0],... );
(and as you can see, i have commented the other send and receive related with the matrix.
/////////////////////////////////////////////////////////
// multiplication of 2 matrices, parallelized using MPI //
/////////////////////////////////////////////////////////
#include <stdio.h>
#include <mpi.h>
// must use #define here, and not simply int blahblahblah, because "c" doesnt like ints for array dimension :(
#define matrixARowSize 3 // size of the row for matrix A
#define matrixAColumnSize 3 // size of the column for matrix A
#define matrixBRowSize 3 // size of the row for matrix B
#define matrixBColumnSize 3 // size of the column for matrix B
// tags used for sending/receiving data:
#define LOWER_BOUND 1 // first line to be processed
#define UPPER_BOUND 2 // last line to be processed
#define DATA // data to be processed
int a[matrixARowSize][matrixAColumnSize]; // matrix a
int b[matrixBRowSize][matrixBColumnSize]; // matrix b
int c[matrixARowSize][matrixBColumnSize]; // matrix c
int main()
{
int currentProcess; // current process
int worldSize; // world size
int i, j, k; // iterators
int rowsComputedPerProcess; // how many rows of the first matrix should be computed in each process
int numberOfSlaveProcesses; // the number of slave processes
int processesUsed; //how many processes of the available ones are actually used
MPI_Init(NULL, NULL); // MPI_Init()
MPI_Comm_size(MPI_COMM_WORLD, &worldSize); // get the world size
MPI_Comm_rank(MPI_COMM_WORLD, ¤tProcess); // get current process
numberOfSlaveProcesses = worldSize - 1; // 0 is the master, rest are slaves
rowsComputedPerProcess = worldSize > matrixARowSize ? 1 : (matrixARowSize/numberOfSlaveProcesses);
processesUsed = worldSize > matrixARowSize ? matrixARowSize : numberOfSlaveProcesses;
/*
* in the first process (the father);
* initialize the 2 matrices, then start splitting the data to the slave processes
*/
if (!currentProcess) // in father process
{
printf("rows per process: %d\n", rowsComputedPerProcess);
printf("nr of processes used: %d\n", processesUsed);
// init matrix A
for(i = 0; i < matrixARowSize; ++i)
for(j = 0; j < matrixAColumnSize; ++j){
a[i][j] = i + j + 1;
// printf("%d\n", a[i][j]);
// printf("%d\n", *(a[i] + j));
}
// init matrix B
for(i = 0; i < matrixBRowSize; ++i)
for(j = 0; j < matrixBColumnSize; ++j)
b[i][j] = i + j + 1;
// start sending data to the slaves for them to work >:)
int beginPosition; // auxiliary values used for sending the offsets to slaves
int endPosition;
for(i = 1; i < processesUsed; ++i) // the last process is dealt with separately
{
beginPosition = (i - 1)*rowsComputedPerProcess;
endPosition = i*rowsComputedPerProcess;
MPI_Send(&beginPosition, 1, MPI_INT, i, LOWER_BOUND, MPI_COMM_WORLD);
MPI_Send(&endPosition, 1, MPI_INT, i, UPPER_BOUND, MPI_COMM_WORLD);
MPI_Send(&a[beginPosition][0], ((endPosition - beginPosition)*matrixARowSize), MPI_INT, i, DATA, MPI_COMM_WORLD);
// MPI_Send(a[beginPosition], (endPosition - beginPosition)*matrixARowSize, MPI_INT, i, DATA, MPI_COMM_WORLD);
// for(j = beginPosition; j < endPosition; ++j)
// for (k = 0; k < matrixAColumnSize; ++k)
// {
// printf("%d ", *(a[j] + k));
// }
// printf("\n");
// printf("beg: %d, end: %d\n", beginPosition, endPosition);
// printf(" data #%d\n", (endPosition - beginPosition)*matrixARowSize);
}
// deal with last process
beginPosition = (i - 1)*rowsComputedPerProcess;
endPosition = matrixARowSize;
MPI_Send(&beginPosition, 1, MPI_INT, i, LOWER_BOUND, MPI_COMM_WORLD);
MPI_Send(&endPosition, 1, MPI_INT, i, UPPER_BOUND, MPI_COMM_WORLD);
// MPI_Send(a[beginPosition], (endPosition - beginPosition)*matrixARowSize, MPI_INT, i, DATA, MPI_COMM_WORLD);
// printf("beg: %d, end: %d\n", beginPosition, endPosition);
// printf(" data #%d\n", (endPosition - beginPosition)*matrixARowSize);
}
else { // if this is a slave (rank > 0)
int beginPosition; // auxiliary values used for sending the offsets to slaves
int endPosition;
MPI_Recv(&beginPosition, 1, MPI_INT, 0, LOWER_BOUND, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(&endPosition, 1, MPI_INT, 0, UPPER_BOUND, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// MPI_Recv(a[beginPosition], (endPosition - beginPosition)*matrixARowSize, 0, DATA, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for(i = beginPosition; i < endPosition; ++i) {
for (j = 0; j < matrixAColumnSize; ++j)
printf("(# %d, i=%d, j=%d: %d ", currentProcess, i, j, a[i][j]);
// printf("\n");
}
}
MPI_Finalize();
return 0; // bye-bye
}
Your DATA constant is empty.
#define DATA // data to be processed
So you're trying to do :
MPI_Send(&a[beginPosition][0], ((endPosition - beginPosition)*matrixARowSize), MPI_INT, i, , MPI_COMM_WORLD);
Which logically generates an expected expression before ',' token error.
I try to compute Fourier transform with the planer fftw_mpi_plan_dft_r2c_2d of FFTW 3.3. Unfortunately, I can not make it work. The result is correct if N0 is equal to the number of processors (nb_proc) but is wrong when N0 != nb_proc.
An example showing my problem:
#include <stdio.h>
#include <complex.h>
#include <fftw3-mpi.h>
int main(int argc, char **argv)
{
/* if N0 (=ny) is equal to nb_proc, result are OK */
/* if N0 is not equal to nb_proc => bug */
const ptrdiff_t N0 = 4, N1 = 4;
int coef_norm = N0*N1;
fftw_plan plan_forward;
double *carrayX;
fftw_complex *carrayK;
ptrdiff_t n_alloc_local, i, j;
ptrdiff_t nX0loc, iX0loc_start, nK0loc, nK1loc;
/* X and K denote physical and Fourier spaces. */
int rank, nb_proc, irank;
MPI_Init(&argc, &argv);
fftw_mpi_init();
/*DETERMINE RANK OF THIS PROCESSOR*/
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/*DETERMINE TOTAL NUMBER OF PROCESSORS*/
MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
if (rank==0) printf("program test_fftw3_2Dmpi_simple\n");
printf("I'm rank (processor number) %i of size %i\n", rank, nb_proc);
n_alloc_local = fftw_mpi_local_size_2d(N0, N1/2+1, MPI_COMM_WORLD,
&nX0loc, &iX0loc_start);
carrayX = fftw_alloc_real(2 * n_alloc_local);
carrayK = fftw_alloc_complex(n_alloc_local);
/* create plan for out-of-place r2c DFT */
plan_forward = fftw_mpi_plan_dft_r2c_2d(N0, N1,
carrayX, carrayK,
MPI_COMM_WORLD,
FFTW_MEASURE);
nK0loc = nX0loc;
nK1loc = N1/2+1;
/* initialize carrayX to a constant */
for (i = 0; i < nX0loc; ++i) for (j = 0; j < N1; ++j)
carrayX[i*N1 + j] = 1.;
/* compute forward transform and normalize */
fftw_execute(plan_forward);
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
carrayK[i*nK1loc + j] = carrayK[i*nK1loc + j]/coef_norm;
/* print carrayK, there should be only one 1 in the first case for rank=0 */
for (irank = 0; irank<nb_proc; irank++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (rank == irank)
{
for (i = 0; i < nK0loc; ++i) for (j = 0; j < nK1loc; ++j)
{
printf("rank = %i, carrayK[%ti*nK1loc + %ti] = (%6.4f, %6.4f)\n",
rank, i, j,
creal(carrayK[i*nK1loc + j]),
cimag(carrayK[i*nK1loc + j]));
}
printf("\n");
}
}
MPI_Barrier(MPI_COMM_WORLD);
fftw_destroy_plan(plan_forward);
MPI_Finalize();
}
There is something wrong in this example but I don't understand what.
For this case (N0 = 4, N1 = 4), the results are correct with
mpirun -np 4 ./test_fftw3_2Dmpi_simple
but not with
mpirun -np 2 ./test_fftw3_2Dmpi_simple
PS: same thing with the flag FFTW_MPI_TRANSPOSED_OUT.