I firstly initialize a 4x4 matrix and then try to send the first 2x2 block to the slave process by using MPI in C. However the slave process only receives the first row of the block, the second row is filled with random numbers from computer ram. I couldn't find what is missing. The code of the program is below :
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 4
int main(int argc, char** argv)
{
int rank, nproc;
const int root = 0;
const int tag = 3;
int** table;
int* datas;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
datas = malloc(SIZE * SIZE * sizeof(int));
table = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
table[i] = &(datas[i * SIZE]);
for (int i = 0; i < SIZE; i++)
for (int k = 0; k < SIZE; k++)
table[i][k] = 0;
table[0][1] = 1;
table[0][2] = 2;
table[1][0] = 3;
table[2][3] = 2;
table[3][1] = 3;
table[3][2] = 4;
if (rank == root){
MPI_Datatype newtype;
int sizes[2] = { 4, 4 }; // size of table
int subsizes[2] = { 2, 2 }; // size of sub-region
int starts[2] = { 0, 0 };
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &newtype);
MPI_Type_commit(&newtype);
MPI_Send(&(table[0][0]), 1, newtype, 1, tag, MPI_COMM_WORLD);
}
else{
int* local_datas = malloc(SIZE * SIZE * sizeof(int));
int** local = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
local[i] = &(local_datas[i * SIZE]);
MPI_Recv(&(local[0][0]), 4, MPI_INT, root, tag, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (int i = 0; i < 2; i++){
for (int k = 0; k < 2; k++)
printf("%3d ", local[i][k]);
printf("\n");
}
}
MPI_Finalize();
return 0;
}
You have instructed the receive operation to put four integer values consecutively in memory and therefore the 2x2 block is converted to a 1x4 row upon receive (since local is 4x4). The second row of local contains random values since the memory is never initialised.
You should either make use of MPI_Type_create_subarray in both the sender and the receiver in order to place the received data in a 2x2 block or redefine local to be a 2x2 matrix instead of 4x4.
Related
The MPI documentation asserts that the adress of address of the receive buffer (recvbuf) is significant only at root. Meaning that the memory may not be allocated in the other processes. This is confirmed by this question.
int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
MPI_Op op, int root, MPI_Comm comm)
At first I thought that recvbuf did not even have to exist: that the memory for recvbuf itself did not have to be allocated (eg by dynamical allocation). Unfortunately (it took me a lot of time to understand my mistake!), it seems that even if the memory that it points to is not valid, the pointer itself has to exist.
See below for the code I have in mind, with a version that gives a segfault, and one that does not.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char **argv) {
// MPI initialization
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n1 = 3, n2 = 10; // Sizes of the 2d arrays
long **observables = (long **) malloc(n1 * sizeof(long *));
for (int k = 0 ; k < n1 ; ++k) {
observables[k] = (long *) calloc(n2, sizeof(long));
for (long i = 0 ; i < n2 ; ++i) {
observables[k][i] = k * i * world_rank; // Whatever
}
}
long **obs_sum; // This will hold the sum on process 0
#ifdef OLD // Version that gives a segfault
if (world_rank == 0) {
obs_sum = (long **) malloc(n2 * sizeof(long *));
for (int k = 0 ; k < n2 ; ++k) {
obs_sum[k] = (long *) calloc(n2, sizeof(long));
}
}
#else // Correct version
// We define all the pointers in all the processes.
obs_sum = (long **) malloc(n2 * sizeof(long *));
if (world_rank == 0) {
for (int k = 0 ; k < n2 ; ++k) {
obs_sum[k] = (long *) calloc(n2, sizeof(long));
}
}
#endif
for (int k = 0 ; k < n1 ; ++k) {
// This is the line that results in a segfault if OLD is defined
MPI_Reduce(observables[k], obs_sum[k], n2, MPI_LONG, MPI_SUM, 0,
MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
// You may free memory here
return 0;
}
Am I interpreting this correctly? What is the rationale behind this behavior?
The problem is not MPI, but the fact that you are passing obs_sum[k], but you haven't defined/allocated it at all.
for (int k = 0 ; k < n1 ; ++k) {
// This is the line that results in a segfault if OLD is defined
MPI_Reduce(observables[k], obs_sum[k], n2, MPI_LONG, MPI_SUM, 0,
MPI_COMM_WORLD);
}
Even if MPI_Reduce() is not getting its value, the generated code will get obs_sum (undefined and not allocated), add k to it and try to read this pointer (segfault) to be passed to MPI_Reduce().
For example the allocation of the rows should be sufficient for it to work:
#else // Correct version
// We define all the pointers in all the processes.
obs_sum = (long **) malloc(n2 * sizeof(long *));
// try commenting out the following lines
// if (world_rank == 0) {
// for (int k = 0 ; k < n2 ; ++k) {
// obs_sum[k] = (long *) calloc(n2, sizeof(long));
// }
// }
#endif
I would allocate a 2D array as a flat array - I really hate this array-of-arrays representation. Wouldn't this be better?
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char **argv) {
// MPI initialization
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n1 = 3, n2 = 10; // Sizes of the 2d arrays
long *observables = (long *) malloc(n1*n2*sizeof(long));
for (int k = 0 ; k < n1 ; ++k) {
for (long i = 0 ; i < n2 ; ++i) {
observables[k*n2+i] = k * i * world_rank; // Whatever
}
}
long *obs_sum = nullptr; // This will hold the sum on process 0
if (world_rank == 0) {
obs_sum = (long *) malloc(n1*n2*sizeof(long));
}
MPI_Reduce(observables, obs_sum, n1*n2, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
// You may free memory here
return 0;
}
What is the correct way to handle a sendcount = 0 when using MPI_Gatherv (or any other function that requires a sendcount) when setting up the displs argument?
I have data that needs to be received by all processors, but all processors might not have any data to send themselves. As an MWE, I tried (on just two processors):
#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
int main(void)
{
int ntasks;
int thistask;
int n = 0;
int i;
int totcounts = 0;
int *data;
int *rbuf;
int *rcnts;
int *displs;
int *master_data;
int *master_displs;
// Set up mpi
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
MPI_Comm_rank(MPI_COMM_WORLD, &thistask);
// Allocate memory for arrays needed by allgatherv
rbuf = calloc(ntasks, sizeof(int));
rcnts = calloc(ntasks, sizeof(int));
displs = calloc(ntasks, sizeof(int));
master_displs = calloc(ntasks, sizeof(int));
// Initialize the counts and displacement arrays
for(i = 0; i < ntasks; i++)
{
rcnts[i] = 1;
displs[i] = i;
}
// Allocate data on just one task, but not others
if(thistask == 1)
{
n = 3;
data = calloc(n, sizeof(int));
for(i = 0; i < n; i++)
{
data[i] = i;
}
}
// Get n so each other processor knows about what others are sending
MPI_Allgatherv(&n, 1, MPI_INT, rbuf, rcnts, displs, MPI_INT, MPI_COMM_WORLD);
// Now that we know how much data each processor is sending, we allocate the array
// to hold it all
for(i = 0; i < ntasks; i++)
{
totcounts += rbuf[i];
}
master_data = calloc(totcounts, sizeof(int));
// Get displs for master data
master_displs[0] = 0;
for(i = 1; i < ntasks; i++)
{
master_displs[i] = master_displs[i - 1] + rbuf[i - 1];
}
// Send each processor's data to all others
MPI_Allgatherv(&data, n, MPI_INT, master_data, rbuf, master_displs, MPI_INT, MPI_COMM_WORLD);
// Print it out to see if it worked
if(thistask == 0)
{
for(i = 0; i < totcounts; i++)
{
printf("master_data[%d] = %d\n", i, master_data[i]);
}
}
// Free
if(thistask == 1)
{
free(data);
}
free(rbuf);
free(rcnts);
free(displs);
free(master_displs);
free(master_data);
MPI_Finalize();
return 0;
}
The way that I've set up master_displs works when every processor has a non-zero n (that is, they have data to send). In this case, both entries will be zero. However, the results of this program are garbage. How would I set up the master_displs array to ensure that master_data holds the correct information (in this case, just master_data[i] = i, as received from task 1)?
I know this has been answered many times before and there is a comprehensive answer here which I have read and attempted to use but I just can't get my code to work for some reason.
I have stripped my code down a bit to make it a bit easier to follow, but basically what I am trying to do is have each process initialise a sub-array and work on it, then put the whole big array back together on rank 0. MPI_Gatherv is giving me a segfault and I cannot figure out why.
Any help would be greatly appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#define N 32
void init_lattice(double **site, int row, int col){
int i,j;
for(i=0; i<row; i++){
for(j=0; j<col; j++){
site[i][j]=(drand48()/4294967295.0 + 0.5)*2*M_PI;
}
}
}
int main(int argc, char *argv[]){
int nprocs, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
int dim = 2;
int grid[dim];
grid[0]=0;
grid[1]=0;
// Assign the grid dimensions
MPI_Dims_create(nprocs, dim, grid);
printf("Dim grid: length: %d, width: %d\n", grid[0], grid[1]);
// The new communicator
MPI_Comm comm_grid;
// Allow cyclic behavior
int periodic[dim];
periodic[0] = 1;
periodic[1] = 1;
// Create the communicator
MPI_Cart_create(MPI_COMM_WORLD, dim, grid, periodic, 0, &comm_grid);
int block_len, block_width;
block_len = N/grid[1];
block_width = N/grid[0];
int i, j;
//Create lattice subset
double *data = (double *) malloc (block_len * block_width * sizeof(double));
double **site = (double **) malloc (block_len * sizeof(double *));
for (i = 0; i < block_len; i++)
site[i] = & (data[i * block_width]);
//Initialise lattice
init_lattice(site, block_len, block_width);
MPI_Datatype newtype, subtype;
int sizes[dim];
sizes[0]=N;
sizes[1]=N;
int subsizes[dim];
subsizes[0] = block_len;
subsizes[1] = block_width;
int starts[dim];
starts[0] = 0;
starts[1] = 0;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &newtype);
MPI_Type_create_resized(newtype, 0, N/grid[1]*sizeof(double), &subtype);
MPI_Type_commit(&subtype);
int sendcounts[grid[0]*grid[1]];
int displs[grid[0]*grid[1]];
if (rank == 0) {
for (i=0; i<grid[0]*grid[1]; i++) sendcounts[i] = 1;
int disp = 0;
for (i=0; i<grid[0]; i++) {
for (j=0; j<grid[1]; j++) {
displs[i*grid[0]+j] = disp;
disp += 1;
}
disp += ((N/grid[1])-1)*grid[0];
}
}
//Create global lattice
double *global_data = (double *) malloc (N * N * sizeof(double));
double **global_site = (double **) malloc (N * sizeof(double *));
for (i = 0; i < N; i++)
global_site[i] = & (global_data[i * N]);
MPI_Gatherv(&(site[0][0]), N*N/(grid[0]*grid[1]), MPI_DOUBLE, &(global_site[0][0]), sendcounts, displs, subtype, 0, MPI_COMM_WORLD);
if(rank==0){
printf("Rank: %d\n", rank);
for(i=0; i<N; i++){
for(j=0; j<N; j++){
printf("%.2lf ", global_site[i][j]);
}
printf("\n");
}
}
return 0;
}
EDIT:
Ok so I have changed my array allocations to contiguous memory and everything is working as it should now. Thanks talonmies!
The fundamental problem here is that MPI expects all allocations to be contiguous blocks of memory. Your site and global_site arrays are not, they are arrays of pointers. The MPI routines are just reading past the end of each individual row allocation and causing your segfault.
If you want to allocate an n x n array to use with the MPI then you need to replace this:
double **global_site;
if(rank==0){
global_site = malloc(sizeof(double *)*(N));
for(i=0; i<N; i++)
global_site[i] = malloc(sizeof(double)*(N));
}
with something like this:
double *global_site = malloc(sizeof(double)*(N * N));
You will obviously need to adjust the rest of your code accordingly.
It seems the only reason you are actually using arrays of pointers is for the convenience of [i][j] style 2D indexing. If you use linear or pitched linear memory, you can easily make a little preprocessor macro or helper function which can give you that style of indexing into row or column major ordered storage which is still compatible with MPI.
I am trying to use MPI to distribute the work for bucket sort. When I scatter the array, I wanted each process to receive a single bucket (int array) and be able to print its content. However, my current program prints out incorrect values, which make me think I am not indexing into the memory I want. Can someone help explain how I can properly index into the array I am passing to each process or how I am doing this incorrectly?
#define MAX_VALUE 64
#define N 32
main(int argc, char *argv[]){
MPI_Init(&argc, &argv); //initialize MPI environment
int** sendArray = malloc(16*sizeof(int *));
int *arrayIndex = (int *) malloc(16*sizeof(int));
int *receiveArray = (int *) malloc(N*sizeof(int));
int nps, myrank;
MPI_Comm_size(MPI_COMM_WORLD, &nps);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int i;
if(myrank == 0)
{
//create an array that stores the number of values in each bucket
for( i = 0; i < 16; i++){
arrayIndex[i] = 0;
}
int bucket =0;
int temp = 0;
//creates an int array within each array index of sendArray
for( i = 0; i < 16; i++){
sendArray[i] = (int *)malloc(N * sizeof(int));
}
//Create a random int array with values ranging from 0 to MAX_VALUE
for(i = 0; i < N; i++){
temp= rand() % MAX_VALUE;
bucket = temp/4;
printf("assigning %d to index [%d][%d]\n", temp, bucket, arrayIndex[bucket]);
sendArray[bucket][arrayIndex[bucket]]= temp;
arrayIndex[bucket] = arrayIndex[bucket] + 1;
}
MPI_Scatter(sendArray, 16, MPI_INT, receiveArray, N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(arrayIndex, 16, MPI_INT, 0, MPI_COMM_WORLD);
printf("bucket %d has %d values\n", myrank, arrayIndex[myrank]);
for( i = 0; i < arrayIndex[myrank]; i++){
printf("bucket %d index %d has value %d\n", myrank, i, receiveArray[i]);
}
}
What you are trying to do doesn't work because MPI always sends only the data you point to. It does not follow the pointers in your sendArray.
In your example you could just make your SendArray bigger, namely 16 * N and put all your data into that continuous array. That way you have a one-dimensional array, but that should not be a Problem in the code you gave us because all buckets have the same length, so you can access element j from bucket i with sendArray[i * N + j].
Also, in most cases send_count should be equal to recv_count. In your case that would be N. The correct MPI call would be
MPI_Scatter(sendArray, N, MPI_INT, receiveArray, N, MPI_INT, 0, MPI_COMM_WORLD);
I am trying to send a 2D integer array of arbitrary length from slave processes to the master but I keep getting a segmentation fault. As MPI is quite difficult to debug, I'm not certain that the issue has to do with the send/recv but if it's not that then it will have to be with the way I am allocating the arrays themselves.
I followed a previous question on here in regards to ensuring that the memory allocated to the array is contiguous but that still didn't fix the segmentation fault.
Below are some sections of my code:
Create array:
int** create2DArray(int sizeX, int sizeY)
{
int* data = (int *) malloc(sizeX * sizeY * sizeof(int));
int** array= (int **) malloc(sizeX * sizeof(int*));
int i;
for (i=0; i<sizeX; i++)
{
array[i] = &(data[sizeY * i]);
}
return array;
}
Initialise arrays:
if(rank==0)
{
display = x11setup(&win, &gc, width, height);
pixels = create2DArray(X_RESN, Y_RESN);
}
else
{
xStart = xPixels * (rank - 1);
xFinish = xStart + xPixels;
pixels = create2DArray(xPixels, Y_RESN);
}
Send:
MPI_Send(&pixels[0][0], xPixels * Y_RESN, MPI_INT, 0, type, MPI_COMM_WORLD);
Recv:
for(i = 1; i < processes; i++)
{
int** pixelChunk = create2DArray(xPixels, Y_RESN);
MPI_Recv(&pixelChunk[0][0], xPixels * Y_RESN, MPI_INT, i, type, MPI_COMM_WORLD, &status);
int xStart = xPixels * (i - 1);
int xFinish = xStart + xPixels;
int k;
for(j = xStart; j < xFinish; j++)
{
for(k = 0; k < Y_RESN; k++)
{
pixels[j][k] = pixelChunk[j - (xPixels * i - 1)][k];
}
}
}
This line looks suspicious:
pixels[j][k] = pixelChunk[j - (xPixels * i - 1)][k];
For example, say we have np = 2, so we're left with a single chunk, then
i = 1;
xStart = 0;
j = 0;
xPixels = 600;
pixelChunk[0 - (600 * 1 - 1)[k] == pixelChunk[-599][k]
Doesn't look right, does it?
This?
pixels[j][k] = pixelChunk[j - xPixels * (i - 1)][k];
The send/recv code is allright probably.