MPI - C send 2D array of structures - c

I managed to write working code for sending one structure in MPI.But what i need is send two dimensional array of these structures and im stuck.
heres my code of sending one struct. Can you guide me how to modify it?
typedef struct {
unsigned char r;
unsigned char g;
unsigned char b;
} pixel;
MPI_Datatype mpi_pixel; /*datatype variable*/
pixel send_pixel; /*my instance of structure*/
int lengtharray[3]; /* Array of lengths */
MPI_Aint disparray[3]; /* Array of displacements */
MPI_Datatype typearray[3]; /* Array of MPI datatypes */
MPI_Aint startaddress, address;
lengtharray[0] = lengtharray[1] =lengtharray[2] = 1; /* Set array of lengths */
typearray[0] = typearray[1] = typearray[2]= MPI_UNSIGNED_CHAR;/* and data types */
/* First element, a, is at displacement 0 */
disparray[0] = 0;
/* Calculate displacement of b */
MPI_Address(&send_pixel.b, &startaddress);
MPI_Address(&send_pixel.g, &address);
disparray[1] = address-startaddress; /* Displacement of second element, b */
MPI_Address(&send_pixel.r, &address);
disparray[2] = address-startaddress; /* Displacement of third element, n */
/* Build the data structure my_type */
MPI_Type_struct(3, lengtharray, disparray, typearray, &mpi_pixel);
MPI_Send(&send_pixel, 1, mpi_pixel, 0, 50, MPI_COMM_WORLD);

There are a few different ways to send this. There are a few answers on SO that explain some of them, here, here.
If you want to continue on along the lines that you are doing, I'd create a contiguous type for a 1D array and then expand that with another contiguous type to a 2D array.
A BIG BIG WARNING I do not check for errors, you REALLY REALLY SHOULD.
I've changed you code around a bit, as I don't like using typedef's of structs. I also put the creation of the basic MPI pixel datatype into a function and added some test sending routines (of course you could extend them to pass in the pixel(s) you want to send:
* Create a MPI datatype of a pixel.
mpi_pixel_init(MPI_Datatype *mpi_pixel)
struct pixel_s pixel; /* instance of structure */
int i = 0; /* temporary loop indexer */
int count = 3; /* number of blocks in the struct */
int blocks[3] = {1, 1, 1}; /* set up 3 blocks */
MPI_Datatype types[3] = { /* pixel internal types */
MPI_Aint dis[3] = { /* internal displacements */
offsetof(struct pixel_s, r),
offsetof(struct pixel_s, g),
offsetof(struct pixel_s, b)
MPI_Type_create_struct(count, blocks, dis, types, mpi_pixel);
Test sending a single pixel:
/* Send a single pixel */
send_pixel(int src, int dst, MPI_Datatype mpixel)
int rank = 0;
struct pixel_s x = {0};
MPI_Status stat;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == src) {
x.r = 255;
x.g = 128;
x.b = 128;
MPI_Send(&x, 1, mpixel, 1, 1, MPI_COMM_WORLD);
} else if (rank == dst) {
MPI_Recv(&x, 1, mpixel, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &stat);
printf("Single pixel\n");
printf("%d:\tr: %d\tg: %d\tb: %d\n", rank, x.r, x.g, x.b);
Test sending a row of pixels:
/* Send a row/1D of pixels */
send_1d_pixels(int src, int dst, MPI_Datatype cpixel)
int i = 0;
int rank = 0;
struct pixel_s x[ROWS] = {0};
MPI_Status stat;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Test sending a row of pixels */
if (rank == src) {
for (i = 0; i < ROWS; ++i) {
x[i].r = i;
x[i].g = i + 128;
x[i].b = 255 - i;
MPI_Send(&x, 1, cpixel, 1, TAG, MPI_COMM_WORLD);
} else if (rank == dst) {
MPI_Recv(&x, 1, cpixel, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &stat);
printf("Row of pixels\n");
for (i = 0; i < ROWS; ++i) {
printf("%d:\tr: %d\tg: %d\tb: %d\n", i,
x[i].r, x[i].g, x[i].b);
Test sending a 2D array of pixels:
/* Send an 2D array of pixels */
send_2d_pixels(int src, int dst, MPI_Datatype apixel)
int i = 0;
int j = 0;
int rank = 0;
struct pixel_s x[ROWS][COLS] = {0};
MPI_Status stat;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Test sending one pixel */
if (rank == src) {
for (i = 0; i < ROWS; ++i) {
for (j = 0; j < COLS; ++j) {
x[i][j].r = i;
x[i][j].g = j;
x[i][j].b = i*COLS + j;
MPI_Send(&x, 1, apixel, 1, TAG, MPI_COMM_WORLD);
} else if (rank == dst) {
MPI_Recv(&x, 1, apixel, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &stat);
printf("Array of pixels\n");
for (i = 0; i < ROWS; ++i) {
for (j = 0; j < COLS; ++j) {
printf("(%d,%d):\tr: %d\tg: %d\tb: %d\n", i, j,
x[i][j].r, x[i][j].g, x[i][j].b);
Then later on you can use it as:
* Create a 2D array of MPI pixels.
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <mpi.h>
#define TAG 1
#define COLS 10
#define ROWS 10
struct pixel_s {
unsigned char r;
unsigned char g;
unsigned char b;
int mpi_pixel_init(MPI_Datatype *);
int send_pixel(int, int, MPI_Datatype);
int send_1d_pixels(int, int, MPI_Datatype);
int send_2d_pixels(int, int, MPI_Datatype);
main(int argc, char **argv)
MPI_Datatype mpixel; /* single pixel */
MPI_Datatype cmpixel; /* row/contiguous pixels */
MPI_Datatype ampixel; /* 2D array of pixels */
MPI_Init(&argc, &argv);
/* Create an MPI pixel datatype */
/* Create a 1D array (contiguous) pixels */
MPI_Type_contiguous(ROWS, mpixel, &cmpixel);
/* Create a 2D array from a 1D array of pixels */
MPI_Type_contiguous(COLS, cmpixel, &ampixel);
/* Test sending one pixel */
send_pixel(0, 1, mpixel);
/* Test sending a row of pixels */
send_1d_pixels(0, 1, cmpixel);
/* Test sending a 2D array of pixels */
send_2d_pixels(0, 1, ampixel);
/* Free up the types and finalize MPI */


MPI Scatter Array of Matrices Struct

I have an array of type Matrix structs which the program got from user's input. I need to distribute the matrices to processes with OpenMPI. I tried using Scatter but I am quite confused about the arguments needed for the program to work (and also how to receive the data in each local arrays). Here is my current code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#define nil NULL
#define NMAX 100
#define DATAMAX 1000
#define DATAMIN -1000
typedef struct Matrix
int mat[NMAX][NMAX]; // Matrix cells
int row_eff; // Matrix effective row
int col_eff; // Matrix effective column
} Matrix;
void init_matrix(Matrix *m, int nrow, int ncol)
m->row_eff = nrow;
m->col_eff = ncol;
for (int i = 0; i < m->row_eff; i++)
for (int j = 0; j < m->col_eff; j++)
m->mat[i][j] = 0;
Matrix input_matrix(int nrow, int ncol)
Matrix input;
init_matrix(&input, nrow, ncol);
for (int i = 0; i < nrow; i++)
for (int j = 0; j < ncol; j++)
scanf("%d", &input.mat[i][j]);
return input;
void print_matrix(Matrix *m)
for (int i = 0; i < m->row_eff; i++)
for (int j = 0; j < m->col_eff; j++)
printf("%d ", m->mat[i][j]);
int main(int argc, char **argv)
MPI_Init(&argc, &argv);
// Get number of processes
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
// Get process rank
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Get matrices from user inputs
int kernel_row, kernel_col, num_targets, target_row, target_col;
// reads kernel's row and column and initalize kernel matrix from input
scanf("%d %d", &kernel_row, &kernel_col);
Matrix kernel = input_matrix(kernel_row, kernel_col);
// reads number of target matrices and their dimensions.
// initialize array of matrices and array of data ranges (int)
scanf("%d %d %d", &num_targets, &target_row, &target_col);
Matrix *arr_mat = (Matrix *)malloc(num_targets * sizeof(Matrix));
for (int i = 0; i < num_targets; i++)
arr_mat[i] = input_matrix(target_row, target_col);
// Get number of matrices per process
int num_mat_per_proc = ceil(num_targets / size);
// Init local matrices and scatter the global matrices
Matrix *local_mats = (Matrix *)malloc(num_mat_per_proc * sizeof(Matrix));
MPI_Scatter(arr_mat, sizeof(local_mats), MPI_BYTE, &local_mats, sizeof(local_mats), MPI_BYTE, 0, MPI_COMM_WORLD);
if (rank == 0)
// Range arrays -> array of convolution results
int arr_range[num_targets];
printf("From master \n");
for (int i = 0; i < 3; i++)
printf("From slave %d = \n", rank);
So here's a few doubts I have about the current implementation:
Can I accept the input just like that or should I make it so that it only happens in rank 0?
How do I implement the scatter part and possibly using Scatterv because the amount of arrays might not be divisible to the number of processes?
Can I accept the input just like that or should I make it so that it
only happens in rank 0?
No, You should use command line arguments or read from file as best practice.
If you want to use scanf, then use it inside rank 0. STDIN is forwarded to rank 0 (this is not supported in standard as far as I know, But I guess this should work and will be implementation dependent)
How do I implement the scatter part and possibly using Scatterv
because the amount of arrays might not be divisible to the number of
If you different size to send for different processes, then you should use scatterv.
Scatter Syntax:
void* send_data,
int send_count,
MPI_Datatype send_datatype,
void* recv_data,
int recv_count,
MPI_Datatype recv_datatype,
int root,
MPI_Comm communicator)
Your usage:
MPI_Scatter(arr_mat, sizeof(local_mats), MPI_BYTE, &local_mats, sizeof(local_mats), MPI_BYTE, 0, MPI_COMM_WORLD);
Potential error points:
In send_count: Size to send (as Gilles Gouaillardet Pointed out in comments). Sizeof(local_mats) instead it should be num_mat_per_proc * sizeof(Matrix).
recv_count: I believe size to receive should not be sizeof(local_mats).
Since you use the same type (MPI_BYTES) for SEND and RECV, your send_count == recv_count

Can you send an array within an array using MPI_Send and MPI_Recv?

This is the very basic function of my program, and as such is not necessarily reproducible. However, I was wondering if there is a way to send an array of arrays using MPI? Or is this something that is not possible and I should flatten my array? Any help would be greatly appreciated as I've been struggling with trying to figure this out.
int *individual_topIds;
int **cell_topIds;
cell_topIds = (int**) malloc(sizeof(int*)*25*boxes);
if(rank == 0) {
for (int i = 0; i < boxes; i++) {
individual_topIds = (int*) malloc(sizeof(int)*25);
for(int j = 0; j < cellMatrix[i].numTop; j++){
individual_topIds[j] = cellMatrix[i].aTopIds[j];
cell_topIds[i] = individual_topIds;
MPI_Send(cell_topIds, boxes*25, MPI_INT, 1, 10, MPI_COMM_WORLD);
Then in my rank == 1 section. I have tried send and receive with just boxes, and not boxes*25 as well.
for 1 -> boxes
MPI_Recv(cell_topIds, boxes*25, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
int *ptop;
ptop = (int*) malloc(sizeof(int)*25);
ptop = cell_topIds[i];
for(int j = 0; j < sizeof(&ptop)/sizeof(int); j++){
printf("%d, ", ptop[j]);
end for i -> boxes
Edit: Forgot to mention that the output of the print is a seg fault
Caught error: Segmentation fault (signal 11)
This is not a particularly well-worded question.
However, MPI will let you send arrays of arrays if you use a custom type, as below:
#include "mpi.h"
#include <stdio.h>
struct Partstruct
char c;
double d[6];
char b[7];
int main(int argc, char *argv[])
struct Partstruct particle[1000];
int i, j, myrank;
MPI_Status status;
MPI_Datatype Particletype;
MPI_Datatype type[3] = { MPI_CHAR, MPI_DOUBLE, MPI_CHAR };
int blocklen[3] = { 1, 6, 7 };
MPI_Aint disp[3];
MPI_Init(&argc, &argv);
disp[0] = &particle[0].c - &particle[0];
disp[1] = &particle[0].d - &particle[0];
disp[2] = &particle[0].b - &particle[0];
MPI_Type_create_struct(3, blocklen, disp, type, &Particletype);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == 0)
MPI_Send(particle, 1000, Particletype, 1, 123, MPI_COMM_WORLD);
else if (myrank == 1)
MPI_Recv(particle, 1000, Particletype, 0, 123, MPI_COMM_WORLD, &status);
return 0;
Alternatively, use a flat array design (this is a good idea for performance reasons as well as easy compatibility with MPI).

MPI_Scatter and Gather - 2D array, uneven blocks

I'm using MPI and I try to send uneven blocks of 2D array to different processors.
For instance if I have not squere image which size is 333x225 and I want to send blocks of different sizes to different processors.
I have seen #Jonathan Dursi method for even arrays:
sending blocks of 2D array in C using MPI
I try to adapt it to my problem. So far I managed to send even chunks of data to two processes like this:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include "mpi.h"
int malloc2dchar(char ***array, int n, int m) {
/* allocate the n*m contiguous items */
char *p = (char *)malloc(n*m*sizeof(char));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (char **)malloc(n*sizeof(char*));
if (!(*array)) {
return -1;
/* set up the pointers into the contiguous memory */
for (int i=0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
int free2dchar(char ***array) {
/* free the memory - the first element of the array is at the start */
/* free the pointers into the memory */
return 0;
int main(int argc, char **argv) {
char **global, **local;
const int gridsize=10; // size of grid
const int procgridsize=2; // size of process grid
int rank, size; // rank of current process and no. of processes
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
/* fill in the array, and print it */
malloc2dchar(&global, gridsize, gridsize);
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++)
global[i][j] = '0'+(3*i+j)%10;
printf("Global array is:\n");
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++)
/* create the local array which we'll process */
malloc2dchar(&local, 5, 10);
/* create a datatype to describe the subarrays of the global array */
int sizes[2] = {gridsize, gridsize}; /* global size */
int subsizes[2] = {5, 10}; /* local size */
int starts[2] = {0,0}; /* where this one starts */
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_CHAR, &type);
MPI_Type_create_resized(type, 0, 10*sizeof(char), &subarrtype);
char *globalptr=NULL;
if (rank == 0) globalptr = &(global[0][0]);
/* scatter the array to all processors */
int sendcounts[2];
int displs[2];
if (rank == 0) {
for (int i=0; i<2; i++) sendcounts[i] = 1;
int disp = 0;
//for (int i=0; i<procgridsize; i++) {
// for (int j=0; j<procgridsize; j++) {
// displs[i*procgridsize+j] = disp;
// disp += 1;
// }
// disp += ((gridsize/procgridsize)-1)*procgridsize;
MPI_Scatterv(globalptr, sendcounts, displs, subarrtype, &(local[0][0]),
gridsize*gridsize/2, MPI_CHAR,
/* now all processors print their local data: */
for (int p=0; p<size; p++) {
if (rank == p) {
printf("Local process on rank %d is:\n", rank);
for (int i=0; i<5; i++) {
for (int j=0; j<10; j++) {
/* now each processor has its local array, and can process it */
for (int i=0; i<5; i++) {
for (int j=0; j<10; j++) {
local[i][j] = 'A' + rank;
/* it all goes back to process 0 */
MPI_Gatherv(&(local[0][0]), gridsize*gridsize/2, MPI_CHAR,
globalptr, sendcounts, displs, subarrtype,
/* don't need the local data anymore */
/* or the MPI data type */
if (rank == 0) {
printf("Processed grid:\n");
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++) {
return 0;
So I get:
Global array is:
Local process on rank 0 is:
Local process on rank 1 is:
Processed grid:
But I want data to be like this (not even chunks):
I have tried to set tab_size depending on process rank. But it doesn't work completly fine.
Here is the code:
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include "mpi.h"
int malloc2dchar(char ***array, int n, int m) {
/* allocate the n*m contiguous items */
char *p = (char *)malloc(n*m*sizeof(char));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (char **)malloc(n*sizeof(char*));
if (!(*array)) {
return -1;
/* set up the pointers into the contiguous memory */
for (int i=0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
int free2dchar(char ***array) {
/* free the memory - the first element of the array is at the start */
/* free the pointers into the memory */
return 0;
int main(int argc, char **argv) {
char **global, **local;
const int gridsize=10; // size of grid
const int procgridsize=2; // size of process grid
int rank, size; // rank of current process and no. of processes
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//if (size != procgridsize*procgridsize) {
// fprintf(stderr,"%s: Only works with np=%d for now\n", argv[0], procgridsize);
int tab_size;
if (rank == 0) {
/* fill in the array, and print it */
malloc2dchar(&global, gridsize, gridsize);
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++)
global[i][j] = '0'+(3*i+j)%10;
printf("Global array is:\n");
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++)
tab_size = 4;
if(rank == 1)
tab_size = 6;
/* create the local array which we'll process */
malloc2dchar(&local, tab_size, 10);
/* create a datatype to describe the subarrays of the global array */
int sizes[2] = {gridsize, gridsize}; /* global size */
int subsizes[2] = {tab_size, 10}; /* local size */
int starts[2] = {0,0}; /* where this one starts */
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_CHAR, &type);
MPI_Type_create_resized(type, 0, 10*sizeof(char), &subarrtype);
char *globalptr=NULL;
if (rank == 0) globalptr = &(global[0][0]);
/* scatter the array to all processors */
int sendcounts[2];
int displs[2];
int tabsize;
if (rank == 0) {
for (int i=0; i<2; i++) sendcounts[i] = 1;
int disp = 0;
//for (int i=0; i<procgridsize; i++) {
// for (int j=0; j<procgridsize; j++) {
// displs[i*procgridsize+j] = disp;
// disp += 1;
// }
// disp += ((gridsize/procgridsize)-1)*procgridsize;
MPI_Scatterv(globalptr, sendcounts, displs, subarrtype, &(local[0][0]),
gridsize*gridsize/2, MPI_CHAR,
/* now all processors print their local data: */
for (int p=0; p<size; p++) {
if (rank == p) {
printf("Local process on rank %d is:\n", rank);
for (int i=0; i<tab_size; i++) {
for (int j=0; j<10; j++) {
/* now each processor has its local array, and can process it */
for (int i=0; i<tab_size; i++) {
for (int j=0; j<10; j++) {
local[i][j] = 'A' + rank;
/* it all goes back to process 0 */
MPI_Gatherv(&(local[0][0]), gridsize*gridsize/2, MPI_CHAR,
globalptr, sendcounts, displs, subarrtype,
/* don't need the local data anymore */
/* or the MPI data type */
if (rank == 0) {
printf("Processed grid:\n");
for (int i=0; i<gridsize; i++) {
for (int j=0; j<gridsize; j++) {
return 0;
And the output looks like this:
Global array is:
Local process on rank 0 is:
Local process on rank 1 is:
[blade001:3727] *** An error occurred in MPI_Gatherv
[blade001:3727] *** reported by process [2497249281,0]
[blade001:3727] *** on communicator MPI_COMM_WORLD
[blade001:3727] *** MPI_ERR_TRUNCATE: message truncated
[blade001:3727] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[blade001:3727] *** and potentially your MPI job)
Why is your code wrong
You define a datatype that should be the same differently on different ranks. That's not the way it is done.
How to do what you attempt correctly
A decomposition of contigous data by complete rows, as you describe, is much simpler. There no need for complex derived datatypes, in fact you don't need them at all. You can use a very simple datatype representing a row. Then the only task is to setup the size / displacements of MPI_Scatterv correctly:
int local_rows[2] = {6, 4};
malloc2dchar(&local, local_rows[rank], gridsize);
MPI_Datatype row_type;
MPI_Type_contiguous(gridsize, MPI_CHAR, &row_type);
int displs[2];
if (rank == 0) {
displs[0] = 0;
for (int r = 1; r < 2; r++) {
displs[r] = displs[r - 1] + local_rows[r - 1];
MPI_Scatterv(globalptr, local_rows, displs, row_type, &(local[0][0]),
local_rows[rank], row_type, 0, MPI_COMM_WORLD);
MPI_Gatherv(&(local[0][0]), local_rows[rank], row_type, globalptr, local_rows,
displs, row_type, 0, MPI_COMM_WORLD);
This assumes that the intended sizes {6, 4} are known by all ranks. You can either have everyone compute it deterministically or have only the root compute that and scatter it (non-root ranks need only know their own row count).
True irregular 2D decomposition
If you truely want to split out chunks not only consisting of whole rows, it becomes much more complicated. There is a very good answer about that already, so I won't repeat that here. Make sure to read it very carefully and follow it closely.
Due to the complexity, I would suggest to only do that if you are absolutely sure you need it.
You cannot send overlapping data with a single scatter. If you need overlap, consider exchanging the data directly between the neighbouring processes that own the range in a halo exchange.

Portability of sending values stored in char array using MPI

If I have a char array that represents, for example, integer value(s), and I used it to send these values via MPI with the appropriate MPI datatype for send and receive operations as follows:
int main(int argc, char* argv[]){
int my_rank; /* rank of process */
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
unsigned char buff[100];
if (my_rank == 0){
int n = 99;
int i;
for(i = 0; i < sizeof(n); i++){
buff[i] = (n >> (8 * i)) & 0xFF;
MPI_Send(&buff, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
MPI_Recv(&buff, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, NULL);
int n = *(int *)buff;
printf("%d\n", n);
/* shut down MPI */
return 0;
Is this portable across machines of different architecture/endiness?
I suspect the only part that is not portable is the conversion from integer value to char array:
int i;
for(i = 0; i < sizeof(n); i++){
buff[i] = (n >> (8 * i)) & 0xFF;
But, anyways if not, is there a way to make the above program fully portable with the existence of the char array to store value(s)?
The way you serialize integers into a buffer is not portable. But if you are sending the integer, why not send integers directly, just take endianness into account. Suppose you want to sent 32bits integers:
int n = 99;
int sent = htonl(n);
MPI_Send(&send, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
For receiving:
int n;
int recv;
MPI_Recv(&recv, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, NULL);
n = ntonl(recv);

MPI Subarray Sending Error

I firstly initialize a 4x4 matrix and then try to send the first 2x2 block to the slave process by using MPI in C. However the slave process only receives the first row of the block, the second row is filled with random numbers from computer ram. I couldn't find what is missing. The code of the program is below :
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 4
int main(int argc, char** argv)
int rank, nproc;
const int root = 0;
const int tag = 3;
int** table;
int* datas;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
datas = malloc(SIZE * SIZE * sizeof(int));
table = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
table[i] = &(datas[i * SIZE]);
for (int i = 0; i < SIZE; i++)
for (int k = 0; k < SIZE; k++)
table[i][k] = 0;
table[0][1] = 1;
table[0][2] = 2;
table[1][0] = 3;
table[2][3] = 2;
table[3][1] = 3;
table[3][2] = 4;
if (rank == root){
MPI_Datatype newtype;
int sizes[2] = { 4, 4 }; // size of table
int subsizes[2] = { 2, 2 }; // size of sub-region
int starts[2] = { 0, 0 };
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &newtype);
MPI_Send(&(table[0][0]), 1, newtype, 1, tag, MPI_COMM_WORLD);
int* local_datas = malloc(SIZE * SIZE * sizeof(int));
int** local = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
local[i] = &(local_datas[i * SIZE]);
MPI_Recv(&(local[0][0]), 4, MPI_INT, root, tag, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (int i = 0; i < 2; i++){
for (int k = 0; k < 2; k++)
printf("%3d ", local[i][k]);
return 0;
You have instructed the receive operation to put four integer values consecutively in memory and therefore the 2x2 block is converted to a 1x4 row upon receive (since local is 4x4). The second row of local contains random values since the memory is never initialised.
You should either make use of MPI_Type_create_subarray in both the sender and the receiver in order to place the received data in a 2x2 block or redefine local to be a 2x2 matrix instead of 4x4.
