MPI gatherv displacement not working as expected - c

I have the following code which I compile and run with:
mpicc -std=c99 region.c
mpirun -n 4 region
$mpirun -version
mpirun (Open MPI) 1.6.5
$mpicc --version
gcc (Ubuntu 4.8.2-19ubuntu1) 4.8.2
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int rank,
size,
dims[2],
coords[2],
image_size[2] = {8,8},
local_image_size[2];
MPI_Datatype border_row_t,
border_col_t,
subarray_type,
recv_type;
unsigned char *image,
*region,
*local_region;
void create_types() {
int starts[2] = {0, 0};
MPI_Type_create_subarray(2, image_size, local_image_size, starts, MPI_ORDER_C, MPI_UNSIGNED_CHAR, &subarray_type);
MPI_Type_commit(&subarray_type);
MPI_Type_vector(local_image_size[0], local_image_size[1], image_size[1], MPI_UNSIGNED_CHAR, &recv_type);
MPI_Type_commit(&recv_type);
}
void distribute_image(){
if (0 == rank) {
MPI_Request request;
int num_hor_segments = image_size[0] / local_image_size[0];
int num_vert_segments = image_size[1] / local_image_size[1];
int dest_rank=0;
for (int vert=0; vert<num_vert_segments; vert++) {
for (int hor=0; hor<num_hor_segments; hor++) {
MPI_Isend((image+(local_image_size[0]*hor)+(local_image_size[1]*image_size[1]*vert)), 1, subarray_type, dest_rank, 0, MPI_COMM_WORLD, &request);
dest_rank++;
}
}
}
MPI_Status status;
MPI_Recv(local_region, local_image_size[0]*local_image_size[1], MPI_UNSIGNED_CHAR, 0, 0, MPI_COMM_WORLD, &status);
}
void gather_region(){
int counts[4]={1,1,1,1};
int disps[4]={0,4,32,36};
MPI_Gatherv(local_region,local_image_size[0]*local_image_size[1], MPI_UNSIGNED_CHAR, region,counts,disps,recv_type,0,MPI_COMM_WORLD);
if (0==rank) {
printf("Actually returned:\n");
for (int i=0; i<image_size[0]*image_size[1]; i++) {
printf("%d\t", *(region+i));
if ((i+1)%image_size[0]==0) printf("\n");
}
}
}
void init_mpi(int argc, char** argv){
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Dims_create(size, 2, dims);
}
void load_and_allocate_images(int argc, char** argv){
if(rank == 0){
image = (unsigned char*) malloc(sizeof(unsigned char*) * image_size[0] * image_size[1]);
for (unsigned char i=0; i<image_size[0]*image_size[1]; i++) {
image[i] = i;
printf("%d\t", *(image+i));
if((i+1)%image_size[0]==0) printf("\n");
}
printf("\n\n");
region = (unsigned char*)calloc(sizeof(unsigned char),image_size[0]*image_size[1]);
}
local_image_size[0] = image_size[0]/dims[0];
local_image_size[1] = image_size[1]/dims[1];
int lsize = local_image_size[0]*local_image_size[1];
int lsize_border = (local_image_size[0] + 2)*(local_image_size[1] + 2);
local_region = (unsigned char*)calloc(sizeof(unsigned char),lsize_border);
}
void cleanup() {
MPI_Type_free(&subarray_type);
MPI_Type_free(&recv_type);
}
int main(int argc, char** argv){
init_mpi(argc, argv);
load_and_allocate_images(argc, argv);
create_types();
distribute_image();
gather_region();
cleanup();
MPI_Finalize();
exit(0);
}
When I run gatherv with displacements of 0, 4, 32 and 36 I get the following
Distributed vector:
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55
56 57 58 59 60 61 62 63
Actually returned:
0 1 2 3 0 0 0 0
8 9 10 11 0 0 0 0
16 17 18 19 0 0 0 0
24 25 26 27 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
If I change the displacements to 0, 1, 32 36 I get the following:
Distributed vector:
0 1 2 3 4 5 6 7
8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55
56 57 58 59 60 61 62 63
Actually returned:
0 1 2 3 0 0 0 0
8 9 10 11 0 0 0 0
16 17 18 19 0 0 0 0
24 25 26 27 4 5 6 7
0 0 0 0 12 13 14 15
0 0 0 0 20 21 22 23
0 0 0 0 28 29 30 31
0 0 0 0 0 0 0 0
Why does a displacement of 1 translate to 28 in the returned vector? This confuses me.

Displacements in MPI_GATHERV are specified in units the extent of the datatype. The datatype as created by MPI_Type_vector(local_image_size[0], local_image_size[1], image_size[1], MPI_UNSIGNED_CHAR, &recv_type); has an extent of {(local_image_size[0]-1) * image_size[1] + local_image_size[1]} * extent(MPI_UNISIGNED_CHAR). Given the following:
local_image_size[0] = 4
local_image_size[1] = 4
image_size[1] = 8
extent(MPI_UNSIGNED_CHAR) = 1 byte
this results in the extent of recv_type being (4-1) * 8 + 4 or 28 bytes. Therefore, displacement of 1 specifies a location 28 bytes past the beginning of the receive buffer.
It is possible to "resize" a type by forcing a different "visible" extent on it with MPI_Type_create_resized. The whole procedure of properly performing 2D decomposition is well described in this answer.

Related

Strange behavior with -pg and optimization when generating hashes

I decided to make a program to find a sha3-512 hash with a certain number of zeroes at the start (like hashcash). It was working fine in my initial tests, so i decided to profile it with gprof to see if I could make it faster. After I had compiled with -pg and run it, I though i should go buy a lotto ticket. On the very first nonce I got a hash with 8 zeroes. However, I ran it again and I again got a number with 8 zeroes at the start. In fact, there were many discernible patterns in the hashes. After a few tests, I found that this only happened if I compiled with -pg and one of -O1, -O2, and -O3.
Here is the program
#include <tomcrypt.h>
#include <string.h>
#include <time.h>
#include <stdbool.h>
#include <stdio.h>
unsigned char* randstring(size_t length) {
srand(time(NULL));
unsigned char* randomString = NULL;
if (length) {
randomString = malloc(sizeof(char) * (length));
if (randomString) {
for (int n = 0; n < length; n++) {
int key = rand() % 255;
randomString[n] = (unsigned char)key;
}
}
}
return randomString;
}
void find_nonce(int zeroes, int* nonce_ptr, unsigned char* returner) {
unsigned char string[40];
unsigned char* rand_string = randstring(30);
memcpy(string, rand_string, 30);
free(rand_string);
//string is longer than rand_string because i need romm to put the nonce in
int nonce = 0;
int idx;
if (register_hash(&sha3_512_desc) == -1) {
printf("Error registering SHA3-512.\n");
exit(1);
}
idx = find_hash("sha3-512");
if (idx == -1) {
printf("Invalid hash name!\n");
exit(1);
}
int res_bool = false;
unsigned char res[64];
unsigned long res_size;
char nonce_str[11];
int nonce_len = 0;
while (!res_bool) {
//Put the nonce into a string
sprintf(nonce_str, "%d", nonce);
//Put the nonce string into the string as an unsigned char (hash_memory takes an unsigned char)
for (int i = 0, j = 11;; ++i, ++j) {
if (nonce_str[i] == '\0') {
break;
}
string[j] = (unsigned char)nonce_str[i];
nonce_len++;
}
//Hash it
hash_memory(idx, string, 30+nonce_len, res, &res_size);
nonce_len = 0;
//Check if the string has a sufficient number of zeroes at the start
res_bool = true;
for (int i = 0; i < zeroes; i++) {
if ((int)res[i] != 0) {
res_bool = false;
break;
}
}
nonce++;
}
*nonce_ptr = nonce;
for (int i = 0; i < 64; i++) {
returner[i] = res[i];
}
}
int main(int argc, char** argv) {
//Getting command-line arguments
int zeroes = atoi(argv[argc - 1]);
int nonce;
unsigned char hash[64];
//Timing the execution
clock_t start, end;
double cpu_time_used;
start = clock();
find_nonce(zeroes, &nonce, hash);
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
//Printing the output to the screen
printf("Hash was ");
for (int i = 0; i < 64; i++) {
printf("%d ", (int)hash[i]);
}
printf("\nNonce to get the hash was %d\nIt took %f seconds to calculate\n", nonce, cpu_time_used);
return 0;
}
And here is an example output from five tests:
Hash was 0 0 0 0 0 0 0 0 6 203 85 177 228 127 0 0 192 128 164 212 252 127 0 0 129 219 85 177 228 127 0 0 0 235 105 177 228 127 0 0 144 128 164 212 252 127 0 0 2 0 0 0 0 0 0 0 48 130 164 212 252 127 0 0
Nonce to get the hash was 1
It took 0.000000 seconds to calculate
Hash was 0 0 0 0 0 0 0 0 6 203 214 123 135 127 0 0 64 216 207 126 253 127 0 0 129 219 214 123 135 127 0 0 0 235 234 123 135 127 0 0 16 216 207 126 253 127 0 0 2 0 0 0 0 0 0 0 176 217 207 126 253 127 0 0
Nonce to get the hash was 1
It took 0.000000 seconds to calculate
Hash was 0 0 0 0 0 0 0 0 6 123 219 55 192 127 0 0 144 108 17 232 252 127 0 0 129 139 219 55 192 127 0 0 0 155 239 55 192 127 0 0 96 108 17 232 252 127 0 0 2 0 0 0 0 0 0 0 0 110 17 232 252 127 0 0
Nonce to get the hash was 1
It took 0.000000 seconds to calculate
Hash was 0 0 0 0 0 0 0 0 6 107 181 157 222 127 0 0 64 183 143 12 253 127 0 0 129 123 181 157 222 127 0 0 0 139 201 157 222 127 0 0 16 183 143 12 253 127 0 0 2 0 0 0 0 0 0 0 176 184 143 12 253 127 0 0
Nonce to get the hash was 1
It took 0.000000 seconds to calculate
Hash was 0 0 0 0 0 0 0 0 6 139 121 81 110 127 0 0 32 171 61 179 254 127 0 0 129 155 121 81 110 127 0 0 0 171 141 81 110 127 0 0 240 170 61 179 254 127 0 0 2 0 0 0 0 0 0 0 144 172 61 179 254 127 0 0
Nonce to get the hash was 1
It took 0.000000 seconds to calculate
res_size is not initialized. It contains garbage, and the garbage is different depending on the compiler flags.
hash_memory, on the other hand, expects it to have the size of the output buffer. The first thing it does is to check that there is enough space provided, and if not it bails out.
So what you see are not hashes, but initial state of your buffer.
Always test the return values!

Issue using MPI_Sendrecv and MPI_Type_create_subarray for 2D halo exchange

I'm trying to write MPI code in C to test halo/ghost exchange across processors for a 2D matrix/grid using MPI_Sendrecv and MPI_Type_create_subarray. I found that me code works only for exchanging data in one direction (x-direction, where the data is contiguous in memory) but does not exchange it correctly in the other direction (y-direction). I think this could most probably be due to the memory layout problem answered in previous posts by jonathan-dursi. I prefer to use the MPI_Type_create_subarray which appears to be easier to set in three dimension grid as well. Could someone help me how I could exchange the halo data?
Here is my code:
#include "mpi.h"
#include <stdio.h>
#define BUFSIZE 2
#define HALO 1
int main(int argc, char *argv[])
{
int i, j, rank, numprocs, buf[BUFSIZE+2*HALO][BUFSIZE+2*HALO];
int NDIM=2;
MPI_Comm new_comm;
MPI_Status status;
MPI_Request request;
MPI_Datatype xslice, yslice;
FILE *file;
int gsize[NDIM];
gsize[0]=BUFSIZE+2*HALO;
gsize[1]=BUFSIZE+2*HALO;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
int nprocs[2]={2,2};
int periods[2]={0,0};
int procneigh[2][2];
int coords[2];
MPI_Cart_create(MPI_COMM_WORLD,NDIM,nprocs,periods,0,&new_comm);
for (int i=0;i<NDIM;++i){
MPI_Cart_shift(new_comm,i,1,&procneigh[i][0],&procneigh[i][1]);
}
MPI_Cart_coords(new_comm,rank,2,coords);
int cnt=0;
for (i=0; i<gsize[0]; i++){
for(j=0; j<gsize[1]; j++){
buf[i][j] = (rank+1) * 10+cnt;
cnt++;;
if(i<1||j<1||i>gsize[0]-2||j>gsize[1]-2) buf[i][j]=0;
}
}
int TSIZE=gsize[0]*gsize[1];
int sizes[NDIM];
int subsizes[NDIM];
int starts[NDIM];
for(int i=0;i<NDIM;++i){
sizes[i]=gsize[i];
starts[i]=0;
}
subsizes[0]=1;
subsizes[1]=BUFSIZE;
MPI_Type_create_subarray(NDIM, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &yslice);
MPI_Type_commit(&yslice);
subsizes[0]=BUFSIZE;
subsizes[1]=1;
MPI_Type_create_subarray(NDIM, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &xslice);
MPI_Type_commit(&xslice);
int flag=1;
MPI_Sendrecv(&buf[HALO][1], 1, yslice ,procneigh[0][0], flag, &buf[gsize[0]-1][1], 1, yslice, procneigh[0][1], flag, new_comm, &status);
MPI_Sendrecv(&buf[gsize[0]-2][1], 1, yslice, procneigh[0][1], flag, &buf[HALO-1][1], 1, yslice, procneigh[0][0], flag, new_comm, &status);
MPI_Sendrecv(&buf[1][HALO], 1, xslice, procneigh[1][0], flag, &buf[1][gsize[1]-1], 1, xslice, procneigh[1][1], flag, new_comm, &status);
MPI_Sendrecv(&buf[1][gsize[1]-2], 1, xslice, procneigh[1][1], flag, &buf[1][HALO-1], 1, xslice, procneigh[1][0], flag, new_comm, &status);
int tmp;
if (rank == 0) {
for (int iproc = 0; iproc < numprocs; iproc++) {
if (iproc) {
MPI_Irecv(&buf[0][0],TSIZE,MPI_INT,iproc,0,MPI_COMM_WORLD,&request);
MPI_Send(&tmp,0,MPI_INT,iproc,0,MPI_COMM_WORLD);
MPI_Wait(&request,&status);
}
printf("---BEG----\n");
for(i = 0; i < gsize[0]; i++) {
for(j = 0; j < gsize[1]; j++) {
printf("%d ",buf[i][j]);
}
printf("\n");
}
printf("---END----\n");
}
} else {
MPI_Recv(&tmp,0,MPI_INT,0,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
MPI_Rsend(&buf[0][0],TSIZE,MPI_INT,0,0,MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
The output is:
---BEG----
0 0 0 0
0 15 16 25
26 19 20 0
0 35 36 45
---END----
---BEG----
0 0 0 0
16 25 26 0
0 29 30 0
36 45 46 0
---END----
---BEG----
0 19 20 0
0 35 36 45
46 39 40 0
0 0 0 0
---END----
---BEG----
0 29 30 0
36 45 46 0
0 49 50 0
0 0 0 0
---END----
While the expected output should be:
---BEG----
0 0 0 0
0 15 16 25
0 19 20 29
0 35 36 45
---END----
---BEG----
0 0 0 0
16 25 26 0
20 29 30 0
36 45 46 0
---END----
---BEG----
0 19 20 29
0 35 36 45
0 39 40 49
0 0 0 0
---END----
---BEG----
20 29 30 0
36 45 46 0
40 49 50 0
0 0 0 0
---END----

Array is printing with weird question mark symbols when printing fibonacci sequences

When printing each Fibonacci sequence the first couple of sequences print in weird symbols or not at all if printing more than 8 sequences.
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
//n=amount of numbers in the series to compute, seq=array to store series
void fibonacci(int n, int* seq){
// Complete this function
int i;
seq[0] = 0;
seq[1] = 1;
for(i = 2; i <= n; i++){
seq[i] = seq[i-2] + seq[i-1];
}
}
int main(){
int n;
//n, amount of series to compute
scanf("%d",&n);
//initialize array to 1, using malloc/calloc
int *seq = malloc(1 * sizeof(*seq));
int i;
for(i = 1; i <= n; i++){
//recompute the whole series
fibonacci(i, seq);
//print array
int j;
for(j = 0; j < i; j++)/* complete code */
printf("%d ", seq[j]);
//resize array, with realloc
int newSize=i+1;
int *seq = realloc(seq, newSize);
printf("\n");
}
//free array
return 0;
}
Output:
"7Y��yb�=
Um�*/E�o 1 1 2 3 5 8 13
0 1 1 2 3 5 8 13 21
0 1 1 2 3 5 8 13 21 34
0 1 1 2 3 5 8 13 21 34 55
0 1 1 2 3 5 8 13 21 34 55 89
0 1 1 2 3 5 8 13 21 34 55 89 144
0 1 1 2 3 5 8 13 21 34 55 89 144 233
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597 2584
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597 2584 4181
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597 2584 4181 6765
0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 1597 2584 4181 6765 10946
There are different problems in your code:
In your fibonacci() function, you iterate using i <= n, but inside the loop, you assign to seq[i]. When i = n, this becomes a problem: you're accessing one cell out of the array.
You are getting n from user input, but then doing int *seq = malloc(1 * sizeof(*seq)). You are only allocating space for one element, not n. You should do malloc(n * sizeof(*seq)) instead.
Not really an error, but inside the first for loop in your main, you're both re-defining and re-allocating the seq array with int *seq = realloc(...). That is not needed at all. Your array is already n cells big, so there is no need to reallocate it each time. You can use it as is.
Not really an error, but there is no need to recompute the series each time. You can compute it only once and then partially print it on each row without a problem.
Also, IMPORTANT! Using int to hold numbers of the Fibonacci sequence is only good until you reach n = 47. More than that, and your next element will overflow the maximum positive value that an int can hold, turning negative, and invalidating the rest of the calculations too. I would suggest you to use long long unsigned int instead, which would be good up to n = 94 (assuming 64 bits). Ultimately, you should check the value of n before calculating the Fibonacci sequence to avoid an overflow.
Here's a better version of your code with those problems fixed:
void fibonacci(int n, int* seq) {
int i;
seq[0] = 0;
seq[1] = 1;
for(i = 2; i < n; i++)
seq[i] = seq[i-2] + seq[i-1];
}
int main() {
int *seq;
int n, i, j;
scanf("%d",&n);
// Allocate enough space for n elements:
seq = malloc(n * sizeof(*seq));
// Compute the whole series once:
fibonacci(n, seq);
// Print partial series on each row:
for(i = 1; i <= n; i++) {
for(j = 0; j < i; j++)
printf("%d ", seq[j]);
printf("\n");
}
free(seq);
return 0;
}

Incomplete copy from struct to array

I have defined:
#define arrayLengthInStruct 50
typedef struct {
struct {
int _buf[arrayLengthInStruct];
int _bufLen;
} _context;
} _handle;
in main()
_handle handlePtr;
_handle* handle = (_handle*) &handlePtr; // data is here
int* k_src = NULL; // to be loaded to
int i = 0;
handlePtr._context._bufLen = arrayLengthInStruct;
// initialize the source
for (i = 0; i < handlePtr._context._bufLen; i++) {
handlePtr._context._buf[i] = i+1;
printf("%d \t", handlePtr._context._buf[i]);
}
printf("\n");
k_src = malloc(sizeof(int)*(handlePtr._context._bufLen));
printf("Amount of data to copy: %d \n", handle->_context._bufLen);
memcpy ( k_src,
&handle->_context._buf[0],
handle->_context._bufLen
);
for (i = 0; i < handlePtr._context._bufLen; i++) {
printf("%d \t", k_src[i]);
}
printf("\n");
However, the copy is incomplete. What am I missing?
output:
/*
1 2 3 4 5 6 7 8 9 10 11 12 13
14 15 16 17 18 19 20 21 22 23 24 25 26
27 28 29 30 31 32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47 48 49 50
Amount of data to copy: 50
1 2 3 4 5 6 7 8 9 10 11 12 13
0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0
*/
The third argument to memcpy is the number of bytes to copy. You provided the number of ints. Do this instead:
memcpy ( k_src,
&handle->_context._buf[0],
handle->_context._bufLen * sizeof(int)
);
You're missing the fact that memcpy copies a number of bytes rather than integers. You need to multiply your array size by sizeof(int) when using it with memcpy.
On a little-endian machine with four-byte int type, copying 50 bytes would give you what you see (50 / 4 = 12.5) though the last element 13 would depend on what was already in the destination memory.

MPI partition matrix into blocks

I want to partition matrix into blocks (not stripes) and then distribute this blocks using MPI_Scatter.
I came up with solution which works, but I think it is far from "best practice". I have 8x8 matrix, filled with numbers from 0 to 63. Then I divide it into 4 4x4 blocks, using MPI_Type_vector and distribute it via MPI_Send, but this require some extra computation since i have to compute offsets for each block in big matrix.
If I use scatter, first (top left) block is transfered OK, but other blocks are not (wrong offset for start of block).
So is it possible to transfer blocks of matrix using MPI_Scatter, or what is the best way to do desired decomposition?
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 8
int main(void) {
MPI_Init(NULL, NULL);
int p, rank;
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char i;
char a[SIZE*SIZE];
char b[(SIZE/2)*(SIZE/2)];
MPI_Datatype columntype;
MPI_Datatype columntype2;
MPI_Type_vector(4, 4, SIZE, MPI_CHAR, &columntype2);
MPI_Type_create_resized( columntype2, 0, sizeof(MPI_CHAR), &columntype );
MPI_Type_commit(&columntype);
if(rank == 0) {
for( i = 0; i < SIZE*SIZE; i++) {
a[i] = i;
}
for(int rec=0; rec < p; rec++) {
int offset = (rec%2)*4 + (rec/2)*32;
MPI_Send (a+offset, 1, columntype, rec, 0, MPI_COMM_WORLD);
}
}
MPI_Recv (b, 16, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//MPI_Scatter(&a, 1, boki, &b, 16, MPI_CHAR , 0, MPI_COMM_WORLD);
printf("rank= %d b= \n%d %d %d %d\n%d %d %d %d\n%d %d %d %d\n%d %d %d %d\n", rank, b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]);
MPI_Finalize();
return 0;
}
What you've got is pretty much "best practice"; it's just a bit confusing until you get used to it.
Two things, though:
First, be careful with this: sizeof(MPI_CHAR) is, I assume, 4 bytes, not 1. MPI_CHAR is an (integer) constant that describes (to the MPI library) a character. You probably want sizeof(char), or SIZE/2*sizeof(char), or anything else convenient. But the basic idea of doing a resize is right.
Second, I think you're stuck using MPI_Scatterv, though, because there's no easy way to make the offset between each block the same size. That is, the first element in the first block is at a[0], the second is at a[SIZE/2] (jump of size/2), the next is at a[SIZE*(SIZE/2)] (jump of (SIZE-1)*(SIZE/2)). So you need to be able to manually generate the offsets.
The following seems to work for me (I generalized it a little bit to make it clearer when "size" means "number of rows" vs "number of columns", etc):
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define COLS 12
#define ROWS 8
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int p, rank;
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
char i;
char a[ROWS*COLS];
const int NPROWS=2; /* number of rows in _decomposition_ */
const int NPCOLS=3; /* number of cols in _decomposition_ */
const int BLOCKROWS = ROWS/NPROWS; /* number of rows in _block_ */
const int BLOCKCOLS = COLS/NPCOLS; /* number of cols in _block_ */
if (rank == 0) {
for (int ii=0; ii<ROWS*COLS; ii++) {
a[ii] = (char)ii;
}
}
if (p != NPROWS*NPCOLS) {
fprintf(stderr,"Error: number of PEs %d != %d x %d\n", p, NPROWS, NPCOLS);
MPI_Finalize();
exit(-1);
}
char b[BLOCKROWS*BLOCKCOLS];
for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++) b[ii] = 0;
MPI_Datatype blocktype;
MPI_Datatype blocktype2;
MPI_Type_vector(BLOCKROWS, BLOCKCOLS, COLS, MPI_CHAR, &blocktype2);
MPI_Type_create_resized( blocktype2, 0, sizeof(char), &blocktype);
MPI_Type_commit(&blocktype);
int disps[NPROWS*NPCOLS];
int counts[NPROWS*NPCOLS];
for (int ii=0; ii<NPROWS; ii++) {
for (int jj=0; jj<NPCOLS; jj++) {
disps[ii*NPCOLS+jj] = ii*COLS*BLOCKROWS+jj*BLOCKCOLS;
counts [ii*NPCOLS+jj] = 1;
}
}
MPI_Scatterv(a, counts, disps, blocktype, b, BLOCKROWS*BLOCKCOLS, MPI_CHAR, 0, MPI_COMM_WORLD);
/* each proc prints it's "b" out, in order */
for (int proc=0; proc<p; proc++) {
if (proc == rank) {
printf("Rank = %d\n", rank);
if (rank == 0) {
printf("Global matrix: \n");
for (int ii=0; ii<ROWS; ii++) {
for (int jj=0; jj<COLS; jj++) {
printf("%3d ",(int)a[ii*COLS+jj]);
}
printf("\n");
}
}
printf("Local Matrix:\n");
for (int ii=0; ii<BLOCKROWS; ii++) {
for (int jj=0; jj<BLOCKCOLS; jj++) {
printf("%3d ",(int)b[ii*BLOCKCOLS+jj]);
}
printf("\n");
}
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Running:
$ mpirun -np 6 ./matrix
Rank = 0
Global matrix:
0 1 2 3 4 5 6 7 8 9 10 11
12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69 70 71
72 73 74 75 76 77 78 79 80 81 82 83
84 85 86 87 88 89 90 91 92 93 94 95
Local Matrix:
0 1 2 3
12 13 14 15
24 25 26 27
36 37 38 39
Rank = 1
Local Matrix:
4 5 6 7
16 17 18 19
28 29 30 31
40 41 42 43
Rank = 2
Local Matrix:
8 9 10 11
20 21 22 23
32 33 34 35
44 45 46 47
Rank = 3
Local Matrix:
48 49 50 51
60 61 62 63
72 73 74 75
84 85 86 87
Rank = 4
Local Matrix:
52 53 54 55
64 65 66 67
76 77 78 79
88 89 90 91
Rank = 5
Local Matrix:
56 57 58 59
68 69 70 71
80 81 82 83
92 93 94 95

Resources