Send over MPI_Bcast an MPI_PACKED message - c

I have a message that I want to send in broadcast by using MPI_Bcast.
I have two structs with, near the others, dynamic arrays, and because of that I decided to use MPI_Pack and MPI_Unpack.
Here below my solution.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <mpi.h>
#include <stddef.h>
#define DEBUG 0
typedef struct Code {
char character;
int length;
char *code;
} Code;
typedef struct CodeDictionary {
int codesNr;
Code *codes;
} CodeDictionary;
typedef struct Header {
int size; // size of the message in bytes
MPI_Datatype *type; // message type
int position; // position in the buffer
} Header;
typedef unsigned char BYTE;
int getRand(const int from, const int to)
{
int num = (rand() % (to - from + 1)) + from;
return num;
}
void buildCodeDictionaryType(MPI_Datatype *CodeDictType) {
int blockLengths[] = {1, 1};
MPI_Datatype types[] = {MPI_CHAR, MPI_INT};
MPI_Aint offsets[2];
offsets[0] = offsetof(Code, character);
offsets[1] = offsetof(Code, length);
MPI_Type_create_struct(2, blockLengths, offsets, types, CodeDictType);
MPI_Type_commit(CodeDictType);
}
BYTE* buildCodeDictionaryMsg(Header *header, CodeDictionary *dict) {
header->size = sizeof(int);
BYTE *buffer = malloc(sizeof(BYTE) * (header->size));
MPI_Pack(&dict->codesNr, 1, MPI_INT, buffer, header->size, &header->position, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("\ndict->codesNr = %d\n", dict->codesNr);
printf("header->size = %d\n", header->size);
printf("header->position = %d\n\n", header->position);
}
for (int i = 0; i < dict->codesNr; i++) {
header->size += sizeof(char) + sizeof(int) + (sizeof(char) * (dict->codes[i].length+1));
buffer = realloc(buffer, header->size);
MPI_Pack(&dict->codes[i], 1, *header->type, buffer, header->size, &header->position, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("before pack array - header->size = %d\n", header->size);
printf("before pack array - header->position = %d\n", header->position);
}
MPI_Pack(dict->codes[i].code, (dict->codes[i].length+1), MPI_CHAR, buffer, header->size, &header->position, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("after pack array - header->size = %d\n", header->size);
printf("after pack array - header->position = %d\n", header->position);
printf("\n");
}
}
return buffer;
}
void buildCodeDictionary(Header *header, CodeDictionary *dict, BYTE* buffer) {
MPI_Unpack(buffer, header->size, &header->position, &dict->codesNr, 1, MPI_INT, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("dict->codesNr = %d\n", dict->codesNr);
printf("header->size = %d\n", header->size);
printf("header->position = %d\n\n", header->position);
}
dict->codes = malloc(sizeof(Code) * dict->codesNr);
// I do it just for the first element because of test
MPI_Unpack(buffer, header->size, &header->position, &dict->codes[0], 1, *header->type, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("before unpack - header->size = %d\n", header->size);
printf("before unpack - header->position = %d\n", header->position);
}
dict->codes[0].code = malloc(sizeof(char) * (dict->codes[0].length+1));
MPI_Unpack(buffer, header->size, &header->position, &dict->codes[0].code, (dict->codes[0].length+1), MPI_CHAR, MPI_COMM_WORLD);
if (DEBUG == 1) {
printf("after unpack - header->size = %d\n", header->size);
printf("after unpack - header->position = %d\n", header->position);
}
// just for test
printf("character: %c\tlength: %d\tcode: ",
dict->codes[0].character,
dict->codes[0].length);
printf("%s\n", dict->codes[0].code); // it crashes here
// if it works do a for in order to unpack all the data
// ...
}
int main(int argc, char** argv) {
MPI_Init(NULL, NULL);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// Create a CodeDictionary variable
CodeDictionary dict;
if (world_rank == 0)
printf("sending\n\n");
if (world_rank == 0) {
dict.codesNr = getRand(5, 9);
dict.codes = malloc(sizeof(Code) * dict.codesNr);
// create some fake values
for (int i = 0; i < dict.codesNr; i++) {
dict.codes[i].character = 'a' + i;
dict.codes[i].length = getRand(1, 9);
dict.codes[i].code = malloc(sizeof(char) * (dict.codes[i].length+1));
for (int j = 0; j < dict.codes[i].length; j++) {
int randChar = getRand('a', 'z');
dict.codes[i].code[j] = randChar + j;
}
dict.codes[i].code[dict.codes[i].length] = '\0';
if (DEBUG == 1)
printf("strlen(dict.charEncoding[%d].encoding): %d\n", i, strlen(dict.codes[i].code));
}
printf("source data\n");
for (int i = 0; i < dict.codesNr; i++) {
printf("codes[%d]:\n\t", i);
printf("character: %c\tlength: %d\tcode: ", dict.codes[i].character, dict.codes[i].length);
for (int j = 0; j < dict.codes[i].length; j++)
printf("%c", dict.codes[i].code[j]);
printf("\n");
}
}
MPI_Datatype codeDictType;
buildCodeDictionaryType(&codeDictType);
Header header = {.size = 0, .position = 0, .type = NULL};
header.type = &codeDictType;
BYTE *buffer = NULL;
if (world_rank == 0)
buffer = buildCodeDictionaryMsg(&header, &dict);
MPI_Bcast(&header.size, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (DEBUG == 1)
printf("rank %d: header.size = %d\n", world_rank, header.size);
if (world_rank != 0)
buffer = calloc(header.size, sizeof(BYTE));
MPI_Bcast(buffer, header.size, MPI_PACKED, 0, MPI_COMM_WORLD);
if (world_rank != 0) {
printf("\nreceiving\n\n");
buildCodeDictionary(&header, &dict, buffer);
// printf("received data\n");
// for (int i = 0; i < dict.codesNr; i++) {
// printf("codes[%d]:\n\t", i);
// printf("character: %c\tlength: %d\tcode: ", dict.codes[i].character, dict.codes[i].length);
// for (int j = 0; j < dict.codes[i].length; j++)
// printf("%c", dict.codes[i].code[j]);
// printf("\n");
// }
}
free(buffer);
MPI_Type_free(&codeDictType);
for (int i = 0; i < dict.codesNr; i++)
free(dict.codes[i].code);
free(dict.codes);
MPI_Finalize();
return 0;
}
For some reason this line of code doesn't work MPI_Unpack(buffer, header->size, &header->position, &dict->codes[0].code, (dict->codes[0].length+1), MPI_CHAR, MPI_COMM_WORLD); because if I try to print the received array, the run crashes with segmentation fault as error.
I don't understand why it happens, I take care of the memory by allocating the right size, \0 character included.
Do you know what is the problem?

Related

Why does this program not enter on the other ranks?

The code I am trying to do has to implement a skribbl io game. I am working with MPI, and the processes are divided between the ranks (rank 0 is the main, it assigns the drawer, rank (drawer) draws, collects the info and the other ones are the players). I have two problems with this code (the second one originates from the first one). The first problem is that although there are cases in the code for the processes to know what they need to do, the players never enter their respective if-s (if (rank != drawer)). I put printf-s before and after the if statement; the one before is called, the one after is not. The second problem is that the MPI_Gather functions from all the cases don't work as expected. I want to send a string array (char[][]), but the drawer's function just waits for data, and does not get any (probably because of the other ranked processes not being able to enter their if's).
Can anyone help me with this?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct dataa{
char fUname[18], sUname[18], tUname[18];
} Data;
short ran(int lower_limit, int upper_limit, unsigned int *seed) //random generáló
{
return (short) ((double) rand_r(seed) / (RAND_MAX + 1.0) * (upper_limit - lower_limit + 1) + lower_limit);
}
void generate(char fUname[18], char sUname[18], char tUname[18], MPI_Datatype* strct) {
int arrayOfBlocklengths[3] = {18, 18, 18};
MPI_Datatype arrayOfTypes[3] = {MPI_CHAR, MPI_CHAR, MPI_CHAR};
MPI_Aint fAddr, sAddr, tAddr;
MPI_Aint arrayOfDisplacements[3] = {0};
MPI_Get_address(fUname, &fAddr);
MPI_Get_address(sUname, &sAddr);
MPI_Get_address(tUname, &tAddr);
arrayOfDisplacements[1] = sAddr - fAddr;
arrayOfDisplacements[2] = tAddr - fAddr;
MPI_Type_create_struct(3, arrayOfBlocklengths, arrayOfDisplacements, arrayOfTypes, strct);
MPI_Type_commit(strct);
}
int main(int argc, const char* argv[]) {
if (argc != 1) {
printf("man no good i no need parameter bro\n");
exit(1);
}
int n, rank, i = 0;
//printf("%d\n", n);
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &n);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int nrOfGames, wordChooser;
unsigned int seed = getpid();
if (rank == 0) {
nrOfGames = ran(5, 15, &seed);
MPI_Bcast(&nrOfGames, 1, MPI_INT, 0, MPI_COMM_WORLD);
} else {
MPI_Bcast(&nrOfGames, 1, MPI_INT, 0, MPI_COMM_WORLD);
printf("Process #%d: nrOfGames: %d\n", rank, nrOfGames);
}
for (i = 0; i < nrOfGames; i++) {
printf("%d. iteration: ranks are: %d\n", i, rank);
/*if (i % n != rank) {
continue;
}*/
if (rank == 0) {
int drawerRank = ran(1, n - 1, &seed);
int j;
MPI_Bcast(&drawerRank, 1, MPI_INT, 0, MPI_COMM_WORLD);
printf("Main process: drawer generated, their rank is %d.\n", drawerRank);
char fileName[15] = "./threewords.sh";
FILE *f = popen(fileName, "r");
Data data;
fscanf(f, "%s %s %s", data.fUname, data.sUname, data.tUname);
printf("Main process: generated usernames are: %s %s %s\n", data.fUname, data.sUname, data.tUname);
MPI_Datatype strct;
generate(data.fUname, data.sUname, data.tUname, &strct);
printf("Main process: generated the structure\n");
MPI_Send(&data, 1, strct, drawerRank, 0, MPI_COMM_WORLD);
printf("Main process: new struct sent\n");
char badMsg[5][18] = {"rossz", "rossz", "rossz", "rossz", "rossz"};
int as = 0;
for (as = 0; as < 5; as++) {
printf("szo: %s ", badMsg[as]);
}
char guesses[n * 6][18];
MPI_Gather(badMsg, 5 * 18, MPI_CHAR, guesses, 5 * 18, MPI_CHAR, drawerRank, MPI_COMM_WORLD);
int* pointsPerPlayer = (int*) calloc (n - 1, sizeof(int));
MPI_Recv(&pointsPerPlayer, n - 1, MPI_INT, drawerRank, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Main process: Receive command sent.\n");
continue;
}
printf("\t\t\trank: %d\n", rank);
if (rank != 0) {
int drawer;
MPI_Bcast(&drawer, 1, MPI_INT, 0, MPI_COMM_WORLD);
printf("Process with rank %d got the drawer, %d.\n", rank, drawer);
if (rank == drawer) {
printf("I am the drawer, rank %d.\n", drawer);
//rajzolo eset
char wordToDraw[18];
int* pointsPerPlayer = (int*) calloc (n - 1, sizeof(int));
Data data;
MPI_Datatype strct;
generate(data.fUname, data.sUname, data.tUname, &strct);
printf("Drawer process generated the structure.\n");
Data recData;
MPI_Recv(&recData, 1, strct, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("\nDrawer process received the structure from the main process, usernames are %s %s %s\n", recData.fUname, recData.sUname, recData.tUname);
MPI_Type_free(&strct);
wordChooser = ran(1, 3, &seed);
if (wordChooser == 1) {
strcpy(wordToDraw, data.fUname);
} else if (wordChooser == 2) {
strcpy(wordToDraw, data.sUname);
} else {
strcpy(wordToDraw, data.tUname);
}
//lerajzolja, most meg varja a valaszokat
int j, k, guessed = 0;
char guessesPerThr[5][18] = {"rossz", "rossz", "rossz", "rossz", "rossz"};
char guesses[n * 6][18];
MPI_Gather(guessesPerThr, 5 * 18, MPI_CHAR, guesses, 5 * 18, MPI_CHAR, drawer, MPI_COMM_WORLD);
printf("sus\n");
j = 1;
k = 0;
while (j < n) {
if (j != 0 && j != rank) {
k = 0;
while (k < 5) {
if (!strcmp(wordToDraw, guessesPerThr[j * 5 + k])) {
guessed++;
pointsPerPlayer[j] += 5 - k;
break;
}
k++;
}
} else {
if (j == 0) {
pointsPerPlayer[j] = 0;
}
}
j++;
}
if (guessed) {
pointsPerPlayer[rank] = guessed - (n - guessed);
if (pointsPerPlayer[i] < 0) {
pointsPerPlayer[i] *= -1;
}
}
MPI_Send(&pointsPerPlayer, n - 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
continue;
}
printf("\t\t\t\t\t\t\trank:%d \t drawer: %d\n", rank, drawer);
if (rank != drawer) {
int drawer;
printf("u ok m8?\n");
MPI_Recv(&drawer, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process #%d: The drawer is %d.\n", rank, drawer);
FILE *g = popen("./fivewords.sh", "r");
char guessesPerThr[5][18], guesses[n * 6][18];
int j;
for (j = 0; j < 5; j++) {
fscanf(g, "%s", guessesPerThr[j]);
}
MPI_Gather(guessesPerThr, 5 * 18, MPI_CHAR, guesses, 5 * 18, MPI_CHAR, drawer, MPI_COMM_WORLD);
}
}
}
MPI_Finalize();
return 0;
}

Why MPI_Bcast is not working when cores >= 12 for a given input?

I have the current code working for cores under 12, using prints, code works until MPI_Bcast(&global_mean,...) prints before it are shown and after no:
Note: MPI custom type has been created and has been tested;
int main (int argc, char ** argv)
{
int pid, n_processors;
const int ROOT = 0;
int xsize, ysize, colmax;
pixel *src = (pixel*) malloc(sizeof(pixel) * MAX_PIXELS);
// Scatter receiver vector
pixel *receive_buffer;
int send_count, partial_sum, total_sum, global_mean, nump, remainder;
double global_time = 0.0;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &n_processors);
if(pid == ROOT){
/* Take care of the arguments */
if (argc != 3)
{
fprintf(stderr, "Usage: %s infile outfile\n", argv[0]);
exit(1);
}
/* Read file */
if(read_ppm (argv[1], &xsize, &ysize, &colmax, (char *) src) != 0)
exit(1);
if (colmax > 255)
{
fprintf(stderr, "Too large maximum color-component value\n");
exit(1);
}
printf("Has read the image, calling filter\n");
send_count = (xsize * ysize)/n_processors;
nump = xsize * ysize;
}
double start_time = MPI_Wtime();
MPI_Bcast(&send_count, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
// Initialize receive_buffer
receive_buffer = (pixel*)malloc(send_count * sizeof(pixel));
// 1. Scatter src array through different proccessors
MPI_Scatter(src, send_count, mpi_pixel_type, receive_buffer, send_count, mpi_pixel_type, ROOT, MPI_COMM_WORLD);
// 2. Do partial sums
int i;
partial_sum = 0;
for(i = 0; i < send_count; i++){
partial_sum += (unsigned int)receive_buffer[i].r + (unsigned int)receive_buffer[i].g + (unsigned int)receive_buffer[i].b;
}
MPI_Reduce(&partial_sum, &total_sum, 1, MPI_INT, MPI_SUM, ROOT, MPI_COMM_WORLD);
// Calculate missing pixels
if(pid == ROOT){
remainder = nump % n_processors;
for(int i = nump - remainder; i < nump; i++)
total_sum += (unsigned int)receive_buffer[i].r + (unsigned int)receive_buffer[i].g + (unsigned int)receive_buffer[i].b;
}
// 3. Calculate mean
if(pid == ROOT)
global_mean = total_sum/nump;
MPI_Bcast(&global_mean, 1, MPI_INT, ROOT, MPI_COMM_WORLD);
// 4. Apply algorithm
MPI_Scatter(src, send_count, mpi_pixel_type, receive_buffer, send_count, mpi_pixel_type, ROOT, MPI_COMM_WORLD);
unsigned int psum;
for(i = 0; i < send_count; i++){
psum = (unsigned int)receive_buffer[i].r + (unsigned int)receive_buffer[i].g + (uint)receive_buffer[i].b;
if(global_mean > psum)
receive_buffer[i].r = receive_buffer[i].g = receive_buffer[i].b = 0;
else
receive_buffer[i].r = receive_buffer[i].g = receive_buffer[i].b = 255;
}
// 5. Gather partial results
MPI_Gather(receive_buffer, send_count, mpi_pixel_type, src, send_count, mpi_pixel_type, ROOT, MPI_COMM_WORLD);
if(pid == ROOT){
// printf("Reamainder: %d\n", remainder);
for(i = nump - remainder; i < nump; i++){
psum = (unsigned int)src[i].r + (unsigned int)src[i].g + (uint)src[i].b;
if(global_mean > psum)
src[i].r = src[i].g = src[i].b = 0;
else
src[i].r = src[i].g = src[i].b = 255;
}
}
double end_time = MPI_Wtime();
global_time += end_time - start_time;
if(pid == ROOT){
printf("Filtering took: %g secs\n", global_time) ;
/* Write result */
printf("Writing output file\n\n");
if (write_ppm(argv[2], xsize, ysize, (char *)src) != 0)
exit(1);
}
MPI_Finalize();
return 0;
}
Under core < 12 is working fine but when cores >= 12
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 56361 RUNNING AT sigma.nsc.liu.se
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
Where did I do wrong in my code? Why stops working only when certain cores is reached?
Why stops working...
Likely because your process is attempting to access memory it does not own, leading to the error: BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
Here you are creating memory for send_count pixels in receive_buffer:
receive_buffer = (pixel*)malloc(send_count * sizeof(pixel));
Then here, you are indexing one past the memory you have created:
for(i = 0; i <= send_count; i++){
^^
psum = (unsigned int)receive_buffer[i].r + (unsigned int)receive_buffer[i].g + (uint)receive_buffer[i].b;
if(global_mean > psum)
Change to
for(i = 0; i < send_count; i++){
^
The error was caused because I was computing with receive buffer after MPI_Reduction and should have used src :)
Thanks all for your suggestions

How to share a string array using open mpi

I'm new in openmpi and I don't know how to use scatter and gather to send an array of strings to all processors. I would like to divide an array and send it to each processor, but all I can divide are the characters of a single array element. Can anyone help me please?
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mpi.h"
#define MASTER 0
#define BUF_SIZE 2048
#define CHAR_SIZE 900
#define CHARS 13
#define MAX_SIZE 3500
#define NUMBER_OF_FILES 2
int main(int argc, char** argv) {
int number_of_words = 0;
int total_rows = 0;
int i, j = 0;
char **words = (char**) calloc(MAX_SIZE, sizeof (char*));
for (i = 0; i < MAX_SIZE; i++) {
words[i] = (char*) calloc(CHARS, sizeof (char));
}
char **local_words = (char**) calloc(MAX_SIZE, sizeof (char*));
for (i = 0; i < MAX_SIZE; i++) {
local_words[i] = (char*) calloc(CHARS, sizeof (char));
}
char **rec_words = (char**) calloc(MAX_SIZE, sizeof (char*));
for (i = 0; i < MAX_SIZE; i++) {
rec_words[i] = (char*) calloc(CHARS, sizeof (char));
}
char str_righe[BUF_SIZE][CHAR_SIZE];
FILE *f = NULL;
char f_title[10];
char str_nfiles[10];
char delim[10] = {10, 32, 33, 39, 44, 46, 58, 59, 63};
char *ptr;
int rank;
int size;
int message_length;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
for (i = 1; i <= NUMBER_OF_FILES; i++) {
strcpy(f_title, "f");
sprintf(str_nfiles, "%d", i);
strcat(f_title, str_nfiles);
strcat(f_title, ".txt");
f = fopen(f_title, "r");
while (fgets(str_righe[j], BUF_SIZE, f)) {
str_righe[j][strlen(str_righe[j])] = '\0';
j++;
}
fclose(f);
}
total_rows = j;
for (i = 0; i < total_rows; ++i) {
ptr = strtok(str_righe[i], delim);
while (ptr != NULL) {
strcpy(words[number_of_words], ptr);
ptr = strtok(NULL, delim);
number_of_words++;
}
}
message_length = number_of_words / size;
if (rank == MASTER) {
for (i = 0; i < number_of_words; i++)
printf("%s\n", words[i]);
}
MPI_Scatter(*words, message_length, MPI_CHAR, *local_words, message_length, MPI_CHAR, MASTER, MPI_COMM_WORLD);
printf("rank %d, fragment: \t%s\n", rank, *local_words);
MPI_Gather(*local_words, message_length, MPI_CHAR, *rec_words, message_length, MPI_CHAR, MASTER, MPI_COMM_WORLD);
if (rank == MASTER) {
printf("rank %d, gathered: \t%s\n", rank, *rec_words);
}
MPI_Finalize();
return EXIT_SUCCESS;
}
I expect the output:
iMac-di-iMac01:mpi macbook$ mpirun -n 2 main
Good
time
by
antonio
rank 0, fragment: Good time
rank 1, fragment: by antonio
rank 0, gathered: Good time by antonio
But the actual output is:
iMac-di-iMac01:mpi macbook$ mpirun -n 2 main
Good
time
by
antonio
rank 0, fragment: Go
rank 1, fragment: od
rank 0, gathered: Good
I realized that I never shared the solution to the problem. I do it now:
I created the matrix variable and I sent the one with the scatter. In this way the slaves received the words and not the characters
int *matrix = 0;
matrix = malloc(sizeof (int) * n_words);
j = 1;
for (i = 0; i <= n_words; i++) {
matrix[i] = j;
j++;
}
n_words_cpu = n_words / (size);
procRow = malloc(sizeof (int) * n_words); // received row will contain p integers
MPI_Scatter(
/* send_data = */ matrix,
/* send_count = */ n_words_cpu,
/* send_datatype = */ MPI_INT,
/* recv_data = */ procRow,
/* recv_count = */ n_words_cpu,
/* recv_datatype = */ MPI_INT,
/* root = */ MASTER,
/* MPI_commuicator = */ MPI_COMM_WORLD);

Cannon algorithm using MPI

I want to implement the Cannon Algorithm using MPI in C using cartesian communicators which are shifted using the default functions and by sending 2-dimensional blocks from the 2 matrices.
I have tried to follow a couple of tutorials found online, but I realized none were implemented the way I wanted them to, using both 2-dimensional blocks and cartesian communicators.
EDIT: I have managed to get over the error after realizing that I was using the proc_grid_size variable in a wrong way, confusing the size of the process matrix with the block size and entering into some unallocated memory area.
I am running with an input of 25 processes and 2 10*10 matrices stored in 2 different files.
I am currently trying to implement the shift operations using the MPI_Cart_Shift function. But I don't know how to send the block over to the neighbors.
This is my current implementation of this specific part, which is not working (the application just hangs):
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank, coords[2];
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
// Get the communicator related information
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// Set up the Cartesian topology
dims[0] = dims[1] = proc_matrix_size;//sqrt(npes);
// Set the periods for wraparound connections
periods[0] = periods[1] = 1;
// Create the Cartesian topology, with rank reordering
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
// Get the rank and coordinates with respect to the new topology
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
// Compute ranks of the up and left shifts
// Get line neighbors (direction = 1, displacement = 1)
MPI_Cart_shift(comm_2d, 1, 1, &leftrank, &rightrank);
// Get column neighbors (direction = 0, displacement = 1)
MPI_Cart_shift(comm_2d, 0, 1, &uprank, &downrank);
// Determine the dimension of the local matrix block
nlocal = block_size;// n / dims[0];
MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(&(a[0][0]), 1, subarrtype,
shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(&(b[0][0]), 1, subarrtype,
shiftdest, 1, shiftsource, 1, comm_2d, &status);
After closing the application, I discover that the root process is the only one that hangs:
F:\Facultate\AN_4\PDC\Labs\MPI\Cannon\x64\Release>mpiexec -np 25 Cannon.exe
a.txt b.txt> mpiexec aborting job...
job aborted:
[ranks] message
[0] job terminated by the user
[1-24] terminated
---- error analysis -----
[0] on DESKTOP-JB1815M
ctrl-c was hit. job aborted by the user.
---- error analysis -----
INITIAL SOLVED CODE:
int malloc2D(int ***array, int n, int m) {
int i;
/* allocate the n*m contiguous items */
int *p = (int*) calloc(n*m, sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int**) calloc(n, sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int free2D(int ***array) {
/* free the memory - the first element of the array is at the start */
free(&((*array)[0][0]));
/* free the pointers into the memory */
free(*array);
return 0;
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (argc != 3) {
fprintf(stderr, "Not enough arguments passed! Make sure you pass 2 filenames.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Declare file pointers
FILE* fa = NULL;
FILE* fb = NULL;
// Declare matrix pointers
int **A = NULL;
int **B = NULL;
int **C = NULL;
// Declare matrix dimensions
int ma = 0, na = 0;
int mb = 0, nb = 0;
// Nr of processes on each line/column in process mesh
int proc_matrix_size = (int)sqrt(world_size);
// Single value for quadratic matrix size
int n = 0;
// Nr of elements on each line/column in local matrix
// of each process
int block_size = 0;
// Open files and read matrices
if (world_rank == 0)
{
fa = fopen(argv[1], "r");
fb = fopen(argv[2], "r");
// Read matrix dymensions
fscanf(fa, "%d %d\n", &ma, &na);
fscanf(fb, "%d %d\n", &mb, &nb);
// Check if matrices are quadratic
if ((ma != na) && (na != mb) && (mb != nb))
{
printf("Invalid matrices dimensions\n");
return 0;
}
n = na;
// Check if sqrt(nr_processes) divides matrix dimension
if ((n % proc_matrix_size != 0) || (world_size % proc_matrix_size != 0))
{
printf("Number of processes does not fit matrix size\n");
return 0;
}
block_size = n / proc_matrix_size;
malloc2D(&A, n, n);
malloc2D(&B, n, n);
malloc2D(&C, n, n);
// Read matrices A & B from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fa, "%d ", &A[i][j]);
fscanf(fb, "%d ", &B[i][j]);
}
fscanf(fa, "\n");
}
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
/*
Divide matrices in blocks and send each block to the corresponding process
*/
// Declare global pointers to matrices
int *globalAptr = NULL;
int *globalBptr = NULL;
int *globalCptr = NULL;
// Declare global return pointers
int *globalA2ptr = NULL;
int *globalB2ptr = NULL;
int **A2 = NULL;
int **B2 = NULL;
// Declare local matrix pointers
int **a = NULL;
int **b = NULL;
int **c = NULL;
malloc2D(&A2, n, n);
malloc2D(&B2, n, n);
if (world_rank == 0)
{
globalAptr = &(A[0][0]);
globalBptr = &(B[0][0]);
globalA2ptr = &(A2[0][0]);
globalB2ptr = &(B2[0][0]);
globalCptr = &(C[0][0]);
}
malloc2D(&a, block_size, block_size);
malloc2D(&b, block_size, block_size);
malloc2D(&c, block_size, block_size);
// Sizes of input global matrix
int sizes[2] = { n, n };
// Sizes of each block
int subsizes[2] = { block_size, block_size };
// Begining of current block
int starts[2] = { 0,0 };
// Declare subarray type
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, block_size * sizeof(int), &subarrtype);
MPI_Type_commit(&subarrtype);
// Scatter the A and B to all processes
int* sendcounts = (int*)malloc(proc_matrix_size * proc_matrix_size * sizeof(int));
int* displs = (int*)malloc(proc_matrix_size * proc_matrix_size * sizeof(int));
if (world_rank == 0)
{
for (int i = 0; i < proc_matrix_size * proc_matrix_size; i++)
sendcounts[i] = 1;
int disp = 0;
for (int i = 0; i < proc_matrix_size; i++) {
for (int j = 0; j < proc_matrix_size; j++) {
displs[i * proc_matrix_size + j] = disp;
disp += 1;
}
disp += ((n / proc_matrix_size)-1) * proc_matrix_size;
}
}
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
for (int i = 0; i < block_size; i++) {
for (int j = 0; j < block_size; j++) {
a[i][j] = 10 + a[i][j];
b[i][j] = 10 + b[i][j];
}
}
// It all goes back to process 0
MPI_Gatherv(&(a[0][0]), block_size * block_size, MPI_INT,
globalA2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Gatherv(&(b[0][0]), block_size * block_size, MPI_INT,
globalB2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
OLD:
I would like to mention that at the moment, I am trying to send blocks over the default communicator and planning to implement the shifting operations and the cartesian communicator after managing to send the matrix blocks.
The help I need is with regard to the Scatterv function which throws the following error:
job aborted: [ranks] message
[0] fatal error Fatal error in MPI_Scatterv: Invalid count, error
stack: MPI_Scatterv(sbuf=0x0000029262048D40, scnts=0x00000292620482B0,
displs=0x0000029262048250, dtype=USER,
rbuf=0x000002926203ED30, rcount=25, MPI_INT, root=0, MPI_COMM_WORLD)
failed Negative count, value is -1912594387
[1-7] terminated
This is the code I have written until now:
#include "stdafx.h"
#include "mpi.h"
#include "stdio.h"
#include "stdlib.h"
#include <assert.h>
#include <cstdlib>
#include <math.h>
int malloc2D(int ***array, int n, int m) {
int i;
/* allocate the n*m contiguous items */
int *p = (int*) malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int**) malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int free2D(int ***array) {
/* free the memory - the first element of the array is at the start */
free(&((*array)[0][0]));
/* free the pointers into the memory */
free(*array);
return 0;
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (argc != 3) {
fprintf(stderr, "Not enough arguments passed! Make sure you pass 2 filenames.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Declare file pointers
FILE* fa = NULL;
FILE* fb = NULL;
// Declare matrix pointers
int **A = NULL;
int **B = NULL;
int **C = NULL;
// Declare matrix dymensions
int ma = 0, na = 0;
int mb = 0, nb = 0;
// Nr of processes on each line/column in process mesh
int proc_grid_size = (int)sqrt(world_size);
// Single value for quadratic matrix size
int n = 0;
// Nr of elements on each line/column in local matrix
// of each process
int block_size = 0;
// Open files and read matrices
if (world_rank == 0)
{
fa = fopen(argv[1], "r");
fb = fopen(argv[2], "r");
// Read matrix dymensions
fscanf(fa, "%d %d\n", &ma, &na);
fscanf(fb, "%d %d\n", &mb, &nb);
// Check if matrices are quadratic
if ((ma != na) && (na != mb) && (mb != nb))
{
printf("Invalid matrices dimensions\n");
return 0;
}
n = na;
// Check if sqrt(nr_processes) divides matrix dimension
if ((n % proc_grid_size != 0) || (world_size % proc_grid_size != 0))
{
printf("Number of processes does not fit matrix size\n");
return 0;
}
block_size = n / proc_grid_size;
// Initialize matrices
A = (int**)calloc(n, sizeof(int*));
B = (int**)calloc(n, sizeof(int*));
//C = (int**)calloc(n, sizeof(int*));
for (int i = 0; i < n; i++)
{
A[i] = (int*)calloc(n, sizeof(int));
B[i] = (int*)calloc(n, sizeof(int));
//C[i] = (int*)calloc(n, sizeof(int));
}
// Read matrix A from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fa, "%d ", &A[i][j]);
printf("%d ", A[i][j]);
}
fscanf(fa, "\n");
printf("\n");
}
// Read matrix B from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fb, "%d ", &B[i][j]);
printf("%d ", B[i][j]);
}
fscanf(fb, "\n");
printf("\n");
}
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
/*
Divide matrices in blocks and send each block to the corresponding process
*/
// Sizes of input global matrix
int sizes[2] = { n, n };
// Sizes of each block
int subsizes[2] = { block_size, block_size };
// Begining of current block
int starts[2] = { 0,0 };
// Declare subarray type
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, block_size * sizeof(int), &subarrtype);
MPI_Type_commit(&subarrtype);
// Declare global pointers to matrices
int *globalAptr = NULL;
int *globalBptr = NULL;
int **A2 = NULL;
int **B2 = NULL;
malloc2D(&A2, n, n);
malloc2D(&B2, n, n);
// Declare global return pointers
int *globalA2ptr = NULL;
int *globalB2ptr = NULL;
if (world_rank == 0)
{
globalAptr = &(A[0][0]);
globalBptr = &(B[0][0]);
globalA2ptr = &(A2[0][0]);
globalB2ptr = &(B2[0][0]);
}
// Declare local matrix pointers
int **a = NULL;
int **b = NULL;
malloc2D(&a, block_size, block_size);
malloc2D(&b, block_size, block_size);
// Scatter the A and B to all processes
int* sendcounts = (int*)malloc(proc_grid_size * proc_grid_size * sizeof(int));
int* displs = (int*)malloc(proc_grid_size * proc_grid_size * sizeof(int));
if (world_rank == 0)
{
for (int i = 0; i < proc_grid_size * proc_grid_size; i++)
sendcounts[i] = 1;
int disp = 0;
for (int i = 0; i < proc_grid_size; i++) {
for (int j = 0; j < proc_grid_size; j++) {
displs[i * proc_grid_size + j] = disp;
disp += 1;
}
disp += ((block_size) - 1) * proc_grid_size;
}
for (int i = 0; i < proc_grid_size * proc_grid_size; i++)
{
printf("Send cound: %d\n", sendcounts[i]);
}
}
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
// Now each processor has its local array, and can process it
for (int i = 0; i < block_size; i++) {
for (int j = 0; j < block_size; j++) {
a[i][j] = 10 + a[i][j];
b[i][j] = 10 + b[i][j];
}
}
// It all goes back to process 0
MPI_Gatherv(&(a[0][0]), block_size * block_size, MPI_INT,
globalA2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Gatherv(&(b[0][0]), block_size * block_size, MPI_INT,
globalB2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Thank you very much!

parallel sort using mpi

I try to sort different array with mpi. Every array are allocate locally.
for example we have {1-7-4-12} {3-7-5-9} {12-15-2-16} {10-8-11-13}
and we want {1-2-3-4}{5-6-7-8}{9-10-11-12}{13-14-15-16}
So I use odd-even strategy. For 2proccess it's works in every case but when i try with more process i have new value. For my example i can have {23-2-3-4}. I think my problem is from allocate memory but i don't find where and what i do wrong...
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MASTER 0
#define MIN(a,b) ((a)<(b)?(a):(b))
#define BLOCK_LOW(id,p,n) ((id)*(n)/(p))
#define BLOCK_HIGH(id,p,n) \
(BLOCK_LOW((id)+1,p,n)-1)
#define BLOCK_SIZE(id,p,n) \
(BLOCK_LOW((id)+1, p, n)-BLOCK_LOW(id, p , n))
#define BLOCK_OWNER(index,p,n) \
(((p)*(index+1)-1)/(n))
int nbProcess, id, n; //n = number of value
void printTabByProcess(int *T){
int i = 0;
int size = BLOCK_SIZE(id, nbProcess, n);
printf("Tab n°%d [ ", id, size);
for(i; i < size; i++){
printf(" %d ", T[i]);
}
printf(" ]\n");
}
void fusion(int *t,int deb1,int fin1,int fin2){
int *table1;
int deb2=fin1+1;
int compt1=deb1;
int compt2=deb2;
int i;
table1=(int*)malloc((fin1-deb1+1)*sizeof(int));
for(i=deb1;i<=fin1;i++) {
table1[i-deb1]=t[i];
}
for(i=deb1;i<=fin2;i++){
if(compt1==deb2)
break;
else if(compt2==(fin2+1)){
t[i]=table1[compt1-deb1];
compt1++;
}
else if(table1[compt1-deb1]<t[compt2]){
t[i]=table1[compt1-deb1];
compt1++;
}
else{
t[i]=t[compt2];
compt2++;
}
}
free(table1);
}
void tri_fusion(int*t,int deb,int fin){
if(deb!=fin){
int milieu=(fin+deb)/2;
tri_fusion(t,deb,milieu);
tri_fusion(t,milieu+1,fin);
fusion(t,deb,milieu,fin);
}
}
int* fusion2(int* t1, int* t2, int size1, int size2){
int* buffer = malloc(sizeof(int)*(size1 + size2));
int index1 = 0;
int index2 = 0;
int i = 0;
for(i; i < (size1 + size2) - 1; i++){
if(t1[index1] < t2[index2]){
buffer[i] = t1[index1];
index1++;
}else{
buffer[i] = t2[index2];
index2++;
}
}
if(index1 == size1 - 1 ){
buffer[size1 + size2 - 1] = t1[index1];
}else{
buffer[size1 + size2 - 1] = t2[index2];
}
return buffer;
}
/*
*
* OUR FUNCTION TO PARALLEL SORT
*
*/
void TD_trier(int* T){
MPI_Status status;
int size = BLOCK_SIZE(id, nbProcess, n);
int receive_size = 0;
int* receive;
int* array_tmp;
int i = 0;
tri_fusion(T, 0, size - 1);
MPI_Barrier(MPI_COMM_WORLD);
for(i; i < nbProcess; i++){
if(i%2==0){
if(id % 2 == 1){//send to left
MPI_Send(&size, 1, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
MPI_Send(T, size, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
MPI_Recv(T, size, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
}else {
MPI_Recv(&receive_size, 1, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
receive = malloc(sizeof(int) * size);
MPI_Recv(receive, receive_size, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
array_tmp = fusion2(T, receive, size, receive_size);
MPI_Send(&array_tmp[size], receive_size, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
T = realloc(array_tmp, sizeof(int) * size);
}
if(id == 1){
//~ printTabByProcess(T);
}
}else if(i%2 == 1 && id < nbProcess-1){ //send to right
if(id % 2 == 1){
MPI_Send(&size, 1, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
MPI_Send(T, size, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
//printTabByProcess(T);
MPI_Recv(T, size, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
}else if(id != 0 && id%2 ==0) {
MPI_Recv(&receive_size, 1, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
//receive = malloc(sizeof(int) * size);
MPI_Recv(receive, receive_size, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
//printTabByProcess(receive);
array_tmp = fusion2(T, receive, size, receive_size);
MPI_Send(array_tmp, receive_size, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
printTabByProcess(&array_tmp[2]);
T = array_tmp + size;
printTabByProcess(T);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
//printTabByProcess(T);
}
int generateRandomValue(){
return rand() % 100;
}
//init array with "random" value
int* TD_init(int n){
int i = 0;
int indiceDerniere = (id+1)*n/nbProcess -1;
int indicePremiere = id*n/nbProcess;
int* arrayLocal;
int localSize = indiceDerniere - indicePremiere +1;
arrayLocal = malloc(sizeof(int)*localSize);
//~ printf("id : %d - nbCase : %d (debut : %d, fin : %d)\n",
//~ id, localSize, indicePremiere, indiceDerniere);
for(i; i < localSize; i++){
arrayLocal[i] = generateRandomValue() - id;
}
printTabByProcess(arrayLocal);
return arrayLocal;
}
int main (int argc, char *argv[]){
//int n = 0;
int *dataLocal;
int dest;
int x;
int success;
MPI_Status status;
srand(time(NULL));
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nbProcess); //numtask contient le nombre de processeur
MPI_Comm_rank(MPI_COMM_WORLD, &id); //taskid, determine le numero du processus
//~ printf ("MPI task %d has started...\n", id);
//~ tag2 = 1;
//~ tag1 = 2;
MPI_Barrier (MPI_COMM_WORLD);
/***** Master task only ******/
if (id == MASTER){
printf("Chose a number of value :");
scanf("%d",&n);
/* Send the number of cases */
for (dest=1; dest<nbProcess; dest++) {
MPI_Send(&n, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); //send number of value
}
} /* end of master section */
/***** Non-master tasks only *****/
if (id > MASTER) {
/* Receive the number of cases */
MPI_Recv(&n, 1, MPI_INT, MASTER, 1, MPI_COMM_WORLD, &status);
}
MPI_Barrier (MPI_COMM_WORLD);
dataLocal = TD_init(n);
MPI_Barrier (MPI_COMM_WORLD);
if(id == 0){
printf("__________________________________________\n");
}
TD_trier(dataLocal);
MPI_Finalize();
}
Troubles may come from fusion2 function. index1 can become higher than size1. In fact, the MPI part works correctly. The code works once tests are performed. Here is a version that is not optimal but...
int* fusion2(int* t1, int* t2, int size1, int size2){
int* buffer = malloc(sizeof(int)*(size1 + size2));
int index1 = 0;
int index2 = 0;
int i = 0;
for(i; i < (size1 + size2) ; i++){
if(index1==size1){
buffer[i] = t2[index2];
index2++;
}else{
if(index2==size2){
buffer[i] = t1[index1];
index1++;
}else{
if(t1[index1] < t2[index2]){
buffer[i] = t1[index1];
index1++;
}else{
buffer[i] = t2[index2];
index2++;
}
}
}
}
return buffer;
}
Watch for memory management.
Ex : did you free T before doing ?
T = realloc(array_tmp, sizeof(int) * size);
Did you free "receive" ? did you free "array_tmp" in the second part ?
I fear memory leakages exist... It might be better to avoid allocation in fusion2, and even in the loops. Allocate array_tmp and receive at start, with "enougth" space, might be safer (faster ?).
Bye,
Francis
More : qsort (in stdlib) may go faster for local sorting.

Resources