The task is a 2D matrix multiplication. N is the data size and P is number of processors. dn029 is my remote host.
I tested this code for multiple number of Ps and I either got a code 139 or 11 error.
The error message I get :
BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 147347 RUNNING AT dn029
= EXIT CODE: 139
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<mpi.h>
int P;
int N = 1024;
/*Single Row, Single Column Matrix Multiplication Function*/
float row_col_multi(float* row, float* col){
int i0;
float c0;
for(i0 = 0; i0 < N ; i0++)
c0 += row[i0]*col[i0];
return c0;
}
int main(int argc, char *argv[]){
MPI_Init(&argc, &argv);
int i, j, k, rank, size;
double start, end, total;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Request request[2];
P = size;
float A_row [N];
float B_col [N];
float matrix_C[N][N];
float matrix_A[N][N];
float matrix_BT[N][N];
if(rank == 0){
double wall_time;
for(i = 0; i < N; i++)
for (j = 0; j < N; j++)
matrix_A[i][j] = -1+2*((float)rand())/RAND_MAX;
for(i = 0; i < N; i++)
for (j = 0; j < N; j++)
matrix_BT[i][j] = -1+2*((float)rand())/RAND_MAX;
}
start = MPI_Wtime();
if(rank == 0)
printf("Root processor %d: Scatterring is started for diagonal elements...\n", rank);
for(i = 0; i < N/P ; i++){
MPI_Iscatter(matrix_A[rank + P*i], N, MPI_FLOAT, A_row, N, MPI_FLOAT, 0, MPI_COMM_WORLD, &request[0]);
MPI_Iscatter(matrix_BT[rank + P*i], N, MPI_FLOAT, B_col, N, MPI_FLOAT, 0, MPI_COMM_WORLD, &request[1]);
MPI_Waitall(2,request, MPI_STATUSES_IGNORE);
matrix_C[rank + P*i][rank + P*i] = row_col_multi(A_row, B_col);
}
for(i = 1 ; i < N ; i++){
if(rank < i){
for(k = 0; k < N/P ; k++){
MPI_Iscatter(matrix_A[rank+i + P*k], N, MPI_FLOAT, A_row, N, MPI_FLOAT, 0, MPI_COMM_WORLD, &request[0]);
MPI_Iscatter(matrix_BT[rank + P*k], N, MPI_FLOAT, B_col, N, MPI_FLOAT, 0, MPI_COMM_WORLD, &request[1]);
MPI_Waitall(2,request, MPI_STATUSES_IGNORE);
matrix_C[rank+i + P*k][rank + P*k] = row_col_multi(A_row, B_col);
}
}
}
end = MPI_Wtime();
printf("Total Time: %f\n", end - start);
MPI_Finalize();
}
Related
It is necessary to sum two vectors on separate processes in parts. I find a portion of vectors that needs to be sent out and send it using MPI_Send. Next, I summarize the parts of the vectors on the processes, I succeeded.
It is not possible to collect the resulting array again on the zero process. As far as I understand, you need to use MPI_Isend.
#include <iostream>
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <string.h>
using namespace std;
void Print(int *vector, int n, int start) {
for (int i = start; i < n; i++)
printf("%4d", vector[i]);
}
void RandomFill(int *vector, int n) {
for (int i = 0; i < n; i++)
vector[i] = rand() % 100;
}
//C = A + B
int main(int argc, char* argv[])
{
int ProcNum, ProcRank;
MPI_Status status;
MPI_Request request;
int *A, *B, *C, *buf;
int nResult = 0;
int n = 5 + rand() % (25 - 5 + 1);
bool yes = true;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &ProcNum);
MPI_Comm_rank(MPI_COMM_WORLD, &ProcRank);
if (ProcRank == 0) {
int part_send = 0;
A = new int[n];
B = new int[n];
C = new int[n];
RandomFill(A, n);
RandomFill(B, n);
printf("\nProcess number: %d\t n = %d", ProcRank, n);
printf("\nVector A:\n");
Print(A, n, 0);
printf("\nVector B:\n");
Print(B, n, 0);
int rest = n % (ProcNum - 1);
int *rest_array = new int[rest];
printf("\n\nRest = %d", rest);
if (rest != 0)
{
part_send = (n - rest) / (ProcNum - 1);
nResult = ProcNum;
int j = 0;
for (int i = n - rest; i < n; i++) {
rest_array[j] = A[i] + B[i];
j++;
}
}
else
{
nResult = ProcNum - 1;
part_send = n / (ProcNum - 1);
}
for (int i = 1; i < ProcNum; i++)
{
int index = (i - 1) * part_send;
MPI_Send(&A[index], part_send, MPI_INT, i, 1, MPI_COMM_WORLD);
MPI_Send(&B[index], part_send, MPI_INT, i, 1, MPI_COMM_WORLD);
}
printf("\n\n");
buf = new int[part_send];
for (int i = 1; i < ProcNum; i++) {
MPI_Irecv(&buf, part_send, MPI_INT, MPI_ANY_SOURCE, 3, MPI_COMM_WORLD, &request);
if (MPI_Wait(&request, &status) == MPI_SUCCESS) {
for (int j = 0; j < part_send; j++)
C[(part_send * i) + j] = buf[j];
printf("Result:");
Print(C, n, 0);
}
}
}
else
{
printf("\n Process number: %d\n", ProcRank);
int nRecv = 0;
MPI_Probe(0, 1, MPI_COMM_WORLD, &status);
MPI_Get_count(&status, MPI_INT, &nRecv);
int *recvArr1 = new int[nRecv];
int *recvArr2 = new int[nRecv];
int *recvArrSum = new int[nRecv];
MPI_Recv(recvArr1, nRecv, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
MPI_Recv(recvArr2, nRecv, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
for (int i = 0; i < nRecv; i++) {
recvArrSum[i] = recvArr1[i] + recvArr2[i];
}
printf("recvArrSum:\n");
Print(recvArrSum, nRecv, 0);
MPI_Isend(&recvArrSum, nRecv, MPI_INT, 0, 3, MPI_COMM_WORLD, &request);
}
MPI_Finalize();
return 0;
}
I am trying to write an mpi program for multiplication of 2 matrix . If I give the size of the matrix lower that 800 the code works but when I give it higher I am getting segmentation fault and I am not able to figure out why . I am new to MPI so still trying to understand everything. If possible please help.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define N 1000
int main(int argc, char* argv[]) {
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
double a[N][N], b[N][N], c[N][N];
int i, j, k;
// Initialize the matrices with random values
if (rank == 0) {
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
a[i][j] = (double)rand() / RAND_MAX;
b[i][j] = (double)rand() / RAND_MAX;
}
}
}
// Broadcast the matrices to all ranks
MPI_Bcast(a, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(b, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Each rank calculates a portion of the output matrix
int rows_per_rank = N / size;
int start_row = rows_per_rank * rank;
int end_row = start_row + rows_per_rank;
for (i = start_row; i < end_row; i++) {
for (j = 0; j < N; j++) {
c[i][j] = 0;
for (k = 0; k < N; k++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
// Gather the output matrix from all ranks
double* c_buffer = (double*) malloc(N*N*sizeof(double));
MPI_Gather(c, rows_per_rank*N, MPI_DOUBLE, c_buffer, rows_per_rank*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Print the output matrix
if (rank == 0) {
printf("Output matrix C:\n");
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf("%lf ", c_buffer[i*N + j]);
}
printf("\n");
}
}
free(c_buffer);
MPI_Finalize();
return 0;
}
this line
double a[N][N], b[N][N], c[N][N];
with N = 1000 requires 24mb of stack space. Thats almost certainly larger than whats available. Either allocate them statically (place the kw static before them) or dynamically on the heap
I have written the code for matrix multiplication that takes input randomly from 1 to 9 and creates the matrix , the code is actually in MPI, the problem is that it works only for 4x4 matrix if i try anything like 5x5 or 10x10 it starts giving random values,I always change the value of N before creating any other order matrix, i don't know if its a MPI problem or just basic C programming error that I am doing, the code i have written is
#define N 4
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "mpi.h"
void print_results(char *prompt, int a[N][N]);
int main(int argc, char *argv[])
{
int i, j, k, rank, size, tag = 99, sum = 0;
int a[N][N];
int b[N][N];
int c[N][N];
int aa[N],cc[N];
int row,col;
int dest = 0;
int source;
double time1, time2, duration, global;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(rank == 0){
printf("enter the number of row =");
scanf("%d",&row);
printf("enter the number of column =");
scanf("%d",&col);
srand(time(NULL));
for(i=0;i<row;i++) {
for(j=0;j<col;j++){
a[i][j] = rand() % 10;
}
}
for(i=0;i<row;i++){
for(j=0;j<col;j++){
b[i][j] = rand() % 10;
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
time1 = MPI_Wtime();
MPI_Scatter(a, N*N/size, MPI_INT, aa, N*N/size, MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(b, N*N, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
sum = sum + aa[j] * b[j][i];
}
cc[i] = sum;
sum = 0;
}
MPI_Gather(cc, N*N/size, MPI_INT, c, N*N/size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
time2 = MPI_Wtime();
duration = time2 - time1;
MPI_Reduce(&duration,&global,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
if(rank == 0) {
printf("Global runtime is %f\n",global);
}
printf("Runtime at %d is %f \n", rank,duration);
MPI_Finalize();
if (rank == 0)
print_results("C = ", c);
}
void print_results(char *prompt, int a[N][N])
{
int i, j;
printf ("\n\n%s\n", prompt);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf(" %d", a[i][j]);
}
printf ("\n");
}
printf ("\n\n");
}
Any solutions, please help me out with it!!
I want to convert backwards substitution sequential C code to parallel and I have an error while rank 1-size receiving data MPI_Recv(prev_x, displacements[rank], MPI_FLOAT, rank-1, tag, MPI_COMM_WORLD, &status);. The logic is a pipeline between processes.
MY code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <math.h>
int main(int argc, char* argv[]){
int i,j,N;
float **a, *b;
float *local_x, *prev_x, *total_proc_x;
int tag = 100;
//MPI variables
int rank, size;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(argc != 2){
if(rank == 0) printf("Using : %s <matrix_size>\n", argv[0]);
return 0;
}
N = strtol(argv[1], NULL, 10);
/* Allocate space for matrices */
a = (float **) malloc ( N * sizeof ( float *) );
for ( i = 0; i < N; i++)
a[i] = ( float * ) malloc ( (i+1) * sizeof ( float ) );
b = ( float * ) malloc ( N * sizeof ( float ) );
if(rank == 0){
srand ( time ( NULL));
for (i = 0; i < N; i++) {
b[i] = (float)rand()/(RAND_MAX*2.0-1.0);
a[i][i] = 2.0+(float)rand()/(RAND_MAX*2.0-1.0);
for (j = 0; j < i; j++)
a[i][j] = (float)rand()/(RAND_MAX*2.0-1.0);
for (j=i; j<N; j++)
a[i][j] = 0.0;
}
}
//broadcast data (a,b)
MPI_Bcast(a, N*N, MPI_FLOAT, 0, MPI_COMM_WORLD);
MPI_Bcast(b, N, MPI_FLOAT, 0, MPI_COMM_WORLD);
int block_size = N/size;
int *counts = (int *) malloc(size*sizeof(int));
int *displacements = (int *) malloc(size*sizeof(int));
int start, end;
for(i=0; i<size; i++){
start = 0;
for(j=0; j<i; j++){
start += block_size;
if(size-(j+1) < N%size) start++;
}
end = start + block_size;
if(size-(i+1) < N%size) end++;
counts[i] = end - start;
displacements[i] = start;
}
local_x = (float *) malloc(counts[rank]*sizeof(float));
for(i=0; i<counts[rank]; i++){
local_x[i] = 0.0;
}
prev_x = (float *) malloc(displacements[rank]*sizeof(float));
if(rank == 0) printf("Size: %d\n", size);
printf("Rank %d, Displacement: %d, Count: %d\n", rank, displacements[rank], counts[rank]);
//calculation
float sum;
if(rank == 0){
printf("Rank %d, OK\n", rank);
for(i=0; i<counts[0]; i++){
sum = 0.0;
for(j=0; j<i; j++){
sum = sum + (local_x[j] * a[i][j]);
}
local_x[i] = (b[i] - sum) / a[i][i];
}
MPI_Send(local_x, displacements[rank+1], MPI_FLOAT, rank+1, tag, MPI_COMM_WORLD);
printf("Process %d sent data to process %d\n", rank, rank+1);
}
if(rank != 0 && rank != (size-1)){
printf("Rank %d, OK\n", rank);
MPI_Recv(prev_x, displacements[rank], MPI_FLOAT, rank-1, tag, MPI_COMM_WORLD, &status);
printf("Process %d received data from process %d", rank, rank-1);
for(i=displacements[rank]; i<(displacements[rank] + counts[rank]); i++){
sum = 0.0;
//unowned rows
for(j=0; j<displacements[rank]; j++){
sum = sum + (prev_x[j] * a[i][j]);
}
//owned rows
for(j=displacements[rank]; j<i; j++){
sum = sum + (local_x[j-displacements[rank]] * a[i][j]);
}
local_x[i] = (b[i] - sum) / a[i][i];
}
//concatenate prev and local x
total_proc_x = (float *) malloc((displacements[rank] + counts[rank])*sizeof(float));
for(i=0; i<displacements[rank]; i++){
total_proc_x[i] = prev_x[i];
}
for(i=0; i<counts[rank]; i++){
total_proc_x[i+displacements[rank]] = local_x[i];
}
//send to next process
MPI_Send(total_proc_x, displacements[rank+1], MPI_FLOAT, rank+1, tag, MPI_COMM_WORLD);
}
if(rank == (size-1)){
printf("Rank %d, OK\n", rank);
MPI_Recv(prev_x, displacements[rank], MPI_FLOAT, rank-1, tag, MPI_COMM_WORLD, &status);
printf("Process %d received data from process %d", rank, rank-1);
for(i=displacements[rank]; i<(displacements[rank] + counts[rank]); i++){
sum = 0.0;
//unowned rows
for(j=0; j<displacements[rank]; j++){
sum = sum + (prev_x[j] * a[i][j]);
}
//owned rows
for(j=displacements[rank]; j<i; j++){
sum = sum + (local_x[j-displacements[rank]] * a[i][j]);
}
local_x[i] = (b[i] - sum) / a[i][i];
}
//concatenate prev and local x
float *total_proc_x = (float *) malloc((displacements[rank] + counts[rank])*sizeof(float));
for(i=0; i<displacements[rank]; i++){
total_proc_x[i] = prev_x[i];
}
for(i=0; i<counts[rank]; i++){
total_proc_x[i+displacements[rank]] = local_x[i];
}
/* Print result */
for (i = 0; i < N; i++) {
for (j = 0; j <= i; j++)
printf ("%f \t", a[i][j]);
printf ("%f \t%f\n", total_proc_x[i], b[i]);
}
/* Check result */
for (i = 0; i < N; i++) {
sum = 0.0;
for (j = 0; j <= i; j++)
sum = sum + (total_proc_x[j]*a[i][j]);
if (fabsf(sum - b[i]) > 0.00001) {
printf("%f != %f\n", sum, b[i]);
printf("Validation Failed...\n");
}
}
}
MPI_Finalize();
return 0;
}
Output:
$ mpicc -o backsub_mpi backsub_mpi.c
$ mpiexec -n 4 ./backsub_mpi 20
Rank 1, Displacement: 5, Count: 5
Rank 1, OK
Rank 3, Displacement: 15, Count: 5
Rank 3, OK
Size: 4
Rank 0, Displacement: 0, Count: 5
Rank 0, OK
Process 0 sent data to process 1
Rank 2, Displacement: 10, Count: 5
Rank 2, OK
-----------------------------------------------------------------------------
One of the processes started by mpirun has exited with a nonzero exit
code. This typically indicates that the process finished in error.
If your process did not finish in error, be sure to include a "return
0" or "exit(0)" in your C code before exiting the application.
PID 4105 failed on node n0 (127.0.0.1) due to signal 11.
-----------------------------------------------------------------------------
mpirun failed with exit status 11
Your problem is in how you allocate the matrix
a = (float **) malloc ( N * sizeof ( float *) );
for ( i = 0; i < N; i++)
a[i] = ( float * ) malloc ( (i+1) * sizeof ( float ) );
You're trying to allocate a triangular array, but you send it
MPI_Bcast(a, N*N, MPI_FLOAT, 0, MPI_COMM_WORLD);
as a square array.
Since a is an array or pointers, the actual matrix elements are probably not in contiguous memory.
You need to allocate double *a as a long enough single array, and then do some index translation to figure out where element i,j goes into this array. Something like i*(i+1)/2+j.
So I got the code working for when running on 1 process. Although when I try to run it on more then 2 processers or more(mpirun -n 4)(mpirun -n 8)etc; half my results are coming back as zero.Im assuming because it doesn't deal with the case where the number of processors is divisible by the matrix size. Any ideas? I'm trying to initialize both matrixes from command line and perform matrix multiplication using MPI. I'm knew to this and would love any help. For example when I enter in a size of 2 and initialize matrix A to the values {1,4,6,7} and matrix B to {8,9,4,5} my result comes out to be {8,9,0,0}..
void init_Matrix(int n, int matrix[n][n])
{
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
scanf("%i", &matrix[i][j]);
}
}
}
void printMatrix(int n, int matrix[n][n])
{
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
printf("%d" , matrix[i][j]);
printf(" ");
}
printf("\n");
}
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
int rank, size;
MPI_Comm_size(MPI_COMM_WORLD, &size); //num p
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int n;
if(rank == 0)
{
printf("Enter in size of matrix! \x0A");
scanf("%i",&n);
}
MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD);
int A[n][n];
int B[n][n];
int C[n][n];
int aa[n/size][n];
int cc[n/size][n];
if(rank == 0)
{
init_Matrix(n,A);
init_Matrix(n,B);
}
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
cc[i][j] = 0;
}
}
//scatter rows of first matrix to different processes
MPI_Scatter(A, n*n/size, MPI_INT, aa, n*n/size, MPI_INT,0,MPI_COMM_WORLD);
//broadcast second matrix to all processes
MPI_Bcast(B, n*n, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
//perform vector multiplication by all processes
for(int k = 0; k < n/size; k++)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
cc[i][j] += A[i][k] * B[k][j];
}
}
}
MPI_Gather(cc, n*n/size, MPI_INT, C, n*n/size, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
if(rank == 0){
printMatrix(n, C);
}
MPI_Finalize();
}
updated:
updated attempt using mpi scatterv and mpi gather
:
void initMatrix(int Size, int matrix[Size][Size])
{
for(int i = 0; i < Size; i++)
{
for(int j = 0; j < Size; j++)
scanf("%i", &matrix[i][j]);
}
}
void multIJK(int Size, int A[Size][Size], int B[Size][Size], int pResult[Size])
{
for(int i = 0; i < Size; i++)
{
for(int j = 0; j < Size; j++)
{
for(int k = 0; k < Size; k++)
pResult += A[i][k] * B[k][j];
}
}
}
int main(int argc, char* argv[]) {
int Size;
int RowNum;
int ProcNum;
int ProcRank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &ProcNum);
MPI_Comm_rank(MPI_COMM_WORLD, &ProcRank);
if (ProcRank == 0) {
printf("Enter in size of matrix! \x0A");
scanf("%i", &Size);
}
int aMatrix[Size][Size];
int bMatrix[Size][Size];
MPI_Bcast(&Size, 1, MPI_INT, 0, MPI_COMM_WORLD);
int RestRows = Size;
for (int i=0; i<ProcRank; i++)
RestRows = RestRows-RestRows/(ProcNum-i);
RowNum = RestRows/(ProcNum-ProcRank);
int pResult[Size];
int pProcRows[RowNum*Size];
int pProcResult[RowNum];
if(ProcRank == 0)
{
initMatrix(Size,aMatrix);
initMatrix(Size,bMatrix);
}
RestRows=Size; // Number of rows, that haven’t been distributed yet
MPI_Bcast(bMatrix, Size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Alloc memory for temporary objects
// the number of elements sent to the process
int pSendInd[ProcNum];
// the index of the first data element sent to the process
int pSendNum[ProcNum];
// Define the disposition of the matrix rows for current process
RowNum = (Size/ProcNum);
pSendNum[0] = RowNum*Size;
pSendInd[0] = 0;
for (int i=1; i<ProcNum; i++) {
RestRows -= RowNum;
RowNum = RestRows/(ProcNum-i);
pSendNum[i] = RowNum*Size;
pSendInd[i] = pSendInd[i-1]+pSendNum[i-1];
}
// Scatter the rows
MPI_Scatterv(aMatrix , pSendNum, pSendInd, MPI_INT, pProcRows,
pSendNum[ProcRank], MPI_DOUBLE, 0, MPI_COMM_WORLD);
multIJK(Size,aMatrix,bMatrix,pResult);
RestRows=Size; // Number of rows, that haven’t been distributed yet
//Alloc memory for temporary objects
// Number of elements, that current process sends
int pReceiveNum[ProcNum];
/* Index of the first element from current process in result vector */
int pReceiveInd[ProcNum];
//Define the disposition of the result vector block of current processor
pReceiveInd[0] = 0;
pReceiveNum[0] = Size/ProcNum;
for (int i=1; i<ProcNum; i++) {
RestRows -= pReceiveNum[i-1];
pReceiveNum[i] = RestRows/(ProcNum-i);
pReceiveInd[i] = pReceiveInd[i-1]+pReceiveNum[i-1];
} //Gather the whole result vector on every processor
MPI_Allgatherv(pProcResult, pReceiveNum[ProcRank], MPI_INT, pResult,
pReceiveNum, pReceiveInd, MPI_DOUBLE, MPI_COMM_WORLD);
//ProcessTermination(aMatrix,bMatrix, pResult, pProcRows, pProcResult);
if(ProcRank == 0)
{
for(int i = 0; i < Size; i++)
{
printf("%i\n",pResult[i]);
}
}
MPI_Finalize();
}
You have some logic problems.
for(int i = 0; i < n; i++) <-- this should be until n/size, you are going into unallocated memory
{
for(int j = 0; j < n; j++)
{
cc[i][j] = 0;
}
}
cc[i][j] += A[i][k] * B[k][j]; <-- again, going outsize allocated memory
Replace it with
cc[k][i] += A[k][j] * B[j][i];
Hopefully these are all the problems.
You should also treat the cases where the matrix size is not divisible by the number of processors.