Why matrix multiplication with -O3 flag takes longer than -O0 in C?

Why matrix multiplication with -O3 flag takes longer than -O0 in C? - c

I was trying to multiply two matrices with different techniques to understand the usage of caches and I encountered some interesting results that I don't know what is happening.
If I try to multiply two matrices with row-column multiplication (A[i][j] = B[i][k] * C[k][j]) it tooks about 28sec without any optimization flags, but if I run the same code with -O3 flag, it tooks about 40sec for the same 1024x1024 matrices.
int** mulMatrices(int **matrix1, int **matrix2) {
int **resultM = malloc(sizeof(int*) * ROWS);
resultM[0] = malloc(sizeof(int) * ROWS * COLS);
for(int i = 1; i < ROWS; i++) {
resultM[i] = resultM[i - 1] + COLS;
}
memset(resultM[0], 0, sizeof(int) * ROWS * COLS);
clock_t start, end;
double cpu_time_used;
start = clock();
for(int i = 0; i < ROWS; i++) {
for(int j = 0; j < COLS; j++) {
for(int k = 0; k < COLS; k++) {
resultM[i][j] += matrix1[i][k] * matrix2[k][j];
}
}
}
end = clock();
cpu_time_used = ((double)(end - start)/(CLOCKS_PER_SEC));
printf("without row-wise mul: %f\n", cpu_time_used);
return resultM;
}
but if I do the same things with below matrix multiplication, it tooks about 25sec without optimization flags and 1.5sec with -O3 as expected since I believe it uses caches more efficient.
int** mulMat(int **matrix1, int **matrix2) {
int **resultM = malloc(sizeof(int*) * ROWS);
resultM[0] = malloc(sizeof(int) * ROWS * COLS);
for(int i = 1; i < ROWS; i++) {
resultM[i] = resultM[i - 1] + COLS;
}
memset(resultM[0], 0, sizeof(int) * ROWS * COLS);
clock_t start,end;
double cpu_time_used;
start = clock();
for(int i = 0; i < ROWS; i++) {
for(int j = 0; j < COLS; j++) {
for(int k = 0; k < COLS; k++) {
resultM[i][k] += matrix1[i][j] * matrix2[j][k];
}
}
}
end = clock();
cpu_time_used = ((double)(end - start)/(CLOCKS_PER_SEC));
printf("with row-wise mul: %f\n", cpu_time_used);
return resultM;
}
My question is, why it takes longer for normal matrix multiplication with -O3 than with -O0?
Thanks.

Related

How to Speed up these functions?

I have two functions that I am working with, and I am attempting to make them run as much as 4x faster.
void get_each_fifth(const matrix_t *matrix, long results[RESULTS_LEN]) {
for (int i = 0; i < matrix->rows; i++) {
for (int j = 0; j < matrix->cols; j++) {
int q = j % RESULTS_LEN;
results[q] += MGET(matrix, i, j);
}
}
}
The function above will need to be optimized to be 4x faster. In this function, I am finding the sums of integers based on their location in the matrix. Elements in column 0, 5, 10, etc. go into the first element of the results array. Elements in column 1, 6, 11, etc. go into the second column of the array. This pattern continues for the remaining columns. To summarize, the numbers in column i go into element i % 5 of the results array.
long get_each(const matrix_t *matrix) {
long sum = 0;
for (int i = 0; i < matrix->rows; i++) {
for (int j = 0; j < matrix->cols; j++) {
sum += MGET(matrix, i, j);
}
}
return sum;
}
This one will need to be 2x faster; it is the sum all of the elements in the matrix and return the result.
MGET and MSET are defined:
#define MGET(mat, i, j) ((mat)->data[((i)*((mat)->cols)) + (j)])
#define MSET(mat, i, j, x) ((mat)->data[((i)*((mat)->cols)) + (j)] = (x))
and the matrix_t struct is defined
typedef struct {
long rows;
long cols;
int *data;
} matrix_t;
and is allocated with this function:
void set_up_matrix(matrix_t *matrix, int rows, int cols) {
if (matrix == NULL) {
return;
}
matrix->rows = rows;
matrix->cols = cols;
matrix->data = malloc(sizeof(int) * rows * cols);
srand(2021);
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
MSET(matrix, i, j, rand() % 100);
}
}
}
and result len is defined:
#define RESULTS_LEN 5
Any help would be appreciated!

I would change it to flexible array member and leave the arithmetic to the compiler. It will make it more cache friendly as well. You need to make its task easier by showing what kind of array your data represents. It will allow the compiler to optimize the loops or use vector instructions (if you use some additional command line options). You can use also compiler specific pragmas or attributes like in the example below. It will unroll the loops speeding up the execution speed.
typedef struct {
size_t rows;
size_t cols;
int data[];
} matrix_t;
void get_each_fifth(const matrix_t * restrict matrix, long results[RESULTS_LEN])
{
int (*array)[matrix -> cols] = (int (*)[matrix -> cols])matrix -> data;
#pragma GCC unroll 10
for (size_t i = 0; i < matrix->rows; i++) {
#pragma GCC unroll 10
for (size_t j = 0; j < matrix->cols; j++) {
size_t q = j % RESULTS_LEN;
results[q] += array[i][j];
}
}
}
matrix_t *set_up_matrix(matrix_t *matrix, size_t rows, size_t cols)
{
int (*array)[cols];
matrix = realloc(matrix, sizeof(*matrix) + rows * cols * sizeof(matrix -> data[0]));
if(matrix)
{
matrix->rows = rows;
matrix->cols = cols;
srand(time(NULL)); // it should be called once in main
array = (int (*)[cols])matrix -> data;
for (size_t i = 0; i < rows; i++)
{
for (size_t j = 0; j < cols; j++)
{
array[i][j] = rand() % 100;
}
}
}
return matrix;
}

Why is my program generating random results when I nest it?

I made this parallel matrix multiplication program using nesting of for loops in OpenMP. When I run the program the displays the answer randomly ( mostly ) with varying indice of the resultant matrix. Here is the snippet of the code :
#pragma omp parallel for
for(i=0;i<N;i++){
#pragma omp parallel for
for(j=0;j<N;j++){
C[i][j]=0;
#pragma omp parallel for
for(m=0;m<N;m++){
C[i][j]=A[i][m]*B[m][j]+C[i][j];
}
printf("C:i=%d j=%d %f \n",i,j,C[i][j]);
}
}

These are the symptoms of a so called "race conditions" as the commenters already stated.
The threads OpenMP uses are independent of each other but the results of the individual loops of the matrix multiplication are not, so one thread might be at a different position than the other one and suddenly you are in trouble if you depend on the order of the results.
You can only parallelize the outmost loop:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
int main(int argc, char **argv)
{
int n;
double **A, **B, **C, **D, t;
int i, j, k;
struct timeval start, stop;
if (argc != 2) {
fprintf(stderr, "Usage: %s a positive integer >= 2 and < 1 mio\n", argv[0]);
exit(EXIT_FAILURE);
}
n = atoi(argv[1]);
if (n <= 2 || n >= 1000000) {
fprintf(stderr, "Usage: %s a positive integer >= 2 and < 1 mio\n", argv[0]);
exit(EXIT_FAILURE);
}
// make it repeatable
srand(0xdeadbeef);
// allocate memory for and initialize A
A = malloc(sizeof(*A) * n);
for (i = 0; i < n; i++) {
A[i] = malloc(sizeof(**A) * n);
for (j = 0; j < n; j++) {
A[i][j] = (double) ((rand() % 100) / 99.);
}
}
// do the same for B
B = malloc(sizeof(*B) * n);
for (i = 0; i < n; i++) {
B[i] = malloc(sizeof(**B) * n);
for (j = 0; j < n; j++) {
B[i][j] = (double) ((rand() % 100) / 99.);
}
}
// and C but initialize with zero
C = malloc(sizeof(*C) * n);
for (i = 0; i < n; i++) {
C[i] = malloc(sizeof(**C) * n);
for (j = 0; j < n; j++) {
C[i][j] = 0.0;
}
}
// ditto with D
D = malloc(sizeof(*D) * n);
for (i = 0; i < n; i++) {
D[i] = malloc(sizeof(**D) * n);
for (j = 0; j < n; j++) {
D[i][j] = 0.0;
}
}
// some coarse timing
gettimeofday(&start, NULL);
// naive matrix multiplication
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
}
}
gettimeofday(&stop, NULL);
t = ((stop.tv_sec - start.tv_sec) * 1000000u +
stop.tv_usec - start.tv_usec) / 1.e6;
printf("Timing for naive run = %.10g\n", t);
gettimeofday(&start, NULL);
#pragma omp parallel shared(A, B, C) private(i, j, k)
#pragma omp for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
D[i][j] = D[i][j] + A[i][k] * B[k][j];
}
}
}
gettimeofday(&stop, NULL);
t = ((stop.tv_sec - start.tv_sec) * 1000000u +
stop.tv_usec - start.tv_usec) / 1.e6;
printf("Timing for parallel run = %.10g\n", t);
// check result
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
if (D[i][j] != C[i][j]) {
printf("Cell %d,%d differs with delta(D_ij-C_ij) = %.20g\n", i, j,
D[i][j] - C[i][j]);
}
}
}
// clean up
for (i = 0; i < n; i++) {
free(A[i]);
free(B[i]);
free(C[i]);
free(D[i]);
}
free(A);
free(B);
free(C);
free(D);
puts("All ok? Bye");
exit(EXIT_SUCCESS);
}
(n>2000 might need some patience to get the result)
But it's not fully true. You could (but shouldn't) try to get the innermost loop with something like
sum = 0.0;
#pragma omp parallel for reduction(+:sum)
for (k = 0; k < n; k++) {
sum += A[i][k] * B[k][j];
}
D[i][j] = sum;
Does not seem to be faster, is even slower with small n.
With the original code and n = 2500 (only one run):
Timing for naive run = 124.466307
Timing for parallel run = 44.154538
About the same with the reduction:
Timing for naive run = 119.586365
Timing for parallel run = 43.288371
With a smaller n = 500
Timing for naive run = 0.444061
Timing for parallel run = 0.150842
It is already slower with reduction at that size:
Timing for naive run = 0.447894
Timing for parallel run = 0.245481
It might win for very large n but I lack the necessary patience.
Nevertheless, a last one with n = 4000 (OpenMP part only):
Normal:
Timing for parallel run = 174.647404
With reduction:
Timing for parallel run = 179.062463
That difference is still fully inside the error-bars.
A better way to multiply large matrices (at ca. n>100 ) would be the Schönhage-Straßen algorithm.
Oh: I just used square matrices for convenience not because they must be of that form! But if you have rectangular matrices with a large length-ratio you might try to change the way the loops run; column-first or row-first can make a significant difference here.

Initializing data with Openmp [shallow water algorithm]

First of all, my Englishg level sucks, so sorry if something isn't well written...
I'm learning how to parallelize C code using OpenMP, the algorith I'm trying to parallelize is the shallow water equations algorithm, and although with a simple #pragma omp parallel for in the most critical loop I've gained nearly 40% more performance I know that my implementation is very poor and I'm not milking the cores as I should. The structure of the code is simple: a 'main' that allocates memory and initializes some matrixes and arrays and calls a function called solver that does all the work, where I putted the #pragma omp parallel for.
I was thinking that I could boost the performance using a parallel section where the memory is allocated and initialized so every thread has all the data, but when i run the program I don't have any boost, and since I'm a rookie with this I don't know if my thinking was bad or the bad thing was my implementation. I'll apreciate some help or a hint that could boost the performance of the algorithm. This is my homework and I don't want someone to do it for me, just a little help that can make me go forward...
I'll paste the code for better understanding:
MAIN FUNCTION (Allocations and initializations)
int main(int argc, char **argv) {
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
/* Default values to use: m volumes in the x-direction and n volumes in the y-direction
M = 1000;
N = 1000;
/* create file and verbose flags */
.......
.......
/* Parse command line options */
.......
.......
epsi = 2.0;
delta = 0.5;
dx = (xend - xstart) / (double) M;
dy = (yend - ystart) / (double) N;
dt = dx / sqrt( 9.81 * 5.0);
tend = 0.1;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
/*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND ALLOCATIONS*/
#pragma omp parallel
{
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));
for (i = 0; i < cell_size; i++) {
ffx[i] = ffx[0] + i * m;
nFx[i] = nFx[0] + i * m;
ffy[i] = ffy[0] + i * n;
nFy[i] = nFy[0] + i * n;
}
for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
x[i] = tmp;
for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
y[i] = tmp;
/* Set initial Gauss hump */
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
Q(0, i, j) = 4.0;
Q(1, i, j) = 0.0;
Q(2, i, j) = 0.0;
}
}
for (i = 1; i < m-1; i++) {
for (j = 1; j < n-1; j++) {
Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
(pow(delta, 2)));
}
}
}
// Record start time
stime = gettime();
/*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}
SOLVER FUNCTION (Critical Section)
/*
This is the main solver routine.
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double tend, double dx, double dy, double dt) {
double bc_mask[3] = {1.0, -1.0, -1.0};
double time;
int i, j, k, steps;
steps = ceil(tend / dt);
for (i = 0, time = 0.0; i < steps; i++, time += dt) {
/* Apply boundary condition */
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 1; j < n - 1 ; j++)
{
Q(k, 0, j) = bc_mask[k] * Q(k, 1, j);
Q(k, m-1, j) = bc_mask[k] * Q(k, m-2, j);
}
}
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 0; j < m; j++)
{
Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
}
}
/* Update all volumes with the Lax-Friedrich's scheme */
laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);
}
}
/*
This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < m; j++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < m-1; j++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < n; j++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < n-1; j++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
As I understand there is no data dependency in the loops of the solver function and it's sub-functions, and since putting a parallel region in the allocation and data initialization did nothing, I don't know how to continue.
Thanks in advance!

There are multiple problems with your code. First of all, you have data races there, since you write to shared variables, such as Q, x, and y, by all threads. Either do the allocations outside of a parallel region or perform them by a single thread only (#pragma omp master or #pragma omp single).
Then, you don't parallelize the for loops in the initialization section. In fact, all these loops are executed by all threads within whole ranges (again with data races and likely a lot of cache contention). You should add #pragma omp parallel to these loops. For nested loops, the collapse directive might be useful.
Also, be sure that there are no data races in solver() and laxf_scheme_2d() functions. Seemingly, the most time of the calculation is spend within laxf_scheme_2d(), however, this function is not at all run in parallel. Does it use OpenMP internally?

Thank you for the answers. I've seen many problems in my implementation, first of all the heaviest function where all the job is done is laxf_scheme_2d.
About the Q variable i have this #define Q(i, j, k) Q[((k) + n * ((j) + m * (i)))]
This is laxf_scheme_2d
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp for
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (j = 1; j < m; j++)
for (k = 0; k < cell_size; k++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) -
dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (j = 1; j < m-1; j++)
for (k = 0; k < cell_size; k++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp for
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (j = 1; j < n; j++)
for (k = 0; k < cell_size; k++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) -
dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (j = 1; j < n-1; j++)
for (k = 0; k < cell_size; k++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
Functions fx and fy are very simple and with no data dependencies. I can't put de #pragma omp parallel for above the first for loop because there are data races but for now I can't see how to change this code to overcome them.
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
M = 1000;
N = 1000;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));

Making an OpenMP program work with Pthreads, segfault error

I have a written a program that performs Gaussian elimination in C and returns the L2 norm of a matrix. The program is called like ./exec n k where n is the size of a n by n matrix and k is the number of threads that will be used to do the program (max 4). I allocate space for a n by n+1 matrix because having an augmented matrix is part of the gaussian elimination.
It works perfectly in OpenMP. As seen in the code below, I only have 1 parallel for. My goal now is to make that parallel for loop run concurrently using Pthreads instead of OpenMP. I made the for loop that be parallelized into a separate function and create pthreads to deal with it. My guess is that the pthreads are not each doing a different part of the loop (basically a different iteration of j), but instead the 4 Pthreads are just running the entire loops. I run the program like ./gauss 30 4 and it sometimes work and sometimes segfaults, although when it does work the L2 norm is not 0 (L2 will return 0 if program worked perfectly), so something is obviously up with the threading section. When I run it through GDB it segfaults at a loop for some reason but this same loop runs perfectly in OpenMP...can someone help me out
GDB
http://i.stack.imgur.com/V99yt.png
OpenMP Code:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <sys/time.h>
//globals
double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
int i,j,k,ptr, z;
int y,z;
int bvectcount = 0;
struct timeval start, end;
// a is matrix, b is vector, x is the solution vector, and n is the size
double L2(double **a, double *bvect, double *vect, int matrixSize) {
double sum;
double res[matrixSize];
int i, j;
for (i=0; i < matrixSize; i++) {
sum = (double) 0;
for (j=0; j < matrixSize; j++) {
sum += a[i][j] * vect[j];
}
res[i] = sum;
}
for (i=0; i < matrixSize; i++) {
res[i] -= vect[i];
}
double sum_squares = (double) 0;
for (i=0; i < matrixSize; i++) {
sum_squares += res[i] * res[i];
}
return sqrt(sum_squares);
}
int checkargs(int argc, char* argv[]){
if(argc != 3){
fprintf(stderr, "Error: Usage is size threadNum\n" );
exit(1);
}
}
int main(int argc, char* argv[]){
//check for args
checkargs(argc, argv);
int matrixSize = atoi(argv[1]);
int threadNum = atoi(argv[2]);
int chunk = matrixSize/threadNum;
//memory allocation
a = (double**)malloc(matrixSize*sizeof(double*));
for(i = 0; i < matrixSize ; i++)
a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
vect = (double*)malloc(matrixSize*sizeof(double));
bvect = (double*)malloc(matrixSize*sizeof(double));
temp = (double*)malloc(matrixSize*sizeof(double));
for(i = 0; i < matrixSize; ++i){
for(j = 0; j < matrixSize + 1; ++j){
a[i][j] = drand48();
}
}
j = 0;
j += matrixSize;
for(i = 0; i < matrixSize; ++i){
bvect[i] = a[i][j];
}
//generation of scalar matrix (diagonal vector)
gettimeofday(&start, NULL);
for(i=0; i<matrixSize; i++){
scalar = a[i][i];
//initialization of p to travel throughout matrix
ptr = i;
//find largest number in column and row number of it
for(k = i+1; k < matrixSize; k++){
if(fabs(scalar) < fabs(a[k][i])){
//k is row of scalar, while
scalar = a[k][i];
ptr = k;
}
}
//swaping the elements of diagonal row and row containing largest no
for(j = 0; j <= matrixSize; j++){
temp[0] = a[i][j];
a[i][j]= a[ptr][j];
a[ptr][j] = temp[0];
}
//calculating triangular matrix
//threading needs to be done HERE
ratio = a[i][i];
for(k = 0; k < matrixSize + 1; k++){
a[i][k] = a[i][k] / ratio;
}
double temp2;
#pragma omp parallel default(none) num_threads(threadNum) shared(a,i,matrixSize,vect) private(j,z,ratio,temp2)
{
#pragma omp for schedule(static)
for(j = i + 1; j<matrixSize; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
}
}
//backward substitution method
for(i=matrixSize-1; i >=0; i--){
for(k = i; k > 0; k--){
a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
a[k-1][i] -= a[k-1][i] * a[i][i];
}
}
for(i = 0; i < matrixSize; ++i){
vect[i] = a[i][matrixSize];
}
double l2Norm;
l2Norm = L2(a, bvect, vect, matrixSize);
printf("THIS IS L2 NORM: %f\n", l2Norm);
gettimeofday(&end, NULL);
delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
printf("end time: %f\n", delta);
}
Pthreads code:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <sys/time.h>
#include <pthread.h>
//globals
double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
int i,j,k,ptr, z;
int y,z;
int bvectcount = 0;
int threadcount;
pthread_t workerThreads[4];
typedef struct threader {
int counter;
int matrixl;
} threader;
struct timeval start, end;
void *retval;
int checkargs(int argc, char* argv[]);
// a is matrix, b is vector, x is the solution vector, and n is the size
double L2(double **a, double *bvect, double *vect, int matrixSize) {
double sum;
double res[matrixSize];
int i, j;
for (i=0; i < matrixSize; i++) {
sum = (double) 0;
for (j=0; j < matrixSize; j++) {
sum += a[i][j] * vect[j];
}
res[i] = sum;
}
for (i=0; i < matrixSize; i++) {
res[i] -= vect[i];
}
double squaresum = (double) 0;
for (i=0; i < matrixSize; i++) {
squaresum += res[i] * res[i];
}
return sqrt(squaresum);
}
int checkargs(int argc, char* argv[]){
if(argc != 3){
fprintf(stderr, "Error: Usage is size threadNum\n" );
exit(1);
}
}
void *parallelstuff(void *args){
threader temp = *((threader *)args);
int i, matrixSize;
i = temp.counter;
matrixSize = temp.matrixl;
double temp2;
for(j = i + 1; j<matrixSize; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
}
int main(int argc, char* argv[]){
//check for args
checkargs(argc, argv);
int matrixSize = atoi(argv[1]);
int threadNum = atoi(argv[2]);
//memory allocation
a = (double**)malloc(matrixSize*sizeof(double*));
for(i = 0; i < matrixSize ; i++)
a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
vect = (double*)malloc(matrixSize*sizeof(double));
bvect = (double*)malloc(matrixSize*sizeof(double));
temp = (double*)malloc(matrixSize*sizeof(double));
for(i = 0; i < matrixSize; ++i){
for(j = 0; j < matrixSize + 1; ++j){
a[i][j] = drand48();
}
}
j = 0;
j += matrixSize;
for(i = 0; i < matrixSize; ++i){
bvect[i] = a[i][j];
}
//generation of scalar matrix (diagonal vector)
gettimeofday(&start, NULL);
for(i=0; i<matrixSize; i++){
scalar = a[i][i];
//initialization of p to travel throughout matrix
ptr = i;
//find largest number in column and row number of it
for(k = i+1; k < matrixSize; k++){
if(fabs(scalar) < fabs(a[k][i])){
//k is row of scalar, while
scalar = a[k][i];
ptr = k;
}
}
//swaping the elements of diagonal row and row containing largest no
for(j = 0; j <= matrixSize; j++)
{
temp[0] = a[i][j];
a[i][j]= a[ptr][j];
a[ptr][j] = temp[0];
}
ratio = a[i][i];
for(k = 0; k < matrixSize + 1; k++){
a[i][k] = a[i][k] / ratio;
}
threader stuff;
stuff.counter = i;
stuff.matrixl = matrixSize;
//MAKE EACH THREAD DO SOMETHING DIFF
// parallelstuff(int i, int matrixSize, double **a){
for(threadcount = 0; threadcount < threadNum; threadcount++){
if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
fprintf(stderr, "Error: consumer create problem\n");
exit(1);
}
}
while(threadcount != 0){
if(pthread_join (workerThreads[threadcount-1], &retval ) != 0){
fprintf(stderr, "Error: consumer create problem\n");
exit(1);
}
threadcount--;
}
//create matrix of n size
//backward substitution method
for(i=matrixSize-1; i >=0; i--){
for(k = i; k > 0; k--){
a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
a[k-1][i] -= a[k-1][i] * a[i][i];
}
}
for(i = 0; i < matrixSize; ++i){
vect[i] = a[i][matrixSize];
}
double l2Norm;
l2Norm = L2(a, bvect, vect, matrixSize);
printf("THIS IS L2 NORM: %f\n", l2Norm);
gettimeofday(&end, NULL);
delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
printf("end time: %f\n", delta);
}
}

note that j , z should be declared as local (private) variables in each thread.
in OpenMP Code , you closed the brace of for loop in line 100 :
gettimeofday(&start, NULL);
for(i=0; i<matrixSize; i++){
scalar = a[i][i];
//initialization of p to travel throughout matrix
.......
......
.....
} //line 100
but in pthreads code, you closed it in line 149, so the full code:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <sys/time.h>
#include <pthread.h>
//globals
double **a, *vect, *bvect, scalar, ratio, sum, delta, *temp;
int i,j,k,ptr, z;
int y; //z?
int bvectcount = 0;
int threadcount;
pthread_t workerThreads[4];
typedef struct threader {
int counter;
int matrixl;
} threader;
struct timeval start, end;
void *retval;
int checkargs(int argc, char* argv[]);
// a is matrix, b is vector, x is the solution vector, and n is the size
double L2(double **a, double *bvect, double *vect, int matrixSize) {
double sum;
double res[matrixSize];
int i, j;
for (i=0; i < matrixSize; i++) {
sum = (double) 0;
for (j=0; j < matrixSize; j++) {
sum += a[i][j] * vect[j];
}
res[i] = sum;
}
for (i=0; i < matrixSize; i++) {
res[i] -= vect[i];
}
double squaresum = (double) 0;
for (i=0; i < matrixSize; i++) {
squaresum += res[i] * res[i];
}
return sqrt(squaresum);
}
int checkargs(int argc, char* argv[]){
if(argc != 3){
fprintf(stderr, "Error: Usage is size threadNum\n" );
exit(1);
}
}
void *parallelstuff(void *args){
threader temp = *((threader *)args);
int i, matrixSize;
i = temp.counter;
matrixSize = temp.matrixl;
//printf("matrixSize=%d counter=%d\n" , matrixSize ,temp.counter );
double temp2;
int j , z; //houssam
for(j = i + 1; j<matrixSize; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
}
int main(int argc, char* argv[]){
//check for args
checkargs(argc, argv);
int matrixSize = atoi(argv[1]);
int threadNum = atoi(argv[2]);
//memory allocation
a = (double**)malloc(matrixSize*sizeof(double*));
for(i = 0; i < matrixSize ; i++)
a[i] = (double*)malloc(matrixSize*sizeof(double) * matrixSize);
vect = (double*)malloc(matrixSize*sizeof(double));
bvect = (double*)malloc(matrixSize*sizeof(double));
temp = (double*)malloc(matrixSize*sizeof(double));
for(i = 0; i < matrixSize; ++i){
for(j = 0; j < matrixSize + 1; ++j){
a[i][j] = drand48();
}
}
j = 0;
j += matrixSize;
for(i = 0; i < matrixSize; ++i){
bvect[i] = a[i][j];
}
//generation of scalar matrix (diagonal vector)
gettimeofday(&start, NULL);
for(i=0; i<matrixSize; i++){
scalar = a[i][i];
//initialization of p to travel throughout matrix
ptr = i;
//find largest number in column and row number of it
for(k = i+1; k < matrixSize; k++){
if(fabs(scalar) < fabs(a[k][i])){
//k is row of scalar, while
scalar = a[k][i];
ptr = k;
}
}
//swaping the elements of diagonal row and row containing largest no
for(j = 0; j <= matrixSize; j++)
{
temp[0] = a[i][j];
a[i][j]= a[ptr][j];
a[ptr][j] = temp[0];
}
ratio = a[i][i];
for(k = 0; k < matrixSize + 1; k++){
a[i][k] = a[i][k] / ratio;
}
threader stuff;
stuff.counter = i;
stuff.matrixl = matrixSize;
//printf("i=%d\n" , i);
//MAKE EACH THREAD DO SOMETHING DIFF
// parallelstuff(int i, int matrixSize, double **a){
for(threadcount = 0; threadcount < threadNum; threadcount++){
if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
fprintf(stderr, "Error: consumer create problem\n");
exit(1);
}
}
while(threadcount != 0){
if(pthread_join (workerThreads[threadcount-1], &retval ) != 0){
fprintf(stderr, "Error: consumer create problem\n");
exit(1);
}
threadcount--;
}
}
//create matrix of n size
//backward substitution method
for(i=matrixSize-1; i >=0; i--){
for(k = i; k > 0; k--){
a[k-1][matrixSize] -= a[k-1][i] * a[i][matrixSize];
a[k-1][i] -= a[k-1][i] * a[i][i];
}
}
for(i = 0; i < matrixSize; ++i){
vect[i] = a[i][matrixSize];
}
double l2Norm;
l2Norm = L2(a, bvect, vect, matrixSize);
printf("THIS IS L2 NORM: %f\n", l2Norm);
gettimeofday(&end, NULL);
delta = ((end.tv_sec - start.tv_sec) * 1000000u +
end.tv_usec - start.tv_usec) / 1.e6;
printf("end time: %f\n", delta);
}

The two codes as written are not equivalent. Observe the OpenMP code:
#pragma omp for schedule(static)
for(j = i + 1; j<matrixSize; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
The combined parallel for construct in OpenMP is a worksharing construct, i.e. it distributes the iterations on the following loop among the threads in the team. Given the schedule(static) clause, the iteration space is split into #threads blocks and each block is assigned to a different thread.
Your Pthreads code does not share the work:
i = temp.counter;
matrixSize = temp.matrixl;
...
for(j = i + 1; j<matrixSize; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
Given that the same stuff object is passed to all threads, they all receive the same value of i and matrixSize and loop over the whole iteration space, therefore the wrong results.
What you have to do is simulate what #pragma omp for schedule(static) does, namely make each thread do only some of the matrixSize - (i+1) + 1 iterations. You should pass each thread a unique data object that contains the starting and the ending iteration:
typedef struct threader {
int start;
int end;
int i;
int matrixSize;
} threader;
...
void *parallelstuff(void *args){
threader *temp = (threader *)args;
int start, end, i, matrixSize;
start = temp->start;
end = temp->end;
i = temp->i;
matrixSize = temp->matrixSize;
double temp2;
int j , z; //houssam
for(j = start + 1; j<end; j++){
temp2 = a[j][i]/a[i][i];
for(z = 0; z<matrixSize + 1; z++){
a[j][z] = a[j][z] - temp2 * a[i][z];
}
}
}
...
threader stuff[threadNum];
//MAKE EACH THREAD DO SOMETHING DIFF
// parallelstuff(int i, int matrixSize, double **a){
for(threadcount = 0; threadcount < threadNum; threadcount++){
stuff[threadcount].start = i + threadcount*(matrixSize / threadNum);
stuff[threadcount].end = i + (threadcount+1)*(matrixSize / threadNum);
stuff[threadcount].i = i;
stuff[threadcount].matrixSize = matrixSize;
if(pthread_create (&workerThreads[threadcount], NULL, parallelstuff, (void *) &stuff ) != 0){
fprintf(stderr, "Error: consumer create problem\n");
exit(1);
}
}
In theory, you could also let each thread know how many threads are out there and let it compute the iteration range itself, but that is complicated by the fact that Pthreads API lacks the equivalent of omp_get_thread_num(). There is an advanced trick that employs aligned memory allocation and the numeric thread ID encoded in the last bits of the pointer passed.

Memory management for Strassen's matrix multiplication

As a part of an assignment, I am trying to find out the crossover point for Strassen's matrix multiplication and naive multiplication algorithms. But for the same, I am unable to proceed when matrix becomes 256x256. Can someone please suggest me the appropriate memory management technique to be able to handle larger inputs.
The code is in C as follows:
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>
void strassenMul(double* X, double* Y, double* Z, int m);
void matMul(double* A, double* B, double* C, int n);
void matAdd(double* A, double* B, double* C, int m);
void matSub(double* A, double* B, double* C, int m);
int idx = 0;
int main()
{
int N;
int count = 0;
int i, j;
clock_t start, end;
double elapsed;
int total = 15;
double tnaive[total];
double tstrassen[total];
printf("-------------------------------------------------------------------------\n\n");
for (count = 0; count < total; count++)
{
N = pow(2, count);
printf("Matrix size = %2d\t",N);
double X[N][N], Y[N][N], Z[N][N], W[N][N];
srand(time(NULL));
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
X[i][j] = rand()/(RAND_MAX + 1.);
Y[i][j] = rand()/(RAND_MAX + 1.);
}
}
start = clock();
matMul((double *)X, (double *)Y, (double *)W, N);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tnaive[count] = elapsed;
printf("naive = %5.4f\t\t",tnaive[count]);
start = clock();
strassenMul((double *)X, (double *)Y, (double *)Z, N);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tstrassen[count] = elapsed;
printf("strassen = %5.4f\n",tstrassen[count]);
}
printf("-------------------------------------------------------------------\n\n\n");
while (tnaive[idx+1] <= tstrassen[idx+1] && idx < 14) idx++;
printf("Optimum input size to switch from normal multiplication to Strassen's is above %d\n\n", idx);
printf("Please enter the size of array as a power of 2\n");
scanf("%d",&N);
double A[N][N], B[N][N], C[N][N];
srand(time(NULL));
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
A[i][j] = rand()/(RAND_MAX + 1.);
B[i][j] = rand()/(RAND_MAX + 1.);
}
}
printf("------------------- Input Matrices A and B ---------------------------\n\n");
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",A[i][j]);
printf("\n");
}
printf("\n");
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",B[i][j]);
printf("\n");
}
printf("\n------- Output matrix by Strassen's method after optimization -----------\n\n");
strassenMul((double *)A, (double *)B, (double *)C, N);
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",C[i][j]);
printf("\n");
}
return(0);
}
void strassenMul(double *X, double *Y, double *Z, int m)
{
if (m <= idx)
{
matMul((double *)X, (double *)Y, (double *)Z, m);
return;
}
if (m == 1)
{
*Z = *X * *Y;
return;
}
int row = 0, col = 0;
int n = m/2;
int i = 0, j = 0;
double x11[n][n], x12[n][n], x21[n][n], x22[n][n];
double y11[n][n], y12[n][n], y21[n][n], y22[n][n];
double P1[n][n], P2[n][n], P3[n][n], P4[n][n], P5[n][n], P6[n][n], P7[n][n];
double C11[n][n], C12[n][n], C21[n][n], C22[n][n];
double S1[n][n], S2[n][n], S3[n][n], S4[n][n], S5[n][n], S6[n][n], S7[n][n];
double S8[n][n], S9[n][n], S10[n][n], S11[n][n], S12[n][n], S13[n][n], S14[n][n];
for (row = 0, i = 0; row < n; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
{
x11[i][j] = *((X+row*m)+col);
y11[i][j] = *((Y+row*m)+col);
}
for (col = n, j = 0; col < m; col++, j++)
{
x12[i][j] = *((X+row*m)+col);
y12[i][j] = *((Y+row*m)+col);
}
}
for (row = n, i = 0; row < m; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
{
x21[i][j] = *((X+row*m)+col);
y21[i][j] = *((Y+row*m)+col);
}
for (col = n, j = 0; col < m; col++, j++)
{
x22[i][j] = *((X+row*m)+col);
y22[i][j] = *((Y+row*m)+col);
}
}
// Calculating P1
matAdd((double *)x11, (double *)x22, (double *)S1, n);
matAdd((double *)y11, (double *)y22, (double *)S2, n);
strassenMul((double *)S1, (double *)S2, (double *)P1, n);
// Calculating P2
matAdd((double *)x21, (double *)x22, (double *)S3, n);
strassenMul((double *)S3, (double *)y11, (double *)P2, n);
// Calculating P3
matSub((double *)y12, (double *)y22, (double *)S4, n);
strassenMul((double *)x11, (double *)S4, (double *)P3, n);
// Calculating P4
matSub((double *)y21, (double *)y11, (double *)S5, n);
strassenMul((double *)x22, (double *)S5, (double *)P4, n);
// Calculating P5
matAdd((double *)x11, (double *)x12, (double *)S6, n);
strassenMul((double *)S6, (double *)y22, (double *)P5, n);
// Calculating P6
matSub((double *)x21, (double *)x11, (double *)S7, n);
matAdd((double *)y11, (double *)y12, (double *)S8, n);
strassenMul((double *)S7, (double *)S8, (double *)P6, n);
// Calculating P7
matSub((double *)x12, (double *)x22, (double *)S9, n);
matAdd((double *)y21, (double *)y22, (double *)S10, n);
strassenMul((double *)S9, (double *)S10, (double *)P7, n);
// Calculating C11
matAdd((double *)P1, (double *)P4, (double *)S11, n);
matSub((double *)S11, (double *)P5, (double *)S12, n);
matAdd((double *)S12, (double *)P7, (double *)C11, n);
// Calculating C12
matAdd((double *)P3, (double *)P5, (double *)C12, n);
// Calculating C21
matAdd((double *)P2, (double *)P4, (double *)C21, n);
// Calculating C22
matAdd((double *)P1, (double *)P3, (double *)S13, n);
matSub((double *)S13, (double *)P2, (double *)S14, n);
matAdd((double *)S14, (double *)P6, (double *)C22, n);
for (row = 0, i = 0; row < n; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
*((Z+row*m)+col) = C11[i][j];
for (col = n, j = 0; col < m; col++, j++)
*((Z+row*m)+col) = C12[i][j];
}
for (row = n, i = 0; row < m; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
*((Z+row*m)+col) = C21[i][j];
for (col = n, j = 0; col < m; col++, j++)
*((Z+row*m)+col) = C22[i][j];
}
}
void matMul(double *A, double *B, double *C, int n)
{
int i = 0, j = 0, k = 0, row = 0, col = 0;
double sum;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
sum = 0.0;
for (k = 0; k < n; k++)
{
sum += *((A+i*n)+k) * *((B+k*n)+j);
}
*((C+i*n)+j) = sum;
}
}
}
void matAdd(double *A, double *B, double *C, int m)
{
int row = 0, col = 0;
for (row = 0; row < m; row++)
for (col = 0; col < m; col++)
*((C+row*m)+col) = *((A+row*m)+col) + *((B+row*m)+col);
}
void matSub(double *A, double *B, double *C, int m)
{
int row = 0, col = 0;
for (row = 0; row < m; row++)
for (col = 0; col < m; col++)
*((C+row*m)+col) = *((A+row*m)+col) - *((B+row*m)+col);
}
Added later If I try using malloc statements for memory assignment, the code is as follows. But the problem is that it stops after the naive matrix multiplication method and does not even proceed to the Strassen's method for N=1. It shows a prompt to close the program.
for (count = 0; count < total; count++)
{
N = pow(2, count);
printf("Matrix size = %2d\t",N);
//double X[N][N], Y[N][N], Z[N][N], W[N][N];
double **X, **Y, **Z, **W;
X = malloc(N * sizeof(double*));
if (X == NULL){
perror("Failed malloc() in X");
return 1;
}
Y = malloc(N * sizeof(double*));
if (Y == NULL){
perror("Failed malloc() in Y");
return 1;
}
Z = malloc(N * sizeof(double*));
if (Z == NULL){
perror("Failed malloc() in Z");
return 1;
}
W = malloc(N * sizeof(double*));
if (W == NULL){
perror("Failed malloc() in W");
return 1;
}
for (j = 0; j < N; j++)
{
X[j] = malloc(N * sizeof(double*));
if (X[j] == NULL){
perror("Failed malloc() in X[j]");
return 1;
}
Y[j] = malloc(N * sizeof(double*));
if (Y[j] == NULL){
perror("Failed malloc() in Y[j]");
return 1;
}
Z[j] = malloc(N * sizeof(double*));
if (Z[j] == NULL){
perror("Failed malloc() in Z[j]");
return 1;
}
W[j] = malloc(N * sizeof(double*));
if (W[j] == NULL){
perror("Failed malloc() in W[j]");
return 1;
}
}
srand(time(NULL));
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
{
X[i][j] = rand()/(RAND_MAX + 1.);
Y[i][j] = rand()/(RAND_MAX + 1.);
}
}
start = clock();
matMul((double *)X, (double *)Y, (double *)W, N);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tnaive[count] = elapsed;
printf("naive = %5.4f\t\t",tnaive[count]);
start = clock();
strassenMul((double *)X, (double *)Y, (double *)Z, N);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tstrassen[count] = elapsed;
for (j = 0; j < N; j++)
{
free(X[j]);
free(Y[j]);
free(Z[j]);
free(W[j]);
}
free(X); free(Y); free(Z); free(W);
printf("strassen = %5.4f\n",tstrassen[count]);
}

I have re-written the answer. My previous answer which allocated memory row by row won't work, because OP has cast the 2-D arrays to 1-D arrays when passed to the functions. Here is my re-write of the code with some simplifications, such as keeping all the matrix arrays 1-dimensional.
I am unsure exactly what Strassen's method does, although the recursion halves the matrix dimensions. So I do wonder if the intention was to use row*2 and col*2 when accessing the arrays passed.
I hope the techniques are useful to you - even that it works! All the matrix arrays are now on the heap.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>
#define total 4 //15
void strassenMul(double* X, double* Y, double* Z, int m);
void matMul(double* A, double* B, double* C, int n);
void matAdd(double* A, double* B, double* C, int m);
void matSub(double* A, double* B, double* C, int m);
enum array { x11, x12, x21, x22, y11, y12, y21, y22,
P1, P2, P3, P4, P5, P6, P7, C11, C12, C21, C22,
S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, arrs };
int idx = 0;
int main()
{
int N;
int count = 0;
int i, j;
clock_t start, end;
double elapsed;
double tnaive[total];
double tstrassen[total];
double *X, *Y, *Z, *W, *A, *B, *C;
printf("-------------------------------------------------------------------------\n\n");
for (count = 0; count < total; count++)
{
N = (int)pow(2, count);
printf("Matrix size = %2d\t",N);
X = malloc(N*N*sizeof(double));
Y = malloc(N*N*sizeof(double));
Z = malloc(N*N*sizeof(double));
W = malloc(N*N*sizeof(double));
if (X==NULL || Y==NULL || Z==NULL || W==NULL) {
printf("Out of memory (1)\n");
return 1;
}
srand((unsigned)time(NULL));
for (i=0; i<N*N; i++)
{
X[i] = rand()/(RAND_MAX + 1.);
Y[i] = rand()/(RAND_MAX + 1.);
}
start = clock();
matMul(X, Y, W, N);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tnaive[count] = elapsed;
printf("naive = %5.4f\t\t",tnaive[count]);
start = clock();
strassenMul(X, Y, Z, N);
free(W);
free(Z);
free(Y);
free(X);
end = clock();
elapsed = ((double) (end - start))*100/ CLOCKS_PER_SEC;
tstrassen[count] = elapsed;
printf("strassen = %5.4f\n",tstrassen[count]);
}
printf("-------------------------------------------------------------------\n\n\n");
while (tnaive[idx+1] <= tstrassen[idx+1] && idx < 14) idx++;
printf("Optimum input size to switch from normal multiplication to Strassen's is above %d\n\n", idx);
printf("Please enter the size of array as a power of 2\n");
scanf("%d",&N);
A = malloc(N*N*sizeof(double));
B = malloc(N*N*sizeof(double));
C = malloc(N*N*sizeof(double));
if (A==NULL || B==NULL || C==NULL) {
printf("Out of memory (2)\n");
return 1;
}
srand((unsigned)time(NULL));
for (i=0; i<N*N; i++)
{
A[i] = rand()/(RAND_MAX + 1.);
B[i] = rand()/(RAND_MAX + 1.);
}
printf("------------------- Input Matrices A and B ---------------------------\n\n");
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",A[i*N+j]);
printf("\n");
}
printf("\n");
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",B[i*N+j]);
printf("\n");
}
printf("\n------- Output matrix by Strassen's method after optimization -----------\n\n");
strassenMul(A, B, C, N);
for (i = 0; i < N; i++)
{
for (j = 0; j < N; j++)
printf("%5.4f ",C[i*N+j]);
printf("\n");
}
free(C);
free(B);
free(A);
return(0);
}
void strassenMul(double *X, double *Y, double *Z, int m)
{
int row = 0, col = 0;
int n = m/2;
int i = 0, j = 0;
double *arr[arrs]; // each matrix mem ptr
if (m <= idx)
{
matMul(X, Y, Z, m);
return;
}
if (m == 1)
{
*Z = *X * *Y;
return;
}
for (i=0; i<arrs; i++) { // memory for arrays
arr[i] = malloc(n*n*sizeof(double));
if (arr[i] == NULL) {
printf("Out of memory (1)\n");
exit (1); // brutal
}
}
for (row = 0, i = 0; row < n; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
{
arr[x11][i*n+j] = X[row*m+col];
arr[y11][i*n+j] = Y[row*m+col];
}
for (col = n, j = 0; col < m; col++, j++)
{
arr[x12][i*n+j] = X[row*m+col];
arr[y12][i*n+j] = Y[row*m+col];
}
}
for (row = n, i = 0; row < m; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
{
arr[x21][i*n+j] = X[row*m+col];
arr[y21][i*n+j] = Y[row*m+col];
}
for (col = n, j = 0; col < m; col++, j++)
{
arr[x22][i*n+j] = X[row*m+col];
arr[y22][i*n+j] = Y[row*m+col];
}
}
// Calculating P1
matAdd(arr[x11], arr[x22], arr[S1], n);
matAdd(arr[y11], arr[y22], arr[S2], n);
strassenMul(arr[S1], arr[S2], arr[P1], n);
// Calculating P2
matAdd(arr[x21], arr[x22], arr[S3], n);
strassenMul(arr[S3], arr[y11], arr[P2], n);
// Calculating P3
matSub(arr[y12], arr[y22], arr[S4], n);
strassenMul(arr[x11], arr[S4], arr[P3], n);
// Calculating P4
matSub(arr[y21], arr[y11], arr[S5], n);
strassenMul(arr[x22], arr[S5], arr[P4], n);
// Calculating P5
matAdd(arr[x11], arr[x12], arr[S6], n);
strassenMul(arr[S6], arr[y22], arr[P5], n);
// Calculating P6
matSub(arr[x21], arr[x11], arr[S7], n);
matAdd(arr[y11], arr[y12], arr[S8], n);
strassenMul(arr[S7], arr[S8], arr[P6], n);
// Calculating P7
matSub(arr[x12], arr[x22], arr[S9], n);
matAdd(arr[y21], arr[y22], arr[S10], n);
strassenMul(arr[S9], arr[S10], arr[P7], n);
// Calculating C11
matAdd(arr[P1], arr[P4], arr[S11], n);
matSub(arr[S11], arr[P5], arr[S12], n);
matAdd(arr[S12], arr[P7], arr[C11], n);
// Calculating C12
matAdd(arr[P3], arr[P5], arr[C12], n);
// Calculating C21
matAdd(arr[P2], arr[P4], arr[C21], n);
// Calculating C22
matAdd(arr[P1], arr[P3], arr[S13], n);
matSub(arr[S13], arr[P2], arr[S14], n);
matAdd(arr[S14], arr[P6], arr[C22], n);
for (row = 0, i = 0; row < n; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
Z[row*m+col] = arr[C11][i*n+j];
for (col = n, j = 0; col < m; col++, j++)
Z[row*m+col] = arr[C12][i*n+j];
}
for (row = n, i = 0; row < m; row++, i++)
{
for (col = 0, j = 0; col < n; col++, j++)
Z[row*m+col] = arr[C21][i*n+j];
for (col = n, j = 0; col < m; col++, j++)
Z[row*m+col] = arr[C22][i*n+j];
}
for (i=0; i<arrs; i++)
free (arr[i]);
}
void matMul(double *A, double *B, double *C, int n)
{
int i = 0, j = 0, k = 0, row = 0, col = 0;
double sum;
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
sum = 0.0;
for (k = 0; k < n; k++)
{
sum += A[i*n+k] * B[k*n+j];
}
C[i*n+j] = sum;
}
}
}
void matAdd(double *A, double *B, double *C, int m)
{
int row = 0, col = 0;
for (row = 0; row < m; row++)
for (col = 0; col < m; col++)
C[row*m+col] = A[row*m+col] + B[row*m+col];
}
void matSub(double *A, double *B, double *C, int m)
{
int row = 0, col = 0;
for (row = 0; row < m; row++)
for (col = 0; col < m; col++)
C[row*m+col] = A[row*m+col] - B[row*m+col];
}