Given a n x n matrix of ints, I have an algorithm that at each step of a for loop of range n traverses and modifies the matrix. Here is the code:
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
The matrix is built as an array of n*n ints and for each index line i, dist[i] is the address of the row of index i [the above code is the standard way to write the Floyd-Warshall algorithm but my question is not about this algorithm by itself].
The following drawing tries to explain how the matrix is processed:
At each step of the loop of index k, the underlying matrix is traversed line by line.
Now, consider the following transformation of the previous code:
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
At every step, the elements of the matrix are accessed in the same order as in the previous algorithm.
The only difference is that at each step k in the exterior loop, the column of index k is copied into a temporary array, cf. the colk malloc above. It results that the element at position (i, k) is read from this array instead of being accessed directly from the matrix.
This innocuous change leads in fact to a significant speedup: you gain a factor 4 if n=1000.
I know that in C, it's faster to traverse an array in row major order but this is always the case here. So i was wondering why there is a speedup so important. Is it related to cache optimisation?
Complete code
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
void print(Matrix dist, int n)
{
int i, j;
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
printf("%d ", dist[i][j]);
printf("\n");
}
}
void test_slow(Matrix dist, int n)
{
clock_t now=clock();
floyd_slow(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed slow: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
void test_fast(Matrix dist, int n)
{
clock_t now=clock();
floyd_fast(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed fast: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
int * data(int n)
{
int N=n*n;
int *t=malloc(N*sizeof(int));
if (!t)
exit(EXIT_FAILURE);
srand(time(NULL));
for (int i=0; i<N;i++)
t[i]=(1+rand())%10;
return t;
}
Matrix getMatrix(int *t, int n)
{
int N=n*n;
int *tt=malloc(N*sizeof(int));
Matrix mat=malloc(n*sizeof(int*));
if (!tt || !mat)
exit(EXIT_FAILURE);
memcpy(tt, t, N*sizeof(int));
for (int i=0; i<n;i++)
mat[i]=&tt[i*n];
return mat;
}
int main(void)
{
int n=1000;
int *t=data(n);
Matrix mat_slow=getMatrix(data(n), n);
Matrix mat_fast=getMatrix(data(n), n);
test_slow(mat_slow, n);
test_fast(mat_fast, n);
return 0;
}
Output:
Elapsed slow: 0.58 s
Elapsed fast: 0.14 s
Compilation options:
rm floyd
gcc -Wall -O3 -march=native -ffast-math -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.c -o floyd -lm
Related
I am trying to create a program to find the transpose of a matrix my dynamic memory allocation. However, while entering the elements of the matrix I can't input more than one element, its only taking the a[0][0] as the input and the program is ending after that.
#include <stdio.h>
#include <stdlib.h>
void createMatrix(int **, int, int);
void inputElements(int **, int, int);
void transpose(int **, int **, int, int);
void display(int **, int, int);
void main()
{
int **matrix, **trans, rows, cols;
printf("\nEnter number of rows in the matrix: ");
scanf("%d", &rows);
printf("\nEnter number of columns in the matrix: ");
scanf("%d", &cols);
createMatrix(matrix, rows, cols);
createMatrix(trans, cols, rows);
inputElements(matrix, rows, cols);
transpose(matrix, trans, rows, cols);
printf("\nMATRIX:\n");
display(matrix, rows, cols);
printf("\nTRANSPOSE OF THE MATRIX:\n");
display(trans, rows, cols);
}
void createMatrix(int **a, int r, int c) //for allocating memory for the matrix
{
int i, j;
a = (int **)malloc(sizeof(int *) * r);
for(i = 0; i < r; i++)
a[i] = (int *)malloc(sizeof(int) * c);
}
void inputElements(int **a, int r, int c) //for entering matrix elements
{
int i, j, t;
for(i = 0; i < r; i++)
{
for(j = 0; j < c; j++)
{
printf("\nEnter matrix element[%d][%d]: ", i + 1, j + 1);
fflush(stdin);
getchar();
scanf("%d", &(a[i][j]));
}
}
}
void transpose(int **a, int **t, int r, int c) //for finding out the transpose of the matrix
{
int i, j;
for (i = 0; i < c; i++)
{
for (j = 0; j < r; j++)
t[i][j] = a[j][i];
}
}
void display(int **a, int r, int c) //for displaying the matrix
{
int i, j;
for (i = 0; i < r; i++)
{
printf("\n");
for (j = 0; j < c; j++)
printf("\t%d", a[i][j]);
}
}
There are many posts about scanf in loops I've seen, but I wasn't able to connect my issue with any of them.
I am trying to type in a matrix and then print it, using functions.
The program crashes when it gets to the read_mat function. What am I getting wrong?
#include <stdio.h>
#include <stdlib.h>
int** insert_mat(int **mat, int r, int c);
int** read_mat(int** mat, int r, int c)
{
mat = (int**)calloc(r,sizeof(int*));
for (int i=0; i<r; i++)
{
mat[i]=(int*)calloc(c,sizeof(int));
}
insert_mat(mat, r, c);
return mat;
}
int** insert_mat(int **mat, int r, int c)
{
for (int i=0; i<r; i++)
{
for(int j=0; j<c; j++)
{
printf("\nmat[%d][%d] = ",i , j);
scanf("%d", &mat[i][j]);
}
}
return mat;
}
void print_mat(int **mat, int r, int c)
{
printf("\n");
for (int i=0; i<r; i++)
{
for(int j=0; j<c; j++)
{
printf("%d ",mat[i][j]);
}
printf("\n");
}
}
int main()
{
int **mat;
int r1 =2;
int c1 = 2;
read_mat(mat, r1, c1);
print_mat(mat, r1, c1);
return 0;
}
I also tried printing elements of mat in main (ex: printf("%d", mat[0][0]);) and it is also not working. So maybe the error is when I insert the elements? But I am not sure what I do wrong.
Here is the section of my code where I am running into a race condition. I'm just trying to copy the values of the matrix "matxOriginal" into the matrix "cluster0", but, when run on multiple threads using openMP, the sample values printed for "cluster0" is different from "matxOriginal".
Both matrix has been dynamically allocated and are both 698 x 9 matrix.
Also, I would like to keep the use and purpose of the "cluster0IndexCounter" variable for a different use outside of what I'm posting. So if you can, please let me know how to make this work.
double **matxGen(int row, int col)
{
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
{
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
{
m[i][j]=j+i;
}
}
return m;
}
double **emptyMatxGen(int row, int col)
{
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
{
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
{
m[i][j]=0.0;
}
}
return m;
}
int main()
{
int x, i, j, tid, row=699, col=9,
cluster0IndexCounter=0;
double **matxOriginal, **matx, **cluster0;
matxOriginal=matxGen(row, col);
matx=matxGen(row, col);
double *centerPoint0=matx[99];
cluster0=emptyMatxGen(row, col);
#pragma omp parallel for private(x, j, tid) schedule(static) reduction(+:cluster0IndexCounter)
for (x=0; x<=698; x++)
{
for (j=0; j<9; j++)
{
cluster0[cluster0IndexCounter][j]=matxOriginal[x][j];
}
cluster0IndexCounter=cluster0IndexCounter+1;
}
printf("cluster0: %f, %f, %f, %f, %f\n", cluster0[9][0], cluster0[9][1], cluster0[9][2], cluster0[9][3], cluster0[9][4]);
free(cluster0);
free(matxOriginal);
free(matx);
return 0;
}
I have the code of matrices multiplication with using openmp:
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main()
{
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
t1=omp_get_wtime();
#pragma omp parallel for shared(a, b, c) private(i, j, k)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
}
}
t2=omp_get_wtime();
printf("Time=%lf\n", t2-t1);
}
Now I want to set the number of threads which I want through command line. I do that by using
atoi(argv[])
Namely
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main(int argc, char** argv[])
{
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
int t = atoi(argv[1]);
t1=omp_get_wtime();
#pragma omp parallel for shared(a, b, c) private(i, j, k) num_threads(t)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
}
}
t2=omp_get_wtime();
printf("Time=%lf\n", t2-t1);
}
Everything is fine, except one crucial thing: when I try to compute the product of matrices with dimension more than (more or less) 500, I get the mistake: "segmentation fault". Could someone clarify the reason for this mistake?
I don't know anything about openmp, but you are most assuredly blowing up your stack. Default stack space will vary from system to system, but with N == 1000, you are trying to put three 2D arrays totaling 3 million doubles on the stack. Assuming a double is 8 bytes, that's 24 million bytes, or just shy of 22.9MB. There can't be many systems allowing that kind of stack space. Instead, I'd recommend trying to grab that amount of memory from the heap. Something like this:
//double a[N][N],b[N][N],c[N][N];
double **a, **b, **c;
a = malloc(sizeof(double*) * N);
b = malloc(sizeof(double*) * N);
c = malloc(sizeof(double*) * N);
for (i=0; i<N; i++)
{
a[i] = malloc(sizeof(double) * N);
b[i] = malloc(sizeof(double) * N);
c[i] = malloc(sizeof(double) * N);
}
// do your calculations
for (i=0; i<N; i++)
{
free(a[i]);
free(b[i]);
free(c[i]);
}
free(a);
free(b);
free(c);
I've verified on my machine at least, that with N == 1000 I crash right out of the gate with EXC_BAD_ACCESS when trying to place those arrays on the stack. When I dynamically allocate the memory instead as shown above, I get no seg faults.
I have an issue passing a matrix to a function in C. There is the function I want to create:
void ins (int *matrix, int row, int column);
but I noticed that in contrast to the vectors, matrix give me an error. How can I pass my matrix to a function so?
EDIT --> there is the code:
// Matrix
#include <stdio.h>
#define SIZE 100
void ins (int *matrix, int row, int column);
void print (int *matrix, int row, int column);
int main ()
{
int mat[SIZE][SIZE];
int row, col;
printf("Input rows: ");
scanf ("%d", &row);
printf("Input columns: ");
scanf ("%d", &col);
printf ("Input data: \n");
ins(mat, row, col);
printf ("You entered: ");
print(mat, row, col);
return 0;
}
void ins (int *matrix, int row, int column);
{
int i, j;
for (i = 0; i < row; i++)
{
for (j = 0; j < column; j++)
{
printf ("Row %d column %d: ", i+1, j+1);
scanf ("%d", &matrix[i][j]);
}
}
}
void print (int *matrix, int row, int column)
{
int i;
int j;
for(i=0; i<row; i++)
{
for(j=0; j<column; j++)
{
printf("%d ", matrix[i][j]);
}
printf("\n");
}
}
You need to pass a pointer with as much levels of indirection (*) as the number of dimensions of your matrix.
For example, if your matrix is 2D (e.g. 10 by 100), then:
void ins (int **matrix, int row, int column);
If you have a fixed dimension (e.g. 100), you can also do:
void ins (int (*matrix)[100], int row, int column);
or in your case:
void ins (int (*matrix)[SIZE], int row, int column);
If both your dimensions are fixed:
void ins (int matrix[10][100], int row, int column);
or in your case:
void ins (int matrix[SIZE][SIZE], int row, int column);
If you have a modern C compiler you can do the following for 2D matrices of any sizes
void ins (size_t rows, size_t columns, int matrix[rows][columns]);
Important is that the sizes come before the matrix, such that they are known, there.
Inside your function you then can access the elements easily as matrix[i][j] and the compiler is doing all the index calculations for you.
it would also possible to leave the first dimension empty, the same as (*matrix):
void ins (int matrix[][100], int row, int column);
Much better way to use malloc function and create dynamically allocated array and do whatever you want to do using 2d array:
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
void fun(int **arr,int m,int n)
{
int i, j;
for (i = 0; i < m; i++)
{
for (j = 0; j < n; j++)
{
scanf("%d", &arr[i][j]);
}
}
}
int main()
{
int i,j,m,n;
printf("enter order of matrix(m*n)");
scanf("%d*%d",&m,&n);
int **a=(int **)malloc(m*sizeof(int));
for(i=0;i<n;i++)
a[i]=(int *)malloc(n*sizeof(int));
fun(a,m,n);
for (i = 0; i < m; i++)
{
for (j = 0; j < n; j++)
{
printf("%d ", a[i][j]);
}
printf("\n");
}
return 0;
}
output:
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
void fun(int **arr,int m,int n)
{
int i, j;
for (i = 0; i < m; i++)
{
for (j = 0; j < n; j++)
{
scanf("%d", &arr[i][j]);
}
}
}
int main()
{
int i,j,m,n;
printf("enter order of matrix(m*n)");
scanf("%d*%d",&m,&n);
int **a=(int **)malloc(m*sizeof(int));
for(i=0;i<n;i++)
a[i]=(int *)malloc(n*sizeof(int));
fun(a,m,n);
for (i = 0; i < m; i++)
{
for (j = 0; j < n; j++)
{
printf("%d ", a[i][j]);
}
printf("\n");
}
return 0;
}
You can try this as well:
void inputmat(int r,int c,int arr[r * sizeof(int)][c * sizeof(int)])