Timing sum by column different arrays sizes and cache behaviour - c

I am using 2D array and 1D array to calculate the sum by row and column for array a and array s. The purpose is to calculate the sum of the 2D array a in the array s. N is the size of the array my question is why do we get significant change when we N=512 and N=1024 as shown in excel
/* sumcol_bycol: Basic implementation of column sum operation.
* Computes sums of the columns of array a into array s.
* s[c] = SUM(r=0..N-1) a[r][c]
*/
static void FN_ALIGN sumcol_bycol(int a[N][N], int s[N]) {
int r, c, sum;
// Iterate over all columns
for (c = 0; c < N; c++) {
// Compute the sum of data in column c
sum = 0;
for (r = 0; r < N; r++) {
sum += a[r][c];
}
// Return the sum in element c of array s
s[c] = sum;
}
}
/* Computing down columns with loop unrolling */
static void FN_ALIGN sumcol_bycol_u4(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++) {
sum = 0;
for (r = 0; r < N-3; r+=4) {
sum += a[r][c];
sum += a[r+1][c];
sum += a[r+2][c];
sum += a[r+3][c];
}
/* The additional cases if unrolling factor does not divide N evenly */
# if N%4 >= 1
sum += a[r][c];
# endif
# if N%4 >= 2
sum += a[r+1][c];
# endif
# if N%4 >= 3
sum += a[r+2][c];
# endif
s[c] = sum;
}
}
/* Unrolling to a factor of 8 */
static void FN_ALIGN sumcol_bycol_u8(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++) {
sum = 0;
for (r = 0; r < N-7; r+=8) {
sum += a[r][c];
sum += a[r+1][c];
sum += a[r+2][c];
sum += a[r+3][c];
sum += a[r+4][c];
sum += a[r+5][c];
sum += a[r+6][c];
sum += a[r+7][c];
}
# if N%8 >= 1
sum += a[r][c];
# endif
# if N%8 >= 2
sum += a[r+1][c];
# endif
# if N%8 >= 3
sum += a[r+2][c];
# endif
# if N%8 >= 4
sum += a[r+3][c];
# endif
# if N%8 >= 5
sum += a[r+4][c];
# endif
# if N%8 >= 6
sum += a[r+5][c];
# endif
# if N%8 >= 7
sum += a[r+6][c];
# endif
s[c] = sum;
}
}
/* Grouping: Compute two columns together */
static void FN_ALIGN sumcol_bycol_g2(int a[N][N], int s[N]) {
int r, c, sum0, sum1;
for (c = 0; c < N-1; c+=2) {
sum0 = sum1 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
}
s[c] = sum0;
s[c+1] = sum1;
}
/* If there is an odd number of columns */
# if N%2 == 1
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
# endif
}
/* Grouping 3 columns at once */
static void FN_ALIGN sumcol_bycol_g3(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2;
for (c = 0; c < N-2; c+=3) {
sum0 = sum1 = sum2 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
}
# if N%3 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* Grouping 4 columns at once */
static void FN_ALIGN sumcol_bycol_g4(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2, sum3;
for (c = 0; c < N-3; c+=4) {
sum0 = sum1 = sum2 = sum3 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
sum3 += a[r][c+3];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
s[c+3] = sum3;
}
# if N%4 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* Grouping 5 columns at once */
static void FN_ALIGN sumcol_bycol_g5(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2, sum3, sum4;
for (c = 0; c < N-4; c+=5) {
sum0 = sum1 = sum2 = sum3 = sum4 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
sum3 += a[r][c+3];
sum4 += a[r][c+4];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
s[c+3] = sum3;
s[c+4] = sum4;
}
# if N%5 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* sumcol_byrow: Column sums computed using row-wise array access.
*/
static void FN_ALIGN sumcol_byrow(int a[N][N], int s[N]) {
int r, c, sum;
// Initialise all sums to zero.
for (c = 0; c < N; c++)
s[c] = 0;
// Iterate over all array elements, adding
// each one onto the appropriate sum.
for (r = 0; r < N; r++) {
for (c = 0; c < N; c++) {
s[c] += a[r][c];
}
}
}
/* Row-wise array access with loop unrolling */
static void FN_ALIGN sumcol_byrow_u4(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++)
s[c] = 0;
for (r = 0; r < N; r++) {
for (c = 0; c < N-3; c+=4) {
s[c] += a[r][c];
s[c+1] += a[r][c+1];
s[c+2] += a[r][c+2];
s[c+3] += a[r][c+3];
}
# if N%4 >= 1
s[c] += a[r][c];
# endif
# if N%4 >= 2
s[c+1] += a[r][c+1];
# endif
# if N%4 >= 3
s[c+2] += a[r][c+2];
# endif
}
}
/* Row-wise computation using 2x2 blocks. The main loops are in 2x2 row-wise blocks
* then each block updates two sums */
static void FN_ALIGN sumcol_byrow_b2x2(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++)
s[c] = 0;
for (r = 0; r < N-1; r+=2) {
for (c = 0; c < N-1; c+=2) {
s[c] += a[r][c] + a[r+1][c];
s[c+1] += a[r][c+1] + a[r+1][c+1];
}
# if N%2 != 0
s[c] += a[r][c] + a[r+1][c];
# endif
}
# if N%2 != 0
/* Process the remaining row */
for (c = 0; c < N-1; c+=2) {
s[c] += a[r][c];
s[c+1] += a[r][c+1];
}
/* Process the corner element */
s[c] += a[r][c];
# endif
}

Related

How does k work in Matrix Multiplication in C?

How does k works in the code below?
# include <stdio.h>
# define R 2
# define C 2
int main(void)
{
int a[R][C], b[R][C], mul[R][C], i, j, k;
for (i = 0; i < R; ++i)
for (j = 0; j < C; ++j)
{
scanf_s("%d", &a[i][j]);
}
for (i = 0; i < R; ++i)
for (j = 0; j < C; ++j)
{
scanf_s("%d", &b[i][j]);
}
for (i = 0; i < R; ++i)
for (j = 0; j < C; ++j)
{
mul[i][j] = 0;
for (k = 0; k < C; ++k)
{
mul[i][j] += a[i][k] * b[k][j];
}
printf("%d", mul[i][j]);
if (j == 0)
{
printf(" ");
}
else if (i == 0 && j == 1)
{
printf("\n");
}
}
return 0;
}
For matrix addition, I know that in math sum[0][0] = a[0][0] + b[0][0] and in code also be like this.
For matrix multiplication, it is mul[0][0] = a[0][0] x b[0][0] + a[0][0] x b[1][0] in math.
However in code, [ ] is not only come from i and j but also k.
scanf_s("%d", &a[i][j]); and scanf_s("%d", &b[i][j]); has shown that %d are saved in memory a[i][j] and b[i][j].
So, what did k be read in mul[i][j] = a[i][k] * b[k][j]; ?
Your misunderstanding starts here:
For matrix multiplication, it is mul[0][0] = a[0][0] x b[0][0] + a[0][0] x b[1][0] in math.
That's not how matrix multiplication works.
If you look at this code for i == 0 and j == 0 and C == 2
mul[i][j] = 0;
for (k = 0; k < C; ++k)
{
mul[i][j] += a[i][k] * b[k][j];
}
it becomes
mul[0][0] = 0;
for (k = 0; k < 2; ++k)
{
mul[0][0] += a[0][k] * b[k][0];
}
Since k will take the values 0 and 1 it results in
mul[0][0] = a[0][0] * b[0][0] + a[0][1] * b[1][0];
^ ^ ^ ^
k = 0 k = 1
As you can see this is different from what you expected. And it's the correct way to do it for multiplication in case of 2X2 matrix.
Had it been 3X3 matrix, it would be:
mul[0][0] = a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0];
^ ^ ^ ^ ^ ^
k = 0 k = 1 k = 2
So the the loop using k as index, takes care of multiplying a row from a with a column of b.

Solve A.x = b using LU factorisation give inf values

I'm trying to solve linear systems of the form Ax = b where A is an (nxn) matrix of real numbers and b a (1xn) vector of real numbers, using the A = LU algorithm. This is my implementation:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int LUPDecompose(double A[N][N], double Tol, int P[N])
{
int i, j, k, imax;
double maxA, ptr[N], absA;
for (i = 0; i <= N; i++)
P[i] = i; //Unit permutation matrix, P[N] initialized with N
for (i = 0; i < N; i++) {
maxA = 0.0;
imax = i;
for (k = i; k < N; k++)
if ((absA = abs(A[k][i])) > maxA) {
maxA = absA;
imax = k;
}
if (maxA < Tol) return 0; //failure, matrix is degenerate
if (imax != i) {
//pivoting P
j = P[i];
P[i] = P[imax];
P[imax] = j;
//pivoting rows of A
for (int ii = 0; ii < N; ii++)
{
ptr[ii] = A[i][ii];
A[i][ii] = A[imax][ii];
A[imax][ii] = ptr[ii];
}
//counting pivots starting from N (for determinant)
P[N]++;
}
for (j = i + 1; j < N; j++) {
A[j][i] /= A[i][i];
for (k = i + 1; k < N; k++)
A[j][k] -= A[j][i] * A[i][k];
}
}
return 1; //decomposition done
}
/* INPUT: A,P filled in LUPDecompose; b - rhs vector; N - dimension
* OUTPUT: x - solution vector of A*x=b
*/
void LUPSolve(double A[N][N], int P[N], double b[N], double x[N])
{
for (int i = 0; i < N; i++) {
x[i] = b[P[i]];
for (int k = 0; k < i; k++)
x[i] -= A[i][k] * x[k];
}
for (int i = N - 1; i >= 0; i--) {
for (int k = i + 1; k < N; k++)
x[i] -= A[i][k] * x[k];
x[i] /= A[i][i];
}
}
int main()
{
double Am[N][N] = {{0.6289, 0, 0.0128, 0.3184, 0.7151},
{0, 1, 0, 0, 0},
{0.0128, 0, 0.0021, 0.0045, 0.0380},
{0.3184, 0, 0.0045, 0.6618, 0.3371},
{0.7151, 0, 0.0380, 0.3371, 1.1381}};
double bm[N] = {1.6752, 0, 0.0574, 1.3217, 2.2283};
int Pm[N] = {0};
double X[N] = {0};
LUPDecompose( Am, 0.0001, Pm);
LUPSolve(Am, Pm, bm, X);
printf("%f %f %f %f %f",X[0],X[1],X[2],X[3],X[4]);
}
However, I am getting inf values as such.
-1.#IND00 -1.#IND00 3.166387 0.849298 0.670689
I wonder if it is a code issue or algorithm. Any help to solve this issue?
"I wonder if it is a code issue or algorithm. Any help to solve this issue?"
I believe there are code and algorithm issues. The following is your code with corrections to address only compile errors, and warnings (see in-line comments). It is not debugged beyond C syntax to achieve a clean compile, and run w/o error. (i.e. runs with no divide by zero, or inf errors.)
#define N 5 //required to be 5 by hard-coded array definitions in main()
int LUPDecompose(double A[N][N], double Tol, int P[N])
{
int i, j, k, imax, ii;//added ii here to increase scope below
double maxA, ptr[N], absA;
//for (i = 0; i <= N; i++)
for (i = 0; i < N; i++)
P[i] = i; //Unit permutation matrix, P[N] initialized with N (actually init with i)
for (i = 0; i < N; i++) {
maxA = 0.0;
imax = i;
for (k = i; k < N; k++)
if ((absA = fabs(A[k][i])) > maxA) {// using fabs, not abs to avoid conversion of double to int.
maxA = absA;
imax = k;
}
if (maxA < Tol) return 0; //failure, matrix is degenerate
if (imax != i) {
//pivoting P
j = P[i];
P[i] = P[imax];
P[imax] = j;
//pivoting rows of A
//for (int ii = 0; ii < N; ii++)
for ( ii = 0; ii < N; ii++)
{
ptr[ii] = A[i][ii];
A[i][ii] = A[imax][ii];
A[imax][ii] = ptr[ii];
}
//counting pivots starting from N (for determinant)
//P[N]++;//N will always overflow for array with only N elements
P[ii-1]++;//use index here instead
}
for (j = i + 1; j < N; j++) {
A[j][i] /= A[i][i];
for (k = i + 1; k < N; k++) {//extra brackets added for readability
A[j][k] -= A[j][i] * A[i][k];
}
}
}
return 1; //decomposition done
}
/* INPUT: A,P filled in LUPDecompose; b - rhs vector; N - dimension
* OUTPUT: x - solution vector of A*x=b
*/
void LUPSolve(double A[N][N], int P[N], double b[N], double x[N])
{
for (int i = 0; i < N; i++) {
x[i] = b[P[i]];
for (int k = 0; k < i; k++) {//extra brackets added for readability
x[i] -= A[i][k] * x[k];
}
}
for (int i = N - 1; i >= 0; i--) {
for (int k = i + 1; k < N; k++) {//additional brackets added for readability
x[i] -= A[i][k] * x[k];
}
x[i] /= A[i][i];
}
}
//int main()
int main(void)//minimum signature for main includes void
{
//Note hardcoded arrays in this code require N == 5 (#define at top)
double Am[N][N] = {{0.6289, 0, 0.0128, 0.3184, 0.7151},
{0, 1, 0, 0, 0},
{0.0128, 0, 0.0021, 0.0045, 0.0380},
{0.3184, 0, 0.0045, 0.6618, 0.3371},
{0.7151, 0, 0.0380, 0.3371, 1.1381}};
double bm[N] = {1.6752, 0, 0.0574, 1.3217, 2.2283};
int Pm[N] = {0};
double X[N] = {0};
LUPDecompose( Am, 0.0001, Pm);
LUPSolve(Am, Pm, bm, X);
printf("%f %f %f %f %f",X[0],X[1],X[2],X[3],X[4]);
return 0; //int main(void){...} requires return statement.
}
Based on this calculator, with these inputs:
the correct solution is:
-0.590174531351002
0
-19.76923076923077
1.0517711171662125
2.6772727272727272
But the actual output from code above is:
Algorithm related debugging is left for you to perform.

Why are random values shown as output to find determinant of a matrix?

I wanted to find determinant of a M*M matrix by using recursion in C.
Here is the code I have tried in Ubuntu.
// Computing determinant of a MXM matrix
#include <stdio.h>
int determinant(int M, int A[10][10]) { //Function to calculate det(A)
int i, j, k, m, n, p, q, pow = 1;
int B[10][10];//assuming M does not cross 10
if (M == 1)
return A[0][0];
else {
det = 0;
for (k = 0; k < M; k += 1) {
m = 0;
n = 0; //m,n are indices of subdeterminant of A
for (i = 0; i < M; i += 1) {
for (j = 0; j < M; j += 1) {
if (i != 0 && j != k) {
B[m][n] = A[i][j]; //finding submatrix
if (n < (k - 2))
n += 1;
else {
n = 0;
m += 1;
}
}
}
}
det += pow * (A[0][k] * determinant(M - 1, B));
pow = -1 * pow;
}
return det;
}
}
int main() {
int M, i, j; // M is order of matrix A for which determinant has to be found
printf("Enter the order of matrix: ");
scanf("%d", &M);
int A[10][10];
printf("Enter matrix A: ");
for (i = 0; i < M; i += 1) {
for (j = 0; j < M; j += 1) {
scanf("%d", &A[i][j]); //Entering elements of matrix A
}
}
printf("Given matrix A is: \n");
for (i = 0; i < M; i += 1) {
for (j = 0; j < M; j += 1) {
printf("%d ", A[i][j]);
}
printf("\n");
}
int det = determinant(M, A);
printf("The determinant of given matrix is %d\n", det);
return 0;
}
This code works fine for a matrix of order 2. But for higher orders, the output is some random number. I am unable to identify any mistake in this. Can anyone explain why the output is not as expected and how to rectify the code to get the expected output?
The inner loop that extracts the submatrix B from A seems broken.
Here is a simpler version:
for (i = 1, m = 0; i < M; i++, m++) {
for (j = 0, n = 0; j < k; j++, n++)
B[m][n] = A[i][j];
for (j = k + 1; j < M; j++, n++)
B[m][n] = A[i][j];
}

Why does my code return -nan in visual studio, but not in Linux?

My Gauss Elimination code's results are -nan in visual studio, but not in Linux.
And the Linux results are awful because at func Gauss_Eli how many I increase the variable k at for blocks the func is working... doesn't occur segment error.
What is wrong with my code?
float ** Gauss_Eli(float ** matrix, int n) {
// -----------------------------------------------------
// | |
// | Eliminate elements except (i, i) element |
// | |
// -----------------------------------------------------
// Eliminate elements at lower triangle part
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
for (int k = 0; k < n + 1; k++) {
float e;
e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
matrix[j][k] -= e;
}
}
}
// Eliminate elements at upper triangle part
for (int i = n - 1; i >= 0; i--) {
for (int j = i - 1; j >= 0; j--) {
for (int k = 0; k < n + 1; k++) {
float e;
e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
matrix[j][k] -= e;
}
}
}
// Make 1 elements i, i
for (int i = 0; i < n; i++)
for (int j = 0; j < n + 1; j++) matrix[i][j] /= matrix[i][i];
return matrix;
}
int main() {
float ** matrix;
int n;
printf("Matrix Size : ");
scanf("%d", &n);
// Malloc variable matrix for Matrix
matrix = (float**)malloc(sizeof(float) * n);
for (int i = 0; i < n; i++) matrix[i] = (float*)malloc(sizeof(float) * (n + 1));
printf("Input elements : \n");
for (int i = 0; i < n; i++)
for (int j = 0; j < n + 1; j++) scanf("%f", &matrix[i][j]);
matrix = Gauss_Eli(matrix, n);
printf("Output result : \n");
//Print matrix after elimination
for (int i = 0; i < n; i++) {
for (int j = 0; j < n + 1; j++) printf("%.6f ", matrix[i][j]);
printf("\n");
}
return 0;
}
1.) OP allocates memory using the wrong type. This may lead to issues of insufficient memory and all sorts of UB and explain the difference between systems as they could have differing pointer and float sizes.
float ** matrix;
// v--- wrong type
// matrix = (float**)malloc(sizeof(float) * n);
Instead allocate to the size of the referenced variable. Easier to code (and get right), review and maintain.
matrix = malloc(sizeof *matrix * n);
if (matrix == NULL) Handle_Error();
2.) Code should look for division by 0.0
//for (int k = 0; k < n + 1; k++) {
// float e;
// e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
// matrix[j][k] -= e;
//}
if (matrix[i][i] == 0.0) Handle_Error();
float m = matrix[j][i] / matrix[i][i];
for (int k = 0; k < n + 1; k++) {
matrix[j][k] -= matrix[i][k]*m;
}
3.) General problem solving tips:
Check return values of scanf("%f", &matrix[i][j]);. It is 1?
Enable all warnings.
Especially for debug, print FP using "%e" rather than "%f".
4.) Numerical analysis tip: Insure exact subtraction when i==j
if (i == j) {
for (int k = 0; k < n + 1; k++) {
matrix[j][k] = 0.0;
}
else {
if (matrix[i][i] == 0.0) Handle_Divide_by_0();
float m = matrix[j][i] / matrix[i][i];
for (int k = 0; k < n + 1; k++) {
matrix[j][k] -= matrix[i][k]*m;
}
}

Expanding the QR Decomposition of square matrices to tall matrices

I'm trying to convert the QR Decomposition from "Numerical Recipes in C" to work with skinny (tall) matrices. Does anyone know how to do this? I believe the problem lies in how the householder is multiplied with the A but I am unable to figure it out.
void qrdcmp(float **a, int n, float *c, float *d, int *sing)
{
int i,j,k;
float scale,sigma,sum,tau;
*sing=0;
for (k = 1; k < n; k++) {
scale = 0.0;
for (i = k; i <= n; i++) scale = FMAX(scale, fabs(a[i][k]));
if (scale == 0.0) { // Singular case.
*sing = 1;
c[k] = d[k] = 0.0;
} else { // Form Qk and Qk ยท A.
for (i = k; i <= n; i++) a[i][k] /= scale;
for (sum = 0.0, i = k; i <= n; i++) sum += SQR(a[i][k]);
sigma = SIGN(sqrt(sum), a[k][k]);
a[k][k] += sigma;
c[k] = sigma * a[k][k];
d[k] = -scale * sigma;
for (j = k + 1; j <= n; j++) {
for (sum = 0.0, i = k; i <= n; i++) sum += a[i][k] * a[i][j];
tau = sum / c[k];
for (i = k; i <= n; i++) a[i][j] -= tau * a[i][k];
}
}
}
d[n] = a[n][n];
if (d[n] == 0.0) *sing=1;
}

Resources