Strassen matrix multiplication implementation

Strassen matrix multiplication implementation - c

I have written the below code for Strassen matrix multiplication. I know it's big but you don't need to go through the whole thing. My problem is that during compile time, the Strassen function with parameters of a[][num],b[][num] and c[][num] doesn't have a fixed value of num. This is where i am going wrong. I need to take the input of num in the main and that is why it cannot be globally given a value. How can i fix this? My code:
#include <stdio.h>
int num;
void strassen(int a[][num], int b[][num], int c[][num], int size) {
int p1[size/2][size/2], p2[size/2][size/2], p3[size/2][size/2], p4[size/2][size/2], p5[size/2][size/2], p6[size/2][size/2], p7[size/2][size/2];
int temp1[size/2][size/2], temp2[size/2][size/2];
int q1, q2, q3, q4, q5, q6, q7, i, j;
if(size >= 2) { //give recursive calls
//p1
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i][j] + a[i + size / 2][j + size / 2];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i][j] + b[i + size / 2][j + size / 2];
}
}
num = size / 2;
strassen(temp1, temp2, p1, size / 2);
//p2
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i + size / 2][j] + a[i + size / 2][j + size / 2];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i][j];
}
}
num = size / 2;
strassen(temp1, temp2, p2, size / 2);
//p3
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i][j];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i][j + size / 2] - b[i + size / 2][j + size / 2];
}
}
num = size / 2;
strassen(temp1, temp2, p3, size / 2);
//p4
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i + size / 2][j + size / 2];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i + size / 2][j] - b[i][j];
}
}
num = size / 2;
strassen(temp1, temp2, p4, size / 2);
//p5
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i][j] + a[i][j + size / 2];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i + size / 2][j + size / 2];
}
}
num = size / 2;
strassen(temp1, temp2, p5, size / 2);
//p6
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i + size / 2][j] - a[i][j];
}
}num = size / 2;
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i][j] + b[i][j + size / 2];
}
}
num = size / 2;
strassen(temp1, temp2, p6, size / 2);
//p7
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp1[i][j] = a[i][j + size / 2] - a[i + size / 2][j + size / 2];
}
}
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
temp2[i][j] = b[i + size / 2][j] + b[i + size / 2][j + size / 2];
}
}
num = size / 2;
strassen(temp1, temp2, p7, size / 2);
//c11
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
c[i][j] = p1[i][j] + p4[i][j] - p5[i][j] + p7[i][j];
}
}
//c12
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
c[i][j + size / 2] = p3[i][j] + p5[i][j];
}
}
//c21
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
c[i + size / 2][j] = p2[i][j] + p4[i][j];
}
}
//c22
for(i = 0; i < size / 2; i++) {
for(j = 0; j < size / 2; j++) {
c[i + size / 2][j + size / 2] = p1[i][j] + p3[i][j] - p2[i][j] + p6[i][j];
}
}
}
else if(size == 1) {
c[0][0] = a[0][0] * b[0][0];
}
}
int padding(int num) {
int original_num = num, lower_power = 0, i, actual_num = 1;
if(num == 1)
return 1;
while(num > 1) {
lower_power++;
num /= 2;
}
for(i = 0; i < lower_power; i++) {
actual_num *= 2;
}
if(actual_num == original_num)
return original_num;
else
return actual_num * 2;
}
int main() {
int i, j, temp;
printf("Enter the size of nxn matrix:\n");
scanf("%d", &num);
temp = num;
if(num <= 0)
return 0;
num = padding(num);
int a[num][num], b[num][num], c[num][num];
printf("Enter matrix a:\n"); //accept inputs for a and b from the user
for(i = 0; i < temp; i++) {
for(j = 0; j < temp; j++) {
scanf("%d", &a[i][j]);
}
for(j = temp; j < num; j++) {
a[i][j] = 0;
}
}
for(i = temp; i < num; i++)
for(j = 0; j < num; j++)
a[i][j] = 0;
printf("\nEnter matrix b:\n");
for(i = 0; i < temp; i++) {
for(j = 0; j < temp; j++) {
scanf("%d", &b[i][j]);
}
for(j = temp; j < num; j++) {
b[i][j] = 0;
}
}
for(i = temp; i < num; i++)
for(j = 0; j < num; j++)
b[i][j] = 0;
printf("Matrix a:\n"); //printing the actual matrices for strassen's multiplication
for(i = 0; i < num; i++) {
for(j = 0; j < num; j++) {
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix b:\n");
for(i = 0; i < num; i++) {
for(j = 0; j < num; j++) {
printf("%d ", b[i][j]);
}
printf("\n");
}
strassen(a, b, c, num);
printf("\nMatrix c is:\n");
for(i = 0; i < temp; i++) {
for(j = 0; j < temp; j++) {
printf("%d ", c[i][j]);
}
printf("\n");
}
return 0;
}

You can (perhaps even should) use int ** instead, and pass num as an argument. As a simplified example, here is how you might compute the sum of a dynamically sized array:
int sum(int *in, int len) {
int out = 0;
for(int i = 0; i < len; i++)
out += in[i];
return out;
}

Related

C, matrix transposed multiplication using dynamic memory allocation

I'm trying to transpose and multiply some matrices, basically
I get 2 matrices, matrixA and matrixB the action to be performed is trace(transpose(matrixA)*matrixB).
I managed to get this working for nxn matrices but I can't get it to work with mxn where (n>m or m>n).
I've looked online for solutions but I can't implement theirs solution into mine.
I removed almost all the code to simplify reading, if you prefer the entire code I linked it here.
If you do want to run the entire code, to recreate the problem use the following commands:
zeroes matrixA 2 3
zeroes matrixB 2 3
set matrixA
1 2 3 4 5 6
set matrixB
6 5 4 3 2 1
frob matrixA matrixB
The above commands should return Sum 56 but instead I get Sum 18
int* matrixATransposed = (int*) malloc(matrixARowLenght * matrixAColLenght * sizeof(int));
for (int i = 0; i < matrixARowLenght; i++)
{
for (int j = 0; j < matrixAColLenght; j++)
{
*(matrixATransposed + i * matrixAColLenght + j) = *(matrixA + j * matrixAColLenght + i);
}
}
// Multiply
int* mulRes = (int*)malloc(matrixARowLenght * matrixAColLenght * sizeof(int));
for (int i = 0; i < matrixAColLenght; i++) {
for (int j = 0; j < matrixBColLenght; j++) {
*(mulRes + i * matrixARowLenght + j) = 0;
for (int k = 0; k < matrixARowLenght; k++)
*(mulRes + i * matrixAColLenght + j) += *(matrixATransposed + i * matrixAColLenght + k) * *(matrixB + k * matrixBColLenght + j);
}
}
// Sum the trace
int trace = 0;
for (int i = 0; i < matrixARowLenght; i++) {
for (int j = 0; j < matrixAColLenght; j++) {
if (i == j) {
trace += *(mulRes + i * matrixAColLenght + j);
}
}
}
printf_s("Sum: %d\n", trace);

Your array indices for calculating the transpose, multiplication, and the trace seem to be incorrect. I've corrected them in the following program:
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv) {
int matrixARowLenght = 2;
int matrixAColLenght = 3;
int matrixA[] = {1,2,3,4,5,6};
int matrixBRowLenght = 2;
int matrixBColLenght = 3;
int matrixB[] = {6,5,4,3,2,1};
// Transpose
int* matrixATransposed = (int *) malloc(matrixARowLenght * matrixAColLenght * sizeof(int));
for (int i = 0; i < matrixAColLenght; i++) {
for (int j = 0; j < matrixARowLenght; j++) {
*(matrixATransposed + i * matrixARowLenght + j) = *(matrixA + j * matrixAColLenght + i);
}
}
// Multiply
int *mulRes = (int *) malloc(matrixARowLenght * matrixAColLenght * sizeof(int));
for (int i = 0; i < matrixAColLenght; ++i) {
for (int j = 0; j < matrixAColLenght; ++j) {
*(mulRes + (i * matrixAColLenght) + j) = 0;
for (int k = 0; k < matrixARowLenght; ++k) {
*(mulRes + (i * matrixAColLenght) + j) += *(matrixATransposed + (i * matrixARowLenght) + k) * *(matrixB + (k * matrixAColLenght) + j);
}
}
}
free(matrixATransposed);
// Sum the trace
int trace = 0;
for (int i = 0; i < matrixAColLenght; i++) {
for (int j = 0; j < matrixAColLenght; j++) {
if (i == j) {
trace += *(mulRes + i * matrixAColLenght + j);
}
}
}
printf("Sum: %d\n", trace);
free(mulRes);
return 0;
}
The above program will output your expected value:
Sum: 56
** UPDATE **
As pointed by MFisherKDX, the above code will not work if the result matrix is not a square matrix. The following code fixes this issue:
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char **argv) {
int matrixARowLenght = 2;
int matrixAColLenght = 3;
int matrixA[] = {1,2,3,4,5,6};
int matrixBRowLenght = 2;
int matrixBColLenght = 4;
int matrixB[] = {8,7,6,5,4,3,2,1};
// Transpose
int* matrixATransposed = (int *) malloc(matrixARowLenght * matrixAColLenght * sizeof(int));
for (int i = 0; i < matrixAColLenght; i++) {
for (int j = 0; j < matrixARowLenght; j++) {
*(matrixATransposed + i * matrixARowLenght + j) = *(matrixA + j * matrixAColLenght + i);
}
}
// Multiply
int *mulRes = (int *) malloc(matrixAColLenght * matrixBColLenght * sizeof(int));
for (int i = 0; i < matrixAColLenght; ++i) {
for (int j = 0; j < matrixBColLenght; ++j) {
*(mulRes + (i * matrixBColLenght) + j) = 0;
for (int k = 0; k < matrixARowLenght; ++k) {
*(mulRes + (i * matrixBColLenght) + j) += *(matrixATransposed + (i * matrixARowLenght) + k) * *(matrixB + (k * matrixBColLenght) + j);
}
}
}
free(matrixATransposed);
// Sum the trace
int trace = 0;
for (int i = 0; i < matrixAColLenght; i++) {
for (int j = 0; j < matrixBColLenght; j++) {
if (i == j) {
trace += *(mulRes + i * matrixBColLenght + j);
}
}
}
printf("Sum: %d\n", trace);
free(mulRes);
return 0;
}
This code will output the following as expected:
Sum: 83

Why does my code return -nan in visual studio, but not in Linux?

My Gauss Elimination code's results are -nan in visual studio, but not in Linux.
And the Linux results are awful because at func Gauss_Eli how many I increase the variable k at for blocks the func is working... doesn't occur segment error.
What is wrong with my code?
float ** Gauss_Eli(float ** matrix, int n) {
// -----------------------------------------------------
// | |
// | Eliminate elements except (i, i) element |
// | |
// -----------------------------------------------------
// Eliminate elements at lower triangle part
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
for (int k = 0; k < n + 1; k++) {
float e;
e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
matrix[j][k] -= e;
}
}
}
// Eliminate elements at upper triangle part
for (int i = n - 1; i >= 0; i--) {
for (int j = i - 1; j >= 0; j--) {
for (int k = 0; k < n + 1; k++) {
float e;
e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
matrix[j][k] -= e;
}
}
}
// Make 1 elements i, i
for (int i = 0; i < n; i++)
for (int j = 0; j < n + 1; j++) matrix[i][j] /= matrix[i][i];
return matrix;
}
int main() {
float ** matrix;
int n;
printf("Matrix Size : ");
scanf("%d", &n);
// Malloc variable matrix for Matrix
matrix = (float**)malloc(sizeof(float) * n);
for (int i = 0; i < n; i++) matrix[i] = (float*)malloc(sizeof(float) * (n + 1));
printf("Input elements : \n");
for (int i = 0; i < n; i++)
for (int j = 0; j < n + 1; j++) scanf("%f", &matrix[i][j]);
matrix = Gauss_Eli(matrix, n);
printf("Output result : \n");
//Print matrix after elimination
for (int i = 0; i < n; i++) {
for (int j = 0; j < n + 1; j++) printf("%.6f ", matrix[i][j]);
printf("\n");
}
return 0;
}

1.) OP allocates memory using the wrong type. This may lead to issues of insufficient memory and all sorts of UB and explain the difference between systems as they could have differing pointer and float sizes.
float ** matrix;
// v--- wrong type
// matrix = (float**)malloc(sizeof(float) * n);
Instead allocate to the size of the referenced variable. Easier to code (and get right), review and maintain.
matrix = malloc(sizeof *matrix * n);
if (matrix == NULL) Handle_Error();
2.) Code should look for division by 0.0
//for (int k = 0; k < n + 1; k++) {
// float e;
// e = matrix[i][k] * (matrix[j][i] / matrix[i][i]);
// matrix[j][k] -= e;
//}
if (matrix[i][i] == 0.0) Handle_Error();
float m = matrix[j][i] / matrix[i][i];
for (int k = 0; k < n + 1; k++) {
matrix[j][k] -= matrix[i][k]*m;
}
3.) General problem solving tips:
Check return values of scanf("%f", &matrix[i][j]);. It is 1?
Enable all warnings.
Especially for debug, print FP using "%e" rather than "%f".
4.) Numerical analysis tip: Insure exact subtraction when i==j
if (i == j) {
for (int k = 0; k < n + 1; k++) {
matrix[j][k] = 0.0;
}
else {
if (matrix[i][i] == 0.0) Handle_Divide_by_0();
float m = matrix[j][i] / matrix[i][i];
for (int k = 0; k < n + 1; k++) {
matrix[j][k] -= matrix[i][k]*m;
}
}

SSE memory access

I need to perform Gaussian Elimination using SSE and I am not sure how to access each element(32 bits) from the 128 bit registers(each storing 4 elements). This is the original code(without using SSE):
unsigned int i, j, k;
for (i = 0; i < num_elements; i ++) /* Copy the contents of the A matrix into the U matrix. */
for(j = 0; j < num_elements; j++)
U[num_elements * i + j] = A[num_elements*i + j];
for (k = 0; k < num_elements; k++){ /* Perform Gaussian elimination in place on the U matrix. */
for (j = (k + 1); j < num_elements; j++){ /* Reduce the current row. */
if (U[num_elements*k + k] == 0){
printf("Numerical instability detected. The principal diagonal element is zero. \n");
return 0;
}
/* Division step. */
U[num_elements * k + j] = (float)(U[num_elements * k + j] / U[num_elements * k + k]);
}
U[num_elements * k + k] = 1; /* Set the principal diagonal entry in U to be 1. */
for (i = (k+1); i < num_elements; i++){
for (j = (k+1); j < num_elements; j++)
/* Elimnation step. */
U[num_elements * i + j] = U[num_elements * i + j] -\
(U[num_elements * i + k] * U[num_elements * k + j]);
U[num_elements * i + k] = 0;
}
}
Okay I'm getting segmentation fault[core dumped] with this code. I'm new to SSE. Can someone help? Thanks.
int i,j,k;
__m128 a_i,b_i,c_i,d_i;
for (i = 0; i < num_rows; i++)
{
for (j = 0; j < num_rows; j += 4)
{
int index = num_rows * i + j;
__m128 v = _mm_loadu_ps(&A[index]); // load 4 x floats
_mm_storeu_ps(&U[index], v); // store 4 x floats
}
}
for (k = 0; k < num_rows; k++){
a_i= _mm_load_ss(&U[num_rows*k+k]);
for (j = (4*k + 1); j < num_rows; j+=4){
b_i= _mm_loadu_ps(&U[num_rows*k+j]);// Reduce the currentrow.
if (U[num_rows*k+k] == 0){
printf("Numerical instability detected.);
}
/* Division step. */
b_i = _mm_div_ps(b_i, a_i);
}
a_i = _mm_set_ss(1);
for (i = (k+1); i < num_rows; i++){
d_i= _mm_load_ss(&U[num_rows*i+k]);
for (j = (4*k+1); j < num_rows; j+=4){
c_i= _mm_loadu_ps(&U[num_rows*i+j]); /* Elimnation step. */
b_i= _mm_loadu_ps(&U[num_rows*k+j]);
c_i = _mm_sub_ps(c_i, _mm_mul_ss(b_i,d_i));
}
d_i= _mm_set_ss(0);
}
}

In order to get you started, your first loop should be more like this:
for (i = 0; i < num_elements; i++)
{
for (j = 0; j < num_elements; j += 4)
{
int index = num_elements * i + j;
__m128i v = _mm_loadu_ps((__m128i *)&A[index]); // load 4 x floats
_mm_storeu_ps((__m128i *)&U[index], v); // store 4 x floats
}
}
This assumes that num_elements is a multiple of 4, and that neither A nor U is correctly aligned.

I'm not seeing performance boost while using optimised memory bandwidth method

I was presented example of a loop which should be slower than the one after this:
for (i = 0; i < 1000; i++)
column_sum[i] = 0.0;
for (j = 0; j < 1000; j++)
column_sum[i] += b[j][i];
Comparing to this one:
for (i = 0; i < 1000; i++)
column_sum[i] = 0.0;
for (j = 0; j < 1000; j++)
for (i = 0; i < 1000; i++)
column_sum[i] += b[j][i];
Now, I coded a tool to test number of different index numbers, but I am not seeing much of performance advantage there after I tried this concept, and I'm afraid that my code has something to do with it...
Should be slower loop that works within my code:
for (i = 0; i < val; i++){
column_sum[i] = 0.0;
for (j = 0; j < val; j++){
int index = i * (int)val + j;
column_sum[i] += p[index];
}
}
Should be "significantly" faster code:
for (i = 0; i < val; i++) {
column_sum[i] = 0.0;
}
for (j = 0; j < val; j++) {
for (i = 0; i < val; i++) {
int index = j * (int)val + i;
column_sum[i] += p[index];
}
}
Data comparison:

I had confused the Index values in the loops: int index = j * (int)val + i;
Slower loop:
for (i = 0; i < val; i++) {
column_sum[i] = 0.0;
for (j = 0; j < val; j++){
int index = j * (int)val + i;
column_sum[i] += p[index];
}
}
Faster loop:
for (i = 0; i < val; i++) {
column_sum[i] = 0.0;
}
for (j = 0; j < val; j++) {
for (i = 0; i < val; i++) {
int index = j * (int)val + i;
column_sum[i] += p[index];
}
}

Initializing a 2d array in C

Here is my code:
int main() {
int x, y;
int *xptr, *yptr;
int array[10][10];
int j;
int k;
int z = 0;
for(j = 0; j < 10; j++) {
for(k = 0; k < 10; k++) {
array[j][k] = j * 10 + k;
}
}
xptr = &array[0][0];
for(j = 0; j < 10; j++) {
for(k = 0; k < 10; k++) {
printf("array[%d][%d] = %d \n", j, k, *(xptr + j), (xptr + k));
}
}
system("PAUSE");
}
I am trying to initialize a 2d array so that at [0][0] it equals 0 and at [9][9] it equals 99. With the way that it is now, [0][0-9] all equal 0 and then [1][0-9] all equal 1. How would I properly load this array in the fashion that I mentioned?

for(j = 0; j < 10; j++) {
for(k = 0; k < 10; k++) {
array[j][k] = j*10 + k;
}
}

I'm assuming you've actually declared everything, but didn't include it in the example. You simply want
array[j][k] = j*10 + k;

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Strassen matrix multiplication implementation - c

You can (perhaps even should) use int ** instead, and pass num as an argument. As a simplified example, here is how you might compute the sum of a dynamically sized array: int sum(int *in, int len) { int out = 0; for(int i = 0; i < len; i++) out += in[i]; return out; }

Related

C, matrix transposed multiplication using dynamic memory allocation

Why does my code return -nan in visual studio, but not in Linux?

SSE memory access

I'm not seeing performance boost while using optimised memory bandwidth method

Initializing a 2d array in C

Categories

Resources