Related
How would you convert this recursive function into iterative?
int sum(int n) {
if (n < 1) return 1;
return sum(n - 1) * (n - 1) + n;
Can I do it like this? (Refer below)
int sum(int n){
if(n<1) return 1;
while(n >= 1){
sum = sum(n-1) * (n-1) + n;
}
return sum;
}
Your answer is still in a recursive form.
Observe that you have a base case, i.e. where n = 0. This is the initial value of your sum. You can then iterate over n, for each iteration applying the formula given for the sum. The iterative case, then, looks like this:
int sum_iterative (int n) {
int sum = 1; //n = 0
for (int i = 1; i <= n; i++) {
sum = sum * (i-1) + i; //sum(n-1) * (n-1) + n
}
return sum;
}
I'm trying to make a simple console application in C which will calculate the determinant of a Matrix using the Gauss elimination. after a lot of tests I found out that my program is not working because of the core dumped error.After 2 days of editing and undoing, i could not find the problem.
Any help is more than welcomed.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int recherche_pivot(int k, int n, float *A)
{
int i, j;
if (A[((k - 1) * n + k) - 1] != 0)
{
return k;
}
else
{ //parcours du reste de la colonne
for (i = k + 1; i <= n; i++)
{
if (A[((k - 1) * n + i) - 1] != 0)
{
return i;
}
}
return -1;
}
}
void fois(int n, float p, int i, float * A, float *b, float * x)
{
int a;
for (a = 1; a <= n; a++)
{
x[a - 1] = A[((i - 1) * n + a) - 1] * p;
}
x[n] = b[i - 1] * p;
}
void afficher_system(int n, float * X, float *b)
{
int i, j;
for (i = 1; i <= n; i++)
{
for (j = 1; j <= n; j++)
printf("%f ", X[((i - 1) * n + j) - 1]);
printf(" | %f", b[i - 1]);
printf("nn");
}
printf("nnnn");
}
void saisirmatrice(int n, float *A)
{
int i, j;
for (i = 1; i <= n; i++)
for (j = 1; j <= n; j++)
scanf("%f", &A[((i - 1) * n + j) - 1]);
}
void affichermatrice(int n, float *A)
{
int i, j;
for (i = 1; i <= n; i++)
for (j = 1; j <= n; j++)
printf("A[%d][%d] = %fn", i, j, A[((i - 1) * n + j) - 1]);
}
void elemination(int n, int k, float *b, float *A)
{
int i, l, j;
float * L, piv;
L = (float *) malloc((n) * sizeof(float));
for (i = k + 1; i <= n; i++)
{
piv = -1 * (A[((i - 1) * n + k) - 1] / A[((k - 1) * n + k) - 1]);
fois(n, piv, k, A, b, L);
//afficher_vecteur(n,L);
for (j = 1; j <= n; j++)
{
A[((i - 1) * n + j) - 1] = A[((i - 1) * n + j) - 1] + L[j - 1];
}
b[i - 1] = b[i - 1] + L[n];
afficher_system(n, A, b);
}
}
void permutter(int n, float * A, int i, int j, float * b)
{
int a;
float t[n + 1];
for (a = 1; a <= n; a++)
{
t[a - 1] = A[((i - 1) * n + a) - 1];
A[((i - 1) * n + a) - 1] = A[((j - 1) * n + a) - 1];
A[((j - 1) * n + a) - 1] = t[a - 1];
}
t[n] = b[i - 1];
b[i - 1] = b[j - 1];
b[j - 1] = t[n];
}
void main()
{
float * A, det, *L, *R, *b, s;
int i, j, i0, n, k, stop = 0;
printf("Veuillez donner la taille de la matrice");
scanf("%d", &n);
A = (float *) malloc(sizeof(float) * (n * n));
L = (float*) malloc(n * sizeof(float));
R = (float*) malloc(n * sizeof(float));
b = (float*) malloc(n * sizeof(float));
printf("Veuillez remplir la matrice");
saisirmatrice(n, A);
det = 1;
stop = 0;
k = 1;
do
{
do
{
i0 = recherche_pivot(k, n, A);
if (i0 == k)
{
//Elémination
elemination(n, k, b, A);
k++;
}
else if (i0 == -1)
{
stop = 1;
}
else
{ //cas ou ligne pivot=i0 != k
//permutation
det = -det;
permutter(n, A, k, i0, b);
//elemination
elemination(n, k, b, A);
//afficher_matrice(n,A);
k++;
}
} while ((k <= n) && (stop == 0));
} while (stop == 1 || k == n);
for (i = 1; i < n; i++)
{
det = det * A[((i - 1) * n + i) - 1];
}
printf("Le determinant est :%f", det);
free(A);
free(L);
free(R);
free(b);
}
There are many problems in the above code. Since arrays are zero-indexed in C, you should count the rows and columns of your matrices starting from zero, instead of counting from 1 and then attempting to convert when array-indexing. There is no need to cast the result of malloc(), and it is better to use an identifier rather than an explicit type as the argument for the sizeof operator:
A = malloc(sizeof(*A) * n * n));
You allocate space for L and R in main(), and then never use these pointers until the end of the program when they are freed. Then you allocate for L within the elemination() function; but you never free this memory, so you have a memory leak. You also allocate space for b in main(), but you don't store any values in b before passing it to the elemination() function. This is bound to cause problems.
There is no need for dynamic allocation here in the first place; I suggest using a variable length array to store the elements of the matrix. These have been available since C99, and will allow you to avoid all of the allocation issues.
There is a problem in the recherche_pivot() function, where you compare:
if(A[((k - 1) * n + i) - 1] != 0) {}
This is a problem because the array element is a floating point value which is the result of arithmetic operations; this value should not be directly compared with 0. I suggest selecting an appropriate DELTA value to represent a zero range, and instead comparing:
#define DELTA 0.000001
...
if (fabs(A[((k - 1) * n + i) - 1]) < DELTA) {}
In the permutter() function you use an array, float t[n];, to hold temporary values. But an array is unnecessary here since you don't need to save these temporary values after the swap; instead just use float t;. Further, when you interchange the values in b[], you use t[n] to store the temporary value, but this is out of bounds.
The elemination() function should probably iterate over all of the rows (excepting the kth row), rather that starting from the kth row, or it should start at the k+1th row. As it is, the kth row is used to eliminate itself. Finally, the actual algorithm that you use to perform the Gaussian elimination in main() is broken. Among other things, the call permutter(n, A, k, i0, b); swaps the kth row with the i0th row, but i0 is the pivot column of the kth row. This makes no sense.
It actually looks like you want to do more than just calculate determinants with this code, since you have b, which is the constant vector of a linear system. This is not needed for the task alluded to in the title of your question. Also, it appears that your code gives a result of 1 for any 1X1 determinant. This is incorrect; it should be the value of the single number in this case.
The Gaussian elimination method for calculating the determinant requires that you keep track of how many row-interchanges are performed, and that you keep a running product of any factors by which individual rows are multiplied. Adding a multiple of one row to another row to replace that row does not change the value of the determinant, and this is the operation used in the reduce() function below. The final result is the product of the diagonal entries in the reduced matrix, multiplied by -1 once for every row-interchange operation, divided by the product of all of the factors used to scale individual rows. In this case, there are no such factors, so the result is simply the product of the diagonal elements of the reduced matrix, with the sign correction. This is the method used by the code posted in the original question.
There were so many issues here that I just wrote a fresh program that implements this algorithm. I think that it is close, at least in spirit, to what you were trying to accomplish. I did add some input validation for the size of the matrix, checking to be sure that the user inputs a positive number, and prompting for re-entry if the input is bad. The input loop that fills the matrix would benefit from similar input validation. Also note that the input size is stored in a signed int, to allow checks for negative input, and a successful input is cast and stored in a variable of type size_t, which is an unsigned integer type guaranteed to hold any array index. This is the correct type to use when indexing arrays, and you will note that size_t is used throughout the program.
#include <stdio.h>
#include <math.h>
#include <stdbool.h>
#define DELTA 0.000001
void show_matrix(size_t mx_sz, double mx[mx_sz][mx_sz]);
void interchange(size_t r1, size_t r2, size_t mx_sz, double mx[mx_sz][mx_sz]);
void reduce(double factor, size_t r1, size_t r2,
size_t mx_sz, double mx[mx_sz][mx_sz]);
size_t get_pivot(size_t row, size_t mx_sz, double mx[mx_sz][mx_sz]);
double find_det(size_t mx_sz, double mx[mx_sz][mx_sz]);
int main(void)
{
size_t n;
int read_val, c;
printf("Enter size of matrix: ");
while (scanf("%d", &read_val) != 1 || read_val < 1) {
while ((c = getchar()) != '\n' && c != EOF) {
continue; // discard extra characters
}
printf("Enter size of matrix: ");
}
n = (size_t) read_val;
double matrix[n][n];
printf("Enter matrix elements:\n");
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
scanf("%lf", &matrix[i][j]);
}
}
printf("You entered:\n");
show_matrix(n, matrix);
putchar('\n');
double result = find_det(n, matrix);
show_matrix(n, matrix);
putchar('\n');
printf("Determinant: %f\n", result);
return 0;
}
void show_matrix(size_t n, double mx[n][n])
{
for (size_t i = 0; i < n; i++) {
for (size_t j = 0; j < n; j++) {
printf("%7.2f", mx[i][j]);
}
putchar('\n');
}
}
/* interchange rows r1 and r2 */
void interchange(size_t r1, size_t r2, size_t mx_sz, double mx[mx_sz][mx_sz])
{
double temp;
for (size_t j = 0; j < mx_sz; j++) {
temp = mx[r1][j];
mx[r1][j] = mx[r2][j];
mx[r2][j] = temp;
}
}
/* add factor * row r1 to row r2 to replace row r2 */
void reduce(double factor, size_t r1, size_t r2,
size_t mx_sz, double mx[mx_sz][mx_sz])
{
for (size_t j = 0; j < mx_sz; j++) {
mx[r2][j] += (factor * mx[r1][j]);
}
}
/* returns pivot column, or mx_sz if there is no pivot */
size_t get_pivot(size_t row, size_t mx_sz, double mx[mx_sz][mx_sz])
{
size_t j = 0;
while (j < mx_sz && fabs(mx[row][j]) < DELTA) {
++j;
}
return j;
}
double find_det(size_t mx_sz, double mx[mx_sz][mx_sz])
{
size_t pivot1, pivot2;
size_t row;
double factor;
bool finished = false;
double result = 1.0;
while (!finished) {
finished = true;
row = 1;
while (row < mx_sz) {
// determinant is zero if there is a zero row
if ((pivot1 = get_pivot(row - 1, mx_sz, mx)) == mx_sz ||
(pivot2 = get_pivot(row, mx_sz, mx)) == mx_sz) {
return 0.0;
}
if (pivot1 == pivot2) {
factor = -mx[row][pivot1] / mx[row - 1][pivot1];
reduce(factor, row - 1, row, mx_sz, mx);
finished = false;
} else if (pivot2 < pivot1) {
interchange(row - 1, row, mx_sz, mx);
result = -result;
finished = false;
}
++row;
}
}
for (size_t j = 0; j < mx_sz; j++) {
result *= mx[j][j];
}
return result;
}
Sample session:
Enter size of matrix: oops
Enter size of matrix: 0
Enter size of matrix: -1
Enter size of matrix: 3
Enter matrix elements:
0 1 3
1 2 0
0 3 4
You entered:
0.00 1.00 3.00
1.00 2.00 0.00
0.00 3.00 4.00
1.00 2.00 0.00
-0.00 -3.00 -9.00
0.00 0.00 -5.00
Determinant: 5.000000
I used an R code which implements a permutation test for the distributional comparison between two populations of functions. We have p univariate p-values.
The bottleneck is the construction of a matrix which contains all the possible CONTIGUOS p-values.
The last row of the matrix of p-values contain all the univariate p-values.
The penultimate row contains all the bivariate p-values in this order:
p_val_c(1,2), p_val_c(2,3), ..., p_val_c(p, 1)
...
The elements of the first row are coincident and the value associated is the p-value of the global test p_val_c(1,...,p)=p_val_c(2,...,p,1)=...=pval(p,1,...,p-1).
For computational reasons, I have decided to implement this component in c and use it in R with .C.
Here the code. The unique important part is the definition of the function Build_pval_asymm_matrix.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix);
// Function used for the sorting of vector T_temp with qsort
int cmp(const void *x, const void *y);
int main() {
int B = 1000; // number Conditional Monte Carlo (CMC) runs
int p = 100; // number univariate tests
// Generate fictitiously data univariate p-values pval and matrix L.
// The j-th column of L is the empirical survival
// function of the statistics test associated to the j-th coefficient
// of the basis expansion. The dimension of L is B * p.
// Generate pval
double pval[p];
memset(pval, 0, sizeof(pval)); // initialize all elements to 0
for (int i = 0; i < p; i++) {
pval[i] = (double)rand() / (double)RAND_MAX;
}
// Construct L
double L[B * p];
// Inizialize to 0 the elements of L
memset(L, 0, sizeof(L));
// Array used to construct the columns of L
double temp_array[B];
memset(temp_array, 0, sizeof(temp_array));
for(int i = 0; i < B; i++) {
temp_array[i] = (double) (i + 1) / (double) B;
}
for (int iter_coeff=0; iter_coeff < p; iter_coeff++) {
// Shuffle temp_array
if (B > 1) {
for (int k = 0; k < B - 1; k++)
{
int j = rand() % B;
double t = temp_array[j];
temp_array[j] = temp_array[k];
temp_array[k] = t;
}
}
for (int i=0; i<B; i++) {
L[iter_coeff + p * i] = temp_array[i];
}
}
double pval_asymm_matrix[p * p];
memset(pval_asymm_matrix, 0, sizeof(pval_asymm_matrix));
// Construct the asymmetric matrix of p-values
clock_t start, end;
double cpu_time_used;
start = clock();
Build_pval_asymm_matrix(&p, &B, pval, L, pval_asymm_matrix);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("TOTAL CPU time used: %f\n", cpu_time_used);
return 0;
}
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix) {
int nbasis = *p, iter_CMC = *B;
// Scalar output fisher combining function applied on univariate
// p-values
double T0_temp = 0;
// Vector output fisher combining function applied on a set of
//columns of L
double T_temp[iter_CMC];
memset(T_temp, 0, sizeof(T_temp));
// Counter for elements of T_temp greater than or equal to T0_temp
int count = 0;
// Indexes for columns of L
int inf = 0, sup = 0;
// The last row of matrice_pval_asymm contains the univariate p-values
for(int i = 0; i < nbasis; i++) {
pval_asymm_matrix[i + nbasis * (nbasis - 1)] = pval[i];
}
// Construct the rows from bottom to up
for (int row = nbasis - 2; row >= 0; row--) {
for (int col = 0; col <= row; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = (nbasis - row) + col - 1;
// Combining function Fisher applied on
// p-values pval[inf:sup]
for (int k = inf; k <= sup; k++) {
T0_temp += log(pval[k]);
}
T0_temp *= -2;
// Combining function Fisher applied
// on columns inf:sup of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = inf; l <= sup; l++) {
T_temp[k] += log(L[l + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
}
// auxiliary variable for columns of L inf:nbasis-1 and 1:sup
int aux_first = 0, aux_second = 0;
int num_col_needed = 0;
for (int col = row + 1; col < nbasis; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = ((nbasis - row) + col) % nbasis - 1;
// Useful indexes
num_col_needed = nbasis - inf + sup + 1;
int index_needed[num_col_needed];
memset(index_needed, -1, num_col_needed * sizeof(int));
aux_first = inf;
for (int i = 0; i < nbasis - inf; i++) {
index_needed[i] = aux_first;
aux_first++;
}
aux_second = 0;
for (int j = 0; j < sup + 1; j++) {
index_needed[j + nbasis - inf] = aux_second;
aux_second++;
}
// Combining function Fisher applied on p-values
// pval[inf:p-1] and pval[0:sup-1]1]
for (int k = 0; k < num_col_needed; k++) {
T0_temp += log(pval[index_needed[k]]);
}
T0_temp *= -2;
// Combining function Fisher applied on columns inf:p-1 and 0:sup-1
// of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = 0; l < num_col_needed; l++) {
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
} // end for over col from row + 1 to nbasis - 1
} // end for over rows of asymm p-values matrix except the last row
}
int cmp(const void *x, const void *y)
{
double xx = *(double*)x, yy = *(double*)y;
if (xx < yy) return -1;
if (xx > yy) return 1;
return 0;
}
Here the times of execution in seconds measured in R:
time_original_function
user system elapsed
79.726 1.980 112.817
time_function_double_for
user system elapsed
79.013 1.666 89.411
time_c_function
user system elapsed
47.920 0.024 56.096
The first measure was obtained using an equivalent R function with duplication of the vector pval and matrix L.
What I wanted to ask is some suggestions in order to decrease the execution time with the C function for simulation purposes. The last time I used c was five years ago and consequently there is room for improvement. For instance I sort the vector T_temp with qsort in order to compute in linear time with a while the number of elements of T_temp greater than or equal to T0_temp. Maybe this task could be done in a more efficient way. Thanks in advance!!
I reduced the input size to p to 50 to avoid waiting on it (don't have such a fast machine) -- keeping p as is and reducing B to 100 has a similar effect, but profiling it showed that ~7.5 out of the ~8 seconds used to compute this was spent in the log function.
qsort doesn't even show up as a real hotspot. This test seems to headbutt the machine more in terms of micro-efficiency than anything else.
So unless your compiler has a vastly faster implementation of log than I do, my first suggestion is to find a fast log implementation if you can afford some accuracy loss (there are ones out there that can compute log over an order of magnitude faster with precision loss in the range of ~3% or so).
If you cannot have precision loss and accuracy is critical, then I'd suggest trying to memoize the values you use for log if you can and store them into a lookup table.
Update
I tried the latter approach.
// Create a memoized table of log values.
double log_cache[B * p];
for (int j=0, num=B*p; j < num; ++j)
log_cache[j] = log(L[j]);
Using malloc might be better here, as we're pushing rather large data to the stack and could risk overflows.
Then pass her into Build_pval_asymm_matrix.
Replace these:
T_temp[k] += log(L[l + nbasis * k]);
...
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
With these:
T_temp[k] += log_cache[l + nbasis * k];
...
T_temp[k] += log_cache[index_needed[l] + nbasis * k];
This improved the times for me from ~8 seconds to ~5.3 seconds, but we've exchanged the computational overhead of log for memory overhead which isn't that much better (in fact, it rarely is but calling log for double-precision floats is apparently quite expensive, enough to make this exchange worthwhile). The next iteration, if you want more speed, and it is very possible, involves looking into cache efficiency.
For this kind of huge matrix stuff, focusing on memory layouts and access patterns can work wonders.
I need to implement a pretty easy in-place LU-decomposition of matrix A. I'm using Gaussian elimination and I want to test it with a 3x3 matrix. The problem is, I keep getting stack smashing error and I don't have any idea why. I don't see any problems in my code, which could do this. Do you have any idea?
The problem is probably in the Factorization block.
###My code:###
#include <stdio.h>
int main() {
int n = 3; // matrix size
int A[3][3] = {
{1, 4, 7},
{2, 5, 8},
{3, 6, 10}
};
printf("Matrix A:\n");
for( int i=0; i < n; i++ ) {
for( int j=0; j < n; j++ ) {
printf("%d ", A[i][j]);
if ( j % 2 == 0 && j != 0 ) {
printf("\n");
}
}
}
// FACTORIZATION
int k;
int rows;
for( k = 0; k < n; k++ ) {
rows = k + k+1;
A[rows][k] = A[rows][k]/A[k][k];
A[rows][rows] = A[rows][rows] - A[rows][k] * A[k][rows];
printf("k: %d\n", k);
}
printf("Matrix after decomp:\n");
for( int i=0; i < n; i++ ) {
for( int j=0; j < n; j++ ) {
printf("%d ", A[i][j]);
if ( j % 3 == 0 && j != 0 ) {
printf("\n");
}
}
}
return 0;
}
Your error is most likely here:
rows = k + k+1;
A[rows][k] = A[rows][k]/A[k][k];
A[rows][rows] = A[rows][rows] - A[rows][k] * A[k][rows];
This means that rows goes through the values 1, 3, 5; and is then used to access an array with only three elements. That would, indeed, overflow, as the only valid offset among those is 1.
EDIT: Looking at your Matlab code, it is doing something completely different, as rows = k + 1:n sets rows to a small vector, which it then uses the splice the matrix, something C does not support as a primitive. You would need to reimplement both that and the matrix multiplication A(rows, k) * A(k, rows) using explicit loops.
Your original Matlab code was (Matlab has 1-based indexing):
for k = 1:n - 1
rows = k + 1:n
A(rows, k) = A(rows, k) / A(k, k)
A(rows, rows) = A(rows, rows) - A(rows, k) * A(k, rows)
end
What rows = k + 1:n this does is that it sets rows to represent a range. The expression A(rows, k) is actually a reference to a vector-shaped slice of the matrix, and Matlab can divide a vector by a scalar.
On the last line, A(rows, rows) is a matrix-shaped slice , and A(rows, k) * A(k, rows) is a matrix multiplication, e.g. multiplying matrices of dimension (1,3) and (3,1) to get one of (3,3).
In C you can't do that using the builtin = and / operators.
The C equivalent is:
for ( int k = 0; k < n - 1; ++k )
{
// A(rows, k) = A(rows, k) / A(k, k)
for ( int row = k + 1; row < n; ++row )
A[row][k] /= A[k][k];
// A(rows, rows) = A(rows, rows) - A(rows, k) * A(k, rows)
for ( int row = k + 1; row < n; ++row )
for ( int col = k + 1; col < n; ++col )
A[row][col] -= A[row][k] * A[k][col];
}
(disclaimer: untested!)
The first part is straightforward: every value in a vector is being divided by a scalar.
However, the second line is more complicated. The Matlab code includes a matrix multiplication and a matrix subtraction ; and also the operation of extracting a sub-matrix from a matrix. If we tried to write a direct translation of that to C, it is very complicated.
We need to use two nested loops to iterate over the rows and columns to perform this operation on the square matrix.
For an assignment of a course called High Performance Computing, I required to optimize the following code fragment:
int foobar(int a, int b, int N)
{
int i, j, k, x, y;
x = 0;
y = 0;
k = 256;
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
if (i > j){
y = y + 8*(i-j);
}else{
y = y + 8*(j-i);
}
}
}
return x;
}
Using some recommendations, I managed to optimize the code (or at least I think so), such as:
Constant Propagation
Algebraic Simplification
Copy Propagation
Common Subexpression Elimination
Dead Code Elimination
Loop Invariant Removal
bitwise shifts instead of multiplication as they are less expensive.
Here's my code:
int foobar(int a, int b, int N) {
int i, j, x, y, t;
x = 0;
y = 0;
for (i = 0; i <= N; i++) {
t = i + 512;
for (j = i + 1; j <= N; j++) {
x = x + ((i<<3) + (j<<2))*t;
}
}
return x;
}
According to my instructor, a well optimized code instructions should have fewer or less costly instructions in assembly language level.And therefore must be run, the instructions in less time than the original code, ie calculations are made with::
execution time = instruction count * cycles per instruction
When I generate assembly code using the command: gcc -o code_opt.s -S foobar.c,
the generated code has many more lines than the original despite having made some optimizations, and run-time is lower, but not as much as in the original code. What am I doing wrong?
Do not paste the assembly code as both are very extensive. So I'm calling the function "foobar" in the main and I am measuring the execution time using the time command in linux
int main () {
int a,b,N;
scanf ("%d %d %d",&a,&b,&N);
printf ("%d\n",foobar (a,b,N));
return 0;
}
Initially:
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
if (i > j){
y = y + 8*(i-j);
}else{
y = y + 8*(j-i);
}
}
}
Removing y calculations:
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
}
}
Splitting i, j, k:
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 8*i*i + 16*i*k ; // multiple of 1 (no j)
x = x + (4*i + 8*k)*j ; // multiple of j
}
}
Moving them externally (and removing the loop that runs N-i times):
for (i = 0; i <= N; i++) {
x = x + (8*i*i + 16*i*k) * (N-i) ;
x = x + (4*i + 8*k) * ((N*N+N)/2 - (i*i+i)/2) ;
}
Rewritting:
for (i = 0; i <= N; i++) {
x = x + ( 8*k*(N*N+N)/2 ) ;
x = x + i * ( 16*k*N + 4*(N*N+N)/2 + 8*k*(-1/2) ) ;
x = x + i*i * ( 8*N + 16*k*(-1) + 4*(-1/2) + 8*k*(-1/2) );
x = x + i*i*i * ( 8*(-1) + 4*(-1/2) ) ;
}
Rewritting - recalculating:
for (i = 0; i <= N; i++) {
x = x + 4*k*(N*N+N) ; // multiple of 1
x = x + i * ( 16*k*N + 2*(N*N+N) - 4*k ) ; // multiple of i
x = x + i*i * ( 8*N - 20*k - 2 ) ; // multiple of i^2
x = x + i*i*i * ( -10 ) ; // multiple of i^3
}
Another move to external (and removal of the i loop):
x = x + ( 4*k*(N*N+N) ) * (N+1) ;
x = x + ( 16*k*N + 2*(N*N+N) - 4*k ) * ((N*(N+1))/2) ;
x = x + ( 8*N - 20*k - 2 ) * ((N*(N+1)*(2*N+1))/6);
x = x + (-10) * ((N*N*(N+1)*(N+1))/4) ;
Both the above loop removals use the summation formulas:
Sum(1, i = 0..n) = n+1
Sum(i1, i = 0..n) = n(n + 1)/2
Sum(i2, i = 0..n) = n(n + 1)(2n + 1)/6
Sum(i3, i = 0..n) = n2(n + 1)2/4
y does not affect the final result of the code - removed:
int foobar(int a, int b, int N)
{
int i, j, k, x, y;
x = 0;
//y = 0;
k = 256;
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*k);
//if (i > j){
// y = y + 8*(i-j);
//}else{
// y = y + 8*(j-i);
//}
}
}
return x;
}
k is simply a constant:
int foobar(int a, int b, int N)
{
int i, j, x;
x = 0;
for (i = 0; i <= N; i++) {
for (j = i + 1; j <= N; j++) {
x = x + 4*(2*i+j)*(i+2*256);
}
}
return x;
}
The inner expression can be transformed to: x += 8*i*i + 4096*i + 4*i*j + 2048*j. Use math to push all of them to the outer loop: x += 8*i*i*(N-i) + 4096*i*(N-i) + 2*i*(N-i)*(N+i+1) + 1024*(N-i)*(N+i+1).
You can expand the above expression, and apply sum of squares and sum of cubes formula to obtain a close form expression, which should run faster than the doubly nested loop. I leave it as an exercise to you. As a result, i and j will also be removed.
a and b should also be removed if possible - since a and b are supplied as argument but never used in your code.
Sum of squares and sum of cubes formula:
Sum(x2, x = 1..n) = n(n + 1)(2n + 1)/6
Sum(x3, x = 1..n) = n2(n + 1)2/4
This function is equivalent with the following formula, which contains only 4 integer multiplications, and 1 integer division:
x = N * (N + 1) * (N * (7 * N + 8187) - 2050) / 6;
To get this, I simply typed the sum calculated by your nested loops into Wolfram Alpha:
sum (sum (8*i*i+4096*i+4*i*j+2048*j), j=i+1..N), i=0..N
Here is the direct link to the solution. Think before coding. Sometimes your brain can optimize code better than any compiler.
Briefly scanning the first routine, the first thing you notice is that expressions involving "y" are completely unused and can be eliminated (as you did). This further permits eliminating the if/else (as you did).
What remains is the two for loops and the messy expression. Factoring out the pieces of that expression that do not depend on j is the next step. You removed one such expression, but (i<<3) (ie, i * 8) remains in the inner loop, and can be removed.
Pascal's answer reminded me that you can use a loop stride optimization. First move (i<<3) * t out of the inner loop (call it i1), then calculate, when initializing the loop, a value j1 that equals (i<<2) * t. On each iteration increment j1 by 4 * t (which is a pre-calculated constant). Replace your inner expression with x = x + i1 + j1;.
One suspects that there may be some way to combine the two loops into one, with a stride, but I'm not seeing it offhand.
A few other things I can see. You don't need y, so you can remove its declaration and initialisation.
Also, the values passed in for a and b aren't actually used, so you could use these as local variables instead of x and t.
Also, rather than adding i to 512 each time through you can note that t starts at 512 and increments by 1 each iteration.
int foobar(int a, int b, int N) {
int i, j;
a = 0;
b = 512;
for (i = 0; i <= N; i++, b++) {
for (j = i + 1; j <= N; j++) {
a = a + ((i<<3) + (j<<2))*b;
}
}
return a;
}
Once you get to this point you can also observe that, aside from initialising j, i and j are only used in a single mutiple each - i<<3 and j<<2. We can code this directly in the loop logic, thus:
int foobar(int a, int b, int N) {
int i, j, iLimit, jLimit;
a = 0;
b = 512;
iLimit = N << 3;
jLimit = N << 2;
for (i = 0; i <= iLimit; i+=8) {
for (j = i >> 1 + 4; j <= jLimit; j+=4) {
a = a + (i + j)*b;
}
b++;
}
return a;
}
OK... so here is my solution, along with inline comments to explain what I did and how.
int foobar(int N)
{ // We eliminate unused arguments
int x = 0, i = 0, i2 = 0, j, k, z;
// We only iterate up to N on the outer loop, since the
// last iteration doesn't do anything useful. Also we keep
// track of '2*i' (which is used throughout the code) by a
// second variable 'i2' which we increment by two in every
// iteration, essentially converting multiplication into addition.
while(i < N)
{
// We hoist the calculation '4 * (i+2*k)' out of the loop
// since k is a literal constant and 'i' is a constant during
// the inner loop. We could convert the multiplication by 2
// into a left shift, but hey, let's not go *crazy*!
//
// (4 * (i+2*k)) <=>
// (4 * i) + (4 * 2 * k) <=>
// (2 * i2) + (8 * k) <=>
// (2 * i2) + (8 * 512) <=>
// (2 * i2) + 2048
k = (2 * i2) + 2048;
// We have now converted the expression:
// x = x + 4*(2*i+j)*(i+2*k);
//
// into the expression:
// x = x + (i2 + j) * k;
//
// Counterintuively we now *expand* the formula into:
// x = x + (i2 * k) + (j * k);
//
// Now observe that (i2 * k) is a constant inside the inner
// loop which we can calculate only once here. Also observe
// that is simply added into x a total (N - i) times, so
// we take advantange of the abelian nature of addition
// to hoist it completely out of the loop
x = x + (i2 * k) * (N - i);
// Observe that inside this loop we calculate (j * k) repeatedly,
// and that j is just an increasing counter. So now instead of
// doing numerous multiplications, let's break the operation into
// two parts: a multiplication, which we hoist out of the inner
// loop and additions which we continue performing in the inner
// loop.
z = i * k;
for (j = i + 1; j <= N; j++)
{
z = z + k;
x = x + z;
}
i++;
i2 += 2;
}
return x;
}
The code, without any of the explanations boils down to this:
int foobar(int N)
{
int x = 0, i = 0, i2 = 0, j, k, z;
while(i < N)
{
k = (2 * i2) + 2048;
x = x + (i2 * k) * (N - i);
z = i * k;
for (j = i + 1; j <= N; j++)
{
z = z + k;
x = x + z;
}
i++;
i2 += 2;
}
return x;
}
I hope this helps.
int foobar(int N) //To avoid unuse passing argument
{
int i, j, x=0; //Remove unuseful variable, operation so save stack and Machine cycle
for (i = N; i--; ) //Don't check unnecessary comparison condition
for (j = N+1; --j>i; )
x += (((i<<1)+j)*(i+512)<<2); //Save Machine cycle ,Use shift instead of Multiply
return x;
}