I need to calculate and compare execution time of multiplication of 2 matrices in 3 different sizes (100 * 100 , 1000 * 1000 and 10000 * 10000) in C programming language. I wrote the following simple code to do that for 1000 * 1000 and I got the execution time
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n function took %f seconds to execute \n", time_taken);
return 0;
}
And now I want to repeat this part for two other sizes and get the result like this at the end of my program with one run:
the execution time for 100 * 100 is 1 second
the execution time for 1000 * 1000 is 2 seconds
the execution time for 10000 * 10000 is 3 seconds
What is the best solution for that? When I repeat this part for 10000 * 10000 after 1000 * 1000 I got the segmentation fault error.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);
r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
printf("\n run second one for %d \n",r1);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() / RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand() / RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}
begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}
end1 = clock();
time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n second function took %f seconds to execute \n",
time_taken);
free(a);
free(b);
free(result);
return 0;
}
A simplified version of your program:
...
int main()
{
int r1 = 1000, c1 = 1000, r2 = 1000, c2 = 1000, i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
...
free(a);
free(b);
free(result);
r1 = 10000, c1 = 10000, r2 = 10000, c2 = 10000;
for (i = 0; i < r1; ++i)
for (j = 0; j < c1; ++j)
(*a)[i][j] = rand() /RAND_MAX; // KABOOM !
...
}
A quick but crucial information about about VLA arrays. Name "variable" in "variable-length-array" means that the size is stored in a variable, not that the size is variable. This variable is hidden and can be only read via sizeof operator.
The size of array is bound to it's type, not to its value. Therefore the dimensions of VLA type (and object) cannot change, no matter if the object is dynamic or automatic.
The line:
double(*a)[r1][c1] = malloc(sizeof *a);
it interpreted as:
typedef double __hidden_type[r1][c1];
__hidden_type* a = malloc(sizeof *a);
... changes of r1 or c1 do not affect sizeof(__hidden_type)
The sizes are bound to the types when the types are defined. After that the types are immutable.
Therefore changing the r1 does not change the size of *a. You need to create a new a (or rather its type) and allocate memory for this new *a.
I suggest moving the whole test to a function that takes r1, r2, c1 and c2 as parameters. The arrays would be local to the function.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void bench(int r1, int c1, int r2, int c2) {
int i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c1; ++j)
{
(*a)[i][j] = rand() /RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; ++i)
{
for (j = 0; j < c2; ++j)
{
(*b)[i][j] = rand()/ RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i)
{
for (j = 0; j < c2; ++j)
{
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
{
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("\n \nfunction took %f seconds to execute \n", time_taken);
free(a);
free(b);
free(result);
}
int main()
{
bench(1000, 1000, 1000, 1000);
bench(2000, 2000, 2000, 2000);
}
I've reduced the size from 10000 to 2000 to get results in reasonable time.
On my machine I got:
function took 1.966788 seconds to execute
function took 37.370633 seconds to execute
Note that the function is very cache unfriendly.
for (i = 0; i < r1; ++i)
for (j = 0; j < c2; ++j)
for (k = 0; k < c1; ++k)
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
On every iteration of k you get a cache miss when accessing (*b)[k][j]. Try swapping the j and k loops:
for (i = 0; i < r1; ++i)
for (k = 0; k < c1; ++k)
for (j = 0; j < c2; ++j)
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
Now when increasing j then (*result)[i][j] and (*b)[k][j] are likely in cache.
On my machine this trivial change gave 10x speedup:
function took 0.319594 seconds to execute
function took 3.829459 seconds to execute
There are multiple problems in your code:
you free the matrices and perform a new benchmark, storing data thru invalid pointers... this has undefined behavior, in your case a segmentation fault.
the allocation code is specific for the initial matrix size, you cannot reallocate the matrices for a different size in the main() function. You should move the code to a separate function taking the matrix sizes as arguments and call this function multiple times.
the initialization values rand() / RAND_MAX are almost always zero because integer arithmetics is used for this division. You should use (*a)[i][j] = rand() / (double)RAND_MAX;
Here is a modified version (similar to tstanisl's):
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void test(int r1, int c1, int r2, int c2) {
int i, j, k;
// Dynamic allocation.
double(*a)[r1][c1] = malloc(sizeof *a);
double(*b)[r2][c2] = malloc(sizeof *b);
double(*result)[r1][c2] = malloc(sizeof *result);
// Storing elements of first matrix.
for (i = 0; i < r1; ++i) {
for (j = 0; j < c1; ++j) {
(*a)[i][j] = rand() / (double)RAND_MAX;
}
}
// Storing elements of second matrix.
for (i = 0; i < r2; ++i) {
for (j = 0; j < c2; ++j) {
(*b)[i][j] = rand() / (double)RAND_MAX;
}
}
// Initializing all elements of result matrix to 0
for (i = 0; i < r1; ++i) {
for (j = 0; j < c2; ++j) {
(*result)[i][j] = 0;
}
}
clock_t begin1 = clock();
// Multiplying matrices a and b and
// storing result in result matrix
// using cache friendly index order
for (i = 0; i < r1; ++i) {
for (k = 0; k < c1; ++k) {
for (j = 0; j < c2; ++j) {
(*result)[i][j] += (*a)[i][k] * (*b)[k][j];
}
}
}
clock_t end1 = clock();
double time_taken = (double)(end1 - begin1) / CLOCKS_PER_SEC;
printf("M(%d,%d) x M(%d,%d) took %f seconds to execute\n",
r1, c1, r2, c2, time_taken);
free(a);
free(b);
free(result);
}
int main() {
test(100, 100, 100, 100);
test(1000, 1000, 1000, 1000);
test(2000, 2000, 2000, 2000);
test(3000, 3000, 3000, 3000);
test(4000, 4000, 4000, 4000);
return 0;
}
Output:
M(100,100) x M(100,100) took 0.000347 seconds to execute
M(1000,1000) x M(1000,1000) took 0.616177 seconds to execute
M(2000,2000) x M(2000,2000) took 5.017987 seconds to execute
M(3000,3000) x M(3000,3000) took 17.703356 seconds to execute
M(4000,4000) x M(4000,4000) took 43.825951 seconds to execute
The time complexity of this simplistic implementation is O(N3), which is consistent with the above timings. Given enough RAM (2.4 GB), multiplying matrices with 10000 rows and columns would take a bit more than 10 minutes.
Achieving the multiplication of 2 10k by 10k double matrices in 3 seconds requires specialized hardware and tailor made software, well beyond the simple approach in this answer.
And now I want to repeat this part for two other sizes and get the result like this at the end of my program with one run: the execution time for 100 * 100 is 1 second the execution time for 1000 * 1000 is 2 seconds the execution time for 10000 * 10000 is 3 seconds
I simply cannot believe that you have multiplied two 10,000 x 10,000 matrices in 3 seconds. What computer are you running that experiment in? Not, with that algorithm and using only one core. Probably you are optimizing your compilation (with default flag -O2 and the whole algorithm has been evicted from the compiler output, as you don't use the matrix after the calculation, so it is innecessary to lose time in a loop for nothing) first, compile your matrix, or print after the computation one element of the result matrix, so the compiler cannot evict the calculation. But don't say your algorithm is multiplying two matrices 10,000 rows and columns in 3 sec.
Allocating a matrix of 10,000x10,000 double quantities, requires a lot of memory. It's 100,000,000 of double entries, wich gets over 800Mb in a single malloc It's very possible that your laptop can handle this once.... but don't do many of these allocations, as you will probably will be over the limits of malloc(3) (or your system).
More when you need at least two of these allocations, or even more, as you said you want to repeat the calculations.
Have you trying to scale yor problem to 100,000 by 100,000 matrices?
When you repeat, it's not warranted that there's no fragmentation in the heap maintained by malloc(), so as you are requesting one gigabyte of continuous memory per matrix, it is probable that malloc(3) runs out of memory in the last mallocs. I'd suggest you to do the three tests in different programs (separate), and don't start any other program (like the browser or your favourite desktop environment while running a matrix multiplication involving 200,000,000 numbers.
Anyway, your program probably start doing swapping if you don't fine control your execution environment, just trashing all the efficiency measurements you are trying to do.
Another thing is that you can probably could incurr in a process limit (if your administrator has established a maximum core memory limmit for your process) and you don't check the result from malloc(3) for proper allocation.
Note:
I have not been able to multiply the two 10,000 by 10,000 matrices in my machine (Pentium Core duo with 6Gb RAM) because the program starts swaping and becomes incredible slow. BTW, be careful, and don't compile your program with optimization (even -O1 makes the matrix multiplication to be eliminated completely from the code, as there's no use of the product, so the complete algorithm is eliminated from the output code by the optimization code in the compiler) I'll leave it running and update my answer if I get some result.
(edit)
$ matrix_mult
function took 9364.303443 seconds to execute
$
It took 2h 36m 6.303443s to execute in my system. This is more approximate to the complexity of the problem (as your approximation is barely linear) you need to compile it without optimization or at least print one element of the resultant matrix, if you want the product being calculated.
Below is the code I'm testing, if you want to test it (I have modified it a bit to make it more readable, to use better timestamps, and to avoid the C initialization, as it can be done on the fly (read the comments in the code):
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 10000
int main()
{
// Dynamic allocation.
#define A (*a)
#define B (*b)
#define R (*result)
double A[N][N] = malloc(sizeof *a);
double B[N][N] = malloc(sizeof *b);
double R[N][N] = malloc(sizeof *result);
printf("Initializing both matrices\n");
// Storing elements of first (and second) matrices.
for (int row = 0; row < N; ++row) {
for (int col = 0; col < N; ++col) {
A[row][col] = rand() / (double)RAND_MAX; // matrix A
B[row][col] = rand() / (double)RAND_MAX; // matrix B
}
}
// Storing elements of second matrix. (done above)
// Initializing all elements of result matrix to 0
// (not needed, see below)
printf("Starting multiplication\n");
struct timespec start_ts; // start timestamp (secs & nsecs).
int res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&start_ts); // cpu time only.
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
// Multiplying matrices a and b and
// storing result in result matrix
for (int row = 0; row < N; ++row) {
for (int col = 0; col < N; ++col) {
double aux = 0.0;
for (int k = 0; k < N; ++k) {
aux += A[row][k] * B[k][col];
}
// why to involve calculating the address of the affected
// cell at every inner loop iteration???
R[row][col] = aux;
}
}
struct timespec end_ts;
res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&end_ts);
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
bool carry = start_ts.tv_nsec > end_ts.tv_nsec;
struct timespec diff_time;
diff_time.tv_sec = end_ts.tv_sec - start_ts.tv_sec;
diff_time.tv_nsec = end_ts.tv_nsec - start_ts.tv_nsec;
if (carry) {
diff_time.tv_sec--;
diff_time.tv_nsec += 1000000000;
}
printf("\n function took %ld.%06ld seconds to execute \n",
diff_time.tv_sec, diff_time.tv_nsec / 1000);
return 0;
}
2nd edit
I have tested multiplication times on a modified version of your program (but using the same algorithm) with the program below:
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MAX 10000
int dim[] = {
89, 144, 233, 377, 610,
987, 1597, 2584, 4181, 6765,
10000
}; /* several numbers taken from fibonacci numbers */
size_t dim_cnt = sizeof dim / sizeof dim[0];
double A[MAX][MAX];
double B[MAX][MAX];
double R[MAX][MAX];
int main()
{
for (int rep = 0; rep < dim_cnt; rep++) {
size_t N = dim[rep];
printf("It %d: Initializing both %zd x %zd matrices\n",
rep, N, N);
// Storing elements of first (and second) matrices.
for (int row = 0; row < N; ++row) {
for (int col = 0; col < N; ++col) {
A[row][col] = rand() /(double)RAND_MAX; // matrix A
B[row][col] = rand() /(double)RAND_MAX; // matrix B
}
}
// Storing elements of second matrix. (done above)
// Initializing all elements of result matrix to 0
// (not needed, see below)
printf("It %d: Starting multiplication\n", rep);
struct timespec start_ts; // start timestamp (secs & nsecs).
int res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&start_ts); // cpu time only.
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
// Multiplying matrices a and b and
// storing result in result matrix
for (int row = 0; row < N; ++row) {
for (int col = 0; col < N; ++col) {
double aux = 0.0;
for (int k = 0; k < N; ++k) {
aux += A[row][k] * B[k][col];
}
// why to involve calculating the address of the affected
// cell at every inner loop iteration???
R[row][col] = aux;
}
}
struct timespec end_ts;
res = clock_gettime(
CLOCK_PROCESS_CPUTIME_ID,
&end_ts);
if (res < 0) {
perror("clock_gettime");
exit(EXIT_FAILURE);
}
bool carry = start_ts.tv_nsec > end_ts.tv_nsec;
struct timespec diff_time;
diff_time.tv_sec = end_ts.tv_sec - start_ts.tv_sec;
diff_time.tv_nsec = end_ts.tv_nsec - start_ts.tv_nsec;
if (carry) {
diff_time.tv_sec--;
diff_time.tv_nsec += 1000000000;
}
printf("It %d: R[0][0] = %g\n", rep, R[0][0]);
printf("%7zd %ld.%06ld\n",
N,
diff_time.tv_sec,
diff_time.tv_nsec / 1000);
}
return 0;
}
It uses a single, statically allocated memory for the biggest of the matrices, and uses a subset of it to model the lower ones. The execution times are far more appropiate than the values you shown in your doce, and I print a result matrix cell value to force the optimizer to conserve the matrix calculation. The resulting execution you get should show values proportional to the ones shown here.
$ time matrix_mult
It 0: Initializing both 89 x 89 matrices
It 0: Starting multiplication
It 0: R[0][0] = 23.6756
89 0.005026
It 1: Initializing both 144 x 144 matrices
It 1: Starting multiplication
It 1: R[0][0] = 40.2614
144 0.019682
It 2: Initializing both 233 x 233 matrices
It 2: Starting multiplication
It 2: R[0][0] = 59.5599
233 0.095213
It 3: Initializing both 377 x 377 matrices
It 3: Starting multiplication
It 3: R[0][0] = 93.4422
377 0.392914
It 4: Initializing both 610 x 610 matrices
It 4: Starting multiplication
It 4: R[0][0] = 153.068
610 1.671904
It 5: Initializing both 987 x 987 matrices
It 5: Starting multiplication
It 5: R[0][0] = 252.981
987 8.816252
It 6: Initializing both 1597 x 1597 matrices
It 6: Starting multiplication
It 6: R[0][0] = 403.61
1597 37.807920
It 7: Initializing both 2584 x 2584 matrices
It 7: Starting multiplication
It 7: R[0][0] = 629.521
2584 157.371367
It 8: Initializing both 4181 x 4181 matrices
It 8: Starting multiplication
It 8: R[0][0] = 1036.47
4181 667.084346
It 9: Initializing both 6765 x 6765 matrices
It 9: Starting multiplication
It 9: R[0][0] = 1653.59
6765 2831.117818
It 10: Initializing both 10000 x 10000 matrices
It 10: Starting multiplication
It 10: R[0][0] = 2521.68
10000 9211.738007
real 216m46,129s
user 215m16,041s
sys 0m4,899s
$ _
In my system, the program shows the following size
$ size matrix_mult
text data bss dec hex filename
2882 528 2400000016 2400003426 0x8f0d2562 matrix_mult
$ _
with a 2.4Gb large bss segment, as corresponding to three variables of around 800Mb each.
One last point: Using VLAs and making your program to dynamically allocate things that will be used during all the program life will not help you to make it faster or slower. It's the algorithm what makes programs faster or slower. Bu I fear you have not calculated a 10,000 by 10,000 matrix product in 3s.
Related
I created a cross-correlation algorithm, and I am trying to maximize its performance by reducing the time it takes for it to run. First of all, I reduced the number of function calls within the "crossCorrelationV2" function. Second, I created several macros at the top of the program for constants. Third, I reduced the number of loops that are inside the "crossCorrelationV2" function. The code that you see is the most recent code that I have.
Are there any other methods I can use to try and reduce the processing time of my code?
Let's assume that I am only focused on the functions "crossCorrelationV2" and "createAnalyzingWave".
I would be glad for any advice, whether in general about programming or pertaining to those two specific functions; I am a beginner programmer. Thanks.
#include <stdio.h>
#include <stdlib.h>
#define ARRAYSIZE 4096
#define PULSESNUMBER 16
#define DATAFREQ 1300
// Print the contents of the array onto the console.
void printArray(double array[], int size){
int k;
for (k = 0; k < size; k++){
printf("%lf ", array[k]);
}
printf("\n");
}
// Creates analyzing square wave. This square wave has unity (1) magnitude.
// The number of high values in each period is determined by high values = (analyzingT/2) / time increment
void createAnalyzingWave(double analyzingFreq, double wave[]){
int highValues = (1 / analyzingFreq) * 0.5 / ((PULSESNUMBER * (1 / DATAFREQ) / ARRAYSIZE));
int counter = 0;
int p;
for(p = 1; p <= ARRAYSIZE; p++){
if ((counter % 2) == 0){
wave[p - 1] = 1;
} else{
wave[p - 1] = 0;
}
if (p % highValues == 0){
counter++;
}
}
}
// Creates data square wave (for testing purposes, for the real implementation actual ADC data will be used). This
// square wave has unity magnitude.
// The number of high values in each period is determined by high values = array size / (2 * number of pulses)
void createDataWave(double wave[]){
int highValues = ARRAYSIZE / (2 * PULSESNUMBER);
int counter = 0;
int p;
for(p = 0; p < ARRAYSIZE; p++){
if ((counter % 2) == 0){
wave[p] = 1;
} else{
wave[p] = 0;
}
if ((p + 1) % highValues == 0){
counter++;
}
}
}
// Finds the average of all the values inside an array
double arrayAverage(double array[], int size){
int i;
double sum = 0;
// Same thing as for(i = 0; i < arraySize; i++)
for(i = size; i--; ){
sum = array[i] + sum;
}
return sum / size;
}
// Cross-Correlation algorithm
double crossCorrelationV2(double dataWave[], double analyzingWave[]){
int bigArraySize = (2 * ARRAYSIZE) - 1;
// Expand analyzing array into array of size 2arraySize-1
int lastArrayIndex = ARRAYSIZE - 1;
int lastBigArrayIndex = 2 * ARRAYSIZE - 2; //bigArraySize - 1; //2 * arraySize - 2;
double bigAnalyzingArray[bigArraySize];
int i;
int b;
// Set first few elements of the array equal to analyzingWave
// Set remainder of big analyzing array to 0
for(i = 0; i < ARRAYSIZE; i++){
bigAnalyzingArray[i] = analyzingWave[i];
bigAnalyzingArray[i + ARRAYSIZE] = 0;
}
double maxCorrelationValue = 0;
double currentCorrelationValue;
// "Beginning" of correlation algorithm proper
for(i = 0; i < bigArraySize; i++){
currentCorrelationValue = 0;
for(b = lastBigArrayIndex; b > 0; b--){
if (b >= lastArrayIndex){
currentCorrelationValue = dataWave[b - lastBigArrayIndex / 2] * bigAnalyzingArray[b] + currentCorrelationValue;
}
bigAnalyzingArray[b] = bigAnalyzingArray[b - 1];
}
bigAnalyzingArray[0] = 0;
if (currentCorrelationValue > maxCorrelationValue){
maxCorrelationValue = currentCorrelationValue;
}
}
return maxCorrelationValue;
}
int main(){
int samplesNumber = 25;
double analyzingFreq = 1300;
double analyzingWave[ARRAYSIZE];
double dataWave[ARRAYSIZE];
createAnalyzingWave(analyzingFreq, analyzingWave);
//createDataWave(arraySize, pulsesNumber, dataWave);
double maximumCorrelationArray[samplesNumber];
int i;
for(i = 0; i < samplesNumber; i++){
createDataWave(dataWave);
maximumCorrelationArray[i] = crossCorrelationV2(dataWave, analyzingWave);
}
printf("Average of the array values: %lf\n", arrayAverage(maximumCorrelationArray, samplesNumber));
return 0;
}
The first point is that you are explicitly shifting the analizingData array, this way you are required twice as much memory and moving the items is about 50% of your time. In a test here using crossCorrelationV2 takes 4.1 seconds, with the implementation crossCorrelationV3 it runs in ~2.0 seconds.
The next thing is that you are spending time multiplying by zero on the padded array, removing that, and also removing the padding, and simplifying the indices we end with crossCorrelationV4 that makes the program to run in ~1.0 second.
// Cross-Correlation algorithm
double crossCorrelationV3(double dataWave[], double analyzingWave[]){
int bigArraySize = (2 * ARRAYSIZE) - 1;
// Expand analyzing array into array of size 2arraySize-1
int lastArrayIndex = ARRAYSIZE - 1;
int lastBigArrayIndex = 2 * ARRAYSIZE - 2; //bigArraySize - 1; //2 * arraySize - 2;
double bigAnalyzingArray[bigArraySize];
int i;
int b;
// Set first few elements of the array equal to analyzingWave
// Set remainder of big analyzing array to 0
for(i = 0; i < ARRAYSIZE; i++){
bigAnalyzingArray[i] = analyzingWave[i];
bigAnalyzingArray[i + ARRAYSIZE] = 0;
}
double maxCorrelationValue = 0;
double currentCorrelationValue;
// "Beginning" of correlation algorithm proper
for(i = 0; i < bigArraySize; i++){
currentCorrelationValue = 0;
// Instead of checking if b >= lastArrayIndex inside the loop I use it as
// a stopping condition.
for(b = lastBigArrayIndex; b >= lastArrayIndex; b--){
// instead of shifting bitAnalizing[b] = bigAnalyzingArray[b-1] every iteration
// I simply use bigAnalizingArray[b-i]
currentCorrelationValue = dataWave[b - lastBigArrayIndex / 2] * bigAnalyzingArray[b - i] + currentCorrelationValue;
}
bigAnalyzingArray[0] = 0;
if (currentCorrelationValue > maxCorrelationValue){
maxCorrelationValue = currentCorrelationValue;
}
}
return maxCorrelationValue;
}
// Cross-Correlation algorithm
double crossCorrelationV4(double dataWave[], double analyzingWave[]){
int bigArraySize = (2 * ARRAYSIZE) - 1;
// Expand analyzing array into array of size 2arraySize-1
int lastArrayIndex = ARRAYSIZE - 1;
int lastBigArrayIndex = 2 * ARRAYSIZE - 2; //bigArraySize - 1; //2 * arraySize - 2;
// I will not allocate the bigAnalizingArray here
// double bigAnalyzingArray[bigArraySize];
int i;
int b;
// I will not copy the analizingWave to bigAnalyzingArray
// for(i = 0; i < ARRAYSIZE; i++){
// bigAnalyzingArray[i] = analyzingWave[i];
// bigAnalyzingArray[i + ARRAYSIZE] = 0;
// }
double maxCorrelationValue = 0;
double currentCorrelationValue;
// Compute the correlation by symmetric paris
// the idea here is to simplify the indices of the inner loops since
// they are computed more times.
for(i = 0; i < lastArrayIndex; i++){
currentCorrelationValue = 0;
for(b = lastArrayIndex - i; b >= 0; b--){
// instead of shifting bitAnalizing[b] = bigAnalyzingArray[b-1] every iteration
// I simply use bigAnalizingArray[b-i]
currentCorrelationValue += dataWave[b] * analyzingWave[b + i];
}
if (currentCorrelationValue > maxCorrelationValue){
maxCorrelationValue = currentCorrelationValue;
}
if(i != 0){
currentCorrelationValue = 0;
// Correlate shifting to the other side
for(b = lastArrayIndex - i; b >= 0; b--){
// instead of shifting bitAnalizing[b] = bigAnalyzingArray[b-1] every iteration
// I simply use bigAnalizingArray[b-i]
currentCorrelationValue += dataWave[b + i] * analyzingWave[b];
}
if (currentCorrelationValue > maxCorrelationValue){
maxCorrelationValue = currentCorrelationValue;
}
}
}
return maxCorrelationValue;
}
If you want more optimization you can unroll some iterations of the loop and enable some compiler optimizations like vector extension.
How to normalize a matrix?
Suppose I have a 2x3 matrix:
1 2 3
4 5 6
The normalized matrix would be:
1/sqrt(pow(2,2) + pow(3,2)) 2/sqrt(pow(2,2) + pow(3,2)) 3/sqrt(pow(2,2) + pow(3,2))
4/sqrt(pow(5,2) + pow(6,2)) 5/sqrt(pow(5,2) + pow(6,2)) 6/sqrt(pow(5,2) + pow(6,2))
This is my sample code:
#include <stdio.h>
#include <conio.h>
#include <math.h>
int main(){
int rows, cols, rowCounter, colCounter, r, c;
int initial[100], inputMatrix[100][100], rowSum[100] = {0}, norm[100][100], square[100] = {0};
printf("Enter size of a matrix\n");
scanf("%d %d", &rows, &cols);
printf("Enter matrix of size %dX%d\n", rows, cols);
/* Input matrix */
for(rowCounter = 0; rowCounter < rows; rowCounter++){
for(colCounter = 0; colCounter < cols; colCounter++){
scanf("%d", &inputMatrix[rowCounter][colCounter]);
}
}
for(r = 0; r < rows; r++)
{
for(c = 1; c < cols; c++)
{
float a;
a == inputMatrix[r][c];
square[r] += pow(a, 2);
}
printf("%.2lf ", square[r]);
}
for(rowCounter = 0; rowCounter < rows; rowCounter++)
{
for(colCounter = 0; colCounter < cols; colCounter++)
{
norm[rowCounter][colCounter] == (inputMatrix[rowCounter][colCounter]) / sqrt(square[rowCounter]);
}
}
printf("\nNormalized Matrix:\n");
for(rowCounter = 0; rowCounter < rows; rowCounter++)
{
for(colCounter = 0; colCounter < cols; colCounter++)
{
printf("%.3lf ", norm[rowCounter][colCounter]);
}
printf("\n");
}
getch();
return 0;
}
Why are you using == here:
for(r = 0; r < rows; r++)
{
for(c = 1; c < cols; c++)
{
float a;
a == inputMatrix[r][c]; //look here
square[r] += pow(a, 2);
}
It should be:
for(r = 0; r < rows; r++)
{
for(c = 1; c < cols; c++)
{
float a;
a = inputMatrix[r][c];
square[r] += pow(a, 2);
}
The same here:
norm[rowCounter][colCounter] == (inputMatrix[rowCounter][colCounter]) / sqrt(square[rowCounter]);
It should be:
norm[rowCounter][colCounter] = (inputMatrix[rowCounter][colCounter]) / sqrt(square[rowCounter]);
And you should be careful here:
int initial[100], inputMatrix[100][100], rowSum[100] = {0}, norm[100][100], square[100] = {0};
Are you sure about use int for all of this declarations?
I think you should use double or float instead, at least in some of them.
There are some problems in your code, I'll try to address the most important ones.
Your norm matrix is a 2D array of int as inputMatrix, but you have to use an array of float or double to correctly store the result and to perform the right calculation. In C if both of the terms of a division are integers types an integer division (like: 3/2 = 1, not 1.5) is performed, which is not what you need.
Another mistake is to use == instead of = to perform an assignment. In C == is the 'equal to' relational operation.
EDIT
As #chux pointed out it would be wiser to choose a more accurate type for a and square[]. Using long long int will (may) prevent numeric overflow in case the elements of the matrix are too big for their square or the sum of them to be reprensented by an int.
Be aware that if you decide to use double instead there are other subtle numerical issues concernig the sum of small number (and the order in which it is performed) represented by floating point types. So, as a partial remedy, you can use long double (if it really has more precision then double in your environment) for a and square.
EDIT 2
In the question and in comment you say that the first element of each row of the matrix is supposed to be "constant in the matrix" so it doesn't take part to the sum of squares in your code and in the example you gave, but in both of them they are updated in the next loop. I'm not sure of what is going on, so I corrected my code to mimic the behavior of yours.
Here is a working corrected version of your code:
#include <stdio.h>
#include <math.h>
int main() {
int rows, cols, r, c;
// you may think about dynamical allocation here
int inputMatrix[100][100], rowSum[100] = {0};
// it's better to use a type that can manage bigger numbers to avoid numeric overflow
long long int a, square[100] = {0};
// your starting matrix can be a matrix of int but the normalized one need to
// contain floating point numbers
double norm[100][100], k;
printf("Enter size of a matrix\n");
scanf("%d %d", &rows, &cols);
printf("Enter matrix of size %dX%d\n", rows, cols);
/* Input matrix */
for ( r = 0; r < rows; r++) {
for (c = 0; c < cols; c++) {
scanf("%d", &inputMatrix[r][c]);
// ^^ if you are scanning integer numbers...
}
}
printf("\nrows: %d cols: %d elements:\n",rows,cols);
for( r = 0; r < rows; r++) {
for( c = 0; c < cols; c++) {
printf("%d ", inputMatrix[r][c]);
// ... ^^ you should print integer numbers
}
printf("\n");
}
for (r = 0; r < rows; r++) {
for (c = 1; c < cols; c++) {
// ^^^ I don't know why you skip this here
a = inputMatrix[r][c];
//^ You have to assign, not to compare!
square[r] += a * a;
// ^^^^^ no need to call pow()
}
printf("Sum of squares of row %d: %lld\n",r,square[r]);
// square contains int ^^
// It would be nice and safer if you check here if square == 0 to avoid a
// division by zero and probably detect bad input data
}
for ( r = 0; r < rows; r++ ) {
// It's far more efficient to precalculate this term, even if compilers
// could be smart enough to do it for you. You may want to store those
// values in an array of doubles instead of the (sum of) squares
k = 1.0 / sqrt(square[r]);
for( c = 0; c < cols; c++ ) {
norm[r][c] = k * inputMatrix[r][c] ;
// again, ^ assign not compare
}
}
// you can add the printf to the previous loop...
printf("\nNormalized Matrix:\n");
for( r = 0; r < rows; r++) {
for( c = 0; c < cols; c++) {
printf("%.3lf ", norm[r][c]);
// ^^^^^ norm contains double
}
printf("\n");
}
return 0;
}
I keep the input matrix of integer type, but it would be better to use double for that too. As i added a print loop for the original matrix, the final output is:
rows: 2 cols: 3 elements:
1 2 3
4 5 6
Sum of squares of row 0: 13
Sum of squares of row 1: 61
Normalized Matrix:
0.277 0.555 0.832
0.512 0.640 0.768
I used an R code which implements a permutation test for the distributional comparison between two populations of functions. We have p univariate p-values.
The bottleneck is the construction of a matrix which contains all the possible CONTIGUOS p-values.
The last row of the matrix of p-values contain all the univariate p-values.
The penultimate row contains all the bivariate p-values in this order:
p_val_c(1,2), p_val_c(2,3), ..., p_val_c(p, 1)
...
The elements of the first row are coincident and the value associated is the p-value of the global test p_val_c(1,...,p)=p_val_c(2,...,p,1)=...=pval(p,1,...,p-1).
For computational reasons, I have decided to implement this component in c and use it in R with .C.
Here the code. The unique important part is the definition of the function Build_pval_asymm_matrix.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix);
// Function used for the sorting of vector T_temp with qsort
int cmp(const void *x, const void *y);
int main() {
int B = 1000; // number Conditional Monte Carlo (CMC) runs
int p = 100; // number univariate tests
// Generate fictitiously data univariate p-values pval and matrix L.
// The j-th column of L is the empirical survival
// function of the statistics test associated to the j-th coefficient
// of the basis expansion. The dimension of L is B * p.
// Generate pval
double pval[p];
memset(pval, 0, sizeof(pval)); // initialize all elements to 0
for (int i = 0; i < p; i++) {
pval[i] = (double)rand() / (double)RAND_MAX;
}
// Construct L
double L[B * p];
// Inizialize to 0 the elements of L
memset(L, 0, sizeof(L));
// Array used to construct the columns of L
double temp_array[B];
memset(temp_array, 0, sizeof(temp_array));
for(int i = 0; i < B; i++) {
temp_array[i] = (double) (i + 1) / (double) B;
}
for (int iter_coeff=0; iter_coeff < p; iter_coeff++) {
// Shuffle temp_array
if (B > 1) {
for (int k = 0; k < B - 1; k++)
{
int j = rand() % B;
double t = temp_array[j];
temp_array[j] = temp_array[k];
temp_array[k] = t;
}
}
for (int i=0; i<B; i++) {
L[iter_coeff + p * i] = temp_array[i];
}
}
double pval_asymm_matrix[p * p];
memset(pval_asymm_matrix, 0, sizeof(pval_asymm_matrix));
// Construct the asymmetric matrix of p-values
clock_t start, end;
double cpu_time_used;
start = clock();
Build_pval_asymm_matrix(&p, &B, pval, L, pval_asymm_matrix);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("TOTAL CPU time used: %f\n", cpu_time_used);
return 0;
}
void Build_pval_asymm_matrix(int * p, int * B, double * pval,
double * L,
double * pval_asymm_matrix) {
int nbasis = *p, iter_CMC = *B;
// Scalar output fisher combining function applied on univariate
// p-values
double T0_temp = 0;
// Vector output fisher combining function applied on a set of
//columns of L
double T_temp[iter_CMC];
memset(T_temp, 0, sizeof(T_temp));
// Counter for elements of T_temp greater than or equal to T0_temp
int count = 0;
// Indexes for columns of L
int inf = 0, sup = 0;
// The last row of matrice_pval_asymm contains the univariate p-values
for(int i = 0; i < nbasis; i++) {
pval_asymm_matrix[i + nbasis * (nbasis - 1)] = pval[i];
}
// Construct the rows from bottom to up
for (int row = nbasis - 2; row >= 0; row--) {
for (int col = 0; col <= row; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = (nbasis - row) + col - 1;
// Combining function Fisher applied on
// p-values pval[inf:sup]
for (int k = inf; k <= sup; k++) {
T0_temp += log(pval[k]);
}
T0_temp *= -2;
// Combining function Fisher applied
// on columns inf:sup of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = inf; l <= sup; l++) {
T_temp[k] += log(L[l + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
}
// auxiliary variable for columns of L inf:nbasis-1 and 1:sup
int aux_first = 0, aux_second = 0;
int num_col_needed = 0;
for (int col = row + 1; col < nbasis; col++) {
T0_temp = 0;
memset(T_temp, 0, sizeof(T_temp));
inf = col;
sup = ((nbasis - row) + col) % nbasis - 1;
// Useful indexes
num_col_needed = nbasis - inf + sup + 1;
int index_needed[num_col_needed];
memset(index_needed, -1, num_col_needed * sizeof(int));
aux_first = inf;
for (int i = 0; i < nbasis - inf; i++) {
index_needed[i] = aux_first;
aux_first++;
}
aux_second = 0;
for (int j = 0; j < sup + 1; j++) {
index_needed[j + nbasis - inf] = aux_second;
aux_second++;
}
// Combining function Fisher applied on p-values
// pval[inf:p-1] and pval[0:sup-1]1]
for (int k = 0; k < num_col_needed; k++) {
T0_temp += log(pval[index_needed[k]]);
}
T0_temp *= -2;
// Combining function Fisher applied on columns inf:p-1 and 0:sup-1
// of matrix L
for (int k = 0; k < iter_CMC; k++) {
for (int l = 0; l < num_col_needed; l++) {
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
}
T_temp[k] *= -2;
}
// Sort the vector T_temp
qsort(T_temp, iter_CMC, sizeof(double), cmp);
// Count the number of elements of T_temp less than T0_temp
int h = 0;
while (h < iter_CMC && T_temp[h] < T0_temp) {
h++;
}
// Number of elements of T_temp greater than or equal to T0_temp
count = iter_CMC - h;
pval_asymm_matrix[col + nbasis * row] = (double) count / (double)iter_CMC;
} // end for over col from row + 1 to nbasis - 1
} // end for over rows of asymm p-values matrix except the last row
}
int cmp(const void *x, const void *y)
{
double xx = *(double*)x, yy = *(double*)y;
if (xx < yy) return -1;
if (xx > yy) return 1;
return 0;
}
Here the times of execution in seconds measured in R:
time_original_function
user system elapsed
79.726 1.980 112.817
time_function_double_for
user system elapsed
79.013 1.666 89.411
time_c_function
user system elapsed
47.920 0.024 56.096
The first measure was obtained using an equivalent R function with duplication of the vector pval and matrix L.
What I wanted to ask is some suggestions in order to decrease the execution time with the C function for simulation purposes. The last time I used c was five years ago and consequently there is room for improvement. For instance I sort the vector T_temp with qsort in order to compute in linear time with a while the number of elements of T_temp greater than or equal to T0_temp. Maybe this task could be done in a more efficient way. Thanks in advance!!
I reduced the input size to p to 50 to avoid waiting on it (don't have such a fast machine) -- keeping p as is and reducing B to 100 has a similar effect, but profiling it showed that ~7.5 out of the ~8 seconds used to compute this was spent in the log function.
qsort doesn't even show up as a real hotspot. This test seems to headbutt the machine more in terms of micro-efficiency than anything else.
So unless your compiler has a vastly faster implementation of log than I do, my first suggestion is to find a fast log implementation if you can afford some accuracy loss (there are ones out there that can compute log over an order of magnitude faster with precision loss in the range of ~3% or so).
If you cannot have precision loss and accuracy is critical, then I'd suggest trying to memoize the values you use for log if you can and store them into a lookup table.
Update
I tried the latter approach.
// Create a memoized table of log values.
double log_cache[B * p];
for (int j=0, num=B*p; j < num; ++j)
log_cache[j] = log(L[j]);
Using malloc might be better here, as we're pushing rather large data to the stack and could risk overflows.
Then pass her into Build_pval_asymm_matrix.
Replace these:
T_temp[k] += log(L[l + nbasis * k]);
...
T_temp[k] += log(L[index_needed[l] + nbasis * k]);
With these:
T_temp[k] += log_cache[l + nbasis * k];
...
T_temp[k] += log_cache[index_needed[l] + nbasis * k];
This improved the times for me from ~8 seconds to ~5.3 seconds, but we've exchanged the computational overhead of log for memory overhead which isn't that much better (in fact, it rarely is but calling log for double-precision floats is apparently quite expensive, enough to make this exchange worthwhile). The next iteration, if you want more speed, and it is very possible, involves looking into cache efficiency.
For this kind of huge matrix stuff, focusing on memory layouts and access patterns can work wonders.
I have been looking for a way to swap the names between two matrices in C. I have 2 square size x size matrices. I make some operation to the one of them, I put the result in a cell in the other matrix, then I swap their names and I repeat.
Below I am giving my code
int main(void){
int const size = 1000;
int const steps = 10;
float A[size][size], B[size][size];
int i,j,k;
int t = 0;
double sum = 0;
double sum1 = 0;
int const ed = size - 1;
for(i = 0; i < size; ++i){
for(j = 0; j < size; ++j){// initialize the matrices
A[i][j] = i+j;
B[i][j] = 0;
}
}
for(i = 0; i < size; ++i){//find the sum of the values in the first matrix
for(j = 0; j < size; ++j){
sum = sum + A[i][j];
}
}
printf("The total sum of the matrix 1 is %lf \n",sum);
for(k = 0; k < steps; ++k){//for each cell of the matrix A calculate the average of the values' of the cell and its surroundings and put it in the coresponding place in the matrix B and then copy matrix B to matrix A and repeat. There are special cases for the cells who are at the edges and the last or first row/column.
for(i = 0; i < size; ++i){
for(j = 0; j < size; ++j){
if(i==0){
if(j==0)
B[i][j]=(A[0][0]+A[0][1]+A[0][ed]+A[1][0]+A[ed][0])/5.0;
else if(j==ed)
B[i][j]=(A[0][ed]+A[0][0]+A[0][ed-1]+A[1][ed]+A[ed][ed])/5.0;
else
B[i][j]=(A[0][j]+A[0][j+1]+A[0][j-1]+A[1][j]+A[ed][j])/5.0;
}else if(i==ed){
if(j==0)
B[i][j]=(A[ed][0]+A[ed][1]+A[ed][ed]+A[0][0]+A[ed-1][0])/5.0;
else if(j==ed)
B[i][j]=(A[ed][ed]+A[ed][0]+A[ed][ed-1]+A[0][ed]+A[ed-1][ed])/5.0;
else
B[i][j]=(A[ed][j]+A[ed][j+1]+A[ed][j-1]+A[0][j]+A[ed-1][j])/5.0;
}else{
if(j==0)
B[i][j]=(A[i][0]+A[i][1]+A[i][ed]+A[i+1][0]+A[i-1][0])/5.0;
else if(j==ed)
B[i][j]=(A[i][ed]+A[i][0]+A[i][ed-1]+A[i+1][ed]+A[i-1][ed])/5.0;
else
B[i][j]=(A[i][j]+A[i][j+1]+A[i][j-1]+A[i+1][j]+A[i-1][j])/5.0;
}
}
}
sum1 = 0;
for(i = 0; i < size; ++i){
for(j = 0; j < size; ++j){
sum1 = sum1 + B[i][j];
}
}
t=t+1;
for(i = 0; i < size; ++i){
for(j = 0; j < size; ++j){
A[i][j] = B[i][j];
}
}
printf("%lf \n",sum1-sum);
}
printf("The total sum of the matrix 2 is %lf \n",sum1);
printf("Number of steps completed: %i \n",t);
printf("Number of steps failed to complete: %i \n", steps-t);
return 0;
}
I have used the method of copying each time the one matrix to the other, but this is not efficient.
I have a hint that I should use pointers but I can not figure it out. Any help will be much appreciated.
You can swap the values of any two variables of the same type by assigning the value of the first to a temporary variable then assigning the value of the second to the first, then assigning the value of the temporary variable to the second:
int a = 2, b = 3, tmp;
tmp = a;
a = b;
b = tmp;
In particular, it works exactly the same when the variables are of pointer type, so
/* The matrices */
double one[3][3], another[3][3];
/* pointers to the matrices */
double (*matrix1p)[3] = one;
double (*matrix2p)[3] = another;
double (*tmp)[3];
/* ... perform matrix operations using matrix1p and matrix2p ... */
/* swap labels (pointers): */
tmp = matrix1p;
matrix1p = matrix2p;
matrix2p = tmp;
/* ... perform more matrix operations using matrix1p and matrix2p ... */
Updated to clarify:
matrix1p is initially an alias for one, and matrix2p is initially an alias for another. After the swap, matrix1p is an alias for another, whereas matrix2p is an alias for one. Of course, you can perform such a swap as many times as you want. You cannot, however, swap one and another themselves, except via an element-by-element swap.
Note that this yields improved efficiency because pointers are quite small relative to the matrices themselves. You don't have to move the elements of the matrices, but only to change which matrix each pointer refers to.
Yes, you should definitely use pointers, for example:
void swap (int*** m1, int*** m2)
{
int** temp;
temp = *m1;
*m1 = *m2;
*m2 = temp;
}
And then invoke:
int m1[5][5] = 0;
int m2[5][5] = 0;
swap (&m1, &m2);
Good day,
I need a help. We get a homework to write a programme in C which should generate and print bigger and smaller matrix made from "X" and ".". And after that find if the smaller 3x3 matrix is in the bigger one. I tried to make it by one dimensional field, but my programme finds matrix only sometimes. I am not able to find it out where is my mistake and how to fix it. I read some threads on forum, but none of it was helpfull to me. Thanks for any help.
P.S. Forgive me language mistakes, I am not a native english speaker.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/* Generates matrix of given dimensions */
void initMatrix(char *Matrix, int rows, int cols)
{
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
{
Matrix[i*cols+j]= "X.." [rand () % 3]; // 2/3 that X will be generated
}
}
}
/* Prints given matrix */
void printMatrix(char *Matrix, int rows, int cols)
{
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
{
printf("%c", Matrix[i * cols + j]);
}
printf("\n");
}
}
int main(void)
{
int rowM1, colM1; // Dimensions of primary (bigger) matrix
int rowM2 = 3, colM2 = 3; // Dimensions of secondary (smaller) matrix
int first, second; // Position of the begginng of matrix 2 in matrix 1
int rel_pos;
int i, j, k, l;
char *M1 = NULL; // Pointer to matrix 1
char *M2 = NULL; // Pointer to matrix 2
printf("Enter the matrix dimensions separated by a space ([rows] [columns]) : ");
if (scanf("%d %d", &rowM1, &colM1) != 2) // Bad parameters
{
printf("Wrong parameters.");
return 1; // End program
}
if (rowM1 < rowM2 || colM1 < colM2)
{
printf("Matrix 2 can not be found because is bigger than Matrix 1.");
return 1;
}
srand(time(NULL)); // Randomly generates numbers
M1 = malloc(rowM1 * colM1 * sizeof(char)); // M1 points to matrix 1
M2 = malloc(rowM2 * colM2 * sizeof(char)); // M2 points to matrix 2
initMatrix(M1, rowM1, colM1); // Initializes matrix 1
initMatrix(M2, rowM2, colM2); // Initializes matrix 2
printf("\nMatrix 1:\n");
printMatrix(M1, rowM1, colM1); // Prints matrix 1
printf("\nMatrix 2:\n");
printMatrix(M2, rowM2, colM2); // Prints matrix 2
putchar('\n');
for (i = 0; i < rowM1; i++)
{
for(j = 0; j < colM1; j++){
{
for (k = 0; k < rowM2 * colM2; k++) // checking the smaller matrix
{
if(M1[i*rowM1+j] == M2[k])
{
first = i*rowM1;
rel_pos = i+1;
}
if(j % colM2 == 0) // Matrix 2 has ended on this line, move on next one.
rel_pos += colM1 - colM2;
if(M1[rel_pos] == M2[j]) // If character are same, keep searching
rel_pos++;
else // else this is not the matrix I'm searching for
break;
}
if(k == rowM2*colM2) // if all k cykle went to the end I found the matrix
{
printf("Matrix found at [%d][%d]", first, second);
return 0;
}
}
}
if(i*colM1 > i*colM1-colM2) // matrix cannot be found
printf("Matrix not found");
break;
}
free(M1); // frees memory of matrix 1
free(M2); // frees memory of matrix 2
return 0;
}
Your inner loop for (k = 0; k < rowM2 * colM2; k++) iterates over the contents of the small matrix, and should compare each entry of the small matrix to the corresponding entry in the large matrix (as defined by the start point given by i and j).
The comparison if(M1[i*rowM1+j] == M2[k]), however, compares all entries of the small matrix with the same entry in the large matrix (the array index of M1 is independent of k).
To fix this, you need to make a fourdimensional loop
for(y0 = 0; y0 < colM1 - colM2 + 1; y0++) {
for(x0 = 0; x0 < rowM1 - rowM2 + 1; x0++) {
for(dy = 0; dy < colM2; dy++) {
for(dx = 0; dx < rowM2; dx++) {
if(M1[(y0 + dy)*rowM1 + (x0 + dx)] == M2[dy*rowM2 + dx]) {
...
}
}
}
}
}