I have a problem with my code when I was trying to shrink the allocated memory. The result showed :
creating matrix
reallocating matrix (expanding)
defining GSL matrix in new matrix
reallocating matrix (shrinking)
realloc(): invalid next size
Aborted (core dumped)
It showed that I have put invalid next size in reallocation method. In my code, I was giving less number of size (shrink the memory). The code I implemented is shown below :
#include <stdlib.h>
#include <gsl/gsl_matrix.h>
#include <time.h>
#include "test.h"
#define nSub 4
int main(int argc, char const *argv[])
{
int testSize = 2000;
double timeOut = 2;
clock_t start_time;
gsl_matrix ***c;
printf("creating matrix\n");
calloc_matrix_C(&c, nSub, nSub, testSize, testSize);
// malloc_matrix_C(&c, nSub, nSub, testSize, testSize);
start_time = clock();
while ((double)(clock() - start_time) / CLOCKS_PER_SEC < timeOut)
;
printf("reallocating matrix (expanding)\n");
realloc_matrix_C(&c[0], 5);
start_time = clock();
while ((double)(clock() - start_time) / CLOCKS_PER_SEC < timeOut)
;
printf("defining GSL matrix in new matrix\n");
c[0][5] = gsl_matrix_calloc(testSize, testSize);
start_time = clock();
while ((double)(clock() - start_time) / CLOCKS_PER_SEC < timeOut)
;
printf("reallocating matrix (shrinking)\n");
realloc_matrix_C(&c[0], nSub);
start_time = clock();
while ((double)(clock() - start_time) / CLOCKS_PER_SEC < timeOut)
;
printf("free the matrix\n");
free_matrix_C(&c, nSub, nSub);
start_time = clock();
while ((double)(clock() - start_time) / CLOCKS_PER_SEC < timeOut)
;
printf("done\n");
while (1)
;
return 0;
}
## Heading ##
void calloc_matrix_C(gsl_matrix ****x, int alloc_dim_1, int alloc_dim_2, int mat_dim_1, int mat_dim_2)
{
(*x) = calloc(alloc_dim_1, sizeof(double **));
for (int i = 0; i < alloc_dim_1; i++)
{
(*x)[i] = calloc(alloc_dim_2, sizeof(double *));
for (int j = 0; j < alloc_dim_2; j++)
{
(*x)[i][j] = gsl_matrix_calloc(mat_dim_1, mat_dim_2);
}
}
}
void realloc_matrix_C(gsl_matrix ***x, int new_dim)
{
(*x) = realloc(*x, new_dim * sizeof(double *));
}
void free_matrix_C(gsl_matrix ****x, int alloc_dim_1, int alloc_dim_2)
{
for (int i = 0; i < alloc_dim_1; i++)
{
for (int j = 0; j < alloc_dim_2; j++)
{
gsl_matrix_free((*x)[i][j]);
}
free((*x)[i]);
}
free((*x));
}
Should the new size be greater?
Related
I am creating 2 programs to test the differences in run time of serial matrix multiply vs that of parallel matrix multiply. The parallel code that I have written is actually running slower than serial code, and running the program with additional cores enabled provides no speedup at all... using more cores actually seems to slow down the parallel program.
What is going on here? This is my parallel code: to use this pass in matrix size and thread number (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
#include <pthread.h>
// Time struct + prototypes
struct timespec time1, time2, diffTime;
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** reserveMatrix(int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
void* matMult(void* arg);
// Argstruct
typedef struct {
double** result;
int tid;
int size;
int s;
int e;
} argStr;
// global variables for use by all threads
int size; // Size of a row and column.
int numThreads; // Number of pThreads to do work
double** mat1;
double** mat2;
double** mat3;
// Main function
int main(int argc, char *argv[]) {
size = atoi(argv[1]);
numThreads = atoi(argv[2]);
mat1 = reserveMatrix(size, size);
mat2 = reserveMatrix(size, size);
mat3 = reserveMatrix(size, size);
if (size == 0) {
//printf("Matrix cannot be size 0\n");
return -1;
}
//Start timer
clock_gettime(CLOCK_MONOTONIC, &time1);
// *********** Begin main operation *********** //
// //
// declare necessary local variables
pthread_t theThreads[numThreads];
argStr data[numThreads]; // Create numThreads # of argStr objects
for (int i = 0; i < numThreads; i++) {
data[i].result = reserveMatrix(size, size);
data[i].tid = i; // Self-assigned threadID
data[i].size = size; // Size of a block
data[i].s = size * i / numThreads;
data[i].e = size * (i + 1) / numThreads - 1;
//printf("I handle operations from %d to %d\n", data[i].s, data[i].e);
}
// Start the threads
for (int i = 0; i < numThreads; i++) {
pthread_create(&theThreads[i], NULL, matMult, (void*) (&data[i]));
}
// await all threads being done.
for (int i = 0; i < numThreads; i++) {
pthread_join(theThreads[i], NULL);
}
// rejoin received data
//printMat(data[1].result, size, size);
// //
// *********** End main operation *********** //
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Pthread Matrix Multiply, %d, %d, %lf\n", size, numThreads, cpuTimeUsed);
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// Reserve matrix function
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
void* matMult(void* arg) {
//printf("Begin an operation\n");
argStr* args = (argStr*)arg;
double** result = args->result;
int tid = args->tid;
int size = args->size; // Size of the matrix
long s = args->s; // Start
long e = args->e; // End
// Print message to confirm data is getting stored
//printf("Hello from operation %d! \n", tid);
//printf("I am working from number %ld to %ld\n", s, e);
for(int r = s; r <= e; r++) { // May need to declare out of loop
for(int c = 0; c < size; c++) {
result[r][c] = 0.0;
for(int i = 0; i < size; i++) {
result[r][c] += mat1[r][i] * mat2[i][c];
}
}
}
// Print multipled matrix values
//printMat(mat3, size, size);
return NULL;
}
This is my serial code: To use this pass in the same sized row and column (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
// Matrix multiply code
// **** Time struct **** //
struct timespec time1, time2, diffTime;
// Prototypes
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** reserveMatrix(int nRows, int nCols);
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
// Begin main
int main(int argc, char *argv[])
{
int rows = atoi(argv[1]);
int cols = atoi(argv[2]);
// Declare the ARRAYS and populate them
double** arr1 = reserveMatrix(rows, cols);
double** arr2 = reserveMatrix(rows, cols);
double** arr3 = reserveMatrix(rows, cols);
double** arr4 = reserveMatrix(rows, cols);
double prod1 = matrixProduct(arr1, arr2, rows, cols);
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr3 = matrixMultiply(arr1, arr2, arr3, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
// Print input matrix values. Used to test that matrix multiply works - it does
// Perform a transposition of matrix 2
for (int r = 0; r < rows; ++r) {
for (int c = r + 1; c < cols; ++c) {
double val = arr2[r][c];
arr2[r][c] = arr2[c][r];
arr2[c][r] = val;
}
}
// Run matrix multiply again on the newly transposed data.
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr4 = transMatrixMultiply(arr1, arr2, arr4, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Trans Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
//double prod2 = matrixProduct(arr3, arr4, rows, cols);
//printf("The matrix product of m3 and m4 is: %f\n", prod2);
//printMat(mat3, rows, cols);
return 0;
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// standard matrix multiply
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int r = 0; r < nRows; ++r) {
for (int c = 0; c < nCols; ++c) {
result[r][c] = 0.0;
for (int i = 0; i < nRows; ++i) {
result[r][c] += matrix1[r][i] * matrix2[i][c];
}
}
}
return result;
}
// Transpose matrix multiply
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int c = 0; c < nCols; ++c) {
for (int r = 0; r < nRows; ++r) {
result[c][r] = 0.0;
for (int i = 0; i < nCols; ++i) {
result[c][r] += matrix1[c][i] * matrix2[r][i];
}
}
}
return result;
}
// Reserve data function. Reserves and populates array data
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Check that matrix1 and matrix2 are the same
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols) {
double sum = 0.0;
for(int i = 0; i < nRows * nCols; i++) {
sum += (mat1[0][i] - mat2[0][i]) * (mat1[0][i] - mat2[0][i]);
//printf("matrix product pos: %i, sum: %f\n", i, sum);
}
return sum;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
Here is the linux output of me compiling and running this code. At matrix size 1200 x 1200 the run time differences are not that pronounced, but the serial code ends up being significantly faster than the parallel at sizes above 1500 x 1500:
MYPC:~/Projects/matrixMultiply/phase3$ gcc matrixMult.c -o MM
MYPC:~/Projects/matrixMultiply/phase3$ gcc pMatMult.c -lpthread -o PMM
MYPC:~/Projects/matrixMultiply/phase3$ ./MM 1200 1200
Matrix Multiply, 1200, 25.487388
Trans Matrix Multiply, 1200, 16.452777
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 2
Pthread Matrix Multiply, 1200, 2, 22.495115
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 4
Pthread Matrix Multiply, 1200, 4, 22.181686
The sections in bold contain the meaningful output. It reads
name of the process
matrix size
number of threads spawned (in pThread program only)
run time
Any help would be appreciated. I will be instantly replying to questions for the next 2 hours.
The solution was to terminate extra processes that were running on my ubuntu machine. The code worked perfectly fine as a few users pointed out. Killing all other processes on the machine, then running my parallel code provided the expected speedups.
I am not sure of the precise technical reason this is going on other than the machine wasn't prioritizing my program when it had others running, resulting in slower times.
I'm making a program which dynamically creating 2d array.but it's showing the error which I mentioned on the title. I'm using Visual Studio 2015.
// last.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <stdio.h>
#include <time.h>
#include "stdlib.h"
double selectionSort(int * number, int number_count);
void print2d(int ** array, int rows, int cols);
void twodarray();
void main(int argc, char* argv[])
{
int num_count = 10000;
int num[10000];
for (int i = 0; i < num_count; i++)
{
num[i] = rand();
}
double sortTime = selectionSort(num, num_count);
printf("Total Runtime is: %.0f milliseconds. \n", sortTime * 1000);
twodarray();
getchar();
}
double selectionSort(int * number, int number_count)
{
clock_t start, end;
double duration;
int min;
start = clock();
for (int i = 0; i < number_count - 1; i++)
{
min = i;
for (int j = i + 1; j < number_count; j++)
{
if (number[min] > number[j])
{
min = j;
}
}
if (min != i)
{
int temp = number[min];
number[min] = number[i];
number[i] = temp;
}
}
end = clock();
return duration = (double)(end - start) / CLOCKS_PER_SEC;
}
void print2d(int ** array, int rows, int cols)
{
int i, j;
for (i = 0; i < rows; i++)
{
for (j = 0, j < cols; j++;)
{
printf("%10d ", array[i][j]);
}
puts("");
}
}
void twodarray()
{
int **twod;
int rows = 10;
twod = malloc(rows * sizeof(int));
int i,cols = 10;
for (i = 0; i < rows; i++)
{
twod[i] = malloc(cols*sizeof(int));
print2d(twod, rows, cols);
}
for (i = 0; rows; i++)
{
free(twod[i]);
free(twod);
}
}
In c++ you need to cast when assigining a void * pointer to another type of pointer. But in c++ you should not use malloc(), instead use
int **twod = new int *[rows];
If you didn't mean to write a c++ program, rename the file. Change the extension from .cpp to .c.
Your allocation is wrong too, as pointed out by #KeineLust here.
This is wrong:
int **twod;
int rows = 10;
twod = malloc(rows * sizeof(int));
You need to reserve space for n pointers to int, not for n ints, change to
twod = malloc(rows * sizeof(int *));
And here:
for (j = 0, j < cols; j++;)
^ ^
Use a semicolon instead of a comma and also remove the last semicolon.
Another problem:
for (i = 0; rows; i++)
{
free(twod[i]);
free(twod); /* Don't free twod in the loop, one malloc -> one free */
}
And as pointed out by Nicat and Iharob, it seems that you are mixing C and C++, use the proper extension (.c)
I currently have a multi-threaded C program coded using Pthreads which uses 2 threads. I want to increase the no. of threads and measure speed up upon doing so. I would like to run my code in an automated manner where the no. of threads used keeps getting incremented and I want to graphically display running times of my code. I would love it if I could get a clue in on how to do so especially on how to automate the entire process and plotting it graphically. Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata
{
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[])
{
long vec_index;
for(vec_index = 0 ; vec_index < VECTOR_SIZE ; vec_index++){
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE/NUM_THREADS;
pthread_t call_thread[NUM_THREADS];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for(i = 0 ; i < NUM_THREADS ; i++){
pthread_create(&call_thread[i], &attr, calcDOT, (void *)i);
}
pthread_attr_destroy(&attr);
for (i = 0 ; i < NUM_THREADS ; i++){
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
pthread_exit(NULL);
}
void *calcDOT(void *thread_id)
{
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long)thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for(vec_index = start_index ; vec_index < end_index ; vec_index++){
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *)thread_id);
}
I would like to increment my NUM_THREADS parameter and run it after each increment, record the execution time after each increment and plot a graph of execution time vs number of threads.
I tried a naive approach by increasing the number of threads, timing it with time.h and plotting it with gnuplot. Each iteration we double the number of threads and we print the time for an iteration. We use gnuplot to display a graph with number of threads on the x-axis and execution time on the y-axis
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata {
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[]) {
double xvals[VECTOR_SIZE / NUM_THREADS];
double yvals[VECTOR_SIZE / NUM_THREADS];
int index = 0;
for (int count = NUM_THREADS; count < VECTOR_SIZE / NUM_THREADS; count = count * 2) {
clock_t begin = clock();
long vec_index;
for (vec_index = 0; vec_index < VECTOR_SIZE; vec_index++) {
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE / count;
pthread_t call_thread[count];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for (i = 0; i < count; i++) {
pthread_create(&call_thread[i], &attr, calcDOT, (void *) i);
}
pthread_attr_destroy(&attr);
for (i = 0; i < count; i++) {
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
clock_t end = clock();
double time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
printf("time spent: %f NUM_THREADS: %d\n", time_spent, count);
xvals[index] = count;
yvals[index] = time_spent;
index++;
}
FILE * gnuplotPipe = popen ("gnuplot -persistent", "w");
fprintf(gnuplotPipe, "plot '-' \n");
for (int i = 0; i < VECTOR_SIZE / NUM_THREADS; i++)
{
fprintf(gnuplotPipe, "%lf %lf\n", xvals[i], yvals[i]);
}
fprintf(gnuplotPipe, "e");
pthread_exit(NULL);
}
void *calcDOT(void *thread_id) {
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long) thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for (vec_index = start_index; vec_index < end_index; vec_index++) {
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *) thread_id);
}
Output
Resultant X*Y is 20900
time spent: 0.000155 NUM_THREADS: 2
Resultant X*Y is 19860
time spent: 0.000406 NUM_THREADS: 4
Resultant X*Y is 17680
time spent: 0.000112 NUM_THREADS: 8
Resultant X*Y is 5712
time spent: 0.000587 NUM_THREADS: 16
Below is a usual way to allocate multidimensional arrays on heap, by using pointers to pointers.
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
But I decided to make a version that allocates only one chunck of memory and does element accessing by multiplication to an index value.
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
The second version appreas to run about 20% faster when running the test code below. What is likely to be the cause, and is this a generally applicable optimization technique? Are there some libraries that define array types of this kind for optimization purpose?
I made a huge mistake in the test code before edit. The first version ran slower because its allocation and deallocation kept place inside the for-loop while the second one did it only once before entering the loop. See the comments in the test code below. After making the two tests equal, I find that the first version can run even faster, especially after optimization. The more complex operations and various copies I put into the test code, I see the first one always run a little bit faster. It seems that the multiplication for indexing is slow in my machine? I'm not sure for the cause, though.
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
Yes, using arithmetic and a single base pointer is what the compiler does internally for non-dynamically allocated 2D (n-dimensional) arrays.
You gain the most performance because there's a single calculation and indexed lookup. With the 2D array shown, there are two pointer lookups and two index calculations per array access (one index calculation and lookup to get to the right array, and then the second to access the element in the right array). With a 3D array, there'd be three index calculations and three lookups.
You also allocate less memory, and need fewer memory allocations, but those are second order effects.
Also, as WhozCraig points out in a comment but I didn't mention, you get better locality of reference and potential for smarter prefetch with a single big chunk of memory compared with multiple smaller chunks (that add up to more memory than the single big chunk).
I tested this file (sim2d.c) compiled with GCC 4.9.1 on Mac OS X 10.10.2 Yosemite.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
There are two slightly different ways of accessing the aai2 data. I also separated the array size (N = 2000) from the number of cycles in a single test (N_CYCLES = 1000). The timing results I got were:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
I was getting similar patterns with (N_CYCLE = 2000), but it was taking twice as long to run — surprise, surprise.
I'm seeing a small but noticeable benefit (about 13% decrease) from the single allocation code, but no significant difference between the two timings for the 'aai2' tests.
Basic statistics:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
Clearly, formally making sure the machine is otherwise unloaded, and running many more iterations of the test, and similar benchmarking steps might improve the data, but the single allocation aai2 mechanism performs better on this machine than the multi-allocation aai mechanism. (Tangential aside: why do people not put a suffix 1 on their first version when they have two or more versions of the code?)
Hardware: 17" Mac Book Pro, early-2011, 2.3 GHz Intel Core i7, 16 GiB 1333 MHz DDR3 RAM.
I am trying to calculate a correlation measure for 18456 genes but the compiler (Dev C) exits after increasing macros GENE or INDEX to a value between 4000 and 5000 or bigger. For example it works well with:
# define GENE 4000
# define INDEX 3000
but not with:
#define GENE 5000
#define INDEX 100
The input file is a space delimited text file with 18456 rows and 57 columns.
Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <limits.h>
#define GENE 5000
#define N 57
#define INDEX 1000
int main (void) {
clock_t start, stop;
double t = 0.0;
int i, j, p, q, wp, wq;
double x;
double *S_matrix = (double *)malloc(INDEX * GENE * sizeof(double));
double sum_S, S_max;
double S[11] = {0};
double r = 0.0, xbar = 0.0, ybar = 0.0, sx = 0.0, sy = 0.0;
// read E matrix
FILE *fq;
double E[GENE][N] = {{0}};
if ((fq = fopen("E_disease.txt", "r")) == NULL )
{
printf("Error\n");
exit(EXIT_FAILURE);
}
fq = fopen("E_disease.txt","r");
printf("\n");
for (i=0;i<GENE;i++)
{
for(j=0;j<N;j++)
{
fscanf(fq,"%lf",&x);
E[i][j] = x;
}
}
printf("\n");
fclose(fq);
// calculate correlation
assert((start = clock())!=-1);
for(p=0; p < INDEX; p++)
{
for(q=0; q < GENE; q++)
{
for(i=0; i<11; i++)
{
/*compute xbar */
for(j = i; j < N; j++)
{
xbar += E[p][j];
}
xbar /= N;
/*compute ybar*/
for(j = i; j < N; j++)
{
ybar += E[q][j];
}
ybar /= N;
/* compute standard deviation of x*/
for(j = i; j < N; j++)
{
sx += (E[p][j] - xbar) * (E[p][j] - xbar);
}
sx = sqrt(sx);
/* compute standard deviation of y */
for(j = i; j < N; j++)
{
sy += (E[q][j] - ybar) * (E[q][j] - ybar);
}
sy = sqrt(sy);
/*compute r, the correlation coefficient between the two arrays */
for( j = i; j < N; j++ )
{
r += (((E[p][j] - xbar)/sx) * ((E[q][j] - ybar)/sy));
}
r /= (N);
if(r>0)
{
S[i] = r;
}
else if(r<=0)
{
S[i] = 0;
}
}
for(j=0, sum_S=0; j<11; j++)
{
sum_S += S[j];
}
for(j=0, S_max = 0; j<11; j++)
{
if(S[j] > S_max)
{
S_max = S[j];
}
}
S_matrix[p*GENE + q] = sum_S/(11*S_max);
}
}
FILE * fs;
fs = fopen ("s_matrix.txt", "w+");
for(wp=0; wp<INDEX; ++wp)
{
for(wq=0; wq<GENE; ++wq)
{
fprintf(fs, "%lf", S_matrix[wp*GENE + wq]);
fprintf(fs, "\t");
}
fprintf(fs, "\n");
printf("\n");
}
fclose(fs);
stop = clock();
t = (double) (stop-start)/CLOCKS_PER_SEC;
printf("Run time: %f\n", t);
//print results
//return (0);
getchar();
}
Let me simplify the code. When I ran the code below, a couple of times, it generally exited immediately. One time, it said that it could not find something like 0xff12345. Another time it printed out S_matrix[55] when I defined constants inside main (the rest of the code is same) like int GENE=100; but just one time. Is that mean a memory leak? It does not give an error message when I compile it but are defining matrices and assigning values to them true?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <time.h>
#include <limits.h>
#define GENE 100
#define N 57
#define INDEX 10
int main (void) {
//int GENE = 100;
//int N = 57;
//int INDEX = 10;
int i, j;
double x;
double *S_matrix = (double *)malloc(INDEX * GENE * sizeof(double));
double *E = (double*)malloc(GENE*N*sizeof(double));
// read E matrix
FILE *fq;
if ((fq = fopen("E_control.txt", "r")) == NULL )
{
printf("Error\n");
exit(EXIT_FAILURE);
}
fq = fopen("E_control.txt","r");
printf("\n");
for (i=0;i<GENE;i++)
{
for(j=0;j<N;j++)
{
fscanf(fq,"%lf",&x);
E[i*GENE+j] = x;
}
}
printf("\n");
fclose(fq);
for(i=0; i<INDEX; i++)
{
for(j=0; j<GENE; j++)
{
S_matrix[i*INDEX+j]=i*j;
}
}
printf("%f " , S_matrix[55]);
free(S_matrix);
S_matrix=NULL;
free(E);
E=NULL;
return(0);
getchar();
getchar();
}
You're attempting to reserve 2280000 bytes of stack space (actually more) in main() because of a overtly large fixed array declaration. Specifically, this line:
double E[GENE][N] = {{0}};
equates to
double E[5000][57] = {{0}};
At 8-bytes per double, thats highly likely to be blowing out your stack. Use dynamic allocation for that array instead. For example:
double (*E)[N] = malloc(5000*sizeof(*E));
And don't forget to free it when you're done.
Global fixed allocation will also work (i.e. declare it as a global outside the main() function block.
static double E[GENE][N];
int main()
{
... your code ...
}
Any method you choose has potential advantages and pitfalls, so plan accordingly.