I made a program that multiplies matrices of the same dimension using (p)threads. The program accepts command line flags -N n -M m where n is the size of the matrix arrays and m is the number of threads (computing threshold). The program compiles and runs but I get strange times for elapsed time, USR time, SYS time, and USR+SYS time. I am testing sizes n = {1000,2000,4000} with each threshold m = {1,2,4}.
I should be seeing the elapsed time reduced and a fairly constant USR+SYS time for each value of n but that is not the case. The output will fluctuate but the problem is that a higher threshold doesn't result in a reduction of elapsed time. Am I implementing threads wrong or is there an issue with my timing?
compile with: -pthread
./* -N n -M m
main
#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
#include<pthread.h>
#include<sys/time.h>
#include<sys/resource.h>
struct matrix {
double **Matrix_A;
double **Matrix_B;
int begin;
int end;
int n;
};
void *calculon(void *mtrx) {
struct matrix *f_mat = (struct matrix *)mtrx;
// transfer data
int f_begin = f_mat->begin;
int f_end = f_mat->end;
int f_n = f_mat->n;
// definition of temp matrix
double ** Matrix_C;
Matrix_C = (double**)malloc(sizeof(double)*f_n);
int f_pholder;
for(f_pholder=0; f_pholder < f_n; f_pholder++)
Matrix_C[f_pholder] = (double*)malloc(sizeof(double)*f_n);
int x, y, z;
for(x = f_begin; x < f_end; x++)
for(y = 0; y < f_n; y++)
for(z = 0; z < f_n; z++)
Matrix_C[x][y] += f_mat->Matrix_A[x][z]*f_mat->Matrix_B[z][y];
for(f_pholder = 0; f_pholder < f_n; f_pholder++)
free(Matrix_C[f_pholder]);
free(Matrix_C);
}
int main(int argc, char **argv) {
char *p;
int c, i, j, x, y, n, m, pholder, n_m, make_thread;
int m_begin = 0;
int m_end = 0;
while((c=getopt(argc, argv, "NM")) != -1) {
switch(c) {
case 'N':
n = strtol(argv[optind], &p, 10);
break;
case 'M':
m = strtol(argv[optind], &p, 10);
break;
default:
printf("\n**WARNING**\nUsage: -N n -M m");
break;
}
}
if(m > n)
printf("\n**WARNING**\nUsage: -N n -M m\n=> m > n");
else if(n%m != 0)
printf("\n**WARNING**\nUsage: -N n -M m\n=> n % m = 0");
else {
n_m = n/m;
// initialize input matrices
double ** thread_matrixA;
double ** thread_matrixB;
// allocate rows onto heap
thread_matrixA=(double**)malloc(sizeof(double)*n);
thread_matrixB=(double**)malloc(sizeof(double)*n);
// allocate columns onto heap
for(pholder = 0; pholder < n; pholder++) {
thread_matrixA[pholder]=(double*)malloc(sizeof(double)*n);
thread_matrixB[pholder]=(double*)malloc(sizeof(double)*n);
}
// populate input matrices with random numbers
for(i = 0; i < n; i++)
for(j = 0; j < n; j++)
thread_matrixA[i][j] = (double)rand()/RAND_MAX+1;
for(x = 0; x < n; x++)
for(y = 0; y < n; y++)
thread_matrixB[x][y] = (double)rand()/RAND_MAX+1;
printf("\n*** Matrix will be of size %d x %d *** \n", n, n);
printf("*** Creating matrix with %d thread(s) ***\n", m);
struct rusage r_usage;
struct timeval usage;
struct timeval time1, time2;
struct timeval cpu_time1, cpu_time2;
struct timeval sys_time1, sys_time2;
struct matrix mat;
pthread_t thread_lord[m];
// begin timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time1 = r_usage.ru_utime;
sys_time1 = r_usage.ru_stime;
gettimeofday(&time1, NULL);
for(make_thread = 0; make_thread < m; make_thread++) {
m_begin += n_m;
// assign values to struct
mat.Matrix_A = thread_matrixA;
mat.Matrix_B = thread_matrixB;
mat.n = n;
mat.begin = m_begin;
mat.end = m_end;
// create threads
pthread_create(&thread_lord[make_thread], NULL, calculon, (void *)&mat);
m_begin = (m_end + 1);
}
// wait for thread to finish before joining
for(i = 0; i < m; i++)
pthread_join(thread_lord[i], NULL);
// end timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time2 = r_usage.ru_utime;
sys_time2 = r_usage.ru_stime;
gettimeofday(&time2, NULL);
printf("\nUser time: %f seconds\n", ((cpu_time2.tv_sec * 1000000 + cpu_time2.tv_usec) - (cpu_time1.tv_sec * 1000000 + cpu_time1.tv_usec))/1e6);
printf("System time: %f seconds\n", ((sys_time2.tv_sec * 1000000 + sys_time2.tv_usec) - (sys_time1.tv_sec * 1000000 + sys_time1.tv_usec))/1e6);
printf("Wallclock time: %f seconds\n\n", ((time2.tv_sec * 1000000 + time2.tv_usec) - (time1.tv_sec * 1000000 + time1.tv_usec))/1e6);
// deallocate matrices
for(pholder = 0; pholder < n; pholder++) {
free(thread_matrixA[pholder]);
free(thread_matrixB[pholder]);
}
free(thread_matrixA);
free(thread_matrixB);
}
return 0;
}
Timing
My guess is that all those malloc()s you're using in the individual threads take a lot more time than you save by splitting the calculation among the threads. Math is fast; malloc() is slow. (To oversimplify a bit)
Sometimes weird timing behavior with threads happens when you have multiple threads trying to access a shared resource which is protected by some kind of exclusive lock. (Example, from something I did a long time ago) But I don't think that's the case here because, first of all, you don't seem to be using any locks, and second, the timing pattern that results typically has the runtime increasing by a little bit as you increase the number of threads. In this case, your runtime increases as you increase the number of threads, by a lot (specifically: it seems to be related to the thread number), so I suspect per-thread resource usage to be the culprit.
That being said, I'm having a hard time confirming my guess, so I can't be sure about this.
Related
The site codewars.com has a task "Sum of intervals".
https://www.codewars.com/kata/52b7ed099cdc285c300001cd
The bottom line is to find the sum of the intervals, taking into account the overlap.
For example:
sum_intervals((const struct interval[]){
{1,4},
{7, 10},
{3, 5}
}, 3); /* => 7 */
The sum of the numbers on the intervals {1,4}, {7,10}, {3,5} is equal to 7. It should be taken into account that the intervals {1,4} and {3,5} overlap.
I'm doing this task in C:
struct interval {
int first;
int second;
};
int sum_intervals(const struct interval* intervals, const size_t ints_size)
{
int seq_size = 0;
for (unsigned int i = 0; i < ints_size; i++)
seq_size += intervals[i].second - intervals[i].first;
int* sequence = malloc(seq_size * sizeof(int));
int iter = 0;
for (unsigned int i= 0; i < ints_size; i++) {
int k = intervals[i].second;
for (int j = intervals[i].first; j < k; j++) {
sequence[iter] = j;
iter++;
}
}
int unq_seq_size = seq_size;
qsort(sequence, seq_size, sizeof(int), compare);
for (int i = 0; i < seq_size - 1; i++)
if (sequence[i] == sequence[i + 1]) unq_seq_size--;
free(sequence);
return unq_seq_size;
}
int compare(const void* x1, const void* x2) {
return (*(int*)x1 - *(int*)x2);
}
First, I determine what size array is needed to store all the numbers in the intervals by calculating int seq_size. Then I allocate memory for the int*sequency array, after which I fill it with numbers between the boundaries of each of the intervals. Next, I sort the array, after which, to find the sum, it will be sufficient to compare neighboring elements for equality and, in case of equality, reduce the sum int unq_seq_size.
The code satisfies the tests, but is further considered a failure because "Execution Timed Out (12000 ms)". Help me optimize the code, or suggest another approach?
I calculated the execution time of the function using the following construct:
float startTime = (float) clock()/CLOCK_PER_SEC;
/* Do work */
float endTime = (float) clock()/CLOCK_PER_SEC;
float timeElapsed = endTime - startTime;
As a result, int timeElapsed is equal to 0.004000. Next, I applied this construction to individual blocks and got that all this time is spent on sorting:
float startTime = (float)clock() / CLOCKS_PER_SEC;
qsort(sequence, seq_size, sizeof(int), compare);
float endTime = (float)clock() / CLOCKS_PER_SEC;
float timeElapsed = endTime - startTime;
printf("%f",timeElapsed ); //0.004000
Also, at the end of the assignment there is a similar text:
"Random tests"
Up to 32 intervals from the range [-10^9, 10^9].
Can these 0.004000 at the interval [-10^9, 10^9] give "Execution Timed Out (12000 ms)"?
You solution is too slow effectively, as it is related to the range of data, which may be huge.
If n is the number of intervals, here is a O(n logn) solution.
Sort the intervals according to the start of them, and if equal to the end of them
Perform an iterative linear examination of the intervals as follows:
sum = 0
current_start = interval[0].first
current_end = interval[0].second
Do i = 1 to n-1
if (interval[i].first > current_end) then
sum += current_end - current_start
current_start = interval[i].first
current_end = interval[i].second
else
current_end = max (current_end, interval[i].second)
sum += current_end - current_start
For Damien'c answer:
#include <stdlib.h>
#include <stdio.h>
typedef struct interval {
int first;
int second;
}interval;
int compare(const void* x1, const void* x2);
int sum_intervals(struct interval* intervals, size_t ints_size);
int main()
{
printf("sum: %d", sum_intervals((struct interval[])
{
{1,5},{8,11}, {2,7}
}, 3));
return 0;
}
int compare(const void* x1, const void* x2) {
return ((interval*)x1)->first - ((interval*)x2)->first;
}
int sum_intervals(struct interval* intervals, size_t ints_size) {
qsort(intervals,ints_size, sizeof(interval),compare);
for (int i = 0; i < ints_size; i++) {
printf("%d", intervals[i].first);
printf(" %d\n", intervals[i].second);
}
int current_first = intervals[0].first;
int current_second = intervals[0].second;
int sum = 0;
for (int i = 1; i < ints_size-1; i++) {
if (current_second < intervals[i].first) {
sum += current_second - current_first;
current_first = intervals[i].first;
current_second = intervals[i].second;
} else current_second = max(current_second, intervals[i].second);
}
sum += current_second - current_first;
return sum;
}
Am I wrong somewhere? Because the answer is 6.
1 5
2 7
8 11
sum: 6
I'm testing performance of ?GEMM, ?TRMM, ?TRSM using MKL's automatic offload on the new Intel Xeon Phi coprocessors and am having some issues with DTRMM and DTRSM. I have code to test the performance for matrix size in steps of 1024 up to 10240 and performance seems to drop off significantly somewhere after N=M=K=8192. When I try testing exactly where by using step sizes of 2, my script was hanging. I then checked 512 step sizes, which work fine, 256 work as well, but anything under 256 just stalls. I cannot find any known issues in regards to this problem. All single precision versions work, as well as single and double precision on ?GEMM. Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include "mkl.h"
#define DBG 0
int main(int argc, char **argv)
{
char transa = 'N', side = 'L', uplo = 'L', diag = 'U';
MKL_INT N, NP; // N = M, N, K, lda, ldb, ldc
double alpha = 1.0; // Scaling factors
double *A, *B; // Matrices
int matrix_bytes; // Matrix size in bytes
int matrix_elements; // Matrix size in elements
int i, j; // Counters
int msec;
clock_t start, diff;
N = atoi(argv[1]);
start = clock();
matrix_elements = N * N;
matrix_bytes = sizeof(double) * matrix_elements;
// Allocate the matrices
A = malloc(matrix_bytes);
if (A == NULL)
{
printf("Could not allocate matrix A\n");
return -1;
}
B = malloc(matrix_bytes);
if (B == NULL)
{
printf("Could not allocate matrix B\n");
return -1;
}
for (i = 0; i < matrix_elements; i++)
{
A[i] = 0.0;
B[i] = 0.0;
}
// Initialize the matrices
for (i = 0; i < N; i++)
for (j = 0; j <= i; j++)
{
A[i+N*j] = 1.0;
B[i+N*j] = 2.0;
}
// DTRMM call
dtrmm(&side, &uplo, &transa, &diag, &N, &N, &alpha, A, &N, B, &N);
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("%f\n", (float)msec * 10e-4);
if (DBG == 1)
{
printf("\nMatrix dimension is set to %d \n\n", (int)N);
// Display the result
printf("\nResulting matrix B:\n");
if (N > 10)
{
printf("NOTE: B is too large, print only upper-left 10x10 block...\n");
NP = 10;
}
else
NP = N;
printf("\n");
for (i = 0; i < NP; i++)
{
for (j = 0; j < NP; j++)
printf("%7.3f ", B[i + j * N]);
printf("\n");
}
}
// Free the matrix memory
free(A);
free(B);
return 0;
}
Any help or insight would be greatly appreciated.
This phenomenon has been extensively discussed in other questions, and also in Intel's Software Optimization Manual and Agner Fog's notes.
Typically, you are experiencing a perfect storm of evictions in the memory hierarchy, such that suddenly (nearly) every single access misses cache and/or TLB (one can determine exactly which resource is missing by looking at the specific data access pattern or by using the PMCs; I can do the calculation later when I'm near a whiteboard, unless mystical gets to you first).
You can also search through some of my or Mystical's answers to find previous answers.
The issue was an older version of Intel's icc compiler (beta 10 update, I believe.. maybe). Gold update works like a charm.
/*
Matricefilenames:
small matrix A.bin of dimension 100 × 50
small matrix B.bin of dimension 50 × 100
large matrix A.bin of dimension 1000 × 500
large matrix B.bin of dimension 500 × 1000
An MPI program should be implemented such that it can
• accept two file names at run-time,
• let process 0 read the A and B matrices from the two data files,
• let process 0 distribute the pieces of A and B to all the other processes,
• involve all the processes to carry out the the chosen parallel algorithm
for matrix multiplication C = A * B ,
• let process 0 gather, from all the other processes, the different pieces
of C ,
• let process 0 write out the entire C matrix to a data file.
*/
int main(int argc, char *argv[]) {
printf("Oblig 2 \n");
double **matrixa;
double **matrixb;
int ma,na,my_ma,my_na;
int mb,nb,my_mb,my_nb;
int i,j,k;
int myrank,numprocs;
int konstanta,konstantb;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
if(myrank==0) {
read_matrix_binaryformat ("small_matrix_A.bin", &matrixa, &ma, &na);
read_matrix_binaryformat ("small_matrix_B.bin", &matrixb, &mb, &nb);
}
//mpi broadcast
MPI_Bcast(&ma,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&mb,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&na,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&nb,1,MPI_INT,0,MPI_COMM_WORLD);
fflush(stdout);
int resta = ma % numprocs;//rest antall som har den største verdien
//int restb = mb % numprocs;
if (myrank == 0) {
printf("ma : %d",ma);
fflush(stdout);
printf("mb : %d",mb);
fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
if (resta == 0) {
my_ma = ma / numprocs;
printf("null rest\n ");
fflush(stdout);
} else {
if (myrank < resta) {
my_ma = ma / numprocs + 1;//husk + 1
} else {
my_ma = ma / numprocs; //heltalls divisjon gir nedre verdien !
}
}
my_na = na;
my_nb = nb;
double **myblock = malloc(my_ma*sizeof(double*));
for(i=0;i<na;i++) {
myblock[i] = malloc(my_na*sizeof(double));
}
//send_cnt for scatterv
//________________________________________________________________________________________________________________________________________________
int* send_cnta = (int*)malloc(numprocs*sizeof(int));//array med antall elementer sendt til hver prosess array[i] = antall elementer , i er process
int tot_elemsa = my_ma*my_na;
MPI_Allgather(&tot_elemsa,1,MPI_INT,&send_cnta[0],1,MPI_INT,MPI_COMM_WORLD);//arrays i c må sendes &array[0]
//send_disp for scatterv
//__________________________________________________________________________________
int* send_dispa = (int*)malloc(numprocs*sizeof(int)); //hvorfor trenger disp
// int* send_dispb = (int*)malloc(numprocs*sizeof(int));
//disp hvor i imagechars første element til hver prosess skal til
fflush(stdout);
if(resta==0) {
send_dispa[myrank]=myrank*my_ma*my_na;
} else if(myrank<=resta) {
if(myrank<resta) {
send_dispa[myrank]=myrank*my_ma*my_na;
} else {//my_rank == rest
send_dispa[myrank]=myrank*(my_ma+1)*my_na;
konstanta=myrank*(my_ma+1)*my_na;
}
}
MPI_Bcast(&konstanta,1,MPI_INT,resta,MPI_COMM_WORLD);
if (myrank>resta){
send_dispa[myrank]=((myrank-resta)*(my_ma*my_na))+konstanta;
}
MPI_Allgather(&send_dispa[myrank],1,MPI_INT,&send_dispa[0],1,MPI_INT,MPI_COMM_WORLD);
//___________________________________________________________________________________
printf("print2: %d" , myrank);
fflush(stdout);
//recv_buffer for scatterv
double *recv_buffera=malloc((my_ma*my_na)*sizeof(double));
MPI_Scatterv(&matrixa[0], &send_cnta[0], &send_dispa[0], MPI_UNSIGNED_CHAR, &recv_buffera[0], my_ma*my_na, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
for(i=0; i<my_ma; i++) {
for(j=0; j<my_na; j++) {
myblock[i][j]=recv_buffera[i*my_na + j];
}
}
MPI_Finalize();
return 0;
}
OLD:I get three type of errors. I can get scatterv count error, segmentationfault 11, or the processes just get stuck. It seems to be random which error I get. I run the code with 2 procs each time. When it gets stuck it gets stuck before the printf("print2: %d" , myrank);. When my friend runs the code on his own computer also with two prosesses, he does not get past by the first MPI_Bcast. Nothing is printed out when he runs it. Here is a link for the errors I get: http://justpaste.it/zs0
UPDATED PROBLEM: Now I get only a segmentation fault after " printf("print2: %d" , myrank); " before the scatterv call. EVEN if I remove all the code after the printf statement I get the segmentation fault, but only if I run the code for more than two procs.
I'm having a little difficulty tracing what you were trying to do. I think you're making the scatterv call more complicated than it needs to be though. Here's a snippet I had from a similar assignment this year. Hopefully it's a clearer example of how scatterv works.
/*********************************************************************
* Scatter A to All Processes
* - Using Scatterv for versatility.
*********************************************************************/
int *send_counts; // Send Counts
int *displacements; // Send Offsets
int chunk; // Number of Rows per Process (- Root)
int chunk_size; // Number of Doubles per Chunk
int remainder; // Number of Rows for Root Process
double * rbuffer; // Receive Buffer
// Do Some Math
chunk = m / (p - 1);
remainder = m % (p - 1);
chunk_size = chunk * n;
// Setup Send Counts
send_counts = malloc(p * sizeof(int));
send_counts[0] = remainder * n;
for (i = 1; i < p; i++)
send_counts[i] = chunk_size;
// Setup Displacements
displacements = malloc(p * sizeof(int));
displacements[0] = 0;
for (i = 1; i < p; i++)
displacements[i] = (remainder * n) + ((i - 1) * chunk_size);
// Allocate Receive Buffer
rbuffer = malloc(send_counts[my_rank] * sizeof(double));
// Scatter A Over All Processes!
MPI_Scatterv(A, // A
send_counts, // Array of counts [int]
displacements, // Array of displacements [int]
MPI_DOUBLE, // Sent Data Type
rbuffer, // Receive Buffer
send_counts[my_rank], // Receive Count - Per Process
MPI_DOUBLE, // Received Data Type
root, // Root
comm); // Comm World
MPI_Barrier(comm);
Also, this causes a segfault on my machine, no mpi... Pretty sure it's the way myblock is being allocated. You should do what #Hristo suggested in the comments. Allocate both matrices and the resultant matrix as flat arrays. That would eliminate the use of double pointers and make your life a whole lot simpler.
#include <stdio.h>
#include <stdlib.h>
void main ()
{
int na = 5;
int my_ma = 5;
int my_na = 5;
int i;
int j;
double **myblock = malloc(my_ma*sizeof(double*));
for(i=0;i<na;i++) {
myblock = malloc(my_na*sizeof(double));
}
unsigned char *recv_buffera=malloc((my_ma*my_na)*sizeof(unsigned char));
for(i=0; i<my_ma; i++) {
for(j=0; j<my_na; j++) {
myblock[i][j]=(float)recv_buffera[i*my_na + j];
}
}
}
Try allocating more like this:
// Allocate A, b, and y. Generate random A and b
double *buff=0;
if (my_rank==0)
{
int A_size = m*n, b_size = n, y_size = m;
int size = (A_size+b_size+y_size)*sizeof(double);
buff = (double*)malloc(size);
if (buff==NULL)
{
printf("Process %d failed to allocate %d bytes\n", my_rank, size);
MPI_Abort(comm,-1);
return 1;
}
// Set pointers
A = buff; b = A+m*n; y = b+n;
// Generate matrix and vector
genMatrix(m, n, A);
genVector(n, b);
}
I'm new to multithreading and try to learn it through a simple program, which adds 1 to n and return the sum. In the sequential case, the main call the sumFrom1 function twice for n = 1e5 and 2e5; in the multithreaded cases, two threads are created using pthread_create and two sums are calculated in separate thread. The multithreadting version is much slower than the sequential version (see results below). I run this on a 12-CPU platform and there are no communication between threads.
Multithreaded:
Thread 1 returns: 0
Thread 2 returns: 0
sum of 1..10000: 50005000
sum of 1..20000: 200010000
time: 156 seconds
Sequential:
sum of 1..10000: 50005000
sum of 1..20000: 200010000
time: 56 seconds
When I add -O2 in compilation, the time of multithreaded version (9s) is less than that of sequential version (11s), but not much as I expect. I can always have the -O2 flag on but I'm curious about the low speed of multithreading in the unoptimized case. Should it be slower than sequential version? If not, what can I do to make it faster?
The code:
#include <stdio.h>
#include <pthread.h>
#include <time.h>
typedef struct my_struct
{
int n;
int sum;
}my_struct_t;
void *sumFrom1(void* sit)
{
my_struct_t* local_sit = (my_struct_t*) sit;
int i;
int nsim = 500000; // Loops for consuming time
int j;
for(j = 0; j < nsim; j++)
{
local_sit->sum = 0;
for(i = 0; i <= local_sit->n; i++)
local_sit->sum += i;
}
}
int main(int argc, char *argv[])
{
pthread_t thread1;
pthread_t thread2;
my_struct_t si1;
my_struct_t si2;
int iret1;
int iret2;
time_t t1;
time_t t2;
si1.n = 10000;
si2.n = 20000;
if(argc == 2 && atoi(argv[1]) == 1) // Use "./prog 1" to test the time of multithreaded version
{
t1 = time(0);
iret1 = pthread_create(&thread1, NULL, sumFrom1, (void*)&si1);
iret2 = pthread_create(&thread2, NULL, sumFrom1, (void*)&si2);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
t2 = time(0);
printf("Thread 1 returns: %d\n",iret1);
printf("Thread 2 returns: %d\n",iret2);
printf("sum of 1..%d: %d\n", si1.n, si1.sum);
printf("sum of 1..%d: %d\n", si2.n, si2.sum);
printf("time: %d seconds", t2 - t1);
}
else // Use "./prog" to test the time of sequential version
{
t1 = time(0);
sumFrom1((void*)&si1);
sumFrom1((void*)&si2);
t2 = time(0);
printf("sum of 1..%d: %d\n", si1.n, si1.sum);
printf("sum of 1..%d: %d\n", si2.n, si2.sum);
printf("time: %d seconds", t2 - t1);
}
return 0;
}
UPDATE1:
After a little googling on "false sharing" (Thanks, #Martin James!), I think it is the main cause. There are (at least) two ways to fix it:
The first way is inserting a buffer zone between the two structs (Thanks, #dasblinkenlight):
my_struct_t si1;
char memHolder[4096];
my_struct_t si2;
Without -O2, the time consuming decreases from ~156s to ~38s.
The second way is avoiding frequently updating sit->sum, which can be realized using a temp variable in sumFrom1 (as #Jens Gustedt replied):
for(int sum = 0, j = 0; j < nsim; j++)
{
sum = 0;
for(i = 0; i <= local_sit->n; i++)
sum += i;
}
local_sit->sum = sum;
Without -O2, the time consuming decreases from ~156s to ~35s or ~109s (It has two peaks! I don't know why.). With -O2, the time consuming stays ~8s.
By modifying your code to
typedef struct my_struct
{
size_t n;
size_t sum;
}my_struct_t;
void *sumFrom1(void* sit)
{
my_struct_t* local_sit = sit;
size_t nsim = 500000; // Loops for consuming time
size_t n = local_sit->n;
size_t sum = 0;
for(size_t j = 0; j < nsim; j++)
{
for(size_t i = 0; i <= n; i++)
sum += i;
}
local_sit->sum = sum;
return 0;
}
the phenomenon disappears. The problems you had:
using int as a datatype is completely wrong for such a test. Your
figures where such that the sum overflowed. Overflow of signed types is undefined behavior. You are lucky that it didn't eat your lunch.
having bounds and summation variables with indirection buys you
additional loads and stores, that in case of -O0 are really done as
such, with all the implications of false sharing and stuff like that.
Your code also observed other errors:
a missing include for atoi
superflouous cast to and from void*
printing of time_t as int
Please compile your code with -Wall before posting.
I can't seem to get a simple program (with lots of memory access) to achieve consistent timing in Linux. I'm using a 2.6 kernel, and the program is being run on a dual-core processor with realtime priority. I'm trying to disable cache effects by declaring the memory arrays as volatile. Below are the results and the program. What are some possible sources of the outliers?
Results:
Number of trials: 100
Range: 0.021732s to 0.085596s
Average Time: 0.058094s
Standard Deviation: 0.006944s
Extreme Outliers (2 SDs away from mean): 7
Average Time, excluding extreme outliers: 0.059273s
Program:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sched.h>
#include <sys/time.h>
#define NUM_POINTS 5000000
#define REPS 100
unsigned long long getTimestamp() {
unsigned long long usecCount;
struct timeval timeVal;
gettimeofday(&timeVal, 0);
usecCount = timeVal.tv_sec * (unsigned long long) 1000000;
usecCount += timeVal.tv_usec;
return (usecCount);
}
double convertTimestampToSecs(unsigned long long timestamp) {
return (timestamp / (double) 1000000);
}
int main(int argc, char* argv[]) {
unsigned long long start, stop;
double times[REPS];
double sum = 0;
double scale, avg, newavg, median;
double stddev = 0;
double maxval = -1.0, minval = 1000000.0;
int i, j, freq, count;
int outliers = 0;
struct sched_param sparam;
sched_getparam(getpid(), &sparam);
sparam.sched_priority = sched_get_priority_max(SCHED_FIFO);
sched_setscheduler(getpid(), SCHED_FIFO, &sparam);
volatile float* data;
volatile float* results;
data = calloc(NUM_POINTS, sizeof(float));
results = calloc(NUM_POINTS, sizeof(float));
for (i = 0; i < REPS; ++i) {
start = getTimestamp();
for (j = 0; j < NUM_POINTS; ++j) {
results[j] = data[j];
}
stop = getTimestamp();
times[i] = convertTimestampToSecs(stop-start);
}
free(data);
free(results);
for (i = 0; i < REPS; i++) {
sum += times[i];
if (times[i] > maxval)
maxval = times[i];
if (times[i] < minval)
minval = times[i];
}
avg = sum/REPS;
for (i = 0; i < REPS; i++)
stddev += (times[i] - avg)*(times[i] - avg);
stddev /= REPS;
stddev = sqrt(stddev);
for (i = 0; i < REPS; i++) {
if (times[i] > avg + 2*stddev || times[i] < avg - 2*stddev) {
sum -= times[i];
outliers++;
}
}
newavg = sum/(REPS-outliers);
printf("Number of trials: %d\n", REPS);
printf("Range: %fs to %fs\n", minval, maxval);
printf("Average Time: %fs\n", avg);
printf("Standard Deviation: %fs\n", stddev);
printf("Extreme Outliers (2 SDs away from mean): %d\n", outliers);
printf("Average Time, excluding extreme outliers: %fs\n", newavg);
return 0;
}
Make sure you have no other processes taking CPU time. Watch out in particular for screen savers and anything which regularly does GUI updates (e.g. a clock or similar). Try setting CPU affinity for your benchmark process to lock it onto one core (e.g. taskset from the command line). Make your benchmark process if not paging - typically you want to have an outer loop which runs N times and then time the last N-1 executions.