I can't seem to get a simple program (with lots of memory access) to achieve consistent timing in Linux. I'm using a 2.6 kernel, and the program is being run on a dual-core processor with realtime priority. I'm trying to disable cache effects by declaring the memory arrays as volatile. Below are the results and the program. What are some possible sources of the outliers?
Results:
Number of trials: 100
Range: 0.021732s to 0.085596s
Average Time: 0.058094s
Standard Deviation: 0.006944s
Extreme Outliers (2 SDs away from mean): 7
Average Time, excluding extreme outliers: 0.059273s
Program:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sched.h>
#include <sys/time.h>
#define NUM_POINTS 5000000
#define REPS 100
unsigned long long getTimestamp() {
unsigned long long usecCount;
struct timeval timeVal;
gettimeofday(&timeVal, 0);
usecCount = timeVal.tv_sec * (unsigned long long) 1000000;
usecCount += timeVal.tv_usec;
return (usecCount);
}
double convertTimestampToSecs(unsigned long long timestamp) {
return (timestamp / (double) 1000000);
}
int main(int argc, char* argv[]) {
unsigned long long start, stop;
double times[REPS];
double sum = 0;
double scale, avg, newavg, median;
double stddev = 0;
double maxval = -1.0, minval = 1000000.0;
int i, j, freq, count;
int outliers = 0;
struct sched_param sparam;
sched_getparam(getpid(), &sparam);
sparam.sched_priority = sched_get_priority_max(SCHED_FIFO);
sched_setscheduler(getpid(), SCHED_FIFO, &sparam);
volatile float* data;
volatile float* results;
data = calloc(NUM_POINTS, sizeof(float));
results = calloc(NUM_POINTS, sizeof(float));
for (i = 0; i < REPS; ++i) {
start = getTimestamp();
for (j = 0; j < NUM_POINTS; ++j) {
results[j] = data[j];
}
stop = getTimestamp();
times[i] = convertTimestampToSecs(stop-start);
}
free(data);
free(results);
for (i = 0; i < REPS; i++) {
sum += times[i];
if (times[i] > maxval)
maxval = times[i];
if (times[i] < minval)
minval = times[i];
}
avg = sum/REPS;
for (i = 0; i < REPS; i++)
stddev += (times[i] - avg)*(times[i] - avg);
stddev /= REPS;
stddev = sqrt(stddev);
for (i = 0; i < REPS; i++) {
if (times[i] > avg + 2*stddev || times[i] < avg - 2*stddev) {
sum -= times[i];
outliers++;
}
}
newavg = sum/(REPS-outliers);
printf("Number of trials: %d\n", REPS);
printf("Range: %fs to %fs\n", minval, maxval);
printf("Average Time: %fs\n", avg);
printf("Standard Deviation: %fs\n", stddev);
printf("Extreme Outliers (2 SDs away from mean): %d\n", outliers);
printf("Average Time, excluding extreme outliers: %fs\n", newavg);
return 0;
}
Make sure you have no other processes taking CPU time. Watch out in particular for screen savers and anything which regularly does GUI updates (e.g. a clock or similar). Try setting CPU affinity for your benchmark process to lock it onto one core (e.g. taskset from the command line). Make your benchmark process if not paging - typically you want to have an outer loop which runs N times and then time the last N-1 executions.
Related
I have a question with MultiThread.
This code is simple Example about comparing Single Thread vs MultiThread.
(sum 0~400,000,000 with singlethread vs 4-multiThread)
//Single
#include<pthread.h>
#include<unistd.h>
#include<stdio.h>
#include<stdlib.h>
#define NUM_THREAD 4
#define MY_NUM 100000000
void* calcThread(void* param);
double total = 0;
double sum[NUM_THREAD] = { 0, };
int main() {
long p[NUM_THREAD] = {MY_NUM, MY_NUM * 2,MY_NUM * 3,MY_NUM * 4 };
int i;
long total_nstime;
struct timespec begin, end;
pthread_t tid[NUM_THREAD];
pthread_attr_t attr[NUM_THREAD];
clock_gettime(CLOCK_MONOTONIC, &begin);
for (i = 0; i < NUM_THREAD; i++) {
calcThread((void*)p[i]);
}
for (i = 0; i < NUM_THREAD; i++) {
total += sum[i];
}
clock_gettime(CLOCK_MONOTONIC, &end);
printf("total = %lf\n", total);
total_nstime = (end.tv_sec - begin.tv_sec) * 1000000000 + (end.tv_nsec - begin.tv_nsec);
printf("%.3fs\n", (float)total_nstime / 1000000000);
return 0;
}
void* calcThread(void* param) {
int i;
long to = (long)(param);
int from = to - MY_NUM + 1;
int th_num = from / MY_NUM;
for (i = from; i <= to; i++)
sum[th_num] += i;
}
I wanna change using 4-MultiThread Code, so I changed that calculate function to using MultiThread.
...
int main() {
...
//createThread
for (i = 0; i < NUM_THREAD; i++) {
pthread_attr_init(&attr[i]);
pthread_create(&tid[i],&attr[i],calcThread,(void *)p[i]);
}
//wait
for(i=0;i<NUM_THREAD;i++){
pthread_join(tid[i],NULL);
}
for (i = 0; i < NUM_THREAD; i++) {
total += sum[i];
}
clock_gettime(CLOCK_MONOTONIC, &end);
...
}
Result(in Ubuntu)
But,It's slower than Single Function Code. I know MultiThread is faster.
I have no idea with this problem :( What's wrong?
Could you give me some advice ? Thanks a lot!
"I know MultiThread is faster"
This isn't always the case, as generally you would be CPU bound in some way, whether that be due to core count, how it is scheduled at the OS level, and hardware level.
It is a balance how many threads is worth giving to a process, as you may run into an old Linux problem where you would be spending more time scheduling the processes than actually running them.
As this is very hardware and OS dependant, it is difficult to say exactly what the issue may be, but make sure you have the appropriate microcode for your CPU installed (generally installed by default in Ubuntu), but just in case, try:
sudo apt-get install intel-microcode
Otherwise look at what other processes are being run, and it may be that a lot of other things are running on the cores that are being allocated the process.
The site codewars.com has a task "Sum of intervals".
https://www.codewars.com/kata/52b7ed099cdc285c300001cd
The bottom line is to find the sum of the intervals, taking into account the overlap.
For example:
sum_intervals((const struct interval[]){
{1,4},
{7, 10},
{3, 5}
}, 3); /* => 7 */
The sum of the numbers on the intervals {1,4}, {7,10}, {3,5} is equal to 7. It should be taken into account that the intervals {1,4} and {3,5} overlap.
I'm doing this task in C:
struct interval {
int first;
int second;
};
int sum_intervals(const struct interval* intervals, const size_t ints_size)
{
int seq_size = 0;
for (unsigned int i = 0; i < ints_size; i++)
seq_size += intervals[i].second - intervals[i].first;
int* sequence = malloc(seq_size * sizeof(int));
int iter = 0;
for (unsigned int i= 0; i < ints_size; i++) {
int k = intervals[i].second;
for (int j = intervals[i].first; j < k; j++) {
sequence[iter] = j;
iter++;
}
}
int unq_seq_size = seq_size;
qsort(sequence, seq_size, sizeof(int), compare);
for (int i = 0; i < seq_size - 1; i++)
if (sequence[i] == sequence[i + 1]) unq_seq_size--;
free(sequence);
return unq_seq_size;
}
int compare(const void* x1, const void* x2) {
return (*(int*)x1 - *(int*)x2);
}
First, I determine what size array is needed to store all the numbers in the intervals by calculating int seq_size. Then I allocate memory for the int*sequency array, after which I fill it with numbers between the boundaries of each of the intervals. Next, I sort the array, after which, to find the sum, it will be sufficient to compare neighboring elements for equality and, in case of equality, reduce the sum int unq_seq_size.
The code satisfies the tests, but is further considered a failure because "Execution Timed Out (12000 ms)". Help me optimize the code, or suggest another approach?
I calculated the execution time of the function using the following construct:
float startTime = (float) clock()/CLOCK_PER_SEC;
/* Do work */
float endTime = (float) clock()/CLOCK_PER_SEC;
float timeElapsed = endTime - startTime;
As a result, int timeElapsed is equal to 0.004000. Next, I applied this construction to individual blocks and got that all this time is spent on sorting:
float startTime = (float)clock() / CLOCKS_PER_SEC;
qsort(sequence, seq_size, sizeof(int), compare);
float endTime = (float)clock() / CLOCKS_PER_SEC;
float timeElapsed = endTime - startTime;
printf("%f",timeElapsed ); //0.004000
Also, at the end of the assignment there is a similar text:
"Random tests"
Up to 32 intervals from the range [-10^9, 10^9].
Can these 0.004000 at the interval [-10^9, 10^9] give "Execution Timed Out (12000 ms)"?
You solution is too slow effectively, as it is related to the range of data, which may be huge.
If n is the number of intervals, here is a O(n logn) solution.
Sort the intervals according to the start of them, and if equal to the end of them
Perform an iterative linear examination of the intervals as follows:
sum = 0
current_start = interval[0].first
current_end = interval[0].second
Do i = 1 to n-1
if (interval[i].first > current_end) then
sum += current_end - current_start
current_start = interval[i].first
current_end = interval[i].second
else
current_end = max (current_end, interval[i].second)
sum += current_end - current_start
For Damien'c answer:
#include <stdlib.h>
#include <stdio.h>
typedef struct interval {
int first;
int second;
}interval;
int compare(const void* x1, const void* x2);
int sum_intervals(struct interval* intervals, size_t ints_size);
int main()
{
printf("sum: %d", sum_intervals((struct interval[])
{
{1,5},{8,11}, {2,7}
}, 3));
return 0;
}
int compare(const void* x1, const void* x2) {
return ((interval*)x1)->first - ((interval*)x2)->first;
}
int sum_intervals(struct interval* intervals, size_t ints_size) {
qsort(intervals,ints_size, sizeof(interval),compare);
for (int i = 0; i < ints_size; i++) {
printf("%d", intervals[i].first);
printf(" %d\n", intervals[i].second);
}
int current_first = intervals[0].first;
int current_second = intervals[0].second;
int sum = 0;
for (int i = 1; i < ints_size-1; i++) {
if (current_second < intervals[i].first) {
sum += current_second - current_first;
current_first = intervals[i].first;
current_second = intervals[i].second;
} else current_second = max(current_second, intervals[i].second);
}
sum += current_second - current_first;
return sum;
}
Am I wrong somewhere? Because the answer is 6.
1 5
2 7
8 11
sum: 6
I made a program that multiplies matrices of the same dimension using (p)threads. The program accepts command line flags -N n -M m where n is the size of the matrix arrays and m is the number of threads (computing threshold). The program compiles and runs but I get strange times for elapsed time, USR time, SYS time, and USR+SYS time. I am testing sizes n = {1000,2000,4000} with each threshold m = {1,2,4}.
I should be seeing the elapsed time reduced and a fairly constant USR+SYS time for each value of n but that is not the case. The output will fluctuate but the problem is that a higher threshold doesn't result in a reduction of elapsed time. Am I implementing threads wrong or is there an issue with my timing?
compile with: -pthread
./* -N n -M m
main
#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
#include<pthread.h>
#include<sys/time.h>
#include<sys/resource.h>
struct matrix {
double **Matrix_A;
double **Matrix_B;
int begin;
int end;
int n;
};
void *calculon(void *mtrx) {
struct matrix *f_mat = (struct matrix *)mtrx;
// transfer data
int f_begin = f_mat->begin;
int f_end = f_mat->end;
int f_n = f_mat->n;
// definition of temp matrix
double ** Matrix_C;
Matrix_C = (double**)malloc(sizeof(double)*f_n);
int f_pholder;
for(f_pholder=0; f_pholder < f_n; f_pholder++)
Matrix_C[f_pholder] = (double*)malloc(sizeof(double)*f_n);
int x, y, z;
for(x = f_begin; x < f_end; x++)
for(y = 0; y < f_n; y++)
for(z = 0; z < f_n; z++)
Matrix_C[x][y] += f_mat->Matrix_A[x][z]*f_mat->Matrix_B[z][y];
for(f_pholder = 0; f_pholder < f_n; f_pholder++)
free(Matrix_C[f_pholder]);
free(Matrix_C);
}
int main(int argc, char **argv) {
char *p;
int c, i, j, x, y, n, m, pholder, n_m, make_thread;
int m_begin = 0;
int m_end = 0;
while((c=getopt(argc, argv, "NM")) != -1) {
switch(c) {
case 'N':
n = strtol(argv[optind], &p, 10);
break;
case 'M':
m = strtol(argv[optind], &p, 10);
break;
default:
printf("\n**WARNING**\nUsage: -N n -M m");
break;
}
}
if(m > n)
printf("\n**WARNING**\nUsage: -N n -M m\n=> m > n");
else if(n%m != 0)
printf("\n**WARNING**\nUsage: -N n -M m\n=> n % m = 0");
else {
n_m = n/m;
// initialize input matrices
double ** thread_matrixA;
double ** thread_matrixB;
// allocate rows onto heap
thread_matrixA=(double**)malloc(sizeof(double)*n);
thread_matrixB=(double**)malloc(sizeof(double)*n);
// allocate columns onto heap
for(pholder = 0; pholder < n; pholder++) {
thread_matrixA[pholder]=(double*)malloc(sizeof(double)*n);
thread_matrixB[pholder]=(double*)malloc(sizeof(double)*n);
}
// populate input matrices with random numbers
for(i = 0; i < n; i++)
for(j = 0; j < n; j++)
thread_matrixA[i][j] = (double)rand()/RAND_MAX+1;
for(x = 0; x < n; x++)
for(y = 0; y < n; y++)
thread_matrixB[x][y] = (double)rand()/RAND_MAX+1;
printf("\n*** Matrix will be of size %d x %d *** \n", n, n);
printf("*** Creating matrix with %d thread(s) ***\n", m);
struct rusage r_usage;
struct timeval usage;
struct timeval time1, time2;
struct timeval cpu_time1, cpu_time2;
struct timeval sys_time1, sys_time2;
struct matrix mat;
pthread_t thread_lord[m];
// begin timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time1 = r_usage.ru_utime;
sys_time1 = r_usage.ru_stime;
gettimeofday(&time1, NULL);
for(make_thread = 0; make_thread < m; make_thread++) {
m_begin += n_m;
// assign values to struct
mat.Matrix_A = thread_matrixA;
mat.Matrix_B = thread_matrixB;
mat.n = n;
mat.begin = m_begin;
mat.end = m_end;
// create threads
pthread_create(&thread_lord[make_thread], NULL, calculon, (void *)&mat);
m_begin = (m_end + 1);
}
// wait for thread to finish before joining
for(i = 0; i < m; i++)
pthread_join(thread_lord[i], NULL);
// end timing
getrusage(RUSAGE_SELF, &r_usage);
cpu_time2 = r_usage.ru_utime;
sys_time2 = r_usage.ru_stime;
gettimeofday(&time2, NULL);
printf("\nUser time: %f seconds\n", ((cpu_time2.tv_sec * 1000000 + cpu_time2.tv_usec) - (cpu_time1.tv_sec * 1000000 + cpu_time1.tv_usec))/1e6);
printf("System time: %f seconds\n", ((sys_time2.tv_sec * 1000000 + sys_time2.tv_usec) - (sys_time1.tv_sec * 1000000 + sys_time1.tv_usec))/1e6);
printf("Wallclock time: %f seconds\n\n", ((time2.tv_sec * 1000000 + time2.tv_usec) - (time1.tv_sec * 1000000 + time1.tv_usec))/1e6);
// deallocate matrices
for(pholder = 0; pholder < n; pholder++) {
free(thread_matrixA[pholder]);
free(thread_matrixB[pholder]);
}
free(thread_matrixA);
free(thread_matrixB);
}
return 0;
}
Timing
My guess is that all those malloc()s you're using in the individual threads take a lot more time than you save by splitting the calculation among the threads. Math is fast; malloc() is slow. (To oversimplify a bit)
Sometimes weird timing behavior with threads happens when you have multiple threads trying to access a shared resource which is protected by some kind of exclusive lock. (Example, from something I did a long time ago) But I don't think that's the case here because, first of all, you don't seem to be using any locks, and second, the timing pattern that results typically has the runtime increasing by a little bit as you increase the number of threads. In this case, your runtime increases as you increase the number of threads, by a lot (specifically: it seems to be related to the thread number), so I suspect per-thread resource usage to be the culprit.
That being said, I'm having a hard time confirming my guess, so I can't be sure about this.
I'm trying to parallelize the dot product operation and I measure the running time of the operation run on various number of cores using OpenMP. I'm getting the result that if N=1e9, then for 1 core the cpu time is 5.6 seconds, for 8 cores 6.0 seconds and for 16 cores 10.8 seconds. Why do the computation time rise when I use more cores?
Here's my code:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_par, *y, *z, cput_par;
clock_t start, end;
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_par = 0;
//nthreads = omp_get_max_threads();
nthreads = 1;
printf("n threads = %d\n", nthreads);
start=clock();
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
end=clock();
cput_par = ((double)(end-start)/(double)(CLOCKS_PER_SEC));
printf("Parallel time use: %f\n", cput_par);
printf("x_par = %f\n", x_par);
return 0;
}
The fault was the the total CPU time of all cores/threads used was calculated. To get the average cpu-time given each thread that value needs to be divided by the number of threads. Another way to solve it can be to measure the walltime (i.e. the difference of the actual time of the day before and after the operation). If the walltime is used then the operating system might run another program in between and this is then also included in the walltime. To illustrate this, along with a comparison for a strict sequential case, I post this code:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> //gettimeofday()
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_seq, x_par, *y, *z;
struct timeval time;
double tstart_cpu, tend_cpu, tstart_wall, tend_wall;
double walltime_seq, walltime_par, cputime_seq, cputime_par;
nthreads = 8;
printf("- - -DOT PROCUCT: OPENMP - - -\n");
printf("Vector size : %d\n", N);
printf("Number of threads used: %d\n", nthreads);
// INITIALIZATION
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_seq = 0;
x_par = 0;
// SEQUENTIAL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
for (i=0; i<N; i++) x_seq += y[i] * z[i];
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_seq = tend_cpu-tstart_cpu;
walltime_seq = tend_wall - tstart_wall;
printf("Sequential CPU time: %f\n", cputime_seq);
printf("Sequential Walltime: %f\n", walltime_seq);
printf("Sequential result : %f\n", x_seq);
// PARALLEL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_par = tend_cpu - tstart_cpu;
walltime_par = tend_wall - tstart_wall;
cputime_par /= nthreads; // take the average cpu time per thread
printf("Parallel CPU time : %f\n", cputime_par);
printf("Parallel Walltime : %f\n", walltime_par);
printf("Parallel result : %f\n", x_par);
// SPEEDUP
printf("Speedup (cputime) : %f\n", cputime_seq/cputime_par);
printf("Speedup (walltime) : %f\n", walltime_seq/walltime_par);
return 0;
}
And a typical run of it outputs:
- - -DOT PROCUCT: OPENMP - - -
Vector size : 1000000000
Number of threads used: 8
Sequential CPU time: 4.871956
Sequential Walltime: 4.878946
Sequential result : 38685626227668133590597632.000000
Parallel CPU time : 0.751475
Parallel Walltime : 0.757933
Parallel result : 133586303067416523805032448.000000
Speedup (cputime) : 6.483191
Speedup (walltime) : 6.437172
As you can see the resulting dot product is not correct, but this answers the initial question.
given the code below
#include<time.h>
#include <stdio.h>
#include <stdlib.h>
void firstSequence()
{
int columns = 999999;
int rows = 400000;
int **matrix;
int j;
int counter = 0;
matrix = (int **)malloc(columns*sizeof(int*));
for(j=0;j<columns;j++)
{
matrix[j]=(int*)malloc(rows*sizeof(int));
}
for(counter = 1;counter < columns; counter ++)
{
free(matrix[counter]);
}
}
void secondSequence()
{
int columns = 111;
int rows = 600000;
int **matrix;
int j;
matrix = (int **)malloc(columns*sizeof(int*));
for(j=0;j<columns;j++)
{
matrix[j]=(int*)malloc(rows*sizeof(int));
}
}
int main()
{
long t1;
long t2;
long diff;
t1 = clock();
firstSequence();
t2 = clock();
diff = (t2-t1) * 1000.0 / CLOCKS_PER_SEC;
printf("%f",t2);
t1 = clock();
secondSequence();
t2 = clock();
diff = (t2-t1) * 1000.0 / CLOCKS_PER_SEC;
printf("%f",diff);
return(0);
}
I need to be able to see how long it takes for both sequence one and sequence two to run. However both times I get 0 as the time elapsed. From looking online I have seen that this can be an issue but I do not how to fix the issue
You display the time incorrectly, so even if your functions take more than 0ms the call to printf() invokes undefined behaviour.
printf("%f",diff);
%f is used to display doubles. You probably want to use %ld.
If your functions really do take 0 ms to execute then a simple method to calculate the time for one call to the function is to call it multiple times, evough to be measurable, and then take the average of the total time elapsed.
clock is not the suitable function for calculating the time a program used.
You should use clock_gettime instead. detail explain about clock_gettime
Simple usage:
struct timespec start, end;
clock_gettime(CLOCK_REALTIME, &start);
for(int i = 0; i < 10000; i++) {
f1();
}
clock_gettime(CLOCK_REALTIME, &end);
cout <<"time elapsed = " << (double)((end.tv_sec - start.tv_sec)*1000000 + end.tv_nsec - start.tv_nsec) << endl;
PS: when you are compiling on linux, remember using the -lrt.