AVX scalar operations are much faster - c

I test the following simple function
void mul(double *a, double *b) {
for (int i = 0; i<N; i++) a[i] *= b[i];
}
with very large arrays so that it is memory bandwidth bound. The test code I use is below. When I compile with -O2 it takes 1.7 seconds. When I compile with -O2 -mavx it takes only 1.0 seconds. The non vex-encoded scalar operations are 70% slower! Why is this?
Here is the the assembly for -O2 and -O2 -mavx.
https://godbolt.org/g/w4p60f
System: i7-6700HQ#2.60GHz (Skylake) 32 GB mem, Ubuntu 16.10, GCC 6.3
Test code
//gcc -O2 -fopenmp test.c
//or
//gcc -O2 -mavx -fopenmp test.c
#include <string.h>
#include <stdio.h>
#include <x86intrin.h>
#include <omp.h>
#define N 1000000
#define R 1000
void mul(double *a, double *b) {
for (int i = 0; i<N; i++) a[i] *= b[i];
}
int main() {
double *a = (double*)_mm_malloc(sizeof *a * N, 32);
double *b = (double*)_mm_malloc(sizeof *b * N, 32);
//b must be initialized to get the correct bandwidth!!!
memset(a, 1, sizeof *a * N);
memset(b, 1, sizeof *b * N);
double dtime;
const double mem = 3*sizeof(double)*N*R/1024/1024/1024;
const double maxbw = 34.1;
dtime = -omp_get_wtime();
for(int i=0; i<R; i++) mul(a,b);
dtime += omp_get_wtime();
printf("time %.2f s, %.1f GB/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw);
_mm_free(a), _mm_free(b);
}

The problem is related to a dirty upper half of an AVX register after calling omp_get_wtime(). This is a problem particularly for Skylake processors.
The first time I read about this problem was here. Since then other people have observed this problem: here and here.
Using gdb I found that omp_get_wtime() calls clock_gettime. I rewrote my code to use clock_gettime() and I see the same problem.
void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); }
void fix_sse() { }
void (*fix)();
double get_wtime() {
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
#ifndef __AVX__
fix();
#endif
return time.tv_sec + 1E-9*time.tv_nsec;
}
void dispatch() {
fix = fix_sse;
#if defined(__INTEL_COMPILER)
if (_may_i_use_cpu_feature (_FEATURE_AVX)) fix = fix_avx;
#else
#if defined(__GNUC__) && !defined(__clang__)
__builtin_cpu_init();
#endif
if(__builtin_cpu_supports("avx")) fix = fix_avx;
#endif
}
Stepping through code with gdb I see that the first time clock_gettime is called it calls _dl_runtime_resolve_avx(). I believe the problem is in this function based on this comment. This function appears to only be called the first time clock_gettime is called.
With GCC the problem goes away using //__asm__ __volatile__ ( "vzeroupper" : : : ); after the first call with clock_gettime however with Clang (using clang -O2 -fno-vectorize since Clang vectorizes even at -O2) it only goes away using it after every call to clock_gettime.
Here is the code I used to test this (with GCC 6.3 and Clang 3.8)
#include <string.h>
#include <stdio.h>
#include <x86intrin.h>
#include <time.h>
void fix_avx() { __asm__ __volatile__ ( "vzeroupper" : : : ); }
void fix_sse() { }
void (*fix)();
double get_wtime() {
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
#ifndef __AVX__
fix();
#endif
return time.tv_sec + 1E-9*time.tv_nsec;
}
void dispatch() {
fix = fix_sse;
#if defined(__INTEL_COMPILER)
if (_may_i_use_cpu_feature (_FEATURE_AVX)) fix = fix_avx;
#else
#if defined(__GNUC__) && !defined(__clang__)
__builtin_cpu_init();
#endif
if(__builtin_cpu_supports("avx")) fix = fix_avx;
#endif
}
#define N 1000000
#define R 1000
void mul(double *a, double *b) {
for (int i = 0; i<N; i++) a[i] *= b[i];
}
int main() {
dispatch();
const double mem = 3*sizeof(double)*N*R/1024/1024/1024;
const double maxbw = 34.1;
double *a = (double*)_mm_malloc(sizeof *a * N, 32);
double *b = (double*)_mm_malloc(sizeof *b * N, 32);
//b must be initialized to get the correct bandwidth!!!
memset(a, 1, sizeof *a * N);
memset(b, 1, sizeof *b * N);
double dtime;
//dtime = get_wtime(); // call once to fix GCC
//printf("%f\n", dtime);
//fix = fix_sse;
dtime = -get_wtime();
for(int i=0; i<R; i++) mul(a,b);
dtime += get_wtime();
printf("time %.2f s, %.1f GB/s, efficency %.1f%%\n", dtime, mem/dtime, 100*mem/dtime/maxbw);
_mm_free(a), _mm_free(b);
}
If I disable lazy function call resolution with -z now (e.g. clang -O2 -fno-vectorize -z now foo.c) then Clang only needs __asm__ __volatile__ ( "vzeroupper" : : : ); after the first call to clock_gettime just like GCC.
I expected that with -z now I would only need __asm__ __volatile__ ( "vzeroupper" : : : ); right after main() but I still need it after the first call to clock_gettime.

Related

Why does OpenMP speed up a SINGLE-ITERATION loop?

I'm using the "read" benchmark from Why is writing to memory much slower than reading it?, and I added just two lines:
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
They should have no effect, because OpenMP should only parallelize the outer loop, but the code now consistently runs twice faster.
Update: These lines aren't even necessary. Simply adding
omp_get_num_threads();
(implicitly declared) in the same place has the same effect.
Complete code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
unsigned long do_xor(const unsigned long* p, unsigned long n)
{
unsigned long i, x = 0;
for(i = 0; i < n; ++i)
x ^= p[i];
return x;
}
int main()
{
unsigned long n, r, i;
unsigned long *p;
clock_t c0, c1;
double elapsed;
n = 1000 * 1000 * 1000; /* GB */
r = 100; /* repeat */
p = calloc(n/sizeof(unsigned long), sizeof(unsigned long));
c0 = clock();
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
for(i = 0; i < r; ++i) {
p[0] = do_xor(p, n / sizeof(unsigned long)); /* "use" the result */
printf("%4ld/%4ld\r", i, r);
fflush(stdout);
}
c1 = clock();
elapsed = (c1 - c0) / (double)CLOCKS_PER_SEC;
printf("Bandwidth = %6.3f GB/s (Giga = 10^9)\n", (double)n * r / elapsed / 1e9);
free(p);
}
Compiled and executed with
gcc -O3 -Wall -fopenmp single_iteration.c && time taskset -c 0 ./a.out
The wall time reported by time is 3.4s vs 7.5s.
GCC 7.3.0 (Ubuntu)
The reason for the performance difference is not actually any difference in code, but in how memory is mapped. In the fast case you are reading from zero-pages, i.e. all virtual addresses are mapped to a single physical page - so nothing has to be read from memory. In the slow case, it is not zeroed. For details see this answer from a slightly different context.
On the other side, it is not caused by calling omp_get_num_threads or the pragma itstelf, but merely linking to the OpenMP runtime library. You can confirm that by using -Wl,--no-as-needed -fopenmp. If you just specify -fopenmp but don't use it at all, the linker will omit it.
Now unfortunately I am still missing the final puzzle piece: why does linking to OpenMP change the behavior of calloc regarding zero'd pages .

OpenMP slower than single threaded even though embarrassingly parallelizable [duplicate]

I have optimized as much as I could my function for sequential running.
When I use openMP I see no gain in performance.
I tried my program on a machine with 1 cores and on a machine with 8 cores, and the performance is the same.
With year set to 20, I have
1 core: 1 sec.
8 core: 1 sec.
With year set to 25 I have
1 core: 40 sec.
8 core: 40 sec.
1 core machine: my laptop's intel core 2 duo 1.8 GHz, ubuntu linux
8 core machine: 3.25 GHz, ubuntu linux
My program enumerate all the possible path of a binomial tree and do some work on each path. So my loop size increase exponentially and I would expect the footprint of openMP thread to be zero. In my loop, I only do a reduction of one variable. All other variable are read-only. I only use function I wrote, and I think they are thread safe.
I also run Valgrind cachegrind on my program. I don't fully understand the output but there seems to be no cache miss or false sharing.
I compile with
gcc -O3 -g3 -Wall -c -fmessage-length=0 -lm -fopenmp -ffast-math
My complete program is as below. Sorry for posting a lot of code. I'm not familiar with openMP nor C, and I couldn't resume my code more without loosing the main task.
How can I improve performance when I use openMP?
Are they some compiler flags or C tricks that will make the program run faster?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
printf("starting\n");
int year=20;
int tradingdate0=1;
globalinit(year,tradingdate0);
int i;
float v=0;
long n=pow(tradingdate0+1,year);
#pragma omp parallel for reduction(+:v)
for(i=0;i<n;i++)
v+=pathvalue(i);
globaldel();
printf("finished\n");
return 0;
}
//***function on which openMP is applied
float pathvalue(long pathindex) {
float value = -ctx.firstpremium;
float personalaccount = ctx.personalaccountat0;
float account = ctx.firstpremium;
int i;
for (i = 0; i < ctx.year-1; i++) {
value *= ctx.accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,ctx.guarantee[i]);
value += qx(i) * death;
if (haswithdraw(i)){
double withdraw = personalaccount*ctx.allowed;
value += px(i) * withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
}
//last year
double index = getindex(ctx.year-1,pathindex);
account = account * index;
value+=fmaxf(account,ctx.guarantee[ctx.year-1]);
return value * ctx.discountfactor;
}
int haswithdraw(int period){
return 1;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/ctx.chunksize[period])%ctx.tradingdate;
return ctx.stock[ndx];
}
float qx(int period){
return 0;
}
float px(int period){
return 1;
}
//****global
struct context ctx;
void globalinit(int year, int tradingdate0){
ctx.year = year;
ctx.tradingdate0 = tradingdate0;
ctx.firstpremium = 1;
ctx.riskfreerate = 0.06;
ctx.volatility=0.25;
ctx.personalaccountat0 = 1;
ctx.allowed = 0.07;
ctx.guaranteerate = 0.03;
ctx.alpha=1;
ctx.beta = 1;
ctx.tradingdate=tradingdate0+1;
ctx.discountfactor = exp(-ctx.riskfreerate * ctx.year);
ctx.accumulationfactor = exp(ctx.riskfreerate);
ctx.guaranteefactor = 1+ctx.guaranteerate;
ctx.upmove=exp(ctx.volatility/sqrt(ctx.tradingdate0));
ctx.downmove=1/ctx.upmove;
ctx.stock=(float*)malloc(sizeof(float)*ctx.tradingdate);
int i;
for(i=0;i<ctx.tradingdate;i++)
ctx.stock[i]=pow(ctx.upmove,ctx.tradingdate0-i)*pow(ctx.downmove,i);
ctx.chunksize=(long*)malloc(sizeof(long)*ctx.year);
for(i=0;i<year;i++)
ctx.chunksize[i]=pow(ctx.tradingdate,ctx.year-i-1);
ctx.guarantee=(float*)malloc(sizeof(float)*ctx.year);
for(i=0;i<ctx.year;i++)
ctx.guarantee[i]=ctx.beta*pow(ctx.guaranteefactor,i+1);
}
void globaldel(){
free(ctx.stock);
free(ctx.chunksize);
free(ctx.guarantee);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//***global
struct context{
int year;
int tradingdate0;
float firstpremium;
float riskfreerate;
float volatility;
float personalaccountat0;
float allowed;
float guaranteerate;
float alpha;
float beta;
int tradingdate;
float discountfactor;
float accumulationfactor;
float guaranteefactor;
float upmove;
float downmove;
float* stock;
long* chunksize;
float* guarantee;
};
struct context ctx;
void globalinit();
void globaldel();
EDIT I simplify all global variables as constant. For 20 year, the program run two time faster (great!). I tried to set the number of thread with OMP_NUM_THREADS=4 ./test for example. But it didn't give me any performance gain.
Can my gcc have some problem?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
starttimer();
printf("starting\n");
int i;
float v=0;
#pragma omp parallel for reduction(+:v)
for(i=0;i<numberofpath;i++)
v+=pathvalue(i);
printf("v:%f\nfinished\n",v);
endtimer();
return 0;
}
//function on which openMP is applied
float pathvalue(long pathindex) {
float value = -firstpremium;
float personalaccount = personalaccountat0;
float account = firstpremium;
int i;
for (i = 0; i < year-1; i++) {
value *= accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,guarantee[i]);
value += death;
double withdraw = personalaccount*allowed;
value += withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
//last year
double index = getindex(year-1,pathindex);
account = account * index;
value+=fmaxf(account,guarantee[year-1]);
return value * discountfactor;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/chunksize[period])%tradingdate;
return stock[ndx];
}
//timing
clock_t begin;
void starttimer(){
begin = clock();
}
void endtimer(){
clock_t end = clock();
double elapsed = (double)(end - begin) / CLOCKS_PER_SEC;
printf("\nelapsed: %f\n",elapsed);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//timing
void starttimer();
void endtimer();
//***constant
const int year= 20 ;
const int tradingdate0= 1 ;
const float firstpremium= 1 ;
const float riskfreerate= 0.06 ;
const float volatility= 0.25 ;
const float personalaccountat0= 1 ;
const float allowed= 0.07 ;
const float guaranteerate= 0.03 ;
const float alpha= 1 ;
const float beta= 1 ;
const int tradingdate= 2 ;
const int numberofpath= 1048576 ;
const float discountfactor= 0.301194211912 ;
const float accumulationfactor= 1.06183654655 ;
const float guaranteefactor= 1.03 ;
const float upmove= 1.28402541669 ;
const float downmove= 0.778800783071 ;
const float stock[2]={1.2840254166877414, 0.7788007830714049};
const long chunksize[20]={524288, 262144, 131072, 65536, 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1};
const float guarantee[20]={1.03, 1.0609, 1.092727, 1.1255088100000001, 1.1592740743, 1.1940522965290001, 1.2298738654248702, 1.2667700813876164, 1.304773183829245, 1.3439163793441222, 1.384233870724446, 1.4257608868461793, 1.4685337134515648, 1.512589724855112, 1.557967416600765, 1.6047064390987882, 1.6528476322717518, 1.7024330612399046, 1.7535060530771016, 1.8061112346694148};
Even if your program benefits from using OpenMP, you won't see it because you are measuring the wrong time.
clock() returns the total CPU time spent in all threads. If you run with four threads and each runs for 1/4 of the time, clock() will still return the same value since 4*(1/4) = 1. You should be measuring the wall-clock time instead.
Replace calls to clock() with omp_get_wtime() or gettimeofday(). They both provide high precision wall-clock timing.
P.S. Why are there so many people around SO using clock() for timing?
It seems as if it should work. Probably you need to specify the number of threads to use. You can do so by setting the OMP_NUM_THREADS variable. For instance, for using 4 threads:
OMP_NUM_THREADS=4 ./test
EDIT: I just compiled the code and I observe significant speedups when changing the number of threads.
I don't see any section in which you're specifying the number of cores OpenMP will use. It's supposed to, by default, use the number of CPUs it sees, but for my purposes, I've always forced it to use as many as I specified.
Add this line before your parallel for construct:
#pragma omp parallel num_threads(num_threads)
{
// Your parallel for follows here
}
...where num_threads is an integer between 1 and the number of cores on your machine.
EDIT: Here's the makefile used to build the code. Place this in a text file named Makefile in the same directory.
test: test.c test.h
cc -o $# $< -O3 -g3 -fmessage-length=0 -lm -fopenmp -ffast-math

I need help to parallelize this code using OpenMP

I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.
Here are the codes:
main.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define pi2 6.283185307
#define pi 3.141592654
#define IMUL(a,b) __mul24(a,b)
typedef struct {
int seme;
} iniran;
typedef struct{
int jp1;
int jp2;
float kx;
float ky;
float kz;
float vAx;
float vAy;
float vAz;
float vBx;
float vBy;
float vBz;
float tetaAp;
float phiAp;
float tetaA;
float tetaBp;
float phiBp;
float tetaB;
float kAx;
float kAy;
float kAz;
float kBx;
float kBy;
float kBz;
int caso;
} stato_struct;
stato_struct *coll_CPU=0;
unsigned int timer;
#include "DSMC_kernel_float.c"
//=============================================================
float min(float *a, float*b){
if(*a<*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float max(float *a, float*b){
if(*a>*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float rf(int *idum){
static int iff=0;
static int inext, inextp, ma[55];
int mj, mk;
int i, k, ii;
float ret_val;
if (*idum<0 || iff==0) {
iff=1;
mj=161803398 - abs(*idum);
mj %= 1000000000;
ma[54]=mj;
mk=1;
for (i=1; i<=54; ++i){
ii=(i*21)%55;
ma[ii-1]=mk;
mk=mj-mk;
if (mk<0) {
mk += 1000000000;
}
mj= ma[ii-1];
}
for(k=1; k<=4; ++k) {
for(i=1; i<=55; ++i){
ma[i-1] -= ma[(i+30)%55];
if (ma[i-1]<0){
ma[i-1] += 1000000000;
}
}
}
inext=0;
inextp=31;
*idum=1;
}
++inext;
if (inext==56){
inext=1;
}
++inextp;
if (inextp==56){
inextp=1;
}
mj=ma[inext-1]-ma[inextp-1];
if (mj<0){
mj += 1000000000;
}
ma[inext-1]=mj;
ret_val=mj*1.0000000000000001e-9;
return ret_val;
}
//============================================================
int genk(float *kx, float *ky, float *kz, int *p2seme){
// float sqrtf(float), sinf(float), cosf(float);
extern float rf(int *);
static float phi;
*kx=rf(p2seme) * 2. -1.f;
*ky= sqrtf(1. - *kx * *kx);
phi=pi2*rf(p2seme);
*kz=*ky * sinf(phi);
*ky *= cosf(phi);
return 0;
}
//==============================================================
int main(void){
float msec_kernel;
int Np=10000, Nmax=512;
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;
iniran1.seme=7593;
for(jp=1;jp<=Np;jp++){
if(jp<=Np/2){
Vx[jp-1]=2.5;
Vy[jp-1]=0;
Vz[jp-1]=0;
tetap[jp-1]=0;
phip[jp-1]=0;
teta[jp-1]=0;
}
for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){
coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
jcoll=0;
while (jcoll<Ncoll){
jp1=1+floorf(Np*rf(&iniran1.seme));
jp2=1+floorf(Np*rf(&iniran1.seme));
genk(&kx,&ky,&kz,&iniran1.seme);
Vrx=Vx[jp2-1]-Vx[jp1-1];
Vry=Vy[jp2-1]-Vy[jp1-1];
Vrz=Vz[jp2-1]-Vz[jp1-1];
scalprod=Vrx*kx+Vry*ky+Vrz*kz;
if (scalprod<0) {
genk(&kAx,&kAy,&kAz,&iniran1.seme);
genk(&kBx,&kBy,&kBz,&iniran1.seme);
coll_CPU[jcoll].jp1= jp1;
coll_CPU[jcoll].jp2=jp2;
coll_CPU[jcoll].kx=kx;
coll_CPU[jcoll].ky=ky;
coll_CPU[jcoll].kz=kz;
coll_CPU[jcoll].vAx=Vx[jp1-1];
coll_CPU[jcoll].vAy=Vy[jp1-1];
coll_CPU[jcoll].vAz=Vz[jp1-1];
coll_CPU[jcoll].vBx=Vx[jp2-1];
coll_CPU[jcoll].vBy=Vy[jp2-1];
coll_CPU[jcoll].vBz=Vz[jp2-1];
coll_CPU[jcoll].tetaAp=tetap[jp1-1];
coll_CPU[jcoll].phiAp=phip[jp1-1];
coll_CPU[jcoll].tetaA=teta[jp1-1];
coll_CPU[jcoll].tetaBp=tetap[jp2-1];
coll_CPU[jcoll].phiBp=phip[jp2-1];
coll_CPU[jcoll].tetaB=teta[jp2-1];
coll_CPU[jcoll].kAx=kAx;
coll_CPU[jcoll].kAy=kAy;
coll_CPU[jcoll].kAz=kAz;
coll_CPU[jcoll].kBx=kBx;
coll_CPU[jcoll].kBy=kBy;
coll_CPU[jcoll].kBz=kBz;
coll_CPU[jcoll].caso=1;
jcoll++;
}
}
clock_t t;
t = clock();
#pragma omp parallel for private(id) //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
for(id=0;id<Nmax;id++){
CollisioniCPU(coll_CPU,id);
}
t = clock() - t;
msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC;
printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);
for (ind=0;ind<Ncoll;ind++){
if (coll_CPU[ind].caso==4)
Ncoll_eff++;
else if (coll_CPU[ind].caso==0)
Ncoll_div++;
else
Ncoll_dim++;
}
free(coll_CPU);
}
return 0;
}
DSMC_kernel_float.c
void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];
vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;
coll_CPU[id].vAx=2*vettA[0];
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}
In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:
Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted
What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.
Changing the inner loop as follows should bring you one step further.
#pragma omp parallel for private(id)
for(id=0;id<Ncoll;id++){
CollisioniCPU(coll_CPU,id);
}
Your OpenMP line seems okay, but I doubt that it will lead to significant improvements in runtime. You should optimize the surrounding code as well. Allocating the memory once outside of your loops would be a good start.
By the way, is there any reason for this verbose coding style and not using a more compact and readable version as this one?
void CollisioniCPU(stato_struct *coll_CPU, int id) {
stato_struct *ptr = coll_CPU + id;
ptr->vAx *= 2;
ptr->vAy *= 2;
ptr->vAz *= 2;
ptr->tetaAp *= 2;
ptr->phiAp *= 2;
ptr->tetaA *= 2;
ptr->vBx *= 2;
ptr->vBy *= 2;
ptr->vBz *= 2;
ptr->tetaBp *= 2;
ptr->phiBp *= 2;
ptr->tetaB *= 2;
}

GCC 4.8.2 vectorization avoid simple loop

Here is excerpt from my C code. I hope I copied all relevant parts.
#define SIN_LEN 22050
#define CALC_N 4100
#define CHUNK_LEN 22050
float __attribute__((aligned(16))) sin_array[SIN_LEN];
float __attribute__((aligned(16))) cos_array[SIN_LEN];
short __attribute__((aligned(16))) data[CHUNK_LEN];
size_t __attribute__((aligned(16))) a1[CHUNK_LEN];
float sinus_sum __attribute__((aligned(16)));
float cosinus_sum __attribute__((aligned(16)));
float __attribute__((aligned(16))) temp1[CHUNK_LEN];
float __attribute__((aligned(16))) temp2[CHUNK_LEN];
void sin_summ(long n, float* restrict sin_s, float* restrict cos_s ) {
sinus_sum = 0;
cosinus_sum = 0;
for (int i = 0; i < CHUNK_LEN; i++ )
a1[i] = (n*i) % SIN_LEN;
for (int i = 0; i < CHUNK_LEN; i++) {
temp1[i] = sin_array[a1[i]];
temp2[i] = cos_array[a1[i]];
}
for (int i = 0; i < CHUNK_LEN; i++) {
sinus_sum += data[i] * temp1[i];
cosinus_sum += data[i] * temp2[i];
}
*sin_s = sinus_sum;
*cos_s = cosinus_sum;
return;
}
Function sin_summ is called in loop too. The GCC parameters are -O3 -march=armv7-a -mtune=cortex-a8 -mfloat-abi=hard -ffast-math -mfpu=neon -ftree-vectorize -ftree-vectorizer-verbose=1 -funsafe-math-optimizations -fstrict-aliasing -Wstrict-aliasing The architecture is armv7-neon (BeagleBone Black).
The problem is loops #1 and #3 are vectorized, but #2 is not. I guess it is because of [a[i]] part. Somebody please suggest how to make it vectorizable. Right now single non-vectorized loop works 40% faster than what is written here.

How can I benchmark C code easily?

Is there a simple library to benchmark the time it takes to execute a portion of C code? What I want is something like:
int main(){
benchmarkBegin(0);
//Do work
double elapsedMS = benchmarkEnd(0);
benchmarkBegin(1)
//Do some more work
double elapsedMS2 = benchmarkEnd(1);
double speedup = benchmarkSpeedup(elapsedMS, elapsedMS2); //Calculates relative speedup
}
It would also be great if the library let you do many runs, averaging them and calculating the variance in timing!
Use the function clock() defined in time.h:
startTime = (float)clock()/CLOCKS_PER_SEC;
/* Do work */
endTime = (float)clock()/CLOCKS_PER_SEC;
timeElapsed = endTime - startTime;
Basically, all you want is a high resolution timer. The elapsed time is of course just a difference in times and the speedup is calculated by dividing the times for each task. I have included the code for a high resolution timer that should work on at least windows and unix.
#ifdef WIN32
#include <windows.h>
double get_time()
{
LARGE_INTEGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return (double)t.QuadPart/(double)f.QuadPart;
}
#else
#include <sys/time.h>
#include <sys/resource.h>
double get_time()
{
struct timeval t;
struct timezone tzp;
gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6;
}
#endif
Benchmark C code easily
#include <time.h>
int main(void) {
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time);
}
Easy benchmark of multi-threaded C code
If you want to benchmark multithreaded program you first need to take a closer look at clock:
Description
The clock() function returns an approximation of processor time
used by the program.
Return value
The value returned is the CPU time used so far as a clock_t; to
get the number of seconds used, divide by CLOCKS_PER_SEC. If the
processor time used is not available or its value cannot be
represented, the function returns the value (clock_t)(-1)
Hence it is very important to divide your elapsed_time by the number of threads in order to get the execution time of your function:
#include <time.h>
#include <omp.h>
#define THREADS_NB omp_get_max_threads()
#pragma omp parallel for private(i) num_threads(THREADS_NB)
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time / THREADS_NB); // divide by THREADS_NB!
Example
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define N 20000
#define THREADS_NB omp_get_max_threads()
void init_arrays(double *a, double *b) {
memset(a, 0, sizeof(a));
memset(b, 0, sizeof(b));
for (int i = 0; i < N; i++) {
a[i] += 1.0;
b[i] += 1.0;
}
}
double func2(double i, double j) {
double res = 0.0;
while (i / j > 0.0) {
res += i / j;
i -= 0.1;
j -= 0.000003;
}
return res;
}
double single_thread(double *a, double *b) {
double res = 0;
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
double multi_threads(double *a, double *b) {
double res = 0;
int i, j;
#pragma omp parallel for private(j) num_threads(THREADS_NB) reduction(+:res)
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
int main(void) {
double *a, *b;
a = (double *)calloc(N, sizeof(double));
b = (double *)calloc(N, sizeof(double));
init_arrays(a, b);
clock_t start_time = clock();
double res = single_thread(a, b);
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Default: Done with %f in %f sd\n", res, elapsed_time);
start_time = clock();
res = multi_threads(a, b);
elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("With OMP: Done with %f in %f sd\n", res, elapsed_time / THREADS_NB);
}
Compile with:
gcc -O3 multithread_benchmark.c -fopenmp && time ./a.out
Output:
Default: Done with 2199909813.614555 in 4.909633 sd
With OMP: Done with 2199909799.377532 in 1.708831 sd
real 0m6.703s (from time function)
In POSIX, try getrusage. The relevant argument is RUSAGE_SELF and the relevant fields are ru_utime.tv_sec and ru_utime.tv_usec.
There may be existing utilities that help with this, but I suspect most will use some kind of sampling or possibly injection. But to get specific sections of code timed, you will probably have to add in calls to a timer like you show in your example. If you are using Windows, then the high performance timer works. I answered a similar question and showed example code that will do that. There are similar methods for Linux.

Resources