OpenMP slower than single threaded even though embarrassingly parallelizable [duplicate] - c

I have optimized as much as I could my function for sequential running.
When I use openMP I see no gain in performance.
I tried my program on a machine with 1 cores and on a machine with 8 cores, and the performance is the same.
With year set to 20, I have
1 core: 1 sec.
8 core: 1 sec.
With year set to 25 I have
1 core: 40 sec.
8 core: 40 sec.
1 core machine: my laptop's intel core 2 duo 1.8 GHz, ubuntu linux
8 core machine: 3.25 GHz, ubuntu linux
My program enumerate all the possible path of a binomial tree and do some work on each path. So my loop size increase exponentially and I would expect the footprint of openMP thread to be zero. In my loop, I only do a reduction of one variable. All other variable are read-only. I only use function I wrote, and I think they are thread safe.
I also run Valgrind cachegrind on my program. I don't fully understand the output but there seems to be no cache miss or false sharing.
I compile with
gcc -O3 -g3 -Wall -c -fmessage-length=0 -lm -fopenmp -ffast-math
My complete program is as below. Sorry for posting a lot of code. I'm not familiar with openMP nor C, and I couldn't resume my code more without loosing the main task.
How can I improve performance when I use openMP?
Are they some compiler flags or C tricks that will make the program run faster?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
printf("starting\n");
int year=20;
int tradingdate0=1;
globalinit(year,tradingdate0);
int i;
float v=0;
long n=pow(tradingdate0+1,year);
#pragma omp parallel for reduction(+:v)
for(i=0;i<n;i++)
v+=pathvalue(i);
globaldel();
printf("finished\n");
return 0;
}
//***function on which openMP is applied
float pathvalue(long pathindex) {
float value = -ctx.firstpremium;
float personalaccount = ctx.personalaccountat0;
float account = ctx.firstpremium;
int i;
for (i = 0; i < ctx.year-1; i++) {
value *= ctx.accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,ctx.guarantee[i]);
value += qx(i) * death;
if (haswithdraw(i)){
double withdraw = personalaccount*ctx.allowed;
value += px(i) * withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
}
//last year
double index = getindex(ctx.year-1,pathindex);
account = account * index;
value+=fmaxf(account,ctx.guarantee[ctx.year-1]);
return value * ctx.discountfactor;
}
int haswithdraw(int period){
return 1;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/ctx.chunksize[period])%ctx.tradingdate;
return ctx.stock[ndx];
}
float qx(int period){
return 0;
}
float px(int period){
return 1;
}
//****global
struct context ctx;
void globalinit(int year, int tradingdate0){
ctx.year = year;
ctx.tradingdate0 = tradingdate0;
ctx.firstpremium = 1;
ctx.riskfreerate = 0.06;
ctx.volatility=0.25;
ctx.personalaccountat0 = 1;
ctx.allowed = 0.07;
ctx.guaranteerate = 0.03;
ctx.alpha=1;
ctx.beta = 1;
ctx.tradingdate=tradingdate0+1;
ctx.discountfactor = exp(-ctx.riskfreerate * ctx.year);
ctx.accumulationfactor = exp(ctx.riskfreerate);
ctx.guaranteefactor = 1+ctx.guaranteerate;
ctx.upmove=exp(ctx.volatility/sqrt(ctx.tradingdate0));
ctx.downmove=1/ctx.upmove;
ctx.stock=(float*)malloc(sizeof(float)*ctx.tradingdate);
int i;
for(i=0;i<ctx.tradingdate;i++)
ctx.stock[i]=pow(ctx.upmove,ctx.tradingdate0-i)*pow(ctx.downmove,i);
ctx.chunksize=(long*)malloc(sizeof(long)*ctx.year);
for(i=0;i<year;i++)
ctx.chunksize[i]=pow(ctx.tradingdate,ctx.year-i-1);
ctx.guarantee=(float*)malloc(sizeof(float)*ctx.year);
for(i=0;i<ctx.year;i++)
ctx.guarantee[i]=ctx.beta*pow(ctx.guaranteefactor,i+1);
}
void globaldel(){
free(ctx.stock);
free(ctx.chunksize);
free(ctx.guarantee);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//***global
struct context{
int year;
int tradingdate0;
float firstpremium;
float riskfreerate;
float volatility;
float personalaccountat0;
float allowed;
float guaranteerate;
float alpha;
float beta;
int tradingdate;
float discountfactor;
float accumulationfactor;
float guaranteefactor;
float upmove;
float downmove;
float* stock;
long* chunksize;
float* guarantee;
};
struct context ctx;
void globalinit();
void globaldel();
EDIT I simplify all global variables as constant. For 20 year, the program run two time faster (great!). I tried to set the number of thread with OMP_NUM_THREADS=4 ./test for example. But it didn't give me any performance gain.
Can my gcc have some problem?
test.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#include "test.h"
int main(){
starttimer();
printf("starting\n");
int i;
float v=0;
#pragma omp parallel for reduction(+:v)
for(i=0;i<numberofpath;i++)
v+=pathvalue(i);
printf("v:%f\nfinished\n",v);
endtimer();
return 0;
}
//function on which openMP is applied
float pathvalue(long pathindex) {
float value = -firstpremium;
float personalaccount = personalaccountat0;
float account = firstpremium;
int i;
for (i = 0; i < year-1; i++) {
value *= accumulationfactor;
double index = getindex(i,pathindex);
account = account * index;
double death = fmaxf(account,guarantee[i]);
value += death;
double withdraw = personalaccount*allowed;
value += withdraw;
personalaccount = fmaxf(personalaccount-withdraw,0);
account = fmaxf(account-withdraw,0);
}
//last year
double index = getindex(year-1,pathindex);
account = account * index;
value+=fmaxf(account,guarantee[year-1]);
return value * discountfactor;
}
float getindex(int period, long pathindex){
int ndx = (pathindex/chunksize[period])%tradingdate;
return stock[ndx];
}
//timing
clock_t begin;
void starttimer(){
begin = clock();
}
void endtimer(){
clock_t end = clock();
double elapsed = (double)(end - begin) / CLOCKS_PER_SEC;
printf("\nelapsed: %f\n",elapsed);
}
test.h
float pathvalue(long pathindex);
int haswithdraw(int period);
float getindex(int period, long pathindex);
float qx(int period);
float px(int period);
//timing
void starttimer();
void endtimer();
//***constant
const int year= 20 ;
const int tradingdate0= 1 ;
const float firstpremium= 1 ;
const float riskfreerate= 0.06 ;
const float volatility= 0.25 ;
const float personalaccountat0= 1 ;
const float allowed= 0.07 ;
const float guaranteerate= 0.03 ;
const float alpha= 1 ;
const float beta= 1 ;
const int tradingdate= 2 ;
const int numberofpath= 1048576 ;
const float discountfactor= 0.301194211912 ;
const float accumulationfactor= 1.06183654655 ;
const float guaranteefactor= 1.03 ;
const float upmove= 1.28402541669 ;
const float downmove= 0.778800783071 ;
const float stock[2]={1.2840254166877414, 0.7788007830714049};
const long chunksize[20]={524288, 262144, 131072, 65536, 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1};
const float guarantee[20]={1.03, 1.0609, 1.092727, 1.1255088100000001, 1.1592740743, 1.1940522965290001, 1.2298738654248702, 1.2667700813876164, 1.304773183829245, 1.3439163793441222, 1.384233870724446, 1.4257608868461793, 1.4685337134515648, 1.512589724855112, 1.557967416600765, 1.6047064390987882, 1.6528476322717518, 1.7024330612399046, 1.7535060530771016, 1.8061112346694148};

Even if your program benefits from using OpenMP, you won't see it because you are measuring the wrong time.
clock() returns the total CPU time spent in all threads. If you run with four threads and each runs for 1/4 of the time, clock() will still return the same value since 4*(1/4) = 1. You should be measuring the wall-clock time instead.
Replace calls to clock() with omp_get_wtime() or gettimeofday(). They both provide high precision wall-clock timing.
P.S. Why are there so many people around SO using clock() for timing?

It seems as if it should work. Probably you need to specify the number of threads to use. You can do so by setting the OMP_NUM_THREADS variable. For instance, for using 4 threads:
OMP_NUM_THREADS=4 ./test
EDIT: I just compiled the code and I observe significant speedups when changing the number of threads.

I don't see any section in which you're specifying the number of cores OpenMP will use. It's supposed to, by default, use the number of CPUs it sees, but for my purposes, I've always forced it to use as many as I specified.
Add this line before your parallel for construct:
#pragma omp parallel num_threads(num_threads)
{
// Your parallel for follows here
}
...where num_threads is an integer between 1 and the number of cores on your machine.
EDIT: Here's the makefile used to build the code. Place this in a text file named Makefile in the same directory.
test: test.c test.h
cc -o $# $< -O3 -g3 -fmessage-length=0 -lm -fopenmp -ffast-math

Related

Does mingw32-pthreads-w32 not work on windows properly

I am on a Windows 10 machine with a processor Intel(R) Core(TM) i5-8265U CPU # 1.60GHz, 1800 Mhz, 4 Core(s), 8 Logical Processor(s) and 8 GB RAM. I have been running this small openmp code to compare the performance of a normal sequential program and an omp program.
#include<stdio.h>
#include<omp.h>
void normal(unsigned int num_steps){
double step = 1.0/(double)(num_steps);
double sum = 0.0;
double start=omp_get_wtime();
for (long i = 0; i < num_steps;i++){
double x = i * step;
sum += (4.0 / (1.0 + x * x));
}
double pi = step * sum;
double end=omp_get_wtime();
printf("Time taken : %0.9lf\n",end-start);
printf("The value of pi is : %0.9lf\n",pi);
}
void parallel(unsigned int num_steps,unsigned int thread_cnt){
double pi=0.0;
double sum[thread_cnt];
for(unsigned int i=0;i<thread_cnt;i++)
sum[i]=0.0;
omp_set_num_threads(thread_cnt);
double start=omp_get_wtime();
#pragma omp parallel
{
double x;
double sum_temp=0.0;
double step = 1.0 / (double)(num_steps);
int num_threads = omp_get_num_threads();
int thread_no = omp_get_thread_num();
if(thread_no==0){
thread_cnt = num_threads;
printf("Number of threads assigned is : %d\n",num_threads);
}
for (unsigned int i = thread_no; i < num_steps;i+=thread_cnt){
x=(i*step);
sum_temp+=(4.0/(1+x*x))*step;
}
#pragma omp critical
{
sum[thread_no]=sum_temp;
}
}
double end=omp_get_wtime();
printf("Time taken : %0.9lf\n",end-start);
for(unsigned int i=0;i<thread_cnt;i++){
pi+=sum[i];
}
printf("The value of pi is : %0.9lf\n",pi);
}
int main(){
unsigned int num_steps=1000000;
unsigned int thread_cnt=4;
scanf("%d",&thread_cnt);
normal(num_steps);
parallel(num_steps,thread_cnt);
return 0;
}
I am using mingw's GCC compiler and to run openmp programs which require pthread library i had downloaded the mingw32-pthreads-w32 library. So is it not working, because I don't seem to be able to beat the normal sequential execution despite using so many threads and also handling race conditions and false sharing using the critical pragma.
Reference :
I have been following the OPENMP playlist on youtube by Intel.

Why does OpenMP speed up a SINGLE-ITERATION loop?

I'm using the "read" benchmark from Why is writing to memory much slower than reading it?, and I added just two lines:
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
They should have no effect, because OpenMP should only parallelize the outer loop, but the code now consistently runs twice faster.
Update: These lines aren't even necessary. Simply adding
omp_get_num_threads();
(implicitly declared) in the same place has the same effect.
Complete code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
unsigned long do_xor(const unsigned long* p, unsigned long n)
{
unsigned long i, x = 0;
for(i = 0; i < n; ++i)
x ^= p[i];
return x;
}
int main()
{
unsigned long n, r, i;
unsigned long *p;
clock_t c0, c1;
double elapsed;
n = 1000 * 1000 * 1000; /* GB */
r = 100; /* repeat */
p = calloc(n/sizeof(unsigned long), sizeof(unsigned long));
c0 = clock();
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
for(i = 0; i < r; ++i) {
p[0] = do_xor(p, n / sizeof(unsigned long)); /* "use" the result */
printf("%4ld/%4ld\r", i, r);
fflush(stdout);
}
c1 = clock();
elapsed = (c1 - c0) / (double)CLOCKS_PER_SEC;
printf("Bandwidth = %6.3f GB/s (Giga = 10^9)\n", (double)n * r / elapsed / 1e9);
free(p);
}
Compiled and executed with
gcc -O3 -Wall -fopenmp single_iteration.c && time taskset -c 0 ./a.out
The wall time reported by time is 3.4s vs 7.5s.
GCC 7.3.0 (Ubuntu)
The reason for the performance difference is not actually any difference in code, but in how memory is mapped. In the fast case you are reading from zero-pages, i.e. all virtual addresses are mapped to a single physical page - so nothing has to be read from memory. In the slow case, it is not zeroed. For details see this answer from a slightly different context.
On the other side, it is not caused by calling omp_get_num_threads or the pragma itstelf, but merely linking to the OpenMP runtime library. You can confirm that by using -Wl,--no-as-needed -fopenmp. If you just specify -fopenmp but don't use it at all, the linker will omit it.
Now unfortunately I am still missing the final puzzle piece: why does linking to OpenMP change the behavior of calloc regarding zero'd pages .

Do While don't work inside CUDA Kernel

Ok, I'm pretty new into CUDA, and I'm kind of lost, really lost.
I'm trying to calculate pi using the Monte Carlo Method, and at the end I just get one add instead of 50.
I don't want to "do while" for calling the kernel, since it's too slow. My issue is, that my code don't loop, it executes only once in the kernel.
And also, I'd like that all the threads access the same niter and pi, so when some thread hit the counters all the others would stop.
#define SEED 35791246
__shared__ int niter;
__shared__ double pi;
__global__ void calcularPi(){
double x;
double y;
int count;
double z;
count = 0;
niter = 0;
//keep looping
do{
niter = niter + 1;
//Generate random number
curandState state;
curand_init(SEED,(int)niter, 0, &state);
x = curand(&state);
y = curand(&state);
z = x*x+y*y;
if (z<=1) count++;
pi =(double)count/niter*4;
}while(niter < 50);
}
int main(void){
float tempoTotal;
//Start timer
clock_t t;
t = clock();
//call kernel
calcularPi<<<1,32>>>();
//wait while kernel finish
cudaDeviceSynchronize();
typeof(pi) piFinal;
cudaMemcpyFromSymbol(&piFinal, "pi", sizeof(piFinal),0, cudaMemcpyDeviceToHost);
typeof(niter) niterFinal;
cudaMemcpyFromSymbol(&niterFinal, "niter", sizeof(niterFinal),0, cudaMemcpyDeviceToHost);
//Ends timer
t = clock() - t;
tempoTotal = ((double)t)/CLOCKS_PER_SEC;
printf("Pi: %g \n", piFinal);
printf("Adds: %d \n", niterFinal);
printf("Total time: %f \n", tempoTotal);
}
There are a variety of issues with your code.
I suggest using proper cuda error checking and run your code with cuda-memcheck to spot any runtime errors. I've omitted proper error checking in my code below for brevity of presentation, but I've run it with cuda-memcheck to indicate no runtime errors.
Your usage of curand() is probably not correct (it returns integers over a large range). For this code to work correctly, you want a floating-point quantity between 0 and 1. The correct call for that is curand_uniform().
Since you want all threads to work on the same values, you must prevent those threads from stepping on each other. One way to do that is to use atomic updates of the variables in question.
It should not be necessary to re-run curand_init on each iteration. Once per thread should be sufficient.
We don't use cudaMemcpy..Symbol operations on __shared__ variables. For convenience, and to preserve something that resembles your original code, I've elected to convert those to __device__ variables.
Here's a modified version of your code that has most of the above issues fixed:
$ cat t978.cu
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#define ITER_MAX 5000
#define SEED 35791246
__device__ int niter;
__device__ int count;
__global__ void calcularPi(){
double x;
double y;
double z;
int lcount;
curandState state;
curand_init(SEED,threadIdx.x, 0, &state);
//keep looping
do{
lcount = atomicAdd(&niter, 1);
//Generate random number
x = curand_uniform(&state);
y = curand_uniform(&state);
z = x*x+y*y;
if (z<=1) atomicAdd(&count, 1);
}while(lcount < ITER_MAX);
}
int main(void){
float tempoTotal;
//Start timer
clock_t t;
t = clock();
int count_final = 0;
int niter_final = 0;
cudaMemcpyToSymbol(niter, &niter_final, sizeof(int));
cudaMemcpyToSymbol(count, &count_final, sizeof(int));
//call kernel
calcularPi<<<1,32>>>();
//wait while kernel finish
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&count_final, count, sizeof(int));
cudaMemcpyFromSymbol(&niter_final, niter, sizeof(int));
//Ends timer
double pi = count_final/(double)niter_final*4;
t = clock() - t;
tempoTotal = ((double)t)/CLOCKS_PER_SEC;
printf("Pi: %g \n", pi);
printf("Adds: %d \n", niter_final);
printf("Total time: %f \n", tempoTotal);
}
$ nvcc -o t978 t978.cu -lcurand
$ cuda-memcheck ./t978
========= CUDA-MEMCHECK
Pi: 3.12083
Adds: 5032
Total time: 0.558463
========= ERROR SUMMARY: 0 errors
$
I've modified the iterations to a larger number, but you can use 50 if you want for ITER_MAX.
Note that there are many criticisms that could be levelled against this code. My aim here, since it's clearly a learning exercise, is to point out what the minimum number of changes could be to get a functional code, using the algorithm you've outlined. As just one example, you might want to change your kernel launch config (<<<1,32>>>) to other, larger numbers, in order to more fully utilize the GPU.

I need help to parallelize this code using OpenMP

I wrote a C code that I would like to parallelize using OpenMP (I am a beginner and I have just a few days to solve this task); let's start from the main: first of all I have initialized 6 vectors (Vx,Vy,Vz,thetap,phip,theta); then there is a for loop that cycles over Nmax; inside of this loop I allocate some memory for the structure I have defined at the very top of the code; the structure is called coll_CPU and increases its size every cycle; then I pick some of the values from the vectors I have mentioned before and I place them into the structure; so at this point my structure coll_CPU is filled with Ncoll elements; during this process I used some of the functions declared outside of the main (these functions are random number generators). Now comes the important part: in my serial code I use a for loop to pass every single element of the structure to a function called collisionCPU (this function just gets the inputs and multiplies them by 2); My goal is to parallelize this loop so that each of my CPUs gives its contribution to do this operation and speed up the process.
Here are the codes:
main.c
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <memory.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define pi2 6.283185307
#define pi 3.141592654
#define IMUL(a,b) __mul24(a,b)
typedef struct {
int seme;
} iniran;
typedef struct{
int jp1;
int jp2;
float kx;
float ky;
float kz;
float vAx;
float vAy;
float vAz;
float vBx;
float vBy;
float vBz;
float tetaAp;
float phiAp;
float tetaA;
float tetaBp;
float phiBp;
float tetaB;
float kAx;
float kAy;
float kAz;
float kBx;
float kBy;
float kBz;
int caso;
} stato_struct;
stato_struct *coll_CPU=0;
unsigned int timer;
#include "DSMC_kernel_float.c"
//=============================================================
float min(float *a, float*b){
if(*a<*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float max(float *a, float*b){
if(*a>*b){
return *a;
}
else{
return *b;
}
}
//=============================================================
float rf(int *idum){
static int iff=0;
static int inext, inextp, ma[55];
int mj, mk;
int i, k, ii;
float ret_val;
if (*idum<0 || iff==0) {
iff=1;
mj=161803398 - abs(*idum);
mj %= 1000000000;
ma[54]=mj;
mk=1;
for (i=1; i<=54; ++i){
ii=(i*21)%55;
ma[ii-1]=mk;
mk=mj-mk;
if (mk<0) {
mk += 1000000000;
}
mj= ma[ii-1];
}
for(k=1; k<=4; ++k) {
for(i=1; i<=55; ++i){
ma[i-1] -= ma[(i+30)%55];
if (ma[i-1]<0){
ma[i-1] += 1000000000;
}
}
}
inext=0;
inextp=31;
*idum=1;
}
++inext;
if (inext==56){
inext=1;
}
++inextp;
if (inextp==56){
inextp=1;
}
mj=ma[inext-1]-ma[inextp-1];
if (mj<0){
mj += 1000000000;
}
ma[inext-1]=mj;
ret_val=mj*1.0000000000000001e-9;
return ret_val;
}
//============================================================
int genk(float *kx, float *ky, float *kz, int *p2seme){
// float sqrtf(float), sinf(float), cosf(float);
extern float rf(int *);
static float phi;
*kx=rf(p2seme) * 2. -1.f;
*ky= sqrtf(1. - *kx * *kx);
phi=pi2*rf(p2seme);
*kz=*ky * sinf(phi);
*ky *= cosf(phi);
return 0;
}
//==============================================================
int main(void){
float msec_kernel;
int Np=10000, Nmax=512;
int id,jp,jcoll,Ncoll,jp1, jp2, ind;
float Vx[Np],Vy[Np],Vz[Np],teta[Np],tetap[Np],phip[Np];
float kx, ky, kz, Vrx, Vry, Vrz, scalprod, fk;
float kAx, kAy, kAz, kBx, kBy, kBz;
iniran1.seme=7593;
for(jp=1;jp<=Np;jp++){
if(jp<=Np/2){
Vx[jp-1]=2.5;
Vy[jp-1]=0;
Vz[jp-1]=0;
tetap[jp-1]=0;
phip[jp-1]=0;
teta[jp-1]=0;
}
for (Ncoll=1;Ncoll<=Nmax;Ncoll += 10){
coll_CPU=(stato_struct*) malloc(Ncoll*sizeof(stato_struct));
jcoll=0;
while (jcoll<Ncoll){
jp1=1+floorf(Np*rf(&iniran1.seme));
jp2=1+floorf(Np*rf(&iniran1.seme));
genk(&kx,&ky,&kz,&iniran1.seme);
Vrx=Vx[jp2-1]-Vx[jp1-1];
Vry=Vy[jp2-1]-Vy[jp1-1];
Vrz=Vz[jp2-1]-Vz[jp1-1];
scalprod=Vrx*kx+Vry*ky+Vrz*kz;
if (scalprod<0) {
genk(&kAx,&kAy,&kAz,&iniran1.seme);
genk(&kBx,&kBy,&kBz,&iniran1.seme);
coll_CPU[jcoll].jp1= jp1;
coll_CPU[jcoll].jp2=jp2;
coll_CPU[jcoll].kx=kx;
coll_CPU[jcoll].ky=ky;
coll_CPU[jcoll].kz=kz;
coll_CPU[jcoll].vAx=Vx[jp1-1];
coll_CPU[jcoll].vAy=Vy[jp1-1];
coll_CPU[jcoll].vAz=Vz[jp1-1];
coll_CPU[jcoll].vBx=Vx[jp2-1];
coll_CPU[jcoll].vBy=Vy[jp2-1];
coll_CPU[jcoll].vBz=Vz[jp2-1];
coll_CPU[jcoll].tetaAp=tetap[jp1-1];
coll_CPU[jcoll].phiAp=phip[jp1-1];
coll_CPU[jcoll].tetaA=teta[jp1-1];
coll_CPU[jcoll].tetaBp=tetap[jp2-1];
coll_CPU[jcoll].phiBp=phip[jp2-1];
coll_CPU[jcoll].tetaB=teta[jp2-1];
coll_CPU[jcoll].kAx=kAx;
coll_CPU[jcoll].kAy=kAy;
coll_CPU[jcoll].kAz=kAz;
coll_CPU[jcoll].kBx=kBx;
coll_CPU[jcoll].kBy=kBy;
coll_CPU[jcoll].kBz=kBz;
coll_CPU[jcoll].caso=1;
jcoll++;
}
}
clock_t t;
t = clock();
#pragma omp parallel for private(id) //HERE IS WHERE I TRIED TO DO THE PARALLELIZATION BUT WITH NO SUCCESS. WHAT DO I HAVE TO TYPE INSTEAD???
for(id=0;id<Nmax;id++){
CollisioniCPU(coll_CPU,id);
}
t = clock() - t;
msec_kernel = ((float)t*1000)/CLOCKS_PER_SEC;
printf("Tempo esecuzione kernel:%e s\n",msec_kernel*1e-03);
for (ind=0;ind<Ncoll;ind++){
if (coll_CPU[ind].caso==4)
Ncoll_eff++;
else if (coll_CPU[ind].caso==0)
Ncoll_div++;
else
Ncoll_dim++;
}
free(coll_CPU);
}
return 0;
}
DSMC_kernel_float.c
void CollisioniCPU(stato_struct *coll_CPU, int id){
float vettA[6], vettB[6];
vettA[0]=coll_CPU[id].vAx;
vettA[1]=coll_CPU[id].vAy;
vettA[2]=coll_CPU[id].vAz;
vettA[3]=coll_CPU[id].tetaAp;
vettA[4]=coll_CPU[id].phiAp;
vettA[5]=coll_CPU[id].tetaA;
vettB[0]=coll_CPU[id].vBx;
vettB[1]=coll_CPU[id].vBy;
vettB[2]=coll_CPU[id].vBz;
vettB[3]=coll_CPU[id].tetaBp;
vettB[4]=coll_CPU[id].phiBp;
vettB[5]=coll_CPU[id].tetaB;
coll_CPU[id].vAx=2*vettA[0];
coll_CPU[id].vAy=2*vettA[1];
coll_CPU[id].vAz=2*vettA[2];
coll_CPU[id].tetaAp=2*vettA[3];
coll_CPU[id].phiAp=2*vettA[4];
coll_CPU[id].tetaA=2*vettA[5];
coll_CPU[id].vBx=2*vettB[0];
coll_CPU[id].vBy=2*vettB[1];
coll_CPU[id].vBz=2*vettB[2];
coll_CPU[id].tetaBp=2*vettB[3];
coll_CPU[id].phiBp=2*vettB[4];
coll_CPU[id].tetaB=2*vettB[5];
}
In order to compile the program I type this line on the terminal: gcc -fopenmp time_analysis.c -o time_analysis -lm fallowed by export OMP_NUM_THREADS=1; however once I run the executable I get this error message:
Error in `./time_analysis': double free or corruption (!prev): 0x00000000009602c0 ***
Aborted
What does this error mean? what I have done wrong in the main function when I tried to parallelize the for loop? and most important: what should I type instead in order to make my code go on parallel? please help me out if you can because I seriously have no time to study OpenMP from scratch and I need to get this job done right away.
Changing the inner loop as follows should bring you one step further.
#pragma omp parallel for private(id)
for(id=0;id<Ncoll;id++){
CollisioniCPU(coll_CPU,id);
}
Your OpenMP line seems okay, but I doubt that it will lead to significant improvements in runtime. You should optimize the surrounding code as well. Allocating the memory once outside of your loops would be a good start.
By the way, is there any reason for this verbose coding style and not using a more compact and readable version as this one?
void CollisioniCPU(stato_struct *coll_CPU, int id) {
stato_struct *ptr = coll_CPU + id;
ptr->vAx *= 2;
ptr->vAy *= 2;
ptr->vAz *= 2;
ptr->tetaAp *= 2;
ptr->phiAp *= 2;
ptr->tetaA *= 2;
ptr->vBx *= 2;
ptr->vBy *= 2;
ptr->vBz *= 2;
ptr->tetaBp *= 2;
ptr->phiBp *= 2;
ptr->tetaB *= 2;
}

Why won't GCC auto-vectorize this loop?

I have the following C program (a simplification of my actual use case which exhibits the same behavior)
#include <stdlib.h>
#include <math.h>
int main(int argc, char ** argv) {
const float * __restrict__ const input = malloc(20000*sizeof(float));
float * __restrict__ const output = malloc(20000*sizeof(float));
unsigned int pos=0;
while(1) {
unsigned int rest=100;
for(unsigned int i=pos;i<pos+rest; i++) {
output[i] = input[i] * 0.1;
}
pos+=rest;
if(pos>10000) {
break;
}
}
}
When I compile with
-O3 -g -Wall -ftree-vectorizer-verbose=5 -msse -msse2 -msse3 -march=native -mtune=native --std=c99 -fPIC -ffast-math
I get the output
main.c:10: note: not vectorized: unhandled data-ref
where 10 is the line of the inner for loop. When I looked up why it might say this, it seemed to be saying that the pointers could be aliased, but they can't be in my code, as I have the __restrict keyword. They also suggested including the -msse flags, but they don't seem to do anything either. Any help?
It certainly seems like a bug. In the following, equivalent functions, foo() is vectorised but bar() is not, when compiling for an x86-64 target:
void foo(const float * restrict input, float * restrict output)
{
unsigned int pos;
for (pos = 0; pos < 10100; pos++)
output[pos] = input[pos] * 0.1;
}
void bar(const float * restrict input, float * restrict output)
{
unsigned int pos;
unsigned int i;
for (pos = 0; pos <= 10000; pos += 100)
for (i = 0; i < 100; i++)
output[pos + i] = input[pos + i] * 0.1;
}
Adding the -m32 flag, to compile for an x86 target instead, causes both functions to be vectorised.
It doesn't like the outer loop format which is preventing it from understanding the inner loop. I can get it to vectorize if I just fold it into a single loop:
#include <stdlib.h>
#include <math.h>
int main(int argc, char ** argv) {
const float * __restrict__ input = malloc(20000*sizeof(float));
float * __restrict__ output = malloc(20000*sizeof(float));
for(unsigned int i=0; i<=10100; i++) {
output[i] = input[i] * 0.1f;
}
}
(note that I didn't think too hard about how to properly translate the pos+rest limit into a single for loop condition, it may be wrong)
You might be able to take advantage of this by putting a simplified inner loop into a function which you call with pointers and a count. Even when it is inlined again it may work fine. This is assuming you deleted parts of your while() loop that I have just simplified away but you need to retain.
try:
const float * __restrict__ input = ...;
float * __restrict__ output = ...;
experiment a bit by changing things around:
#include <stdlib.h>
#include <math.h>
int main(int argc, char ** argv) {
const float * __restrict__ input = new float[20000];
float * __restrict__ output = new float[20000];
unsigned int pos=0;
while(1) {
unsigned int rest=100;
output += pos;
input += pos;
for(unsigned int i=0;i<rest; ++i) {
output[i] = input[i] * 0.1;
}
pos+=rest;
if(pos>10000) {
break;
}
}
}
g++ -O3 -g -Wall -ftree-vectorizer-verbose=7 -msse -msse2 -msse3 -c test.cpp
test.cpp:14: note: versioning for alias required: can't determine dependence between *D.4096_24 and *D.4095_21
test.cpp:14: note: mark for run-time aliasing test between *D.4096_24 and *D.4095_21
test.cpp:14: note: Alignment of access forced using versioning.
test.cpp:14: note: Vectorizing an unaligned access.
test.cpp:14: note: vect_model_load_cost: unaligned supported by hardware.
test.cpp:14: note: vect_model_load_cost: inside_cost = 2, outside_cost = 0 .
test.cpp:14: note: vect_model_simple_cost: inside_cost = 2, outside_cost = 0 .
test.cpp:14: note: vect_model_simple_cost: inside_cost = 2, outside_cost = 1 .
test.cpp:14: note: vect_model_simple_cost: inside_cost = 1, outside_cost = 0 .
test.cpp:14: note: vect_model_store_cost: inside_cost = 1, outside_cost = 0 .
test.cpp:14: note: cost model: Adding cost of checks for loop versioning to treat misalignment.
test.cpp:14: note: cost model: Adding cost of checks for loop versioning aliasing.
test.cpp:14: note: Cost model analysis:
Vector inside of loop cost: 8
Vector outside of loop cost: 6
Scalar iteration cost: 5
Scalar outside cost: 1
prologue iterations: 0
epilogue iterations: 0
Calculated minimum iters for profitability: 2
test.cpp:14: note: Profitability threshold = 3
test.cpp:14: note: Vectorization may not be profitable.
test.cpp:14: note: create runtime check for data references *D.4096_24 and *D.4095_21
test.cpp:14: note: created 1 versioning for alias checks.
test.cpp:14: note: LOOP VECTORIZED.
test.cpp:4: note: vectorized 1 loops in function.
Compilation finished at Wed Feb 16 19:17:59

Resources