how can be parallelized this c code using openmp? - c

This is the c code to edit with openmp instruction to improve the velocity of execution and below the stucture of the file used by the program.
I've try with:
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
but it doesn't work, the expected result is wrong, and different from the serial one.
I think that there is a logical error in the parallelization, I mean that this algorithm have to be parallelized in another way.
// FEEDFORWARD AND BACKPROPAGATION ALGORITHM
// WITH IMPLEMENTAtION OF BATCH TECHNIQUE
// compute the error in a batch of 5 input and then propagate the error, usefull for the parallelization.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define INPUTN 3 // number of neurons in the input layer
#define HN 3 // number of neurons in the hidden layer
#define OUTN 1 // number of neurons in the output layer
#define DATANUM 1000 // number of training samples
#define EPOCHS 1000
#define BATCH_SIZE 20
typedef struct DataS{
double input[INPUTN];
double teach;
}DataS;
int main(){
double alpha = 0.0000001; //learning rate
double hn_out[HN];
double price_M;
double y_out = 0.0;
double error; //loss function
int k,g;
double delta_y;
double delta_w[HN][INPUTN];
double delta_b[HN];
DataS data[DATANUM];
double w[HN][INPUTN];
double v[HN];
double b[HN];
FILE *fp1;
double relative_err = 0;
double y_avg = 0.0;
double y_out_avg = 0.0;
double y_exp_avg = 0.0;
//weights initialization
for(int i=0; i<HN; i++){
v[i]= 1.0;
for(int j=0; j<INPUTN; j++)
w[i][j]= 1.0;
b[i]=0.0;
}
//get Dataset
fp1 = fopen("Dataset_3.txt", "r");
if(fp1 == NULL)
{
printf("cannot open file");
exit(1);
}
for(int i=0;i<DATANUM; i++){
fscanf(fp1, "%lf\t%lf\t%lf\t%lf", &data[i].input[0], &data[i].input[1], &data[i].input[2], &data[i].teach);
printf("%lf\t%lf\t%lf\t%lf\n", data[i].input[0], data[i].input[1], data[i].input[2], data[i].teach);
y_avg += data[i].teach/DATANUM;
}
fclose(fp1);
//START ALGORITHM
double ti = omp_get_wtime(); //initial time
for (int i = 0; i < EPOCHS; i ++) {
printf("\nepoch %d) ", i);
relative_err=0;
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
for(int j=0; j<DATANUM/BATCH_SIZE; j++){
//FEEDFORWARD
//compute hn_out[HN]
int base = j*BATCH_SIZE;
printf("Avg of data:");
for(int i_b=0; i_b<BATCH_SIZE; i_b++){
printf(" %d", base+i_b);
for(k=0; k<HN; k++){
hn_out[k]= 0.0;
}
for(k=0; k<HN; k++){
for(g=0; g<INPUTN; g++){
hn_out[k]+= w[k][g]*data[base+i_b].input[g];
}
hn_out[k]+= b[k];
}
//compute y_out[OUTN]
y_out= 0.0;
for(g=0; g<HN; g++){
y_out += hn_out[g]*v[g];
}
y_out = y_out/HN;
y_out_avg += y_out/BATCH_SIZE;
y_exp_avg += data[base+i_b].teach/BATCH_SIZE;
}
//LOSS FUNCTION
error = pow((y_out_avg-y_exp_avg),2);
printf("\nESTIM_AVG\tREAL_AVG\tRELATIVE_ERROR");
relative_err = fabs((y_out_avg-y_exp_avg)/y_avg); //relative_error: (prezzo calcolato - prezzo atteso)/misura attesa media
printf("\n%lf\t%lf\t%lf\n", y_out_avg, y_exp_avg, relative_err);
//BACKPROPAGATION
//update bias and weight
for(k=0;k<HN;k++){
for(g=0; g<INPUTN; g++){
w[k][g] = w[k][g]-2*alpha*data[j].input[g]*(y_out_avg-y_exp_avg);
v[g]= v[g]-2*alpha*(y_out_avg-y_exp_avg);
}
b[k]= b[k]-2*alpha*(y_out_avg-y_exp_avg);
//b[k]= 0;
}
y_out_avg = 0.0;
y_exp_avg = 0.0;
}
}
double tf = omp_get_wtime(); //final time
double time = tf - ti; //effective time for the execution
printf ("Elapsed time: %lf\n", time);
return 0;
}
using a file "Dataset_3.txt" which have 1000 rows of data here an example of 10 data:
u can copy and paste and create a file of 1000 rows or edit the code to run it correctly.
121.3168139 6.873759459 7 322386.5042
99.60902165 4.63043755 7 284554.0498
135.7221604 6.663354979 4 284796.0999
133.7192657 3.496973506 7 343977.1519
155.0125801 2.259712681 8 390169.2343
152.0527816 3.643403786 4 309419.1429
64.71485146 5.10618215 7 235827.262
130.6841885 5.405015338 4 280079.0986
56.36704 1.557336041 5 193401.2459
96.33489022 2.840480371 4 234694.1379
need some help for speed-up the program execution using openmp.

The level at witch you placed you OpenMP directive isn't the right one as there are too many things in the j loop that are not meant to be executed in parallel.
However, you can consider parallelizing the i_b loop.
For this one, a good stating point for this would be for example:
#pragma omp parallel for reduction(+:y_out_avg,y_exp_avg) private(k,g,y_out,hn_out)
If/when you're happy with the correctness of the code and if you want to go further in the parallelisation, then you can consider the "BACKPROPAGATION" loops and see what could be done there...

Related

Why am I not getting the same estimation of PI using a parallelized (OpenMP) algothrim copied from working code?

The code below is a direct translation from a youtube video on Estimating PI using OpenMP and Monte Carlo. Even with the same inputs I'm not getting here their output. In fact, it seems like around half the value is what I get.
int main() {
int num; // number of iterations
printf("Enter number of iterations you want the loop to run for: ");
scanf_s("%d", &num);
double x, y, z, pi;
long long int i;
int count = 0;
int num_thread;
printf("Enter number of threads you want to run to parallelize the process:\t");
scanf_s("%d", &num_thread);
printf("\n");
#pragma omp parallel firstprivate(x,y,z,i) shared(count) num_threads(num_thread)
{
srand((int)time(NULL) ^ omp_get_thread_num());
for (i = 0; i < num; i++) {
x = (double)rand() / (double)RAND_MAX;
y = (double)rand() / (double)RAND_MAX;
z = pow(((x * x) + (y * y)), .5);
if (z <= 1) {
count++;
}
}
} // END PRAGMA
pi = ((double)count / (double)(num * num_thread)) * 4;
printf("The value of pi obtained is %f\n", pi);
return 0;
}
I've also used a similar algorithm straight from the Oak Ridge National Laboratory's website (https://www.olcf.ornl.gov/tutorials/monte-carlo-pi/):
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
int main(int argc, char* argv[])
{
int niter = 1000000; //number of iterations per FOR loop
double x,y; //x,y value for the random coordinate
int i; //loop counter
int count=0; //Count holds all the number of how many good coordinates
double z; //Used to check if x^2+y^2<=1
double pi; //holds approx value of pi
int numthreads = 16;
#pragma omp parallel firstprivate(x, y, z, i) shared(count) num_threads(numthreads)
{
srandom((int)time(NULL) ^ omp_get_thread_num()); //Give random() a seed value
for (i=0; i<niter; ++i) //main loop
{
x = (double)random()/RAND_MAX; //gets a random x coordinate
y = (double)random()/RAND_MAX; //gets a random y coordinate
z = sqrt((x*x)+(y*y)); //Checks to see if number is inside unit circle
if (z<=1)
{
++count; //if it is, consider it a valid random point
}
}
//print the value of each thread/rank
}
pi = ((double)count/(double)(niter*numthreads))*4.0;
printf("Pi: %f\n", pi);
return 0;
}
And I am have the exact problem, so I'm think it isn't the code but somehow my machine.
I am running in VS Studio 22, Windows 11 with 16 core i9-12900kf and 32 gb ram.
Edit: I forgot to mention I did alter the second algorithm to use srand() and rand() instead.
There are many errors in the code:
As pointed out by #JeromeRichard and #JohnBollinger rand\srand\random are not threadsafe you should use a threadsafe solution.
There is a race condition at line ++count; (different threads read and write a shared variable). You should use reduction to avoid it.
The code assumes that you use numthreads threads, but OpenMP does not guarantee that you actually got all of the threads you requested. I think if you got PI/2 as a result, the problem should be the difference between the requested and obtained number of threads. If you use #pragma omp parallel for... before the loop, you do not need any assumptions about the number of threads (ie. in this case the equation to calculate PI does not contain the number of threads).
A minor comment is that you do not need to use the time-consuming pow function.
Putting it together your code should be something like this:
#pragma omp parallel for reduction(+:count) num_threads(num_thread)
for (long long int i = 0; i < num; i++) {
const double x = threadsafe_random_number_between_0_1();
const double y = threadsafe_random_number_between_0_1();
const double z = x * x + y * y;
if (z <= 1) {
count++;
}
}
double pi = ((double) count / (double) num ) * 4.0;
One assumption but I may be wrong : you initialise random with time, so it may happen than different thread use the same time , which may result in same random number generated, and so the result will be really bad as you got multiple time the same values. This is a problem with the Monte-Carlo method where 2 identical points will make wrong result.

Parallel computing using multiple cores with Open-MP.

I am struggling to figure out how to parallelize this code with OpenMP, any help is appreciated. Below is the base code and a description.
In the simulation of a collection of soft particles (such as proteins in a fluid), there is a repulsive force between a pair of particles when they overlap. The goal of this assignment is to use parallel computing to accelerate the computation of these repulsive forces, using multiple cores with Open-MP.
In the force repulsion function, the particles are assumed to have unit radius. The particles are in a “simulation box” of dimensions L × L × L. The dimension L is chosen such that the volume fraction of particles is φ = 0.3. The simulation box has periodic (wrap-around) boundary conditions, which explains why we need to use the remainder function to compute the distance between two particles. If the particles overlap, i.e., the distance s between two particles is less than 2, then the repulsive force is proportional to k(2−s) where k is a force constant. The force is along the vector joining the two particles.
Write a program that tests the correctness of your code. This can be done by computing the correct forces and comparing them to the forces computed by your optimized code. Give evidence in your report that your program works correctly using your test program
How much faster is your accelerated code compared to the provided baseline code? Include timings for different problem sizes. Be sure to include a listing of your code in your report.
Code to parallelize
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
double get_walltime() {
struct timeval tp;
gettimeofday(&tp, NULL);
return (double) (tp.tv_sec + tp.tv_usec*1e-6); }
void force_repulsion(int np, const double *pos, double L, double krepulsion, double *forces)
{
int i, j;
double posi [4]; double rvec [4];
double s2, s, f;
// initialize forces to zero
for (i=0; i<3*np; i++)
forces[i] = 0.;
// loop over all pairs
for (i=0; i<np; i++)
{
posi[0] = pos[3*i ];
posi[1] = pos[3*i+1]; posi[2] = pos[3*i+2];
for (j=i+1; j<np; j++)
{
// compute minimum image difference
rvec[0] = remainder(posi[0] - pos[3*j ], L);
rvec[1] = remainder(posi[1] - pos[3*j+1], L);
rvec[2] = remainder(posi[2] - pos[3*j+2], L);
s2 = rvec [0]* rvec [0] + rvec [1]* rvec [1] + rvec [2]* rvec [2];
if (s2 < 4)
{
s = sqrt(s2);
rvec[0] /= s; rvec[1] /= s;
rvec[2] /= s;
f = krepulsion*(2.-s);
forces[3*i ] += f*rvec[0];
forces[3*i+1] += f*rvec[1];
forces[3*i+2] += f*rvec[2];
forces[3*j ] += -f*rvec[0];
forces[3*j+1] += -f*rvec[1];
forces[3*j+2] += -f*rvec[2]; }
} }
}
int main(int argc, char *argv[]) {
int i;
int np = 100; // default number of particles
double phi = 0.3; // volume fraction
double krepulsion = 125.; // force constant
double *pos; double *forces;
double L, time0 , time1;
if (argc > 1)
np = atoi(argv[1]);
L = pow(4./3.*3.1415926536*np/phi, 1./3.);
// generate random particle positions inside simulation box
forces = (double *) malloc(3*np*sizeof(double));
pos = (double *) malloc(3*np*sizeof(double));
for (i=0; i<3*np; i++)
pos[i] = rand()/(double)RAND_MAX*L;
// measure execution time of this function
time0 = get_walltime ();
force_repulsion(np, pos, L, krepulsion, forces);
time1 = get_walltime ();
printf("number of particles: %d\n", np);
printf("elapsed time: %f\n", time1-time0);
free(forces);
free(pos);
return 0; }
Theoretically, it would be as simple as this:
void force_repulsion(int np, const double *pos, double L, double krepulsion,
double *forces)
{
// initialize forces to zero
#pragma omp parallel for
for (int i = 0; i < 3 * np; i++)
forces[i] = 0.;
// loop over all pairs
#pragma omp parallel for
for (int i = 0; i < np; i++)
{
double posi[4];
double rvec[4];
double s2, s, f;
posi[0] = pos[3 * i];
//...
Compilation:
g++ -fopenmp example.cc -o example
Note that I did not check for correctness. Make sure you won't have global variable inside the parallel for (as I updated your code..)

What to heed, when reading an array from multiple threads?

I'd like get to know OpenMP a bit, cause I'd like to have a huge loop parallelized. After some reading (SO, Common OMP mistakes, tutorial, etc), I've taken as a first step the basically working c/mex code given below (which yields different results for the first test case).
The first test does sum up result values - functions serial, parallel -,
the second takes values from an input array and writes the processed values to an output array - functions serial_a, parallel_a.
My questions are:
Why differ the results of the first test, i. e. the results of the serial and parallel
Suprisingly the second test succeeds. My concern is about, how to handle memory (array locations) which possibly are read by multiple threads? In the example this should be emulated by a[i])/cos(a[n-i].
Are there some easy rules how to determine which variables to declare as private, shared and reduction?
In both cases int i is outside the pragma, however the second test appears to yield correct results. So is that okay or has i to be moved into the pragma omp parallel region, as being said here?
Any other hints on spoted mistakes?
Code
#include "mex.h"
#include <math.h>
#include <omp.h>
#include <time.h>
double serial(int x)
{
double sum=0;
int i;
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
return sum;
}
double parallel(int x)
{
double sum=0;
int i;
#pragma omp parallel num_threads(6) shared(sum) //default(none)
{
//printf(" I'm thread no. %d\n", omp_get_thread_num());
#pragma omp for private(i, x) reduction(+: sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
return sum;
}
void serial_a(double* a, int n, double* y2)
{
int i;
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
void parallel_a(double* a, int n, double* y2)
{
int i;
#pragma omp parallel num_threads(6)
{
#pragma omp for private(i)
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
}
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
double sum, *y1, *y2, *a, s, p;
int x, n, *d;
/* Check for proper number of arguments. */
if(nrhs!=2) {
mexErrMsgTxt("Two inputs required.");
} else if(nlhs>2) {
mexErrMsgTxt("Too many output arguments.");
}
/* Get pointer to first input */
x = (int)mxGetScalar(prhs[0]);
/* Get pointer to second input */
a = mxGetPr(prhs[1]);
d = (int*)mxGetDimensions(prhs[1]);
n = (int)d[1]; // row vector
/* Create space for output */
plhs[0] = mxCreateDoubleMatrix(2,1, mxREAL);
plhs[1] = mxCreateDoubleMatrix(n,2, mxREAL);
/* Get pointer to output array */
y1 = mxGetPr(plhs[0]);
y2 = mxGetPr(plhs[1]);
{ /* Do the calculation */
clock_t tic = clock();
y1[0] = serial(x);
s = (double) clock()-tic;
printf("serial....: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
y1[1] = parallel(x);
p = (double) clock()-tic;
printf("parallel..: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
mexEvalString("drawnow");
tic = clock();
serial_a(a, n, y2);
s = (double) clock()-tic;
printf("serial_a..: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
parallel_a(a, n, &y2[n]);
p = (double) clock()-tic;
printf("parallel_a: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
}
}
Output
>> mex omp1.c
>> [a, b] = omp1(1e8, 1:1e8);
serial....: 13399 ms
parallel..: 2810 ms
ratio.....: 0.21
serial_a..: 12840 ms
parallel_a: 2740 ms
ratio.....: 0.21
>> a(1) == a(2)
ans =
0
>> all(b(:,1) == b(:,2))
ans =
1
System
MATLAB Version: 8.0.0.783 (R2012b)
Operating System: Microsoft Windows 7 Version 6.1 (Build 7601: Service Pack 1)
Microsoft Visual Studio 2005 Version 8.0.50727.867
In your function parallel you have a few mistakes. The reduction should be declared when you use parallel. Private and share variables should also be declared when you use parallel. But when you do a reduction you should not declare the variable that is being reduced as shared. The reduction will take care of this.
To know what to declare private or shared you have to ask yourself which variables are being written to. If a variable is not being written to then normally you want it to be shared. In your case the variable x does not change so you should declare it shared. The variable i, however, does change so normally you should declare it private so to fix your function you could do
#pragma omp parallel reduction(+:sum) private(i) shared(x)
{
#pragma omp for
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
However, OpenMP automatically makes the iterator of a parallel for region private and variables declared outside of parallel regions are shared by default so for your parallel function you can simply do
#pragma omp parallel for reduction(+:sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
Notice that the only difference between this and your serial code is the pragma statment. OpenMP is designed so that you don't have to change your code except for pragma statments.
When it comes to arrays as long as each iteration of a parallel for loop acts on a different array element then you don't have to worry about shared and private. So you can write your private_a function simply as
#pragma omp parallel for
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
and once again it is the same as your serial_a function except for the pragma statement.
But be careful with assuming iterators are private. Consider the following double loop
for(i=0; i<n; i++) {
for(j=0; j<m; j++) {
//
}
}
If you use #pragma parallel for with that the i iterator will be made private but the j iterator will be shared. This is because the parallel for only applies to the outer loop over i and since j is shared by default it is not made private. In this case you would need to explicitly declare j private like this #pragma parallel for private(j).

Correct way to implement windowing

I'm trying to implement windowing in a program, for that I've wrote a sin function with 2048 samples. I'm reading the values and try to calculate the PSD using the "rect" window, when my window is 2048 wide, the result is accurate. Otherwise the result doesn't make any sense to me.
Here is the code that I'm using,
#include <fftw3.h>
#include <math.h>
#include <stdio.h>
#include <complex.h>
int main (){
FILE* inputFile = NULL;
FILE* outputFile= NULL;
double* inputData=NULL;
double* outputData=NULL;
double* windowData=NULL;
unsigned int windowSize = 512;
int overlaping =128;
int index1 =0,index2=0, i=0;
double powVal= 0.0;
fftw_plan plan_r2hc;
// mememory allocation
inputData = (double*) fftw_malloc(sizeof(double)*windowSize);
outputData= (double*) fftw_malloc(sizeof(double)*windowSize);
windowData= (double*) fftw_malloc(sizeof(double)*windowSize);
plan_r2hc = fftw_plan_r2r_1d(windowSize, inputData, windowData, FFTW_R2HC, FFTW_PATIENT);
// Opning files
inputFile = fopen("sinusD","rb");
outputFile= fopen("windowingResult","wb+");
if(inputFile==NULL ){
printf("Couldn't open either the input or the output file \n");
return -1;
}
while((i=fread(inputData,sizeof(double),windowSize,inputFile))==windowSize){
fftw_execute_r2r(plan_r2hc, inputData, windowData);
for( index1 =0; index1 < windowSize;index1++){
outputData[index1]+=windowData[index1];
printf("index %d \t %lf\n",index1,inputData[index1]);
}
if(overlaping!=0)
fseek(inputFile,(-overlaping)*sizeof(double),SEEK_CUR);
}
if( i!=0){
i = -i;
fseek(inputFile ,i*sizeof(double),SEEK_END);
fread(inputData,sizeof(double),-i,inputFile);
fftw_execute_r2r(plan_r2hc, inputData, windowData);
for( index1=0;index1< windowSize; index1++){
outputData[index1]+=windowData[index1];
}
}
powVal = outputData[0]*outputData[0];
powVal /= (windowSize*windowSize)/2;
index1 = 0;
fprintf(outputFile,"%lf ",powVal);
printf(" PSD \t %lf\n",powVal);
for (index1 =1; index1<=windowSize/2;index1++){
powVal = outputData[index1]*outputData[index1]+outputData[windowSize-index1]*outputData[windowSize- index1];
powVal/=(windowSize*windowSize)/2;
// powVal = 20*log10(fabs(powVal));
fprintf(outputFile,"%lf ",powVal);
printf(" PsD %d \t %10.5lf\n",index1,powVal);
}
fftw_free(inputData);
fftw_free(outputData);
fftw_free(windowData);
fclose(inputFile);
fclose(outputFile);
}
You need to premultiply the signal with a window function. This can be precomputed if you are calculating multiple FFTs.
For example, a Hanning window is calculated as follows:
#define WINDOW_SIZE 2048
int i;
double w[WINDOW_SIZE];
for (i=0; i<WINDOW_SIZE; i++) {
w[i] = (1.0 - cos(2.0 * M_PI * i/(WINDOW_SIZE-1))) * 0.5;
}
Before computing the Fourier transform, multiply your input data by this window as follows:
for (i=0; i<WINDOW_SIZE; i++) inputData[i] *= w[i];
Explanation
When you calculate the Fourier transform of a finite set of samples, what you actually get is the frequency spectrum of the infinite signal that you would get by repeating these samples forever. Unless you're sampling a signal whose frequency is an exact multiple of the sampling frame rate, you will get large discontinuities where the end of one sample frame runs into the start of the next. A window function flattens out the samples at the edges of the sample frame to eliminate these discontinuities.

DTRMM & DTRSM hangs on certain matrix sizes

I'm testing performance of ?GEMM, ?TRMM, ?TRSM using MKL's automatic offload on the new Intel Xeon Phi coprocessors and am having some issues with DTRMM and DTRSM. I have code to test the performance for matrix size in steps of 1024 up to 10240 and performance seems to drop off significantly somewhere after N=M=K=8192. When I try testing exactly where by using step sizes of 2, my script was hanging. I then checked 512 step sizes, which work fine, 256 work as well, but anything under 256 just stalls. I cannot find any known issues in regards to this problem. All single precision versions work, as well as single and double precision on ?GEMM. Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdint.h>
#include <time.h>
#include "mkl.h"
#define DBG 0
int main(int argc, char **argv)
{
char transa = 'N', side = 'L', uplo = 'L', diag = 'U';
MKL_INT N, NP; // N = M, N, K, lda, ldb, ldc
double alpha = 1.0; // Scaling factors
double *A, *B; // Matrices
int matrix_bytes; // Matrix size in bytes
int matrix_elements; // Matrix size in elements
int i, j; // Counters
int msec;
clock_t start, diff;
N = atoi(argv[1]);
start = clock();
matrix_elements = N * N;
matrix_bytes = sizeof(double) * matrix_elements;
// Allocate the matrices
A = malloc(matrix_bytes);
if (A == NULL)
{
printf("Could not allocate matrix A\n");
return -1;
}
B = malloc(matrix_bytes);
if (B == NULL)
{
printf("Could not allocate matrix B\n");
return -1;
}
for (i = 0; i < matrix_elements; i++)
{
A[i] = 0.0;
B[i] = 0.0;
}
// Initialize the matrices
for (i = 0; i < N; i++)
for (j = 0; j <= i; j++)
{
A[i+N*j] = 1.0;
B[i+N*j] = 2.0;
}
// DTRMM call
dtrmm(&side, &uplo, &transa, &diag, &N, &N, &alpha, A, &N, B, &N);
diff = clock() - start;
msec = diff * 1000 / CLOCKS_PER_SEC;
printf("%f\n", (float)msec * 10e-4);
if (DBG == 1)
{
printf("\nMatrix dimension is set to %d \n\n", (int)N);
// Display the result
printf("\nResulting matrix B:\n");
if (N > 10)
{
printf("NOTE: B is too large, print only upper-left 10x10 block...\n");
NP = 10;
}
else
NP = N;
printf("\n");
for (i = 0; i < NP; i++)
{
for (j = 0; j < NP; j++)
printf("%7.3f ", B[i + j * N]);
printf("\n");
}
}
// Free the matrix memory
free(A);
free(B);
return 0;
}
Any help or insight would be greatly appreciated.
This phenomenon has been extensively discussed in other questions, and also in Intel's Software Optimization Manual and Agner Fog's notes.
Typically, you are experiencing a perfect storm of evictions in the memory hierarchy, such that suddenly (nearly) every single access misses cache and/or TLB (one can determine exactly which resource is missing by looking at the specific data access pattern or by using the PMCs; I can do the calculation later when I'm near a whiteboard, unless mystical gets to you first).
You can also search through some of my or Mystical's answers to find previous answers.
The issue was an older version of Intel's icc compiler (beta 10 update, I believe.. maybe). Gold update works like a charm.

Resources