I have this part of a code which is reading a 2D array of structs, doing some math on them that putting the results into a second 2D array:
#pragma omp parallel for private (n, i, j) schedule(dynamic)
for(n = 0; n < frames_read; n++){
for (i = 0; i < atoms_total; i++)
{
for(j = 0; j < atoms_total; j++)
{
if (timestep_array[i][n].atom_id == timestep_array[j][m].atom_id)
{
// calculates the vector magnitude and stores it in the created array MSD
double temp1_x = timestep_array[i][n].normalized_x_position + timestep_array[i][n].x_box;
double temp2_x = timestep_array[j][n+1].normalized_x_position + timestep_array[j][n+1].x_box;
double temp3_x = temp2_x - temp1_x;
double temp4_x = temp3_x * box_bound_x;
double temp5_x = pow(temp4_x, 2);
double temp1_y = timestep_array[i][n].normalized_y_position + timestep_array[i][n].y_box;
double temp2_y = timestep_array[j][n+1].normalized_y_position + timestep_array[j][n+1].y_box;
double temp3_y = temp2_y - temp1_y;
double temp4_y = temp3_y * box_bound_y;
double temp5_y = pow(temp4_y, 2);
double temp1_z = timestep_array[i][n].normalized_z_position + timestep_array[i][n].z_box;
double temp2_z = timestep_array[j][n+1].normalized_z_position + timestep_array[j][n+1].z_box;
double temp3_z = temp2_z - temp1_z;
double temp4_z = temp3_z * box_bound_z;
double temp5_z = pow(temp4_z, 2);
double temp = temp5_x + temp5_y + temp5_z;
double temp2 = sqrt(temp);
int atom_number = timestep_array[i][n].atom_id;
MSD[atom_number][n].msd = sqrt(temp2);
MSD[atom_number][n].atom_type = timestep_array[i][n].atom_type;
MSD[atom_number][n].time_in_picoseconds = timestep_array[i][n].timestep / picoseconds;
}
}
}
}
I have tried so many combinations of the #pragma statement (including making many more of the variables private.) Nothing has resulted in the a.out file running more than one thread. What am I doing wrong?
Related
I am trying to adapt a secuential function writen for CPU to an OpenCL kernel for GPU.
The function is the well known im2col used in many deep learning applications.
I have found some code on the OpenCV repository implementing this im2col function written in OpenCL but the one that I have to adapt uses a batch that confuses me and seems to be a bit different.
What should I change on the OpenCL kernel to make it work the same on GPU as it does on the CPU function?
CPU code
int fn_im2col_cpu(int I, int WI, int HI, int B, int KW, int KH, int WO, int HO, int PW, int PH, int SW, int SH, type *in_ptr, type *out_ptr) {
PROFILING_HEADER_EXTERN(im2col);
PROFILING_DEVICE(im2col, DEV_CPU);
int i; // scrolls input channels
int w; // scrolls channel columns (width)
int h; // scrolls channel rows (height)
int kw; // scrolls filter columns (width)
int kh; // scrolls filter rows (height)
// we sweep all output pixels, and for each pixel we compute the associated input pixel
#pragma omp parallel for private (kh, kw, h, w)
for (i = 0; i < I; i++) {
size_t out_addr = ((size_t)B * (size_t)WO * (size_t)HO * (size_t)KW * (size_t)KH * (size_t)i);
size_t in_addr1 = (size_t)i * (size_t)B * (size_t)WI * (size_t)HI;
for (kh = 0; kh < KH; kh++) {
for (kw = 0; kw < KW; kw++) {
for (h = 0; h < HO; h++) {
int hi = h * SH - PH + kh;
size_t in_addr2 = in_addr1 + ((size_t)hi * (size_t)B * (size_t)WI);
for (w = 0; w < WO; w++) {
int wi = w * SW - PW + kw;
int force_padding = (wi < 0) || (wi >= WI) || (hi < 0) || (hi >= HI);
if (force_padding) {
bzero(&out_ptr[out_addr], B*sizeof(type));
} else {
int in_addr = in_addr2 + (wi * B);
memcpy(&out_ptr[out_addr], &in_ptr[in_addr], B*sizeof(type));
}
out_addr+=B;
}
}
}
}
}
return 1;
}
OpenCL kernel from https://github.com/opencv/opencv/blob/master/modules/dnn/src/opencl/im2col.cl
__kernel void im2col(__global const float *im_src, int im_src_offset,
int channels, int height_inp, int width_inp,
int kernel_h, int kernel_w, int pad_h, int pad_w,
int stride_h, int stride_w,
int height_out, int width_out,
__global float *im_col, int im_col_offset
)
{
int index = get_global_id(0);
if (index >= height_out * width_out * channels)
return;
int j_out = index % width_out;
int i_out = (index / width_out) % height_out;
int c_inp = (index / width_out) / height_out;
int c_out = c_inp * kernel_h * kernel_w;
int i_inp = i_out * stride_h - pad_h;
int j_inp = j_out * stride_w - pad_w;
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
for (int ki = 0; ki < kernel_h; ++ki)
for (int kj = 0; kj < kernel_w; ++kj) {
int i = i_inp + ki;
int j = j_inp + kj;
*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
im_src[ki * width_inp + kj] : 0;
im_col += height_out * width_out;
}
}
Your C version folds the batch into the lowest dimension. The opencl version isn't even using batch.
You need to pass in the batch size "B", and change this copy to a block copy (or just do a loop over) by the batch size:
for (int b=0; b<B; b++) *(im_col*B+b) = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[(ki * width_inp + kj)*B + b] : 0;
to emulate the memcpy(..., B*sizeof(type)).
And then just stride B times more:
im_col += height_out * width_out * B;
I am trying to translate the Automatic Gain Control block from Matlab's communications toolbox. Here is the documentation but the relevant parts are summarized below:
The block diagram
Where
And
This is the code I've written:
double AGC_Detector(int N, float complex* input, int ndx)
{
double samp = 0;
for(int i = ndx*N; i < (ndx+1)*(N-1); i++)
{
samp += cabs(input[i])*cabs(input[i]);
}
samp = samp/N;
return samp;
}
void use_AGC(float complex* input,
float complex* output,
double step,
double desired_pwr,
int avrg_len,
int max_pwr,
int len)
{
double z = 0;
double e = 0;
double g = 0;
double prev_g = 0;
double dtctr = 0;
for(int i = 0; i < len; i++)
{
dtctr = AGC_Detector(avrg_len,input,i);
z = dtctr * exp(2*prev_g);
e = desired_pwr - log(z);
g = prev_g + step*e;
if(g > max_pwr)
{
g = max_pwr;
}
prev_g = g;
output[i] = input[i] * exp(prev_g);
}
}
But I am not getting the same output I get for the same values in Matlab. What am I doing wrong?
Thank You!
Maybe it's not the only error but I've noticed, that the sum of the AGC Detector goes from ndx*N to (ndx+1)N-1 in the Matlab documentation, but in your C code it goes from
ndx*N to (ndx+1)(N-1)
I figured it out. Code below gives same output as matlab for a given set of complex data.
double AGC_Detector(double complex input, double* sum, double state[], unsigned int* idx, int N)
{
unsigned int first;
unsigned int nth;
double output;
double val = cabs(input)*cabs(input);
*sum += val;
output = (*sum)*(1.0/N);
*sum -= state[*idx-1];
if(*sum < 0.0){*sum = 0.0;}
state[*idx-1] = val;
first = *idx;
nth = first + 1;
if(nth < first){nth = 0xFFFFFFFF;}
*idx = nth;
if(*idx > N-1){*idx = 1;}
return output;
}
void use_AGC(double complex* input, double complex* output, double step, double desired_pwr, int avrg_len, int max_pwr, int len)
{
unsigned int idx = 1;
int filt_len = avrg_len-1;
double K = step;
double g = 0;
double sum = 0;
double dtctr = 0;
double filterState[filt_len];
for(int i = 0; i < filt_len; i++){filterState[i] = 0;}
for(int i = 0; i < avrg_len; i++)
{
dtctr = AGC_Detector(input[i], &sum, filterState, &idx, avrg_len);
output[i] = input[i] * exp(g);
dtctr = log(dtctr);
g += K*(desired_pwr - (dtctr + 2.0*g));
if(g > max_pwr){g = max_pwr;}
}
}
im having trouble with the parallelization of this black_scholes code fragment, i added a simple #pragma omp parallel for but it take 50 times more time
im sure there is a problem with shared memory but i really don't know what
black_scholes_iterate (void* the_args)
{
black_scholes_args_t* args = (black_scholes_args_t*) the_args;
/* Unpack the IN/OUT struct */
/* IN (read-only) parameters */
const int S = args->S;
const int E = args->E;
const int M = args->M;
const double r = args->r;
const double sigma = args->sigma;
const double T = args->T;
/* OUT (write-only) parameters */
double* trials = args->trials;
double mean = 0.0;
/* Temporary variables */
gaussrand_state_t gaussrand_state;
void* prng_stream = NULL;
int k;
/* Spawn a random number generator */
prng_stream = spawn_prng_stream (0);
/* Initialize the Gaussian random number module for this thread */
init_gaussrand_state (&gaussrand_state);
/* Do the Black-Scholes iterations */
printf("here2: %d \n",M);
#pragma omp parallel for
for (k = 0; k < M; k++)
{
const double gaussian_random_number = gaussrand1 (&uniform_random_double,
prng_stream,
&gaussrand_state);
trials[k] = black_scholes_value (S, E, r, sigma, T,
gaussian_random_number);
/*
* We scale each term of the sum in order to avoid overflow.
* This ensures that mean is never larger than the max
* element of trials[0 .. M-1].
*/
mean += trials[k] / (double) M;
}
after further testing i noticed that htis part of the for loop take a lot of time:
const double gaussian_random_number = gaussrand1
(&uniform_random_double,prng_stream, &gaussrand_state);
double *a;
a = malloc(M * sizeof (double));
for (int k = 0; k < M; k++)
{
const double gaussian_random_number = gaussrand1 (&uniform_random_double,
prng_stream,
&gaussrand_state);
a[k]=gaussian_random_number;
}
#pragma omp parallel for
for (int k = 0; k < M; k++)
{
trials[k] = black_scholes_value (S, E, r, sigma, T,
a[k]);
mean += trials[k] / (double) M;
}
#Z Boson's answer was the solution, i got a significant speed up and it helped a lot thank you soo much
I am working on a multi-threaded numerical integration program using the trapezoidal rule.
I have a struct which contains six items:
typedef struct trapezoidalIntegrationThread{
float a;
float b;
int n;
float h;
double res;
float elTime;
}threadParams;
a is the left end point, b is the right end point, n is the number of trapezoids, h is the height, res is the result calculated within compute_with_pthread, and finally, elTime is the elapsed time for compute_with_pthread for benchmarking.
Here is my code in main:
int n = NUM_TRAPEZOIDS;
float a = LEFT_ENDPOINT;
float b = RIGHT_ENDPOINT;
pthread_t masterThread;
pthread_t slaveThread[NUM_THREADs];
threadParams *trapThread;
for(i = 0; i < NUM_THREADs; i++) {
trapThread = (threadParams *) malloc(sizeof(threadParams));
trapThread->a = a;
trapThread->b = b;
trapThread->n = n;
trapThread->h = (b - a) / (float) n;
if (pthread_create(&slaveThread[i], NULL, compute_using_pthreads, (void *) trapThread) != 0) {
printf("Looks like something went wrong..\n");
return -1;
}
}
for(i = 0; i < NUM_THREADs; i++) {
pthread_join(slaveThread[i], NULL);
}
pthread_exit((void *) masterThread);
I am basically creating the number of threads defined in NUM_THREADS (let's assume this value is 4). I am allocating how much memory the struct needs, and setting the pre-defined values of:
#define LEFT_ENDPOINT 5
#define RIGHT_ENDPOINT 1000
#define NUM_TRAPEZOIDS 100000000
#define NUM_THREADs 8 /* Number of threads to run. */
Next, I create my pthreads, and call the compute_using_pthreads function:
void *compute_using_pthreads(void *inputs)
{
double integral;
int k;
threadParams *args = (threadParams *) inputs;
unsigned long p_micros = 0;
float p_millis = 0.0;
clock_t p_start, p_end;
float a = args->a;
float b = args->b;
int n = args->n;
float h = args->h;
p_start = clock();
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
p_end = clock();
p_micros = p_end - p_start;
p_millis = p_micros / 1000;
args->res = integral;
args->elTime = p_millis;
}
I ran this program and compared it against a non-multithreaded function:
double compute_gold(float a, float b, int n, float h)
{
double integral;
int k;
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
return integral;
}
So here are the results:
Run-time of compute_gold:
~3000 ms
Run_time of compute_with_pthread:
Using 1 thread: ~3000 ms
Using 2 threads: ~6000 ms
Using 4 thrads: ~12000 ms
....
So for some reason, the more threads I added, the execution took n-threads more time to execute. I can't for the life of me figure out why this is happening, as I am quite new to C programming =/
I seem to be lost with this Fourier Transform function. There's a sample program that I have but don't understand. The ggFFTworksp contains the data and fftFrameSize is simply framesize of the data. I don't understand how the function is supposed to put the FFT version of the data into the fftBuffer if there is no part in the code where fftBuffer is actually edited or manipulated. Thank you in advance!
The function call is this:
static float gFFTworksp[2*MAX_FRAME_LENGTH];
long fftFrameSize;
smbFft(gFFTworksp, fftFrameSize, -1);
The function in question is this:
void smbFft(float *fftBuffer, long fftFrameSize, long sign)
/*
FFT routine, (C)1996 S.M.Bernsee. Sign = -1 is FFT, 1 is iFFT (inverse)
Fills fftBuffer[0...2*fftFrameSize-1] with the Fourier transform of the
time domain data in fftBuffer[0...2*fftFrameSize-1]. The FFT array takes
and returns the cosine and sine parts in an interleaved manner, ie.
fftBuffer[0] = cosPart[0], fftBuffer[1] = sinPart[0], asf. fftFrameSize
must be a power of 2. It expects a complex input signal (see footnote 2),
ie. when working with 'common' audio signals our input signal has to be
passed as {in[0],0.,in[1],0.,in[2],0.,...} asf. In that case, the transform
of the frequencies of interest is in fftBuffer[0...fftFrameSize].
*/
{
float wr, wi, arg, *p1, *p2, temp;
float tr, ti, ur, ui, *p1r, *p1i, *p2r, *p2i;
long i, bitm, j, le, le2, k;
for (i = 2; i < 2*fftFrameSize-2; i += 2) {
for (bitm = 2, j = 0; bitm < 2*fftFrameSize; bitm <<= 1) {
if (i & bitm) j++;
j <<= 1;
}
if (i < j) {
p1 = fftBuffer+i; p2 = fftBuffer+j;
temp = *p1; *(p1++) = *p2;
*(p2++) = temp; temp = *p1;
*p1 = *p2; *p2 = temp;
}
}
for (k = 0, le = 2; k < (long)(log(fftFrameSize)/log(2.)+.5); k++) {
le <<= 1;
le2 = le>>1;
ur = 1.0;
ui = 0.0;
arg = M_PI / (le2>>1);
wr = cos(arg);
wi = sign*sin(arg);
for (j = 0; j < le2; j += 2) {
p1r = fftBuffer+j; p1i = p1r+1;
p2r = p1r+le2; p2i = p2r+1;
for (i = j; i < 2*fftFrameSize; i += le) {
tr = *p2r * ur - *p2i * ui;
ti = *p2r * ui + *p2i * ur;
*p2r = *p1r - tr; *p2i = *p1i - ti;
*p1r += tr; *p1i += ti;
p1r += le; p1i += le;
p2r += le; p2i += le;
}
tr = ur*wr - ui*wi;
ui = ur*wi + ui*wr;
ur = tr;
}
}
}
In the following line:
p1 = fftBuffer+i; p2 = fftBuffer+j;
p1 and p2 become pointers that point to the memory location of the fftBuffer array. And in these lines:
*(p2++) = temp; temp = *p1;
*p1 = *p2; *p2 = temp;
the values in these memory locations are being changed.