parallelize Mandelbrot with OpenMP - c
I have a function which calculate mandelbrot set I'm trying to parallelize it using openMP.
I put #pragma omp parallel for private before each for
static void calculer (Image * im, int nb_iter, double x_min, double x_max, double y_min, double y_max) {
/* Discretisation de l'ensemble */
double pasx = (x_max - x_min) / im -> nb_col;
double pasy = (y_max - y_min) / im -> nb_lig;
double cy = y_min;
double new_zx;
unsigned int l,c;
// Calcul
#pragma omp parallel for private ( pasx, pasy, im,nb_iter,x_min,x_max,y_min, y_max)
for (l = 0; l < im->nb_lig; l++) {
double cx = x_min;
#pragma omp parallel for private (cx)
for (c = 0; c < im->nb_col; c++) {
double zx = 0.0;
double zy = 0.0;
unsigned int n = 0;
while ( ( zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ) ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
++n;
}
im->pixels[l*im->nb_col + c] = n%256;
cx += pasx;
}
cy += pasy;
}
}
When compiling using gcc mandelbrot.c -fopenmp -o exe, I get segmentation fault. What could be the possible reason for it?
EDIT: After executing vingrid I get this message
Process terminating with default action of signal 11 (SIGSEGV)
==10689== Access not within mapped region at address 0x0
==10689== at 0x40105F: calculer._omp_fn.0 (in /home/haddad/Documents/TPOpenMP/TP_OpenMP/TP_Mandelbrot/exe3)
==10689== by 0x4E39EE9: ??? (in /usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0)
==10689== by 0x5047E99: start_thread (pthread_create.c:308)
==10689== by 0x535038C: clone (clone.S:112)
==10689== If you believe this happened as a result of a stack
==10689== overflow in your program's main thread (unlikely but
==10689== possible), you can try to increase the size of the
==10689== main thread stack using the --main-stacksize= flag.
==10689== The main thread stack size used in this run was 8388608.
I went ahead and cleaned up your code and even made an image from it.
#include <stdlib.h>
#include <stdio.h>
struct Image {
unsigned nb_lig;
unsigned nb_col;
unsigned *pixels;
};
void calculer (struct Image * im, unsigned nb_iter, double x_min, double x_max, double y_min, double y_max) {
double pasx = (x_max - x_min) / im -> nb_col;
double pasy = (y_max - y_min) / im -> nb_lig;
unsigned l,c;
#pragma omp parallel for private (c)
for (l = 0; l < im->nb_lig; l++) {
for (c = 0; c < im->nb_col; c++) {
double zx = 0.0, zy = 0.0, new_zx;
double cx = x_min + c*pasx, cy = y_min + l*pasy;
unsigned n = 0;
for(n=0; (zx*zx + zy*zy < 4.0 ) && ( n < nb_iter ); n++ ) {
new_zx = zx*zx - zy*zy + cx;
zy = 2.0*zx*zy + cy;
zx = new_zx;
}
if(n == nb_iter) n = 0;
im->pixels[l*im->nb_col + c] = n;
}
}
}
void draw_image(struct Image *im) {
const char charset[] = ".,c8M#jawrpogOQEPGJ";
unsigned l,c;
for (l = 0; l < im->nb_lig; l++) {
for (c = 0; c < im->nb_col; c++) {
unsigned n = im->pixels[l*im->nb_col + c];
char p = n > 0 ? charset[n % (sizeof(charset)-1)] : ' ';
putchar(p);
if(c+1 == im->nb_col) puts("");
}
}
puts("");
}
int main(void) {
struct Image im;
im.nb_lig = 40;
im.nb_col = 80;
im.pixels = malloc(sizeof *im.pixels * im.nb_lig*im.nb_col);
unsigned nb_iter = 256;
calculer(&im, nb_iter, -2.5, 1.5, -2.0, 2.0);
draw_image(&im);
return 0;
}
The output is
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccc,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccc,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccc,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccc,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccccccccc,,,
,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccccccccccccc,
,,,,,,,,,,,,,,,,,,,,ccccccccccccccc88888888888cccccccccccccccccccccccccccccccccc
,,,,,,,,,,,,,,,,,,,ccccccccc8888888888888888888888888ccccccccccccccccccccccccccc
,,,,,,,,,,,,,,,,,ccccccc8888888888888888MMMMM#.w##MMM8888ccccccccccccccccccccccc
,,,,,,,,,,,,,,,,ccccc8888888888888888MMMMMM##jaoro #MMMM8888cccccccccccccccccccc
,,,,,,,,,,,,,,,ccc8888888888888888MMMMMMM###jwrG#owj##MMMM88888ccccccccccccccccc
,,,,,,,,,,,,,,cc8888888888888888MMMMMMM##jjawQ Jwj####MM888888ccccccccccccccc
,,,,,,,,,,,,,cc88888888888888MMMMMM##jawwwwrpQ OprwjjjJ#MM88888cccccccccccccc
,,,,,,,,,,,,cc8888888888888MMMM####jjagM Pa ,gQEPE#M888888ccccccccccccc
,,,,,,,,,,,,c88888888888MM#######jjjwQg# ,aj#M888888cccccccccccc
,,,,,,,,,,,c8888888MMM##agaaaaaaaaawo, Gr.#MM888888ccccccccccc
,,,,,,,,,,,888MMMMMM###japP,gOPOorro# EwjMM8888888cccccccccc
,,,,,,,,,,,8MMMMMM####jawoJ EP ga#MMM888888cccccccccc
,,,,,,,,,,,MMMMMjjjjawgOQ8 Q wj#MMM888888cccccccccc
,,,,,,,,,,, gwaj#MMM888888cccccccccc
,,,,,,,,,,,MMMMMjjjjawgOQ8 Q wj#MMM888888cccccccccc
,,,,,,,,,,,8MMMMMM####jawoJ EP ga#MMM888888cccccccccc
,,,,,,,,,,,888MMMMMM###japP,gOPOorro# EwjMM8888888cccccccccc
,,,,,,,,,,,c8888888MMM##agaaaaaaaaawo, Gr.#MM888888ccccccccccc
,,,,,,,,,,,,c88888888888MM#######jjjwQg# ,aj#M888888cccccccccccc
,,,,,,,,,,,,cc8888888888888MMMM####jjagM Pa ,gQEPE#M888888ccccccccccccc
,,,,,,,,,,,,,cc88888888888888MMMMMM##jawwwwrpQ OprwjjjJ#MM88888cccccccccccccc
,,,,,,,,,,,,,,cc8888888888888888MMMMMMM##jjawQ Jwj####MM888888ccccccccccccccc
,,,,,,,,,,,,,,,ccc8888888888888888MMMMMMM###jwrG#owj##MMMM88888ccccccccccccccccc
,,,,,,,,,,,,,,,,ccccc8888888888888888MMMMMM##jaoro #MMMM8888cccccccccccccccccccc
,,,,,,,,,,,,,,,,,ccccccc8888888888888888MMMMM#.w##MMM8888ccccccccccccccccccccccc
,,,,,,,,,,,,,,,,,,,ccccccccc8888888888888888888888888ccccccccccccccccccccccccccc
,,,,,,,,,,,,,,,,,,,,ccccccccccccccc88888888888cccccccccccccccccccccccccccccccccc
,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccccccccccccc,
,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccccccccc,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccccccc,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccccccccccc,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccccccccccccc,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ccccccccccccccccccccccccc,,,,,,,,,,,,,,,,,
Related
OpenCL Kernel implementing im2col with batch
I am trying to adapt a secuential function writen for CPU to an OpenCL kernel for GPU. The function is the well known im2col used in many deep learning applications. I have found some code on the OpenCV repository implementing this im2col function written in OpenCL but the one that I have to adapt uses a batch that confuses me and seems to be a bit different. What should I change on the OpenCL kernel to make it work the same on GPU as it does on the CPU function? CPU code int fn_im2col_cpu(int I, int WI, int HI, int B, int KW, int KH, int WO, int HO, int PW, int PH, int SW, int SH, type *in_ptr, type *out_ptr) { PROFILING_HEADER_EXTERN(im2col); PROFILING_DEVICE(im2col, DEV_CPU); int i; // scrolls input channels int w; // scrolls channel columns (width) int h; // scrolls channel rows (height) int kw; // scrolls filter columns (width) int kh; // scrolls filter rows (height) // we sweep all output pixels, and for each pixel we compute the associated input pixel #pragma omp parallel for private (kh, kw, h, w) for (i = 0; i < I; i++) { size_t out_addr = ((size_t)B * (size_t)WO * (size_t)HO * (size_t)KW * (size_t)KH * (size_t)i); size_t in_addr1 = (size_t)i * (size_t)B * (size_t)WI * (size_t)HI; for (kh = 0; kh < KH; kh++) { for (kw = 0; kw < KW; kw++) { for (h = 0; h < HO; h++) { int hi = h * SH - PH + kh; size_t in_addr2 = in_addr1 + ((size_t)hi * (size_t)B * (size_t)WI); for (w = 0; w < WO; w++) { int wi = w * SW - PW + kw; int force_padding = (wi < 0) || (wi >= WI) || (hi < 0) || (hi >= HI); if (force_padding) { bzero(&out_ptr[out_addr], B*sizeof(type)); } else { int in_addr = in_addr2 + (wi * B); memcpy(&out_ptr[out_addr], &in_ptr[in_addr], B*sizeof(type)); } out_addr+=B; } } } } } return 1; } OpenCL kernel from https://github.com/opencv/opencv/blob/master/modules/dnn/src/opencl/im2col.cl __kernel void im2col(__global const float *im_src, int im_src_offset, int channels, int height_inp, int width_inp, int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w, int height_out, int width_out, __global float *im_col, int im_col_offset ) { int index = get_global_id(0); if (index >= height_out * width_out * channels) return; int j_out = index % width_out; int i_out = (index / width_out) % height_out; int c_inp = (index / width_out) / height_out; int c_out = c_inp * kernel_h * kernel_w; int i_inp = i_out * stride_h - pad_h; int j_inp = j_out * stride_w - pad_w; im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset; im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset; for (int ki = 0; ki < kernel_h; ++ki) for (int kj = 0; kj < kernel_w; ++kj) { int i = i_inp + ki; int j = j_inp + kj; *im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[ki * width_inp + kj] : 0; im_col += height_out * width_out; } }
Your C version folds the batch into the lowest dimension. The opencl version isn't even using batch. You need to pass in the batch size "B", and change this copy to a block copy (or just do a loop over) by the batch size: for (int b=0; b<B; b++) *(im_col*B+b) = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ? im_src[(ki * width_inp + kj)*B + b] : 0; to emulate the memcpy(..., B*sizeof(type)). And then just stride B times more: im_col += height_out * width_out * B;
openMP and Mandelbrot in C
I have to parallelize using openMP the serial version of a program in C to visualize a Mandelbrot set. I tried to do it but I obtain something really strange. #include <stdlib.h> #include <stdio.h> #include <omp.h> #include <unistd.h> #include <time.h> #include <sys/time.h> #include "pngwriter.h" #include "consts.h" unsigned long get_time() { struct timeval tp; gettimeofday(&tp, NULL); return tp.tv_sec * 1000000 + tp.tv_usec; } int main(int argc, char** argv) { png_data* pPng = png_create(IMAGE_WIDTH, IMAGE_HEIGHT); double x, y, x2, y2, cx, cy; cy = MIN_Y; double fDeltaX = (MAX_X - MIN_X) / (double)IMAGE_WIDTH; double fDeltaY = (MAX_Y - MIN_Y) / (double)IMAGE_HEIGHT; long nTotalIterationsCount = 0; unsigned long nTimeStart = get_time(); long i, j, n; n = 0; int c; #pragma omp parallel { #pragma omp for private(i, c) reduction(+ : cx, cy) for (j = 0; j < IMAGE_HEIGHT; j++) { cx = MIN_X; for (i = 0; i < IMAGE_WIDTH; i++) { x = cx; y = cy; x2 = x * x; y2 = y * y; for (n = 0; (n < MAX_ITERS) && (x2 + y2 < 4); n++) { y = 2 * x * y + cy; x = x2 - y2 + cx; x2 = x * x; y2 = y * y; } int c = ((long)n * 255) / MAX_ITERS; png_plot(pPng, i, j, c, c, c); cx += fDeltaX; nTotalIterationsCount++; } cy += fDeltaY; } } unsigned long nTimeEnd = get_time(); png_write(pPng, "mandel.png"); return 0; } I obtain this: https://usi365-my.sharepoint.com/personal/fabbrl_usi_ch/_layouts/15/guestaccess.aspx?guestaccesstoken=d83LRC8EG1Kec%2f%2f6zwCbiHkO7%2bsuGv7JyWR%2flalvPvA%3d&docid=128ed81bef8b244d680d5651ad1afea2f&rev=1 Since this is an assignment, I am not here to ask for code. Just an explanation. Thanks.
Adding more threads to program resulted in longer execution time for calculating trapezoidal integration
I am working on a multi-threaded numerical integration program using the trapezoidal rule. I have a struct which contains six items: typedef struct trapezoidalIntegrationThread{ float a; float b; int n; float h; double res; float elTime; }threadParams; a is the left end point, b is the right end point, n is the number of trapezoids, h is the height, res is the result calculated within compute_with_pthread, and finally, elTime is the elapsed time for compute_with_pthread for benchmarking. Here is my code in main: int n = NUM_TRAPEZOIDS; float a = LEFT_ENDPOINT; float b = RIGHT_ENDPOINT; pthread_t masterThread; pthread_t slaveThread[NUM_THREADs]; threadParams *trapThread; for(i = 0; i < NUM_THREADs; i++) { trapThread = (threadParams *) malloc(sizeof(threadParams)); trapThread->a = a; trapThread->b = b; trapThread->n = n; trapThread->h = (b - a) / (float) n; if (pthread_create(&slaveThread[i], NULL, compute_using_pthreads, (void *) trapThread) != 0) { printf("Looks like something went wrong..\n"); return -1; } } for(i = 0; i < NUM_THREADs; i++) { pthread_join(slaveThread[i], NULL); } pthread_exit((void *) masterThread); I am basically creating the number of threads defined in NUM_THREADS (let's assume this value is 4). I am allocating how much memory the struct needs, and setting the pre-defined values of: #define LEFT_ENDPOINT 5 #define RIGHT_ENDPOINT 1000 #define NUM_TRAPEZOIDS 100000000 #define NUM_THREADs 8 /* Number of threads to run. */ Next, I create my pthreads, and call the compute_using_pthreads function: void *compute_using_pthreads(void *inputs) { double integral; int k; threadParams *args = (threadParams *) inputs; unsigned long p_micros = 0; float p_millis = 0.0; clock_t p_start, p_end; float a = args->a; float b = args->b; int n = args->n; float h = args->h; p_start = clock(); integral = (f(a) + f(b))/2.0; for (k = 1; k <= n-1; k++) { integral += f(a+k*h); } integral = integral*h; p_end = clock(); p_micros = p_end - p_start; p_millis = p_micros / 1000; args->res = integral; args->elTime = p_millis; } I ran this program and compared it against a non-multithreaded function: double compute_gold(float a, float b, int n, float h) { double integral; int k; integral = (f(a) + f(b))/2.0; for (k = 1; k <= n-1; k++) { integral += f(a+k*h); } integral = integral*h; return integral; } So here are the results: Run-time of compute_gold: ~3000 ms Run_time of compute_with_pthread: Using 1 thread: ~3000 ms Using 2 threads: ~6000 ms Using 4 thrads: ~12000 ms .... So for some reason, the more threads I added, the execution took n-threads more time to execute. I can't for the life of me figure out why this is happening, as I am quite new to C programming =/
segment fault on programming C
I am tyring to make velocity Verlet method, by using C language. I thought I made it good. However, there pops up 'Segmentation fault(core dumped)' whenever, I increase the size of the vector or array, x and y. For the size n equal and less than 1e3, it's fine, but at the point of n = 1e4, the program gets error. Please anybody help me on this. Thank you. #include <stdio.h> #include <stdlib.h> #include <math.h> double verlet(double t, double x) { double E = 0.252; double B = 0.052; double a = M_PI/2; return -sin(x) + E*cos(t) + B*cos(2*t+a); } double pverlet(double(*f)(double, double), double dt, double t, double x, double y) { return x + dt*( y + (dt/2)*f(t, x)); } double vverlet(double(*g)(double, double), double dt, double t, double x, double y) { return y + (dt/2) * g(t, x); } int main(void) { int i; double t; int n = 1e4; double ti = 0, tf = 1e5, dt = (tf-ti)/n; double *x = (double *) malloc(sizeof(double)*n); double *y = (double *) malloc(sizeof(double)*2*n); if (x == NULL) { printf("error allocating memory!\n"); return 1; } if (y == NULL) { printf("error allocating memory!\n"); return 1; } for (y[0] = 0, i = 1; i <2*n; i++) { y[i] = vverlet(verlet, dt, ti + dt*(i-1), x[i-1], y[i-1]); } for (x[0] = 0, i = 1; i < n; i++) { x[i] = pverlet(verlet, dt, ti + dt*(i-1), x[i-1], y[2*(i-1)]); } for (i = 0; i < n; i++) { t = ti + dt * i; printf("%e %e %e\n", t, x[i], y[2*i]); } return 0; free(x); free(y); }
for (y[0] = 0, i = 1; i <2*n; i++) { y[i] = vverlet(verlet, dt, ti + dt*(i-1), x[i-1], y[i-1]); } x is defined from 0 to n-1.
problems with compiling openMP and math library
I'm trying to compile a program with openMP: gcc -c fopenmp -lm prog.c -o prog prog.c includes . However when I run ./prog, the error is: bash: ./prog: Permission denied I tried again without the -c flag: gcc -o prog -fopenmp -lm prog.c But this time it doesn't seem to see the functions in math library like cos, sqrt (undefined reference). #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <time.h> #include <sys/time.h> #include <omp.h> #define REAL float #define NX (64) #ifndef M_PI #define M_PI (3.1415926535897932384626) #endif void init(REAL *buff, const int nx, const int ny, const int nz, const REAL kx, const REAL ky, const REAL kz, const REAL dx, const REAL dy, const REAL dz, const REAL kappa, const REAL time) { REAL ax, ay, az; int jz, jy, jx; ax = exp(-kappa*time*(kx*kx)); ay = exp(-kappa*time*(ky*ky)); az = exp(-kappa*time*(kz*kz)); for (jz = 0; jz < nz; jz++) { for (jy = 0; jy < ny; jy++) { for (jx = 0; jx < nx; jx++) { int j = jz*nx*ny + jy*nx + jx; REAL x = dx*((REAL)(jx + 0.5)); REAL y = dy*((REAL)(jy + 0.5)); REAL z = dz*((REAL)(jz + 0.5)); REAL f0 = (REAL)0.125 *(1.0 - ax*cos(kx*x)) *(1.0 - ay*cos(ky*y)) *(1.0 - az*cos(kz*z)); buff[j] = f0; } } } } REAL accuracy(const REAL *b1, REAL *b2, const int len) { REAL err = 0.0; int i; for (i = 0; i < len; i++) { err += (b1[i] - b2[i]) * (b1[i] - b2[i]); } return (REAL)sqrt(err/len); } typedef void (*diffusion_loop_t)(REAL *f1, REAL *f2, int nx, int ny, int nz, REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, REAL **f_ret, REAL *time_ret, int *count_ret); static void diffusion_baseline(REAL *f1, REAL *f2, int nx, int ny, int nz, REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, REAL **f_ret, REAL *time_ret, int *count_ret) { REAL time = 0.0; int count = 0; do { int z; for (z = 0; z < nz; z++) { int y; for (y = 0; y < ny; y++) { int x; for (x = 0; x < nx; x++) { int c, w, e, n, s, b, t; c = x + y * nx + z * nx * ny; w = (x == 0) ? c : c - 1; e = (x == nx-1) ? c : c + 1; n = (y == 0) ? c : c - nx; s = (y == ny-1) ? c : c + nx; b = (z == 0) ? c : c - nx * ny; t = (z == nz-1) ? c : c + nx * ny; f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e] + cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t]; } } } REAL *t = f1; f1 = f2; f2 = t; time += dt; count++; } while (time + 0.5*dt < 0.1); *time_ret = time; *f_ret = f1; *count_ret = count; return; } static void diffusion_openmp(REAL *f1, REAL *f2, int nx, int ny, int nz, REAL ce, REAL cw, REAL cn, REAL cs, REAL ct, REAL cb, REAL cc, REAL dt, REAL **f_ret, REAL *time_ret, int *count_ret) { #pragma omp parallel { REAL time = 0.0; int count = 0; REAL *f1_t = f1; REAL *f2_t = f2; #pragma omp master printf("%d threads running\n", omp_get_num_threads()); do { int z; #pragma omp for for (z = 0; z < nz; z++) { int y; for (y = 0; y < ny; y++) { int x; for (x = 0; x < nx; x++) { int c, w, e, n, s, b, t; c = x + y * nx + z * nx * ny; w = (x == 0) ? c : c - 1; e = (x == nx-1) ? c : c + 1; n = (y == 0) ? c : c - nx; s = (y == ny-1) ? c : c + nx; b = (z == 0) ? c : c - nx * ny; t = (z == nz-1) ? c : c + nx * ny; f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e] + cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t]; } } } REAL *t = f1_t; f1_t = f2_t; f2_t = t; time += dt; count++; } while (time + 0.5*dt < 0.1); #pragma omp master { *f_ret = f1_t; *time_ret = time; *count_ret = count; } } return; } int main(int argc, char *argv[]) { struct timeval time_begin, time_end; int nx = NX; int ny = NX; int nz = NX; REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX); REAL time = 0.0; int count = 0; REAL l, dx, dy, dz, kx, ky, kz, kappa, dt; REAL ce, cw, cn, cs, ct, cb, cc; l = 1.0; kappa = 0.1; dx = dy = dz = l / nx; kx = ky = kz = 2.0 * M_PI; dt = 0.1*dx*dx / kappa; init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); ce = cw = kappa*dt/(dx*dx); cn = cs = kappa*dt/(dy*dy); ct = cb = kappa*dt/(dz*dz); cc = 1.0 - (ce + cw + cn + cs + ct + cb); diffusion_loop_t diffusion_loop = diffusion_baseline; if (argc == 2) { if (strcmp(argv[1], "openmp") == 0) { diffusion_loop = diffusion_openmp; } } gettimeofday(&time_begin, NULL); diffusion_loop(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, dt, &f1, &time, &count); gettimeofday(&time_end, NULL); REAL *answer = (REAL *)malloc(sizeof(REAL) * nx*ny*nz); init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time); REAL err = accuracy(f1, answer, nx*ny*nz); double elapsed_time = (time_end.tv_sec - time_begin.tv_sec) + (time_end.tv_usec - time_begin.tv_usec)*1.0e-6; REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06; double thput = (nx * ny * nz) * sizeof(REAL) * 2.0 * count / elapsed_time / (1 << 30); fprintf(stderr, "elapsed time : %.3f (s)\n", elapsed_time); fprintf(stderr, "flops : %.3f (MFlops)\n", mflops); fprintf(stderr, "throughput : %.3f (GB/s)\n", thput); fprintf(stderr, "accuracy : %e\n", err); free(answer); free(f1); free(f2); return 0; }
In your first case, the -c flag only compiles to an object file, not an actual executable. Since object files don't normally have the executable bit set (because they're not directly executable), you get the Permission denied error. In your second case, it's because the order in which you specify -l libraries is important. You need to move -lm after your prog.c file, like this: gcc -o prog -fopenmp prog.c -lm