Related
I'm trying to parallelize my code, but i got errors. I need to calc a Cauchy problem (it's already done) but than i need to parallelize it using OpenMP lib.
I've tried to write some code with OpenMP, but it's not working.
I've created a struct to collect result.
struct Dots {
double par;
double x;
double y;
};
This is my target function with parameter.
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = x + 2 * y[0] / (1 + mu * mu);
return GSL_SUCCESS;
}
This is the main function. I currently didn't find a way how to create a array of arrays of struct, but this is not the main problem.
void calc_cauchy_problem(struct Dots ArrayOfDots[], double x_start, double x_end, double y_start,
int count) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int mu = 5;
int param = 0;
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys,
gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
int status = 0;
#pragma omp parallel for shared(ArrayOfDots) private(sys, param, d, status)
for (int param = 1; param < mu; param++) {
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys,
gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++)
{
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
// ArrayOfDots[i].par = mu;
// ArrayOfDots[i].x = xi;
// ArrayOfDots[i].y = y[0];
}
gsl_odeiv2_driver_free (d);
}
}
The main
int main() {
double x_start = 0;
double x_end = 10;
double y_start = 0;
int count = 10;
struct Dots ArrayOfDots[count];
calc_cauchy_problem(ArrayOfDots, x_start, x_end, y_start, count);
return 0;
}
It's compiled successfully with this gcc main.c -o main -fopenmp -lgsl -std=gnu11 but when i launch it i got error
gsl: driver.c:354: ERROR: integration limits and/or step direction not consistent
Default GSL error handler invoked.
I think that the main problem with this #pragma omp parallel for shared(ArrayOfDots) private(sys, param, d, status) but i have no idea how to rewrite this in the other way.
Thanks for your responses.
UPD:
With Kaveh Vahedipour help my code partially start to work. It means that half of my for cycle start to work.
UPD UPD:
After another investigations i had the following code:
It's compile and run, but i got Process finished with exit code 4 and printf("Elapsed time = %f\n", omp_get_wtime() - start_time); don't print anything.
struct Dots {
double par;
double x;
double y;
};
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = (x + 2 * y[0]) / (1 + mu * mu);
return GSL_SUCCESS;
}
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int param = param1;
int j = 0;
int status = 0;
char filename[10];
#pragma omp parallel for private(param, status, x, y)
for (param = param1; param <= param2; param++) {
struct Dots ArrayOfDots[count];
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++) {
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
}
gsl_odeiv2_driver_free (d);
}
}
int main() {
double start_time = omp_get_wtime();
double x_start = 0;
double x_end = 10;
double y_start = 0;
const int count = 500;
int param1 = 1;
int param2 = 10;
calc_cauchy_problem(x_start, x_end, y_start, count, param1, param2);
printf("Elapsed time = %f\n", omp_get_wtime() - start_time);
return 0;
}
Add x to private loop vars: private(sys, param, d, status, x). Please get back to me, if you still experience issues.
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int param = param1;
int j = 0;
int status = 0;
char filename[10];
#pragma omp parallel for private(param, status, x, y)
for (param = param1; param <= param2; param++) {
struct Dots ArrayOfDots[count];
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++) {
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
}
//write_data_to_file(param, count, ArrayOfDots);
for (int i = 0; i < count; ++i) {
printf ("%d: %f, %f, %f\n", omp_get_thread_num(),
ArrayOfDots[i].par, ArrayOfDots[i].x, ArrayOfDots[i].y);
}
gsl_odeiv2_driver_free (d);
}
}
Seems like this version works fine. I think problem was with this struct Dots ArrayOfDots[count]; and when i try to push values to this struct.
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
Here is the full code.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
// GSL lib includes
#include <gsl/gsl_sf_bessel.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_odeiv2.h>
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = (x + 2 * y[0]) / (1 + mu * mu);
return GSL_SUCCESS;
}
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
#pragma omp parallel for
for(int param = param1; param < param2; param++) {
gsl_odeiv2_system sys = {ode_func, NULL, 1, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rk8pd,
1e-6, 1e-6, 0.0);
int i;
double x = x_start, x1 = x_end;
double y[1] = { y_start };
for (i = 1; i <= count; i++)
{
double xi = i * x1 / count;
int status = gsl_odeiv2_driver_apply (d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
// printf ("%d %d %.5e %.5e\n", omp_get_thread_num(), param, x, y[0]);
}
gsl_odeiv2_driver_free (d);
}
}
int main() {
double start_time = omp_get_wtime();
double x_start = 0;
double x_end = 10;
double y_start = 0;
const int count = 100000;
int param1 = 1;
int param2 = 20;
calc_cauchy_problem(x_start, x_end, y_start, count, param1, param2);
printf("Elapsed time = %f\n", omp_get_wtime() - start_time);
return 0;
}
Really thanks to Kaveh Vahedipour.
I have an assignment to code a program to calculate cos(x) through the Maclaurin approximation. However I must use a function for the cos(x) and another one to calculate the exponentials that go on the denominators inside the cos(x) function. I think most of this is right, but I'm probably missing on something and I can't figure out what.
#include<stdio.h>
#include <stdlib.h>
#include <math.h>
int fat(int);
float cosx(float);
int main()
{
float x1;
/* Original code: **x1 = x1 * 3.14159 / 180;** `transforms the value to radians` */
x1 = x1 * 3.14159 / 180; /* transforms the value to radians */
printf("Insert number:\n");
scanf("%f", &x1);
printf("Cosine of %f = %f", x1, cosx(x1));
return 0;
}
int fat(int y)
{
int n, fat = 1;
for(n = 1; n <= y; n++)
{
fat = fat * n;
}
return fat;
}
float cosx(float x)
{
int i=1, a = 2, b, c = 1, e;
float cos;
while(i < 20)
{
b = c * (pow(x,a)) / e;
cos = 1 - b;
a += 2;
e = fat(a);
c *= -1;
i++;
}
return cos;
}
If I input 0 it returns -2147483648.000000, which is clearly wrong.
First error is uninitialized variable x1, and right after that you have use:
int x1; // <<< uninitiated variable;
**x1 = x1 * 3.14159 / 180;** `transforms the value to radians
this will produce random value, you should put
int x = 0; // or some other value of your choice
In my opinion you should move x1 = x1 * 3.14159/100; after scanf("%d", x1).
Than again uninitiated value e before use.
int i=1, a = 2, b, c = 1, e;
...
b = c * (pow(x,a)) / e;
...
than you have in the line b = c * pow(x,a) where you go out of range of int variable potentially. If e = 1, x = 2 and a > 31 you are out of range for b. Another problem is pow(x,a) is rising much faster than `e. thus you get bigger and bigger values thus you are getting another overflow. And here is the code that works:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
long double fact(int);
long double cosx(double);
long double my_pow (double b, int e);
int main()
{
double x1 = 45.00;
printf("Insert number:\n");
scanf("%lf", &x1);
x1 = x1 * 3.14159 / 180; // ** `transforms the value to radians`
printf("Cosine of %f = %.10LF", x1, cosx(x1));
return 0;
}
long double fact(int y)
{
int n;
double fact = 1;
for(n = 1; n <= y; n++)
{
fact *= n;
}
return fact;
}
long double cosx(double x)
{
int a = 2, c = -1;
long i = 0, lim = 500;
long double cos = 1;
long double b = 0, e = 0;
while(i < lim) {
e = fact(a);
b = c * my_pow(x,a);
cos += b/e;
// printf ("%le %le %le\n", e, b, cos);
a += 2;
c *= -1;
i++;
}
return cos;
}
long double my_pow (double b, int e) {
long double pow = 1;
for (;e > 0; --e, pow *= b)
;
return pow;
}
I'm trying to compile a program with openMP:
gcc -c fopenmp -lm prog.c -o prog
prog.c includes . However when I run ./prog, the error is:
bash: ./prog: Permission denied
I tried again without the -c flag:
gcc -o prog -fopenmp -lm prog.c
But this time it doesn't seem to see the functions in math library like cos, sqrt (undefined reference).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>
#define REAL float
#define NX (64)
#ifndef M_PI
#define M_PI (3.1415926535897932384626)
#endif
void init(REAL *buff, const int nx, const int ny, const int nz,
const REAL kx, const REAL ky, const REAL kz,
const REAL dx, const REAL dy, const REAL dz,
const REAL kappa, const REAL time) {
REAL ax, ay, az;
int jz, jy, jx;
ax = exp(-kappa*time*(kx*kx));
ay = exp(-kappa*time*(ky*ky));
az = exp(-kappa*time*(kz*kz));
for (jz = 0; jz < nz; jz++) {
for (jy = 0; jy < ny; jy++) {
for (jx = 0; jx < nx; jx++) {
int j = jz*nx*ny + jy*nx + jx;
REAL x = dx*((REAL)(jx + 0.5));
REAL y = dy*((REAL)(jy + 0.5));
REAL z = dz*((REAL)(jz + 0.5));
REAL f0 = (REAL)0.125
*(1.0 - ax*cos(kx*x))
*(1.0 - ay*cos(ky*y))
*(1.0 - az*cos(kz*z));
buff[j] = f0;
}
}
}
}
REAL accuracy(const REAL *b1, REAL *b2, const int len) {
REAL err = 0.0;
int i;
for (i = 0; i < len; i++) {
err += (b1[i] - b2[i]) * (b1[i] - b2[i]);
}
return (REAL)sqrt(err/len);
}
typedef void (*diffusion_loop_t)(REAL *f1, REAL *f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
REAL cb, REAL cc, REAL dt,
REAL **f_ret, REAL *time_ret, int *count_ret);
static void
diffusion_baseline(REAL *f1, REAL *f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
REAL cb, REAL cc, REAL dt,
REAL **f_ret, REAL *time_ret, int *count_ret) {
REAL time = 0.0;
int count = 0;
do {
int z;
for (z = 0; z < nz; z++) {
int y;
for (y = 0; y < ny; y++) {
int x;
for (x = 0; x < nx; x++) {
int c, w, e, n, s, b, t;
c = x + y * nx + z * nx * ny;
w = (x == 0) ? c : c - 1;
e = (x == nx-1) ? c : c + 1;
n = (y == 0) ? c : c - nx;
s = (y == ny-1) ? c : c + nx;
b = (z == 0) ? c : c - nx * ny;
t = (z == nz-1) ? c : c + nx * ny;
f2[c] = cc * f1[c] + cw * f1[w] + ce * f1[e]
+ cs * f1[s] + cn * f1[n] + cb * f1[b] + ct * f1[t];
}
}
}
REAL *t = f1;
f1 = f2;
f2 = t;
time += dt;
count++;
} while (time + 0.5*dt < 0.1);
*time_ret = time;
*f_ret = f1;
*count_ret = count;
return;
}
static void
diffusion_openmp(REAL *f1, REAL *f2, int nx, int ny, int nz,
REAL ce, REAL cw, REAL cn, REAL cs, REAL ct,
REAL cb, REAL cc, REAL dt,
REAL **f_ret, REAL *time_ret, int *count_ret) {
#pragma omp parallel
{
REAL time = 0.0;
int count = 0;
REAL *f1_t = f1;
REAL *f2_t = f2;
#pragma omp master
printf("%d threads running\n", omp_get_num_threads());
do {
int z;
#pragma omp for
for (z = 0; z < nz; z++) {
int y;
for (y = 0; y < ny; y++) {
int x;
for (x = 0; x < nx; x++) {
int c, w, e, n, s, b, t;
c = x + y * nx + z * nx * ny;
w = (x == 0) ? c : c - 1;
e = (x == nx-1) ? c : c + 1;
n = (y == 0) ? c : c - nx;
s = (y == ny-1) ? c : c + nx;
b = (z == 0) ? c : c - nx * ny;
t = (z == nz-1) ? c : c + nx * ny;
f2_t[c] = cc * f1_t[c] + cw * f1_t[w] + ce * f1_t[e]
+ cs * f1_t[s] + cn * f1_t[n] + cb * f1_t[b] + ct * f1_t[t];
}
}
}
REAL *t = f1_t;
f1_t = f2_t;
f2_t = t;
time += dt;
count++;
} while (time + 0.5*dt < 0.1);
#pragma omp master
{
*f_ret = f1_t;
*time_ret = time;
*count_ret = count;
}
}
return;
}
int main(int argc, char *argv[])
{
struct timeval time_begin, time_end;
int nx = NX;
int ny = NX;
int nz = NX;
REAL *f1 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
REAL *f2 = (REAL *)malloc(sizeof(REAL)*NX*NX*NX);
REAL time = 0.0;
int count = 0;
REAL l, dx, dy, dz, kx, ky, kz, kappa, dt;
REAL ce, cw, cn, cs, ct, cb, cc;
l = 1.0;
kappa = 0.1;
dx = dy = dz = l / nx;
kx = ky = kz = 2.0 * M_PI;
dt = 0.1*dx*dx / kappa;
init(f1, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
ce = cw = kappa*dt/(dx*dx);
cn = cs = kappa*dt/(dy*dy);
ct = cb = kappa*dt/(dz*dz);
cc = 1.0 - (ce + cw + cn + cs + ct + cb);
diffusion_loop_t diffusion_loop = diffusion_baseline;
if (argc == 2) {
if (strcmp(argv[1], "openmp") == 0) {
diffusion_loop = diffusion_openmp;
}
}
gettimeofday(&time_begin, NULL);
diffusion_loop(f1, f2, nx, ny, nz, ce, cw, cn, cs, ct, cb, cc, dt,
&f1, &time, &count);
gettimeofday(&time_end, NULL);
REAL *answer = (REAL *)malloc(sizeof(REAL) * nx*ny*nz);
init(answer, nx, ny, nz, kx, ky, kz, dx, dy, dz, kappa, time);
REAL err = accuracy(f1, answer, nx*ny*nz);
double elapsed_time = (time_end.tv_sec - time_begin.tv_sec)
+ (time_end.tv_usec - time_begin.tv_usec)*1.0e-6;
REAL mflops = (nx*ny*nz)*13.0*count/elapsed_time * 1.0e-06;
double thput = (nx * ny * nz) * sizeof(REAL) * 2.0 * count
/ elapsed_time / (1 << 30);
fprintf(stderr, "elapsed time : %.3f (s)\n", elapsed_time);
fprintf(stderr, "flops : %.3f (MFlops)\n", mflops);
fprintf(stderr, "throughput : %.3f (GB/s)\n", thput);
fprintf(stderr, "accuracy : %e\n", err);
free(answer);
free(f1);
free(f2);
return 0;
}
In your first case, the -c flag only compiles to an object file, not an actual executable. Since object files don't normally have the executable bit set (because they're not directly executable), you get the Permission denied error.
In your second case, it's because the order in which you specify -l libraries is important. You need to move -lm after your prog.c file, like this:
gcc -o prog -fopenmp prog.c -lm
I have a square rotating in the console, but I get some holes. How can I fill it correctly?
#include <stdio.h>
#include <Windows.h>
#include <math.h>
void moveTo (int x, int y)
{
COORD coord = { x, y };
SetConsoleCursorPosition(GetStdHandle(STD_OUTPUT_HANDLE), coord);
}
double round (double number)
{
return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
}
double deg2rad (double a)
{
double pi = 3.14159265358979323846;
return a * pi / 180.0;
}
int main ()
{
int w = 8;
int h = 8;
int cx = 20;
int cy = 10;
double a = 0;
while (1)
{
system("cls");
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
double xx = x - 4;
double yy = y - 4;
double fx = xx * cos(a) - yy * sin(a);
double fy = xx * sin(a) + yy * cos(a);
int ix = cx + round(fx);
int iy = cy + round(fy);
moveTo(ix, iy);
printf("X");
}
}
a += deg2rad(15.0);
Sleep(100);
}
return 0;
}
Not an actual answer
Aparently your code will always print the same number of X in the screen, even though this might not be always the case.
I think you shouldn't be recalculating the positions of each predefinedX but instead calculate the geometry of the lines around the square and fill the space between with Xs, as many as necessary until it hit the opposite border or something.
An alternative solution could be doubling the "density" of your square:
int w = 8*2;
int h = 8*2;
int cx = 20*2;
int cy = 10*2;
...
moveTo(ix/2, iy/2);
This should double print some dots, but should also fill gaps.
I've wanted to create program that generates fractals on my GPU.
First I created a working project in C, after that I tried to convert it into CUDA/C.
Unfortunately, after I did it I saw that there is a difference in results of CPU and GPU.
I spend few hours thinking what I did wrong and it's a mystery to me.
IMO: It seems that there is a difference of calculating values in while loop, therefore it ends earlier than in normal CPU function.
Question: is there any possibility that it is true? And if, what can I do to avoid that kind of computing error?
Here's my entire code:
// C libs
#include <stdint.h>
#include <stdio.h>
#include <iostream>
// Help libs
#include <windows.h>
#include <math.h>
// CUDA libs
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void calulateFractal(unsigned char *a, int N, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < N)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < maxLevel)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
}
void calulateFractalCPU(unsigned char *a, int i, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < 99)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
int saveFractalToBitmap(unsigned char **colorsArray, unsigned char *bitmap, int width, int height, char *filename)
{
// Bitmap structures to be written to file
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;
// Fill BITMAPFILEHEADER structure
memcpy((char *)&bfh.bfType, "BM", 2);
bfh.bfSize = sizeof(bfh) + sizeof(bih) + 3*height*width;
bfh.bfReserved1 = 0;
bfh.bfReserved2 = 0;
bfh.bfOffBits = sizeof(bfh) + sizeof(bih);
// Fill BITMAPINFOHEADER structure
bih.biSize = sizeof(bih);
bih.biWidth = width;
bih.biHeight = height;
bih.biPlanes = 1;
bih.biBitCount = 24;
bih.biCompression = BI_RGB; // uncompressed 24-bit RGB
bih.biSizeImage = 0; // can be zero for BI_RGB bitmaps
bih.biXPelsPerMeter = 3780; // 96dpi equivalent
bih.biYPelsPerMeter = 3780;
bih.biClrUsed = 0;
bih.biClrImportant = 0;
// Open bitmap file (binary mode)
FILE *f;
f = fopen(filename, "wb");
if(f == NULL)
return -1;
// Write bitmap file header
fwrite(&bfh, 1, sizeof(bfh), f);
fwrite(&bih, 1, sizeof(bih), f);
// Write bitmap pixel data starting with the
// bottom line of pixels, left hand side
for (int i = 0; i < width * height ; i++)
{
// Write pixel components in BGR order
fputc(colorsArray[bitmap[i]][2], f);
fputc(colorsArray[bitmap[i]][1], f);
fputc(colorsArray[bitmap[i]][0], f);
}
// Close bitmap file
fclose(f);
return 0;
}
int main()
{
unsigned char **colorsArray;
unsigned char *fractalLevelsCPU;
unsigned char *fractalLevelsGPU;
double minX = -1.7;
double maxX = 1.7;
double minY = -1.5;
double maxY = 1.5;
double input_re = -0.79;
double input_im = 0.1463;
int width = 10;
int height = 5;
int N = width * height;
int maxLevel = 100;
size_t levelsArraySize = N * sizeof(unsigned char);
double ratioX = (maxX - minX) / (double) width;
double ratioY = (maxY - minY) / (double) height;
bool gpu = true;
// Allocate memory
colorsArray = (unsigned char**) malloc((maxLevel+1) * sizeof(unsigned char*));
for(int i=0; i<=maxLevel; i++)
{
colorsArray[i] = (unsigned char *) malloc(3 * sizeof(unsigned char));
colorsArray[i][0] = (int) (255.0 * i / maxLevel);
colorsArray[i][1] = (int) (255.0 * i / maxLevel);
colorsArray[i][2] = (int) (255.0 * log((double) i) / log((double) maxLevel));
}
fractalLevelsCPU = (unsigned char*) malloc(levelsArraySize);
cudaMalloc((unsigned char **) &fractalLevelsGPU, levelsArraySize);
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyHostToDevice);
if(gpu)
{
// Run GPU method
calulateFractal <<< 1, N >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
// Copy data from GPU to CPU array
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyDeviceToHost);
}
else
{
// Iterate every element in array and compute level of fractal
for(int i=0; i<N; i++)
{
calulateFractalCPU(fractalLevelsCPU, i, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
}
}
// Show results
for(int i=0; i<N; i++)
{
if((i % width) == 0)
printf("\n");
printf("%d\t", fractalLevelsCPU[i]);
}
//saveFractalToBitmap(colorsArray, fractalLevelsCPU, width, height, "frac.bmp");
// Free memory
for(int i=0; i<=maxLevel; i++)
{
free(colorsArray[i]);
}
free(colorsArray);
free(fractalLevelsCPU);
cudaFree(fractalLevelsGPU);
return 0;
}
I've find solution to my problem.
First of all, number of threads per block should be a power of two number.
Also I realized that my GPU has it's limits for number of threads per block and blocks itself.
NVIDIA Utils showed me that I can use max 65536 blocks and 512 threads per block.
Solution:
int threadsPerBlock = 512;
int blocksNumber = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);
if(blocksNumber > 65536)
return -1;
calulateFractal <<< blocksNumber, threadsPerBlock >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);