Different result of computing in GPU (CUDA) and CPU - c

I've wanted to create program that generates fractals on my GPU.
First I created a working project in C, after that I tried to convert it into CUDA/C.
Unfortunately, after I did it I saw that there is a difference in results of CPU and GPU.
I spend few hours thinking what I did wrong and it's a mystery to me.
IMO: It seems that there is a difference of calculating values in while loop, therefore it ends earlier than in normal CPU function.
Question: is there any possibility that it is true? And if, what can I do to avoid that kind of computing error?
Here's my entire code:
// C libs
#include <stdint.h>
#include <stdio.h>
#include <iostream>
// Help libs
#include <windows.h>
#include <math.h>
// CUDA libs
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void calulateFractal(unsigned char *a, int N, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < N)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < maxLevel)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
}
void calulateFractalCPU(unsigned char *a, int i, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < 99)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
int saveFractalToBitmap(unsigned char **colorsArray, unsigned char *bitmap, int width, int height, char *filename)
{
// Bitmap structures to be written to file
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;
// Fill BITMAPFILEHEADER structure
memcpy((char *)&bfh.bfType, "BM", 2);
bfh.bfSize = sizeof(bfh) + sizeof(bih) + 3*height*width;
bfh.bfReserved1 = 0;
bfh.bfReserved2 = 0;
bfh.bfOffBits = sizeof(bfh) + sizeof(bih);
// Fill BITMAPINFOHEADER structure
bih.biSize = sizeof(bih);
bih.biWidth = width;
bih.biHeight = height;
bih.biPlanes = 1;
bih.biBitCount = 24;
bih.biCompression = BI_RGB; // uncompressed 24-bit RGB
bih.biSizeImage = 0; // can be zero for BI_RGB bitmaps
bih.biXPelsPerMeter = 3780; // 96dpi equivalent
bih.biYPelsPerMeter = 3780;
bih.biClrUsed = 0;
bih.biClrImportant = 0;
// Open bitmap file (binary mode)
FILE *f;
f = fopen(filename, "wb");
if(f == NULL)
return -1;
// Write bitmap file header
fwrite(&bfh, 1, sizeof(bfh), f);
fwrite(&bih, 1, sizeof(bih), f);
// Write bitmap pixel data starting with the
// bottom line of pixels, left hand side
for (int i = 0; i < width * height ; i++)
{
// Write pixel components in BGR order
fputc(colorsArray[bitmap[i]][2], f);
fputc(colorsArray[bitmap[i]][1], f);
fputc(colorsArray[bitmap[i]][0], f);
}
// Close bitmap file
fclose(f);
return 0;
}
int main()
{
unsigned char **colorsArray;
unsigned char *fractalLevelsCPU;
unsigned char *fractalLevelsGPU;
double minX = -1.7;
double maxX = 1.7;
double minY = -1.5;
double maxY = 1.5;
double input_re = -0.79;
double input_im = 0.1463;
int width = 10;
int height = 5;
int N = width * height;
int maxLevel = 100;
size_t levelsArraySize = N * sizeof(unsigned char);
double ratioX = (maxX - minX) / (double) width;
double ratioY = (maxY - minY) / (double) height;
bool gpu = true;
// Allocate memory
colorsArray = (unsigned char**) malloc((maxLevel+1) * sizeof(unsigned char*));
for(int i=0; i<=maxLevel; i++)
{
colorsArray[i] = (unsigned char *) malloc(3 * sizeof(unsigned char));
colorsArray[i][0] = (int) (255.0 * i / maxLevel);
colorsArray[i][1] = (int) (255.0 * i / maxLevel);
colorsArray[i][2] = (int) (255.0 * log((double) i) / log((double) maxLevel));
}
fractalLevelsCPU = (unsigned char*) malloc(levelsArraySize);
cudaMalloc((unsigned char **) &fractalLevelsGPU, levelsArraySize);
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyHostToDevice);
if(gpu)
{
// Run GPU method
calulateFractal <<< 1, N >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
// Copy data from GPU to CPU array
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyDeviceToHost);
}
else
{
// Iterate every element in array and compute level of fractal
for(int i=0; i<N; i++)
{
calulateFractalCPU(fractalLevelsCPU, i, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
}
}
// Show results
for(int i=0; i<N; i++)
{
if((i % width) == 0)
printf("\n");
printf("%d\t", fractalLevelsCPU[i]);
}
//saveFractalToBitmap(colorsArray, fractalLevelsCPU, width, height, "frac.bmp");
// Free memory
for(int i=0; i<=maxLevel; i++)
{
free(colorsArray[i]);
}
free(colorsArray);
free(fractalLevelsCPU);
cudaFree(fractalLevelsGPU);
return 0;
}

I've find solution to my problem.
First of all, number of threads per block should be a power of two number.
Also I realized that my GPU has it's limits for number of threads per block and blocks itself.
NVIDIA Utils showed me that I can use max 65536 blocks and 512 threads per block.
Solution:
int threadsPerBlock = 512;
int blocksNumber = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);
if(blocksNumber > 65536)
return -1;
calulateFractal <<< blocksNumber, threadsPerBlock >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

Related

Hough transform implementation using C using 2D raw image

I'm trying to implement Hough algorithm using C programming language and 2D raw image.
I have written the code in order to get separate output image for edge detection and Hough transform. When I do edge detection only I'm getting the proper edge detected output image. But here I'm getting the output raw image of size 0KB.I'm not getting where I made mistake.
Can anybody please help me to rectify this problem and give idea for further development such as draw line based on Hough space for the below code.
Thank you in advance.
#include <stdio.h>
#include <math.h>
#define PI 3.14159265
void
hough_transform(unsigned char out[256][256], unsigned char h_out[][256],
int w, int h)
{
int i, j;
int thetaDeg;
float thetaRad;
int rho;
int distMax;
int thetamax;
distMax = sqrt((h * h) + (w * w));
thetamax = 360;
int Acc[180][180];
for (i = 0; i < w; i++) {
for (j = 0; j < h; j++) {
if (out[i][j] > 0) {
for (thetaDeg = 0; thetaDeg < thetamax; ++thetaDeg) {
thetaRad = thetaDeg * (PI / 180);
rho = i * cos(thetaRad) + j * sin(thetaRad);
// find rho value that is closest to this
if (distMax < rho) {
int min = abs(rho - distMax);
}
if (distMax <= 1) {
// Increment a value in an accumulator array
Acc[rho][thetaDeg] += 1;
}
}
}
}
}
for (rho = 0; rho < distMax; rho++) {
for (thetaDeg = 0; thetaDeg < thetamax; thetaDeg++) {
h_out[rho][thetaDeg] = Acc[rho][thetaDeg];
}
}
}
void
edge_detect(unsigned char input[256][256], unsigned char out[][256],
int width, int height)
{
int in[3][3] = { {1, 2, 1}, {0, 0, 0}, {-1, -2, -1} };
int pixel;
int i;
int j;
int x;
int y;
for (y = 1; y < height - 1; y++) {
for (x = 1; x < width - 1; x++) {
pixel = 0;
for (j = -1; j <= 1; j++) {
for (i = -1; i <= 1; i++) {
pixel += in[j + 1][i + 1] * input[y + j][x + i];
}
}
out[y][x] = (unsigned char) abs(pixel);
}
}
}
int
main()
{
FILE *fp;
FILE *fp1;
FILE *fp2;
unsigned char input[256][256];
unsigned char out[256][256];
unsigned char h_out[256][256];
int width = 256;
int height = 256;
fp = fopen("Image.raw", "rb");
fp1 = fopen("output.raw", "wb");
fp2 = fopen("h_output.raw", "wb");
fread(input, 256 * 256, 1, fp);
edge_detect(input, out, width, height);
hough_transform(out, h_out, width, height);
fwrite(out, 256 * 256, 1, fp1);
fwrite(h_out, 256 * 256, 1, fp2);
fclose(fp);
fclose(fp1);
fclose(fp2);
return (0);
}

Ray tracer not giving different light intensities based on direction

Goal: I am trying to create a ray tracer in C. I just added in a light source that should give each of my three spheres a shading effect based on where the light is. If the light is to the left of all of them, a shadow should be cased on the right.
Problem: When changing the light intensities and position of the light, all the spheres are changed uniformly. The spheres will be more or less lit equally and there is no variation of lighting on individual pixels on the sphere.
My debugging attempts: I have tried looking through the variable outputs by printing out a lot of different info and I think the source comes from my variable
diffuse_light_intensity
which does not change much (through all the iterations on the screen the value changes twice when it should be changing quite often due to the angles of the light on the surface changing quite a bit)
My Code: (my theory is the problem lies in scene_intersect() or cast_ray())
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <limits.h>
typedef struct {
float position[3];
float intensity;
} Light;
typedef struct {
float diffuse_color[3];
} Material;
typedef struct {
float center[3];
float radius;
Material material;
} Sphere;
int arrSub(const float arr1[], const float arr2[], float subArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 - arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
subArr[i] = arr1[i] - arr2[i];
}
return 0;
}
int arrAdd(const float arr1[], const float arr2[], float addArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 + arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
addArr[i] = arr1[i] + arr2[i];
}
return 0;
}
int arrScalarMult(const float arr1[], float scalar, float newArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 - arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
newArr[i] = arr1[i] * scalar;
}
return 0;
}
float dotProduct(const float arr1[], const float arr2[], int length) {
/*
Returns the dot product of two equal sized arrays
(treated as vectors)
a (dot) b = a1b1 + a2b2 + ... anbn
*/
float result = 0;
for (int i = 0; i < length; i++) {
result += arr1[i] * arr2[i];
}
return result;
}
int normalize(float arr[], int len) {
//Normalize a vector (array)
float sumSqr;
float norm;
for (int i = 0; i < len; i++) {
sumSqr += arr[i] * arr[i];
}
norm = sqrt(sumSqr);
for (int i = 0; i < len; i++) {
arr[i] = arr[i] / norm;
}
return 0;
}
bool ray_intersect(const float origin[], const float dir[], float t0, Sphere s) {
/*
Ray-Sphere Intersection
Vectors:
origin (the zero vector)
dir (direction vector)
L (vector from origin to center of sphere)
Scalars:
tca
d2
thc
t0
t1
*/
float L[3] = {0,0,0}; //The zero vector
arrSub(s.center, origin, L, 3); //L is now the vector from origin to the sphere's center
float tca = dotProduct(L, dir, 3); //Projection of L onto dir
float d2 = dotProduct(L, L, 3) - tca*tca;
if (d2 > s.radius * s.radius) return false; //There is no intersection, so return false.
float thc = sqrtf((s.radius*s.radius - d2));
t0 = tca - thc;
float t1 = tca + thc;
if (t0 < 0) {
t0 = t1;
}
if (t0 < 0) return false;
return true;
}
bool scene_intersect(const float origin[], const float dir[], const Sphere s[], int len, float hit[], float N[], Material * ptr_m) {
float sphere_dist = INT_MAX;
for (size_t i=0; i < len; i++) {
float dist_i;
if (ray_intersect(origin, dir, dist_i, s[i]) && dist_i < sphere_dist) {
sphere_dist = dist_i;
float dirDist[3];
arrScalarMult(dir, dist_i, dirDist, 3);
arrAdd(origin, dirDist, hit, 3);
float hitMinusCenter[3];
arrSub(hit, s[i].center, hitMinusCenter, 3);
normalize(hitMinusCenter, 3);
N[0] = hitMinusCenter[0];
N[1] = hitMinusCenter[1];
N[2] = hitMinusCenter[2];
* ptr_m = s[i].material;
}
}
return sphere_dist<1000;
}
int cast_ray(const float origin[], const float dir[], const Sphere s[], const Light l[], int l_size, unsigned char colorArr[]) {
float point[3], N[3];
Material m;
Material * ptr_m = &m;
if (!scene_intersect(origin, dir, s, 3, point, N, ptr_m)) {
//background
colorArr[0] = 5; //red
colorArr[1] = 100; //green
colorArr[2] = 250; //blue
} else {
float diffuse_light_intensity = 0;
float light_dir[3];
for (size_t i = 0; i < l_size; i++) {
arrSub(l[i].position, point, light_dir, 3);
normalize(light_dir, 3);
diffuse_light_intensity += l[i].intensity * ((0.f >= dotProduct(light_dir, N, 3) ? (0.f) : (dotProduct(light_dir, N, 3))));
}
//light up pixel
colorArr[0] = m.diffuse_color[0] * diffuse_light_intensity;
colorArr[1] = m.diffuse_color[1] * diffuse_light_intensity;
colorArr[2] = m.diffuse_color[2] * diffuse_light_intensity;
}
return 0;
}
int render(const Sphere s[], const Light l[], int l_length) {
/*
Creates image in a new color each step.
*/
const int width = 1024;
const int height = 768;
FILE *fp = fopen("fourth.ppm", "wb"); // Write in binary mode
(void) fprintf(fp, "P6\n%d %d\n255\n", width, height);
float fov = 3.1415926535/2.; // Field of View
#pragma omp parallel for
for (size_t j = 0; j < height; j++) {
for (size_t i = 0; i < width; i++) {
float x = (2*(i+.5)/(float)width - 1)*tan(fov/2.)*width/(float)height;
float y = -(2*(j+.5)/(float)height - 1)*tan(fov/2.);
float dir[] = {x,y,-1};
normalize(dir, 3);
unsigned char color[3];
const float origin[] = {0,0,0};
cast_ray(origin, dir, s, l, l_length, color);
(void) fwrite(color, 1, 3, fp);
}
}
(void) fclose(fp);
return 0;
}
int main(void) {
Material red = {255,0,0};
Material pink = {150,10,150};
Material gold = {255, 195, 0};
//Populate with spheres
Sphere s[3];
Sphere originalS = {{-3,0,-16},2,gold};
Sphere bigS = {{-1.0, -1.5, -12}, 3, red};
Sphere anotherS = {{7,5,-18},2,pink};
s[0] = originalS;
s[1] = bigS;
s[2] = anotherS;
//Add light source
Light l[1];
Light test_light = {{-20,20,20}, 1.5};
l[0] = test_light;
render(s,l, 1);
printf("Run success!\n");
return 0;
}
If any clarification is needed on my code please let me know, I am quite new to both C and stackoverflow.
There's a fundamental error in ray_intersect where you're passing the t0 variable by value, and not as a pointer, and therefore in the scene_intersect function its value is always zero.
The other problem is that you don't initialize the sumSqr in the normalize function, resulting in that function returning NaN for each vector component.
With those two fixed I get something approximating shaded balls. The errors in that image are caused by failing to ensure that your output pixel values fall in the range [0, 255].
NB: both of these first errors are detected if you turn on full compiler error checking, warning you of uninitialised variables being used.

Cannot retrieve original image using FFT with FFTW

I'm using FFTW in my C code and I have some issue.
First, I can transform an original image to two images (mag+phase) and get back the original image with the inverse transform.
However, If I want to get a mag file centered in frequency it does not work anymore: the final image has some issues.
Here some pieces of my code. Can someone help me to find the error in my code?
EDIT: I've fixed the code to follow #francis recommandation, but my issues is always here.
enum {
TYPE_CENTERED,
TYPE_REGULAR
};
static void fft_to_spectra(fits* fit, fftw_complex *frequency_repr, double *as,
double *ps, int nbdata) {
unsigned int i;
for (i = 0; i < nbdata; i++) {
double r = creal(frequency_repr[i]);
double im = cimag(frequency_repr[i]);
as[i] = hypot(r, im);
ps[i] = atan2(im, r);
}
}
static void fft_to_freq(fits* fit, fftw_complex *frequency_repr, double *as, double *ps, int nbdata) {
unsigned int i;
for (i = 0; i < nbdata; i++) {
frequency_repr[i] = as[i] * (cos(ps[i]) + I * sin(ps[i]));
}
}
void change_symmetry(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
if (i < width / 2 && j < height / 2) {
*x = i + width / 2;
*y = j + height / 2;
}
if (i >= width / 2 && j < height / 2) {
*x = i - width / 2;
*y = j + height / 2;
}
if (i < width / 2 && j >= height / 2) {
*x = i + width / 2;
*y = j - height / 2;
}
if (i >= width / 2 && j >= height / 2) {
*x = i - width / 2;
*y = j - height / 2;
}
}
static void centered(WORD *buf, unsigned int width,
unsigned int height) {
unsigned int i, j;
WORD *temp = malloc(width * height * sizeof(WORD));
for (j = 0; j < height; j++) {
for (i = 0; i < width; i++) {
unsigned int x = i;
unsigned int y = j;
change_symmetry(width, height, i, j, &x, &y);
temp[j * width + i] = buf[y * width + x];
}
}
memcpy(buf, temp, sizeof(WORD) * width * height);
free(temp);
}
static void normalisation_spectra(unsigned int w, unsigned int h, double *modulus, double *phase,
WORD *abuf, WORD *pbuf) {
unsigned int i;
for (i = 0; i < h * w; i++) {
pbuf[i] = round_to_WORD(((phase[i] + M_PI) * USHRT_MAX_DOUBLE / (2 * M_PI)));
abuf[i] = round_to_WORD((modulus[i] / w / h));
}
}
static void save_dft_information_in_gfit(fits *fit) {
strcpy(gfit.dft.ord, fit->dft.type);
strcpy(gfit.dft.ord, fit->dft.ord);
}
static void FFTD(fits *fit, fits *x, fits *y, int type_order, int layer) {
WORD *xbuf = x->pdata[layer];
WORD *ybuf = y->pdata[layer];
WORD *gbuf = fit->pdata[layer];
unsigned int i;
unsigned int width = fit->rx, height = fit->ry;
int nbdata = width * height;
fftw_complex *spatial_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!spatial_repr) {
return;
}
fftw_complex *frequency_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!frequency_repr) {
fftw_free(spatial_repr);
return;
}
/* copying image selection into the fftw data */
#ifdef _OPENMP
#pragma omp parallel for num_threads(com.max_thread) private(i) schedule(static) if(nbdata > 15000)
#endif
for (i = 0; i < nbdata; i++) {
spatial_repr[i] = (double) gbuf[i];
}
/* we run the Fourier Transform */
fftw_plan p = fftw_plan_dft_2d(height, width, spatial_repr, frequency_repr,
FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
/* we compute modulus and phase */
double *modulus = malloc(nbdata * sizeof(double));
double *phase = malloc(nbdata * sizeof(double));
fft_to_spectra(fit, frequency_repr, modulus, phase, nbdata);
//We normalize the modulus and the phase
normalisation_spectra(width, height, modulus, phase, xbuf, ybuf);
if (type_order == TYPE_CENTERED) {
strcpy(x->dft.ord, "CENTERED");
centered(xbuf, width, height);
centered(ybuf, width, height);
}
free(modulus);
free(phase);
fftw_destroy_plan(p);
fftw_free(spatial_repr);
fftw_free(frequency_repr);
}
static void FFTI(fits *fit, fits *xfit, fits *yfit, int type_order, int layer) {
WORD *xbuf = xfit->pdata[layer];
WORD *ybuf = yfit->pdata[layer];
WORD *gbuf = fit->pdata[layer];
unsigned int i;
unsigned int width = xfit->rx;
unsigned int height = xfit->ry;
int nbdata = width * height;
double *modulus = calloc(1, nbdata * sizeof(double));
double *phase = calloc(1, nbdata * sizeof(double));
if (type_order == TYPE_CENTERED) {
centered(xbuf, width, height);
centered(ybuf, width, height);
}
for (i = 0; i < height * width; i++) {
modulus[i] = (double) xbuf[i] * (width * height);
phase[i] = (double) ybuf[i] * (2 * M_PI / USHRT_MAX_DOUBLE);
phase[i] -= M_PI;
}
fftw_complex* spatial_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!spatial_repr) {
return;
}
fftw_complex* frequency_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!frequency_repr) {
fftw_free(spatial_repr);
return;
}
fft_to_freq(fit, frequency_repr, modulus, phase, nbdata);
fftw_plan p = fftw_plan_dft_2d(height, width, frequency_repr, spatial_repr,
FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (i = 0; i < nbdata; i++) {
double pxl = creal(spatial_repr[i]) / nbdata;
gbuf[i] = round_to_WORD(pxl);
}
free(modulus);
free(phase);
fftw_destroy_plan(p);
fftw_free(spatial_repr);
fftw_free(frequency_repr);
}
Here my images, the original one and the FFTD(centered)->FFTI result
The plan is created using the flag FFTW_MEASURE. Hence, several DFT are computed and the input array is likely overwritten. Here is the start of the description of planner flags in the documentation of FFTW:
FFTW_ESTIMATE specifies that, instead of actual measurements of different algorithms, a simple heuristic is used to pick a (probably sub-optimal) plan quickly. With this flag, the input/output arrays are not overwritten during planning.
FFTW_MEASURE tells FFTW to find an optimized plan by actually computing several FFTs and measuring their execution time. Depending on your machine, this can take some time (often a few seconds). FFTW_MEASURE is the default planning option.
Either switch to FFTW_ESTIMATE or create the plan before populating the input array:
/* we run the Fourier Transform */
fftw_plan p = fftw_plan_dft_2d(width, height, spatial_repr, frequency_repr,
FFTW_FORWARD, FFTW_MEASURE);
/* copying image selection into the fftw data */
#ifdef _OPENMP
#pragma omp parallel for num_threads(com.max_thread) private(i) schedule(static) if(nbdata > 15000)
#endif
for (i = 0; i < nbdata; i++) {
spatial_repr[i] = (double) gbuf[i];
}
If you intend to a single image, using FFTW_ESTIMATE is the way to go. On the contrary, if you consider treating multiple images, creating the plan once using FFTW_MEASURE and storing it is a good option. Then you may use New-array Execute Functions each time a FFT is to be performed:
fftw_execute_dft(p, spatial_repr, frequency_repr);
You can test the return value of malloc() or fftw_malloc() to check if the allocations went right. If not, it returns NULL. fftw_malloc() is implemented as function *X(kernel_malloc)(size_t n) in fftw-3.3.6-pl2/kernel/kalloc.c . It calls functions like memalign() or _aligned_malloc() among others. Both these two return NULL just like malloc() in case of failure. Finally, I did not spotted a critical issue regarding memory allocation of deallocation in the piece of code you provided.
The argument double nbdata in fft_to_spectra() should likely be an integer. Valgrind might have considered it as strange...
EDIT : the change_symmetry() is to be modified for odd sizes. Something like:
void change_symmetry_forward(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
*x = i + width / 2;
if (*x>=width){
*x=*x-width;
}
*y = j + height / 2;
if(*y>=height){
*y =*y-height;
}
}
and
void change_symmetry_backward(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
*x = i +width- width / 2;
if (*x>=width){
*x=*x-width;
}
*y = j +height- height / 2;
if(*y>=height){
*y =*y-height;
}
}

why matlab out of memory using mex

Matlab would be out of memory, if i use a mex file in a loop. I think it's caused by memory leak. mxMalloc variable is freed by mxFree, but I cannot destroy mxCreateNumericArray variable using mxDestroyArray(plhs[0]).
The mex file is from Offscreen toolbox link . The code is as following.
#include "mex.h"
#include "math.h"
#include "OffscreenGL.h"
#include "OffscreenCommon.h"
void drawPatchAndConvert(GLuint listName, GLubyte *imageBuffer, unsigned int imgHeight, unsigned int imgWidth, unsigned int zoomFactor = 1)
{
// This is a temporary bug fix for Nvidia's open program
// seems the width of the pixel has to be a multiple of 4
// for other width, we have to pad the width and remove it later
unsigned int paddedWidth = imgWidth * zoomFactor % 4;
if (paddedWidth != 0){
paddedWidth = 4 - paddedWidth + imgWidth * zoomFactor;
}else {
paddedWidth = imgWidth * zoomFactor;
}
unsigned char *paddedImgBuffer = (unsigned char *)mxMalloc(paddedWidth * imgHeight * zoomFactor * MAX_COLOR_CHANNEL * sizeof(GL_UNSIGNED_BYTE));
drawPatch(listName, paddedImgBuffer, imgHeight, imgWidth, zoomFactor);
// reorder the pixel data for the opengl to matlab conversion
unsigned int imgSize = imgHeight * imgWidth * zoomFactor * zoomFactor;
unsigned int imgSize2 = imgSize * 2;
unsigned int matlabImgIndex = 0;
unsigned int oglImageIndex = 0;
for (int j = 0; j < imgWidth * zoomFactor; j++) {
for (int i = 0; i < imgHeight * zoomFactor; i++, matlabImgIndex++) {
oglImageIndex = (j + (imgHeight*zoomFactor -1-i) * paddedWidth) * 3;
imageBuffer[matlabImgIndex] = paddedImgBuffer[oglImageIndex];
imageBuffer[matlabImgIndex + imgSize] = paddedImgBuffer[oglImageIndex + 1];
imageBuffer[matlabImgIndex + imgSize2] = paddedImgBuffer[oglImageIndex + 2];
}
}
mxFree(paddedImgBuffer);
}
static void renderColorMesh(double *FM, int fNum, double *VM, int vNum, float *ColorM, int colorNum,const mxArray *CamParamS, double *imgSizeV, double *zNearFarV, unsigned int zoomFactor,unsigned char *imgBuffer)
{
cameraSetup(CamParamS, zNearFarV[0], zNearFarV[1], (unsigned int) imgSizeV[0], (unsigned int) imgSizeV[1], zoomFactor);
#ifndef NDEBUG
mexPrintf("Start to create the display list: fNum=%d, vNum=%d, colorNum=%d\n", fNum, vNum, colorNum);
#endif
GLuint list = createDisplayListWithColor(FM, fNum, VM, vNum, ColorM, colorNum);
#ifndef NDEBUG
mexPrintf("Start to draw the patch\n");
#endif
drawPatchAndConvert(list, imgBuffer, (int) imgSizeV[0], (int) imgSizeV[1], zoomFactor);
}
void mexFunction(int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
// get the vertex array, face array, and color array
double *FM = mxGetPr(prhs[0]);
int fNum = mxGetM(prhs[0]);
double *VM = mxGetPr(prhs[1]);
int vNum = mxGetM(prhs[1]);
float *ColorM = (float *)mxGetData(prhs[2]);
int colorNum = mxGetM(prhs[2]);
// get the camera parameters
const mxArray *CamParamS = prhs[3];
double *imgSizeV = mxGetPr(prhs[4]);
double *zNearFarV = mxGetPr(prhs[5]);
double zoomFactor = mxGetScalar(prhs[6]);
OffscreenGL offscreenGL((int)(imgSizeV[0] * zoomFactor), (int) (imgSizeV[1] * zoomFactor));
int output3Size[3];
unsigned char *imgBuffer;
if (offscreenGL.RGB8Setup()) {
//mexPrintf("OpenGLCanvas setup Successful\n");
output3Size[0] = (int) (imgSizeV[0] * zoomFactor);
output3Size[1] = (int) (imgSizeV[1] * zoomFactor);
output3Size[2] = 3;
plhs[0] = mxCreateNumericArray(3, output3Size, mxUINT8_CLASS, mxREAL);
imgBuffer = (unsigned char *) mxGetData(plhs[0]);
renderColorMesh(FM, fNum, VM, vNum, ColorM, colorNum, CamParamS, imgSizeV,
zNearFarV, (unsigned int) zoomFactor, imgBuffer);
} else {
mexPrintf("OpenGLCanvas setup failed\n");
}
}
This line is suspicious:
paddedWidth = 4 - paddedWidth + imgWidth * zoomFactor;
It only pads the entire zoomed in line with a few bytes. I think you meant something like
paddedWidth = (4 - paddedWidth + imgWidth) * zoomFactor;

segmentation fault

I am trying get a mandelbrot image clearly with the sequential programming in C++, but I am getting a segmentation fault during runtime. I have no idea about the seg. fault, but my program is perfectly compiling with no errors.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int file_write(unsigned int width, unsigned int height)
{
unsigned int **color = NULL;
FILE *fractal = fopen("mandelbrot_imageSequential.ppm","w+");
if(fractal != NULL)
{
fprintf(fractal,"P6\n");
fprintf(fractal,"# %s\n", "Mandelbrot_imageSequential.ppm");
fprintf(fractal,"%d %d\n", height, width);
fprintf(fractal,"40\n");
int x = 0, y = 0;
unsigned int R = 0, G = 0, B = 0;
for(x = 0; x < width; ++x)
{
for(y = 0; y < height; ++y)
{
R = (color[y][x]*10);
G = 255-((color[y][x]*10));
B = ((color[y][x]*10)-150);
if(R == 10)
R = 11;
if(G == 10)
G = 11;
if(B == 10)
B = 11;
putc(R, fractal);
putc(G, fractal);
putc(B, fractal);
}
}
fclose(fractal);
}
return 0;
}
int method(int x, int y, int height, int width, double min_re, double max_re, double min_im, double max_im, int max_iterations)
{
double threshold = 4;
double x_factor = (max_re-min_re)/(width-1);
double y_factor = (max_im-min_im)/(height-1);
double c_im = max_im - y*y_factor;
double c_re = min_re + x*x_factor;
double Z_re = c_re, Z_im = c_im;
unsigned int col = 0;
for(unsigned n = 0; n < max_iterations; ++n)
{
double Z_re2 = Z_re*Z_re, Z_im2 = Z_im*Z_im;
if(Z_re2 + Z_im2 > threshold)
{
col = n;
break;
}
Z_im = 2 * Z_re * Z_im + c_im;
Z_re = Z_re2 - Z_im2 + c_re;
}
return col;
}
int main(int argc, char *argv[])
{
unsigned int width;
unsigned int height;
unsigned int max_iterations;
unsigned int **color = NULL;
int x,y;
double threshold;
double min_re;
double max_re;
double min_im;
double max_im;
unsigned int NUM_OF_THREADS;
if(argc != 10)
{
printf("There is an error in the input given.\n");
return 0;
}
else
{
height = atoi(argv[1]);
width = atoi(argv[2]);
max_iterations = atoi(argv[3]);
min_re = atof(argv[4]);
max_re = atof(argv[5]);
min_im = atof(argv[6]);
max_im = atof(argv[7]);
threshold = atoi(argv[8]);
NUM_OF_THREADS = atoi(argv[9]);
}
color = (unsigned int**)malloc(height*sizeof(unsigned int*));
printf("height = %d\twidth = %d\tmaximum_iterations = %d\tminimum_x-value = %.2f\tmaximum_x-value = %.2f\tminimum_y-value = %.2f\tmaximum_y-value = %.2f\tthreshold_value = %.2f\tno. of threads = %d\t\n",height,width,max_iterations,min_re,max_re,min_im,max_im,threshold,NUM_OF_THREADS);
for(x = 0; x < height; x++)
{
color[x] = (unsigned int*)malloc(width*sizeof(unsigned int));
}
time_t ts,te;
time(&ts);
method(x,y,height,width,min_re,max_re,min_im,max_im,max_iterations);
time(&te);
double diff = difftime(te,ts);
file_write(width, height);
printf("Total Time elapsed: %f\n",diff);
return 0;
}
How to correct this segmentation fault?
At least one problem is in the file_write function.
unsigned int **color = NULL;
R = (color[y][x]*10);
I assume the color should be an input parameter.
If you are on Linux machine do the following :
$ulimit -c unlimited
Then run the code. Notice a core.[pid] file is generated. fire up gdb like following
$gdb ./your_app core.[pid]
It will take you the statement where segfault occurred. issue a "backtrace" command in gdb prompt to see the call hierarchy.
Remember compiling with "-g" flag to get more verbose gdb output.
There are two major problems with your code:
You allocate memory for the color array but then use a different color inside file_write() which is initialized to NULL.
You need to pass the first color as an argument to file_write():
int main(...)
{
...
file_write(color, width, height);
printf("Total Time elapsed: %f\n",diff);
return 0;
}
And declare the other color as an argument to file_write():
int file_write(unsigned int **color, unsigned int width, unsigned int height)
{
/* unsigned int **color = NULL; // Removed */
...
You're only calling method() once and not storing anything into color. You need to call it in a loop. Something similar to:
/* Untested */
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
color[y][x] = method(x,y,height,width,min_re,max_re,min_im,max_im,max_iterations);
}
}
Then, of course, you should check the return values of malloc(), fopen(), fprintf(), fclose(), ... , and check that the input variables have reasonable values and so on.
I also noticed that you're passing width and height in different order to file_write() and method(). To avoid future headaches, I would change the method() function to method(x, y, width, height) so that the horizontal and vertical arguments are passed in the same order.

Resources