why matlab out of memory using mex - c

Matlab would be out of memory, if i use a mex file in a loop. I think it's caused by memory leak. mxMalloc variable is freed by mxFree, but I cannot destroy mxCreateNumericArray variable using mxDestroyArray(plhs[0]).
The mex file is from Offscreen toolbox link . The code is as following.
#include "mex.h"
#include "math.h"
#include "OffscreenGL.h"
#include "OffscreenCommon.h"
void drawPatchAndConvert(GLuint listName, GLubyte *imageBuffer, unsigned int imgHeight, unsigned int imgWidth, unsigned int zoomFactor = 1)
{
// This is a temporary bug fix for Nvidia's open program
// seems the width of the pixel has to be a multiple of 4
// for other width, we have to pad the width and remove it later
unsigned int paddedWidth = imgWidth * zoomFactor % 4;
if (paddedWidth != 0){
paddedWidth = 4 - paddedWidth + imgWidth * zoomFactor;
}else {
paddedWidth = imgWidth * zoomFactor;
}
unsigned char *paddedImgBuffer = (unsigned char *)mxMalloc(paddedWidth * imgHeight * zoomFactor * MAX_COLOR_CHANNEL * sizeof(GL_UNSIGNED_BYTE));
drawPatch(listName, paddedImgBuffer, imgHeight, imgWidth, zoomFactor);
// reorder the pixel data for the opengl to matlab conversion
unsigned int imgSize = imgHeight * imgWidth * zoomFactor * zoomFactor;
unsigned int imgSize2 = imgSize * 2;
unsigned int matlabImgIndex = 0;
unsigned int oglImageIndex = 0;
for (int j = 0; j < imgWidth * zoomFactor; j++) {
for (int i = 0; i < imgHeight * zoomFactor; i++, matlabImgIndex++) {
oglImageIndex = (j + (imgHeight*zoomFactor -1-i) * paddedWidth) * 3;
imageBuffer[matlabImgIndex] = paddedImgBuffer[oglImageIndex];
imageBuffer[matlabImgIndex + imgSize] = paddedImgBuffer[oglImageIndex + 1];
imageBuffer[matlabImgIndex + imgSize2] = paddedImgBuffer[oglImageIndex + 2];
}
}
mxFree(paddedImgBuffer);
}
static void renderColorMesh(double *FM, int fNum, double *VM, int vNum, float *ColorM, int colorNum,const mxArray *CamParamS, double *imgSizeV, double *zNearFarV, unsigned int zoomFactor,unsigned char *imgBuffer)
{
cameraSetup(CamParamS, zNearFarV[0], zNearFarV[1], (unsigned int) imgSizeV[0], (unsigned int) imgSizeV[1], zoomFactor);
#ifndef NDEBUG
mexPrintf("Start to create the display list: fNum=%d, vNum=%d, colorNum=%d\n", fNum, vNum, colorNum);
#endif
GLuint list = createDisplayListWithColor(FM, fNum, VM, vNum, ColorM, colorNum);
#ifndef NDEBUG
mexPrintf("Start to draw the patch\n");
#endif
drawPatchAndConvert(list, imgBuffer, (int) imgSizeV[0], (int) imgSizeV[1], zoomFactor);
}
void mexFunction(int nlhs, mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
// get the vertex array, face array, and color array
double *FM = mxGetPr(prhs[0]);
int fNum = mxGetM(prhs[0]);
double *VM = mxGetPr(prhs[1]);
int vNum = mxGetM(prhs[1]);
float *ColorM = (float *)mxGetData(prhs[2]);
int colorNum = mxGetM(prhs[2]);
// get the camera parameters
const mxArray *CamParamS = prhs[3];
double *imgSizeV = mxGetPr(prhs[4]);
double *zNearFarV = mxGetPr(prhs[5]);
double zoomFactor = mxGetScalar(prhs[6]);
OffscreenGL offscreenGL((int)(imgSizeV[0] * zoomFactor), (int) (imgSizeV[1] * zoomFactor));
int output3Size[3];
unsigned char *imgBuffer;
if (offscreenGL.RGB8Setup()) {
//mexPrintf("OpenGLCanvas setup Successful\n");
output3Size[0] = (int) (imgSizeV[0] * zoomFactor);
output3Size[1] = (int) (imgSizeV[1] * zoomFactor);
output3Size[2] = 3;
plhs[0] = mxCreateNumericArray(3, output3Size, mxUINT8_CLASS, mxREAL);
imgBuffer = (unsigned char *) mxGetData(plhs[0]);
renderColorMesh(FM, fNum, VM, vNum, ColorM, colorNum, CamParamS, imgSizeV,
zNearFarV, (unsigned int) zoomFactor, imgBuffer);
} else {
mexPrintf("OpenGLCanvas setup failed\n");
}
}

This line is suspicious:
paddedWidth = 4 - paddedWidth + imgWidth * zoomFactor;
It only pads the entire zoomed in line with a few bytes. I think you meant something like
paddedWidth = (4 - paddedWidth + imgWidth) * zoomFactor;

Related

Invalid initialization of dynamic array leads to read and write errors

I'm trying to malloc an array of structs.
typedef struct {
long int val;
long int time;
long int last_used;
} pair;
so in my main I have
pair **fifoVM = (pair **) malloc(sizeof(pair *) * framecount);
pair **fifop1VM = (pair **) malloc(sizeof(pair *) * framecount + 1);
pair **fifop2VM = (pair **) malloc(sizeof(pair *) * framecount + 2);
pair **LRUVM = (pair **) malloc(sizeof(pair *) * framecount);
and I initialize all of the pairs using
void init(pair **frames, int size) {
for (int i = 0; i < size; i++) {
frames[i] = (pair *) malloc(sizeof(pair));
frames[i]->val = -1;
frames[i]->last_used = TIME_VAL;
frames[i]->time = TIME_VAL++;
}
}
But by the time I try to deallocate it, I get a corruption error from Valgrind.
I initially thought that the problem was using a pair* in the array but it still didn't work with just pair. I also thought it might be the pair going out of scope when init() returns but that also inst true because it would only deallocate the variable containing the pointer.
Also for some weird reason, LRUVM is the only array to crash, even though it's the last one.
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <time.h>
//since time.h only has millisecond resolution,
//I need to simulate time
int TIME_VAL = 0;
typedef struct {
long int val;
long int time;
long int last_used;
} pair;
//Allocate the pairs for a given array
void init(pair **frames, int size) {
//iterate through array
for (int i = 0; i < size; i++) {
//allocate memory and assign
frames[i] = (pair *) malloc(sizeof(pair));
frames[i]->val = -1;
frames[i]->last_used = TIME_VAL;
frames[i]->time = TIME_VAL++;
}
}
int main(int argc, char **argv) {
//Command line arguments
int framecount = atoi(argv[1]);
int x = atoi(argv[2]);
int NUM_ACCESSES = atoi(argv[3]);
int NUM_ITERATIONS = atoi(argv[4]);
for (int i = 0; i < NUM_ITERATIONS; i++) {
//Allocate Arrays
pair **fifoVM = (pair **) malloc(sizeof(pair *) * framecount);
pair **fifop1VM = (pair **) malloc(sizeof(pair *) * framecount + 1);
pair **fifop2VM = (pair **) malloc(sizeof(pair *) * framecount + 2);
pair **LRUVM = (pair **) malloc(sizeof(pair *) * framecount);
//initialize all of the pairs in the arrays
init(fifoVM, framecount);
init(fifop1VM, framecount + 1);
init(fifop2VM, framecount + 2);
init(LRUVM, framecount);
//deallocate arrays
freeList(fifoVM, framecount);
freeList(fifop1VM, framecount + 1);
freeList(fifop2VM, framecount + 2);
freeList(LRUVM, framecount);
}
}
void freeList(pair **vm, int framecount) {
for (int i = 0; i < framecount; i++) {
free(vm[i]);
}
free(vm);
}
Some of the allocation sizes are not computed correctly: malloc(sizeof(pair *) * framecount + 1) should be:
malloc(sizeof(pair *) * (framecount + 1))
Note that your data structure seem to have an indirection for no good reason. Why not allocate arrays of structures instead of arrays of pointers to structures allocated individually?
Here is a simpified version:
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <time.h>
//since time.h only has millisecond resolution,
//I need to simulate time
int TIME_VAL = 0;
typedef struct {
long int val;
long int time;
long int last_used;
} pair;
//Allocate the pairs for a given array
void init(pair *frames, int size) {
for (int i = 0; i < size; i++) {
frames[i].val = -1;
frames[i].last_used = TIME_VAL;
frames[i].time = TIME_VAL++;
}
}
int main(int argc, char **argv) {
//Command line arguments
if (argc < 5) return 1;
int framecount = atoi(argv[1]);
int x = atoi(argv[2]);
int num_accesses = atoi(argv[3]);
int num_iterations = atoi(argv[4]);
for (int i = 0; i < num_iterations; i++) {
//Allocate Arrays
pair *fifoVM = calloc(sizeof(pair), framecount);
pair *fifop1VM = calloc(sizeof(pair), framecount + 1);
pair *fifop2VM = calloc(sizeof(pair), framecount + 2);
pair *LRUVM = calloc(sizeof(pair), framecount);
if (fifoVM && fifop1VM && fifop2VM && LRUVM) {
//initialize all of the pairs in the arrays
init(fifoVM, framecount);
init(fifop1VM, framecount + 1);
init(fifop2VM, framecount + 2);
init(LRUVM, framecount);
//...
}
//deallocate arrays
free(fifoVM);
free(fifop1VM);
free(fifop2VM);
free(LRUVM);
}
}

Cannot retrieve original image using FFT with FFTW

I'm using FFTW in my C code and I have some issue.
First, I can transform an original image to two images (mag+phase) and get back the original image with the inverse transform.
However, If I want to get a mag file centered in frequency it does not work anymore: the final image has some issues.
Here some pieces of my code. Can someone help me to find the error in my code?
EDIT: I've fixed the code to follow #francis recommandation, but my issues is always here.
enum {
TYPE_CENTERED,
TYPE_REGULAR
};
static void fft_to_spectra(fits* fit, fftw_complex *frequency_repr, double *as,
double *ps, int nbdata) {
unsigned int i;
for (i = 0; i < nbdata; i++) {
double r = creal(frequency_repr[i]);
double im = cimag(frequency_repr[i]);
as[i] = hypot(r, im);
ps[i] = atan2(im, r);
}
}
static void fft_to_freq(fits* fit, fftw_complex *frequency_repr, double *as, double *ps, int nbdata) {
unsigned int i;
for (i = 0; i < nbdata; i++) {
frequency_repr[i] = as[i] * (cos(ps[i]) + I * sin(ps[i]));
}
}
void change_symmetry(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
if (i < width / 2 && j < height / 2) {
*x = i + width / 2;
*y = j + height / 2;
}
if (i >= width / 2 && j < height / 2) {
*x = i - width / 2;
*y = j + height / 2;
}
if (i < width / 2 && j >= height / 2) {
*x = i + width / 2;
*y = j - height / 2;
}
if (i >= width / 2 && j >= height / 2) {
*x = i - width / 2;
*y = j - height / 2;
}
}
static void centered(WORD *buf, unsigned int width,
unsigned int height) {
unsigned int i, j;
WORD *temp = malloc(width * height * sizeof(WORD));
for (j = 0; j < height; j++) {
for (i = 0; i < width; i++) {
unsigned int x = i;
unsigned int y = j;
change_symmetry(width, height, i, j, &x, &y);
temp[j * width + i] = buf[y * width + x];
}
}
memcpy(buf, temp, sizeof(WORD) * width * height);
free(temp);
}
static void normalisation_spectra(unsigned int w, unsigned int h, double *modulus, double *phase,
WORD *abuf, WORD *pbuf) {
unsigned int i;
for (i = 0; i < h * w; i++) {
pbuf[i] = round_to_WORD(((phase[i] + M_PI) * USHRT_MAX_DOUBLE / (2 * M_PI)));
abuf[i] = round_to_WORD((modulus[i] / w / h));
}
}
static void save_dft_information_in_gfit(fits *fit) {
strcpy(gfit.dft.ord, fit->dft.type);
strcpy(gfit.dft.ord, fit->dft.ord);
}
static void FFTD(fits *fit, fits *x, fits *y, int type_order, int layer) {
WORD *xbuf = x->pdata[layer];
WORD *ybuf = y->pdata[layer];
WORD *gbuf = fit->pdata[layer];
unsigned int i;
unsigned int width = fit->rx, height = fit->ry;
int nbdata = width * height;
fftw_complex *spatial_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!spatial_repr) {
return;
}
fftw_complex *frequency_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!frequency_repr) {
fftw_free(spatial_repr);
return;
}
/* copying image selection into the fftw data */
#ifdef _OPENMP
#pragma omp parallel for num_threads(com.max_thread) private(i) schedule(static) if(nbdata > 15000)
#endif
for (i = 0; i < nbdata; i++) {
spatial_repr[i] = (double) gbuf[i];
}
/* we run the Fourier Transform */
fftw_plan p = fftw_plan_dft_2d(height, width, spatial_repr, frequency_repr,
FFTW_FORWARD, FFTW_ESTIMATE);
fftw_execute(p);
/* we compute modulus and phase */
double *modulus = malloc(nbdata * sizeof(double));
double *phase = malloc(nbdata * sizeof(double));
fft_to_spectra(fit, frequency_repr, modulus, phase, nbdata);
//We normalize the modulus and the phase
normalisation_spectra(width, height, modulus, phase, xbuf, ybuf);
if (type_order == TYPE_CENTERED) {
strcpy(x->dft.ord, "CENTERED");
centered(xbuf, width, height);
centered(ybuf, width, height);
}
free(modulus);
free(phase);
fftw_destroy_plan(p);
fftw_free(spatial_repr);
fftw_free(frequency_repr);
}
static void FFTI(fits *fit, fits *xfit, fits *yfit, int type_order, int layer) {
WORD *xbuf = xfit->pdata[layer];
WORD *ybuf = yfit->pdata[layer];
WORD *gbuf = fit->pdata[layer];
unsigned int i;
unsigned int width = xfit->rx;
unsigned int height = xfit->ry;
int nbdata = width * height;
double *modulus = calloc(1, nbdata * sizeof(double));
double *phase = calloc(1, nbdata * sizeof(double));
if (type_order == TYPE_CENTERED) {
centered(xbuf, width, height);
centered(ybuf, width, height);
}
for (i = 0; i < height * width; i++) {
modulus[i] = (double) xbuf[i] * (width * height);
phase[i] = (double) ybuf[i] * (2 * M_PI / USHRT_MAX_DOUBLE);
phase[i] -= M_PI;
}
fftw_complex* spatial_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!spatial_repr) {
return;
}
fftw_complex* frequency_repr = fftw_malloc(sizeof(fftw_complex) * nbdata);
if (!frequency_repr) {
fftw_free(spatial_repr);
return;
}
fft_to_freq(fit, frequency_repr, modulus, phase, nbdata);
fftw_plan p = fftw_plan_dft_2d(height, width, frequency_repr, spatial_repr,
FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
for (i = 0; i < nbdata; i++) {
double pxl = creal(spatial_repr[i]) / nbdata;
gbuf[i] = round_to_WORD(pxl);
}
free(modulus);
free(phase);
fftw_destroy_plan(p);
fftw_free(spatial_repr);
fftw_free(frequency_repr);
}
Here my images, the original one and the FFTD(centered)->FFTI result
The plan is created using the flag FFTW_MEASURE. Hence, several DFT are computed and the input array is likely overwritten. Here is the start of the description of planner flags in the documentation of FFTW:
FFTW_ESTIMATE specifies that, instead of actual measurements of different algorithms, a simple heuristic is used to pick a (probably sub-optimal) plan quickly. With this flag, the input/output arrays are not overwritten during planning.
FFTW_MEASURE tells FFTW to find an optimized plan by actually computing several FFTs and measuring their execution time. Depending on your machine, this can take some time (often a few seconds). FFTW_MEASURE is the default planning option.
Either switch to FFTW_ESTIMATE or create the plan before populating the input array:
/* we run the Fourier Transform */
fftw_plan p = fftw_plan_dft_2d(width, height, spatial_repr, frequency_repr,
FFTW_FORWARD, FFTW_MEASURE);
/* copying image selection into the fftw data */
#ifdef _OPENMP
#pragma omp parallel for num_threads(com.max_thread) private(i) schedule(static) if(nbdata > 15000)
#endif
for (i = 0; i < nbdata; i++) {
spatial_repr[i] = (double) gbuf[i];
}
If you intend to a single image, using FFTW_ESTIMATE is the way to go. On the contrary, if you consider treating multiple images, creating the plan once using FFTW_MEASURE and storing it is a good option. Then you may use New-array Execute Functions each time a FFT is to be performed:
fftw_execute_dft(p, spatial_repr, frequency_repr);
You can test the return value of malloc() or fftw_malloc() to check if the allocations went right. If not, it returns NULL. fftw_malloc() is implemented as function *X(kernel_malloc)(size_t n) in fftw-3.3.6-pl2/kernel/kalloc.c . It calls functions like memalign() or _aligned_malloc() among others. Both these two return NULL just like malloc() in case of failure. Finally, I did not spotted a critical issue regarding memory allocation of deallocation in the piece of code you provided.
The argument double nbdata in fft_to_spectra() should likely be an integer. Valgrind might have considered it as strange...
EDIT : the change_symmetry() is to be modified for odd sizes. Something like:
void change_symmetry_forward(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
*x = i + width / 2;
if (*x>=width){
*x=*x-width;
}
*y = j + height / 2;
if(*y>=height){
*y =*y-height;
}
}
and
void change_symmetry_backward(unsigned int width, unsigned int height, unsigned int i, unsigned int j, unsigned int *x,
unsigned int *y) {
*x = i +width- width / 2;
if (*x>=width){
*x=*x-width;
}
*y = j +height- height / 2;
if(*y>=height){
*y =*y-height;
}
}

Neural Network in C for XOR, outputs all converge to same value

While the problem in the title is a specific one (converging to same value just for sigmoid, with other activation functions costs are not reduced in general), my network is buggy in general, and after many hours spent debugging/testing I cannot figure out why, even after catching some minor mistakes. Doing the back propagation on paper for the first training epoch aligned with what the function was doing.
I hate to say that I do not know the issue with my network. I'd appreciate if someone could look over it to hint me at what part(s) of my network implementation are wrong.
What I have Tried
Different Activation Functions
Different Weight Initialization Methods (both Xavier and random)
Example (XOR)
Results after 20,000 epochs:
Inputs = 1.000000 0.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028415.
Inputs = 0.000000 1.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028452.
Inputs = 1.000000 1.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028426.
Inputs = 0.000000 0.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028441.
Note
Since I am not providing data.c (since it would make the post extensively long, as it loads data from the csv files), trust that input and targetOutput in the train method of model.c, after initInput and initTargetOutput have been applied to them, they contain the values of the input, [1 or 0, 1 or 0] and the target output, [1 or 0]. I have completely verified this myself.
GitHub
The repository containing the below files is located here. My apologies for not including it in the pre-edited post. Do note that there are many sections of code within the repository that are surrounded by #if #else macros (one for gradient checking, the others for toggling printing). Those are beyond the scope of this post (thus I have removed them in the below files).
model.c
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "model.h"
#include "functions.h"
/**
* #param model
* #param input The head to ann input array of size <code>model.neuronsPerLayer[INPUT_LAYER]</code> that has the inputs
* of the model.
*/
void setInput(struct Model* model, double input[]) {
model->values[INPUT_LAYER] = input;
}
void propagateInputForward(struct Model* model, double input[]) {
setInput(model, input);
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double weightedSum = 0.0;
double bias = model->biases[endLayerIndex][endNeuronIndex];
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightedInfluence = weight * startNeuronValue;
weightedSum += weightedInfluence;
}
weightedSum += bias;
double activatedNeuronValue = model->getActivation(weightedSum);
model->values[endLayerIndex][endNeuronIndex] = activatedNeuronValue;
}
}
}
/**
* #param model The model which the parameter gradients will be based on.
* #param layerIndex The layer index whose weight deltas are being calculated.
* #param baseDelta The base delta, equal to change in the cost function over change in
* the weighted sum of the neuron value.
* #param weightGradients The weight gradient to fill.
* #param biasGradients The bias gradient to fill.
*/
void updateParameterGradients(struct Model *model, const double* targetOutput, double** weightGradients[],
double* biasGradients[]) {
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
// Entry indexed by [layerIndex][neuronIndex] gives
// Δ C / Δ Z[layerIndex, neuronIndex]
double* errors[NUMBER_OF_LAYERS];
errors[OUTPUT_LAYER] = malloc(sizeof(double) * outputNeuronCount);
// Fill errors of output layers
for (int outputNeuronIndex = 0; outputNeuronIndex < outputNeuronCount; outputNeuronIndex++) {
double outputNeuronValue = model->values[OUTPUT_LAYER][outputNeuronIndex];
double targetOutputNeuronValue = targetOutput[outputNeuronIndex];
// Δ C_outputNeuronIndex / Δ A[OUTPUT_LAYER][outputNeuronIndex]
double firstErrorComponent = model->getCostDerivative(outputNeuronValue, targetOutputNeuronValue);
// Δ A[OUTPUT_LAYER][outputNeuronIndex] / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double secondErrorComponent = model->getActivationDerivative(outputNeuronValue);
// Δ C_outputNeuronIndex / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double error = firstErrorComponent * secondErrorComponent;
errors[OUTPUT_LAYER][outputNeuronIndex] = error;
}
// Fill errors of non-output layers
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int startNeuronsCount = model->neuronsPerLayer[startLayerIndex];
int endNeuronsCount = model->neuronsPerLayer[endLayerIndex];
errors[startLayerIndex] = malloc(sizeof(double) * startNeuronsCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronsCount; startNeuronIndex++) {
double error = 0.0;
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronsCount; endNeuronIndex++) {
double nextError = errors[endLayerIndex][endNeuronIndex];
double nextWeight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double activationValue = model->values[startLayerIndex][startNeuronIndex];
double activationValueDelta = model->getActivationDerivative(activationValue);
double errorInfluence = nextWeight * nextError * activationValueDelta;
error += errorInfluence;
}
errors[startLayerIndex][startNeuronIndex] = error;
}
}
// Update weights and biases of all layers based on errors
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double endNeuronError = errors[endLayerIndex][endNeuronIndex];
double biasGradientInfluence = endNeuronError;
biasGradients[endLayerIndex][endNeuronIndex] += biasGradientInfluence;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightGradientInfluence = endNeuronError * startNeuronValue;
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] += weightGradientInfluence;
}
}
}
}
/**
* Updates the weight and bias values within {#code model}, given the gradients of the cost function
* with respect to the weights and biases.
*
* #param model
* #param weightGradients
* #param biasGradients
*/
void updateParameterValues(struct Model* model, double** weightGradients[], double* biasGradients[], int batchSize) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double biasDelta = biasGradients[endLayerIndex][endNeuronIndex];
biasDelta /= batchSize;
biasDelta *= model->learningRate;
// update bias
model->biases[endLayerIndex][endNeuronIndex] -= biasDelta;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weightDelta = weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex];
weightDelta /= batchSize;
weightDelta *= model->learningRate;
// update weight
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] -= weightDelta;
}
}
}
}
static int epochIndex = 0;
void initGradients(struct Model* model, double** weightGradients[], double* biasGradients[]) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
biasGradients[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
weightGradients[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
biasGradients[endLayerIndex][endNeuronIndex] = 0.0;
weightGradients[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++)
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] = 0.0;
}
}
}
/**
* Feeds the input values of the entry into the input array given.
*
* #param input
* #param entry
* #param inputColumnIndices
* #param inputColumnIndicesCount
*/
void initInput(double input[], const double entry[], const int inputColumnIndices[], int inputColumnIndicesCount) {
for (int inputColumnIndex = 0; inputColumnIndex < inputColumnIndicesCount; inputColumnIndex++) {
int inputColumn = inputColumnIndices[inputColumnIndex];
input[inputColumnIndex] = entry[inputColumn];
}
}
/**
* Feeds the target output values of entry given into the target output array given.
*
* #param targetOutput
* #param entry
* #param outputColumnIndices
* #param outputColumnIndicesCount
*/
void initTargetOutput(double targetOutput[], const double entry[], const int outputColumnIndices[], int outputColumnIndicesCount) {
for (int outputColumnIndex = 0; outputColumnIndex < outputColumnIndicesCount; outputColumnIndex++) {
int outputColumn = outputColumnIndices[outputColumnIndex];
targetOutput[outputColumnIndex] = entry[outputColumn];
}
}
void test(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[], double** predictedOutputs, double costs[]) {
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double *entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
double cost = 0.0;
for (int outputIndex = 0; outputIndex < outputNeuronCount; outputIndex++) {
double value = model->values[OUTPUT_LAYER][outputIndex];
predictedOutputs[entryIndex][outputIndex] = value;
double targetValue = targetOutput[outputIndex];
cost += model->getCost(value, targetValue);
}
// Take average cost
cost /= outputNeuronCount;
costs[entryIndex] = cost;
}
}
void freeGradients(struct Model* model, double** weightGradients[], double** biasGradients) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
free(biasGradients[endLayerIndex]);
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
for (int neuronIndex = 0; neuronIndex < endNeuronCount; neuronIndex++)
free(weightGradients[endLayerIndex][neuronIndex]);
}
}
/**
* Trains the model on the given data.
*
* #param model
* #param data Container for the data the model will be trained on.
* #param inputColumnIndices The indices of the columns within {#code data} that are the input columns.
* #param outputColumnIndices The indices of the columns within {#code data} that are the output columns.
*/
void train(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[]) {
// For both weightGradients and biasGradients, index 0 is not occupied.
// [endLayerIndex][endNeuronIndex in layerIndex][startNeuronIndex in layerIndex - 1]
double** weightGradients[NUMBER_OF_LAYERS];
// [endLayerIndex][endNeuronIndex]
double* biasGradients[NUMBER_OF_LAYERS];
// Allocate the storage for the weight and bias deltas, in addition
// to initializing them all weight and bias deltas with values of 0
initGradients(model, weightGradients, biasGradients);
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
epochIndex++;
// Feed each input into model
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double* entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
// Feed values of entry into input and targetOutput given indices of input and output columns
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
// update weight and bias gradients based on this entry, part of the batch
updateParameterGradients(model, targetOutput, weightGradients, biasGradients);
}
updateParameterValues(model, weightGradients, biasGradients, data->numberOfEntries);
freeGradients(model, weightGradients, biasGradients);
}
/**
* Allocates the memory for the parameters (weights and biases) of the model, in addition to initializing
* them to their default values.
*
* #param model
*/
void initParameters(struct Model* model) {
// initialize weights with arbitrary
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
model->weights[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
model->biases[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] = model->getInitialWeightValue(startNeuronCount, endNeuronCount);
model->biases[endLayerIndex][endNeuronIndex] = model->getInitialBiasValue(startNeuronCount, endNeuronCount);
}
}
}
}
/**
* Allocayes the memory for the values of the model.
*
* #param model
*/
void initValues(struct Model* model) {
for (int layerIndex = 0; layerIndex < NUMBER_OF_LAYERS; layerIndex++) {
int neuronsInLayer = model->neuronsPerLayer[layerIndex];
model->values[layerIndex] = malloc(sizeof(double) * neuronsInLayer);
}
}
main.c
#include <stdio.h>
#include <stdlib.h>
#include <zconf.h>
#include <time.h>
#include "model.h"
#include "functions.h"
#include "data.h"
#define EPOCH_COUNT 20000
#define NUMBER_OF_COLUMNS 3
#define TRAIN_ENTRIES_SIZE 4
#define TEST_ENTRIES_SIZE 4
int main() {
time_t currentTime;
time(&currentTime);
srand(currentTime);
struct Model model = {
.neuronsPerLayer = {2, 2, 1},
.learningRate = 0.02,
// Default values
.getActivation = applySigmoid,
.getActivationDerivative = applySigmoidDerivative,
.getCost = getCost,
.getCostDerivative = getCostDerivative,
.getInitialWeightValue = getInitialRandomWeight,
.getInitialBiasValue = getInitialBias,
};
int numberOfInputs = model.neuronsPerLayer[INPUT_LAYER];
int numberOfOutputs = model.neuronsPerLayer[OUTPUT_LAYER];
// Change working directory so data can be referenced relative to parent data folder
chdir("..");
struct Data trainData;
fill(&trainData, "data/xor/train.csv", NUMBER_OF_COLUMNS, TRAIN_ENTRIES_SIZE);
struct Data testData;
fill(&testData, "data/xor/test.csv", NUMBER_OF_COLUMNS, TEST_ENTRIES_SIZE);
int inputColumnIndices[numberOfInputs];
int outputColumnIndices[numberOfOutputs];
inputColumnIndices[0] = 0;
inputColumnIndices[1] = 1;
outputColumnIndices[0] = 2;
initValues(&model);
initParameters(&model);
for (int epochIndex = 0; epochIndex < EPOCH_COUNT; epochIndex++)
train(&model, &trainData, inputColumnIndices, outputColumnIndices);
exit(0);
}
functions.c
#include <stdlib.h>
#include "functions.h"
#include "math.h"
double applySigmoid(double weightedSum) {
double eToWSum = pow(M_E, weightedSum);
return eToWSum / (eToWSum + 1);
}
double applySigmoidDerivative(double activationValue) {
return activationValue * (1 - activationValue);
}
double applyReLU(double weightedSum) {
return weightedSum < 0 ? 0 : weightedSum;
}
double applyReLUDerivative(double activationValue) {
return activationValue == 0 ? 0 : 1;
}
double applyTanH(double weightedSum) {
return 2 * applyReLU(2 * weightedSum) - 1;
}
double applyTanHDerivative(double activationValue) {
return 1 - pow(activationValue, 2);
}
double getInitialXavierWeight(double previousLayerSize, double layerSize) {
return sqrt(2 / previousLayerSize);
}
double getInitialRandomWeight(double previousLayerSize, double layerSize) {
return ((double) rand() / RAND_MAX) * 0.01;
}
double getInitialBias(double previousLayerSize, double layerSize) {
return 0;
}
double getCost(double neuronValue, double intendedValue) {
double difference = neuronValue - intendedValue;
return 0.5 * pow(difference, 2);
}
double getCostDerivative(double neuronValue, double intendedValue) {
return neuronValue - intendedValue;
}
If any other files are required, please request.

Different result of computing in GPU (CUDA) and CPU

I've wanted to create program that generates fractals on my GPU.
First I created a working project in C, after that I tried to convert it into CUDA/C.
Unfortunately, after I did it I saw that there is a difference in results of CPU and GPU.
I spend few hours thinking what I did wrong and it's a mystery to me.
IMO: It seems that there is a difference of calculating values in while loop, therefore it ends earlier than in normal CPU function.
Question: is there any possibility that it is true? And if, what can I do to avoid that kind of computing error?
Here's my entire code:
// C libs
#include <stdint.h>
#include <stdio.h>
#include <iostream>
// Help libs
#include <windows.h>
#include <math.h>
// CUDA libs
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void calulateFractal(unsigned char *a, int N, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < N)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < maxLevel)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
}
void calulateFractalCPU(unsigned char *a, int i, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int x = i % width;
int y = i / width;
double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;
double z_re = p_re;
double z_im = p_im;
int iteration = 0;
while ((z_re * z_re + z_im * z_im) < 4 && iteration < 99)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}
a[i] = iteration;
}
int saveFractalToBitmap(unsigned char **colorsArray, unsigned char *bitmap, int width, int height, char *filename)
{
// Bitmap structures to be written to file
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;
// Fill BITMAPFILEHEADER structure
memcpy((char *)&bfh.bfType, "BM", 2);
bfh.bfSize = sizeof(bfh) + sizeof(bih) + 3*height*width;
bfh.bfReserved1 = 0;
bfh.bfReserved2 = 0;
bfh.bfOffBits = sizeof(bfh) + sizeof(bih);
// Fill BITMAPINFOHEADER structure
bih.biSize = sizeof(bih);
bih.biWidth = width;
bih.biHeight = height;
bih.biPlanes = 1;
bih.biBitCount = 24;
bih.biCompression = BI_RGB; // uncompressed 24-bit RGB
bih.biSizeImage = 0; // can be zero for BI_RGB bitmaps
bih.biXPelsPerMeter = 3780; // 96dpi equivalent
bih.biYPelsPerMeter = 3780;
bih.biClrUsed = 0;
bih.biClrImportant = 0;
// Open bitmap file (binary mode)
FILE *f;
f = fopen(filename, "wb");
if(f == NULL)
return -1;
// Write bitmap file header
fwrite(&bfh, 1, sizeof(bfh), f);
fwrite(&bih, 1, sizeof(bih), f);
// Write bitmap pixel data starting with the
// bottom line of pixels, left hand side
for (int i = 0; i < width * height ; i++)
{
// Write pixel components in BGR order
fputc(colorsArray[bitmap[i]][2], f);
fputc(colorsArray[bitmap[i]][1], f);
fputc(colorsArray[bitmap[i]][0], f);
}
// Close bitmap file
fclose(f);
return 0;
}
int main()
{
unsigned char **colorsArray;
unsigned char *fractalLevelsCPU;
unsigned char *fractalLevelsGPU;
double minX = -1.7;
double maxX = 1.7;
double minY = -1.5;
double maxY = 1.5;
double input_re = -0.79;
double input_im = 0.1463;
int width = 10;
int height = 5;
int N = width * height;
int maxLevel = 100;
size_t levelsArraySize = N * sizeof(unsigned char);
double ratioX = (maxX - minX) / (double) width;
double ratioY = (maxY - minY) / (double) height;
bool gpu = true;
// Allocate memory
colorsArray = (unsigned char**) malloc((maxLevel+1) * sizeof(unsigned char*));
for(int i=0; i<=maxLevel; i++)
{
colorsArray[i] = (unsigned char *) malloc(3 * sizeof(unsigned char));
colorsArray[i][0] = (int) (255.0 * i / maxLevel);
colorsArray[i][1] = (int) (255.0 * i / maxLevel);
colorsArray[i][2] = (int) (255.0 * log((double) i) / log((double) maxLevel));
}
fractalLevelsCPU = (unsigned char*) malloc(levelsArraySize);
cudaMalloc((unsigned char **) &fractalLevelsGPU, levelsArraySize);
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyHostToDevice);
if(gpu)
{
// Run GPU method
calulateFractal <<< 1, N >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
// Copy data from GPU to CPU array
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyDeviceToHost);
}
else
{
// Iterate every element in array and compute level of fractal
for(int i=0; i<N; i++)
{
calulateFractalCPU(fractalLevelsCPU, i, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
}
}
// Show results
for(int i=0; i<N; i++)
{
if((i % width) == 0)
printf("\n");
printf("%d\t", fractalLevelsCPU[i]);
}
//saveFractalToBitmap(colorsArray, fractalLevelsCPU, width, height, "frac.bmp");
// Free memory
for(int i=0; i<=maxLevel; i++)
{
free(colorsArray[i]);
}
free(colorsArray);
free(fractalLevelsCPU);
cudaFree(fractalLevelsGPU);
return 0;
}
I've find solution to my problem.
First of all, number of threads per block should be a power of two number.
Also I realized that my GPU has it's limits for number of threads per block and blocks itself.
NVIDIA Utils showed me that I can use max 65536 blocks and 512 threads per block.
Solution:
int threadsPerBlock = 512;
int blocksNumber = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);
if(blocksNumber > 65536)
return -1;
calulateFractal <<< blocksNumber, threadsPerBlock >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

cast error and invalid conversion error

error: cast from 'void*' to 'unsigned int' loses precision
error: invalid conversion from 'unsigned int' to 'unsigned int**'
can u tell me how to properly cast this, i am getting error on this line:
color = (unsigned int)malloc(height*sizeof(unsigned int));
inside the main function.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
unsigned int width;
unsigned int height;
unsigned int **color = NULL;
bool file_write()
{
FILE *fractal = fopen("mandelbrot_imageSequential.ppm","w+");
if(fractal != NULL)
{
fprintf(fractal,"P6\n");
fprintf(fractal,"# %s\n", "Mandelbrot_imageSequential.ppm");
fprintf(fractal,"%d %d\n", height, width);
fprintf(fractal,"40\n");
int x = 0, y = 0;
unsigned int R = 0, G = 0, B = 0;
for(x = 0; x < width; ++x)
{
for(y = 0; y < height; ++y)
{
R = (color[y][x]*10);
G = 255-((color[y][x]*10));
B = ((color[y][x]*10)-150);
if(R == 10) R = 11;
if(G == 10) G = 11;
if(B == 10) B = 11;
putc(R, fractal);
putc(G, fractal);
putc(B, fractal);
}
}
fclose(fractal);
}
return true;
}
int method(int x, int y, double min_re, double max_re, double min_im, double max_im, int max_iterations)
{
double threshold = 4;
double x_factor = (max_re-min_re)/(width-1);
double y_factor = (max_im-min_im)/(height-1);
double c_im = max_im - y*y_factor;
double c_re = min_re + x*x_factor;
double Z_re = c_re, Z_im = c_im;
unsigned int col = 0;
for(unsigned n = 0; n < max_iterations; ++n)
{
double Z_re2 = Z_re*Z_re, Z_im2 = Z_im*Z_im;
if(Z_re2 + Z_im2 > threshold)
{
col = n;
break;
}
Z_im = 2 * Z_re * Z_im + c_im;
Z_re = Z_re2 - Z_im2 + c_re;
}
return col;
}
void method1(double min_re, double max_re, double min_im, double max_im, int max_iterations)
{
for(int x = 0; x < width; x++)
{
for(int y = 0; y < height; ++y)
{
int m1 = method(x,y,min_re,max_re,min_im,max_im,max_iterations);
if(m1)
{
color[x][y] = m1*50;
}
}
}
}
int main(int argc, char *argv[])
{
unsigned int max_iterations;
int x,y;
double threshold;
double min_re;
double max_re;
double min_im;
double max_im;
unsigned int NUM_OF_THREADS;
if(argc != 10)
{
printf("There is an error in the input given.\n");
return 0;
}
else
{
height = atoi(argv[1]);
width = atoi(argv[2]);
max_iterations = atoi(argv[3]);
min_re = atof(argv[4]);
max_re = atof(argv[5]);
min_im = atof(argv[6]);
max_im = atof(argv[7]);
threshold = atoi(argv[8]);
NUM_OF_THREADS = atoi(argv[9]);
}
color = (unsigned int)malloc(height*sizeof(unsigned int));
printf("height = %d\twidth = %d\tmaximum_iterations = %d\tminimum_x-value = %.2f\tmaximum_x-value = %.2f\tminimum_y-value = %.2f\tmaximum_y-value = %.2f\tthreshold_value = %.2f\tno. of threads = %d\t\n",height,width,max_iterations,min_re,max_re,min_im,max_im,threshold,NUM_OF_THREADS);
for(x = 0; x < height; x++)
{
color[x] = (unsigned int*)malloc(width*sizeof(unsigned int));
}
time_t ts,te;
time(&ts);
method1(min_re, max_re, min_im, max_im, max_iterations);
time(&te);
double diff = difftime(te,ts);
file_write();
printf("Total Time elapsed: %f\n",diff);
return 0;
}
Why are you casting the return value of malloc to an unsigned int?
First off, don't cast the return value of malloc in C. It is pointless and can actually hide the fact that you forgot to include . C is not C++ in this regard. A void* can be implicitly converted to any pointer type in C.
Secondly, malloc returns a pointer, and you have defined color as an unsigned int**... yet you attempt to assign an unsigned int as well as an unsigned int* to it. Obviously those are incompatible. Just drop the casts and use/declare the type properly.
color = (unsigned int**)malloc(height*sizeof(unsigned int*));
Shouldn't it be this?
You are trying to allocate array of pointers dynamically. So what you need to do is the following:
color = (unsigned int**)malloc(height*sizeof(unsigned int));
Rest of it is fine ...

Resources