I am trying to parallelize a simple mandelbrot c program, yet I get this error that has to do with not including acc routine information. Also, I am not sure whether I should be copying data in and out of the parallel section. PS I am relatively new to parallel programming, so any advice with learning it would be appreciated.
(Warning when compiled)
PGC-S-0155-Procedures called in a compute region must have acc routine information: fwrite (mandelbrot.c: 88)
PGC-S-0155-Accelerator region ignored; see -Minfo messages (mandelbrot.c: 51)
main:
51, Accelerator region ignored
88, Accelerator restriction: call to 'fwrite' with no acc routine information
PGC/x86-64 Linux 16.10-0: compilation completed with severe errors
Here is my code:
#include <stdio.h>
#include <math.h>
int main()
{
/* screen ( integer) coordinate */
int iX,iY;
const int iXmax = 800;
const int iYmax = 800;
/* world ( double) coordinate = parameter plane*/
double Cx,Cy;
const double CxMin=-2.5;
const double CxMax=1.5;
const double CyMin=-2.0;
const double CyMax=2.0;
/* */
double PixelWidth=(CxMax-CxMin)/iXmax;
double PixelHeight=(CyMax-CyMin)/iYmax;
/* color component ( R or G or B) is coded from 0 to 255 */
/* it is 24 bit color RGB file */
const int MaxColorComponentValue=255;
FILE * fp;
char *filename="new1.ppm";
char *comment="# ";/* comment should start with # */
static unsigned char color[3];
/* Z=Zx+Zy*i ; Z0 = 0 */
double Zx, Zy;
double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */
/* */
int Iteration;
const int IterationMax=200;
/* bail-out value , radius of circle ; */
const double EscapeRadius=2;
double ER2=EscapeRadius*EscapeRadius;
/*create new file,give it a name and open it in binary mode */
fp= fopen(filename,"wb"); /* b - binary mode */
/*write ASCII header to the file*/
fprintf(fp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue);
/* compute and write image data bytes to the file*/
#pragma acc parallel loop present(CyMin, iY, PixelHeight, iX, iXmax, CxMin, PixelWidth, Zx, Zy, Zx2, Zy2, Iteration, IterationMax)
for(iY=0;iY<iYmax;iY++)
{
Cy=CyMin + iY*PixelHeight;
if (fabs(Cy)< PixelHeight/2) Cy=0.0; /* Main antenna */
#pragma acc loop
for(iX=0;iX<iXmax;iX++)
{
Cx=CxMin + iX*PixelWidth;
/* initial value of orbit = critical point Z= 0 */
Zx=0.0;
Zy=0.0;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
/* */
#pragma acc loop
for (Iteration=0;Iteration<IterationMax && ((Zx2+Zy2)<ER2);Iteration++)
{
Zy=2*Zx*Zy + Cy;
Zx=Zx2-Zy2 +Cx;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
};
/* compute pixel color (24 bit = 3 bytes) */
if (Iteration==IterationMax)
{ /* interior of Mandelbrot set = black */
color[0]=0;
color[1]=0;
color[2]=0;
}
else
{ /* exterior of Mandelbrot set = white */
color[0]=255; /* Red*/
color[1]=255; /* Green */
color[2]=255;/* Blue */
};
/*write color to the file*/
fwrite(color,1,3,fp);
}
}
fclose(fp);
return 0;
}
Since you can't access a file from the GPU, you'll want to capture the results to arrays, copy them back to the host, and then output the results to a file.
Also, the "present" clause indicates that you've already copied the data over to the device and the program will abort if it's not there. Given the usage, I think you meant to use "private", which indicates that the variable should be private to the execution level. However scalars are private by default in OpenACC, so there's no need to manually privatize these variables. If you do manually privatize a variable, be sure to put it at the correct loop level. For example, if you privatize "Zx" on the outer loop, it will only private to that loop level. It would be shared by all the vectors of the inner loop! Again, here, it's best to just let the compiler handle privatizing the scalars, but just be mindful of where you privatize things in the few cases where you have to manually privatize variables.
Here's a corrected version of your code.
#include <stdio.h>
#include <math.h>
int main()
{
/* screen ( integer) coordinate */
int iX,iY;
const int iXmax = 800;
const int iYmax = 800;
/* world ( double) coordinate = parameter plane*/
double Cx,Cy;
const double CxMin=-2.5;
const double CxMax=1.5;
const double CyMin=-2.0;
const double CyMax=2.0;
/* */
double PixelWidth=(CxMax-CxMin)/iXmax;
double PixelHeight=(CyMax-CyMin)/iYmax;
/* color component ( R or G or B) is coded from 0 to 255 */
/* it is 24 bit color RGB file */
const int MaxColorComponentValue=255;
FILE * fp;
char *filename="new1.ppm";
char *comment="# ";/* comment should start with # */
static unsigned char color[3];
unsigned char red[iXmax][iYmax];
unsigned char blue[iXmax][iYmax];
unsigned char green[iXmax][iYmax];
/* Z=Zx+Zy*i ; Z0 = 0 */
double Zx, Zy;
double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */
/* */
int Iteration;
const int IterationMax=200;
/* bail-out value , radius of circle ; */
const double EscapeRadius=2;
double ER2=EscapeRadius*EscapeRadius;
/*create new file,give it a name and open it in binary mode */
fp= fopen(filename,"wb"); /* b - binary mode */
/*write ASCII header to the file*/
fprintf(fp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue);
/* compute and write image data bytes to the file*/
#pragma acc parallel loop copyout(red,blue,green)
for(iY=0;iY<iYmax;iY++)
{
Cy=CyMin + iY*PixelHeight;
if (fabs(Cy)< PixelHeight/2) Cy=0.0; /* Main antenna */
#pragma acc loop
for(iX=0;iX<iXmax;iX++)
{
Cx=CxMin + iX*PixelWidth;
/* initial value of orbit = critical point Z= 0 */
Zx=0.0;
Zy=0.0;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
/* */
#pragma acc loop
for (Iteration=0;Iteration<IterationMax && ((Zx2+Zy2)<ER2);Iteration++)
{
Zy=2*Zx*Zy + Cy;
Zx=Zx2-Zy2 +Cx;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
};
/* compute pixel color (24 bit = 3 bytes) */
if (Iteration==IterationMax)
{ /* interior of Mandelbrot set = black */
red[iX][iY]=0;
blue[iX][iY]=0;
green[iX][iY]=0;
}
else
{ /* exterior of Mandelbrot set = white */
red[iX][iY]=255;
blue[iX][iY]=255;
green[iX][iY]=255;
}; }
}
/*write color to the file*/
for(iY=0;iY<iYmax;iY++) {
for(iX=0;iX<iXmax;iX++) {
color[0] = red[iX][iY];
color[1] = blue[iX][iY];
color[2] = green[iX][iY];
fwrite(color,1,3,fp);
}
}
fclose(fp);
return 0;
}
When using OpenACC, the parallel regions are offloaded to a device, like a GPU. GPU devices normally don't have access to the entire system or the IO, and have a reduced subset of the standard library implemented.
In your case, the fwrite function call cannot be offloaded to a device, since you cannot access the disk from the accelerator.
You could do that in OpenMP, where parallel regions are executed on CPU or MIC threads, which typically have access to the entire system library.
The acc routine directive that PGC is suggesting would allow you to annotate any function to create a device version of it.
I know that it is possible to write Level-2 MATLAB S-Functionswith variable-sized signals.
Is it also somehow possible to do that in C MEX S-Functions?
My data has a different size at each time step. This requirement originates from a compressing block which gets a fixed size signal (2D) as its input. However the output signal (1D / Vector) changes its size at every mdlOutput().
The comments of the question already answered it:
Yes it is possible!
Here my example:
// Required S-Function header
#define S_FUNCTION_LEVEL 2
#define S_FUNCTION_NAME sfunc_varsignal
#include "simstruc.h"
enum {INPUT_PORT = 0, NUM_INPUT_PORTS};
enum {OUTPUT_PORT = 0, NUM_OUPUT_PORTS};
/**
* "Specify the number of inputs, outputs, states, parameters, and other
* characteristics of the C MEX S-function"
*/
static void mdlInitializeSizes(SimStruct* S)
{
boolean_T boolret;
int_T intret;
// Parameter
ssSetNumSFcnParams(S, 0);
if (ssGetNumSFcnParams(S) != ssGetSFcnParamsCount(S))
{
return; // Parameter mismatch will be reported by Simulink
}
// Input port
boolret = ssSetNumInputPorts(S, NUM_INPUT_PORTS);
if (boolret == 0)
{
return;
}
ssSetInputPortDirectFeedThrough(S, INPUT_PORT, 1);
intret = ssSetInputPortDimensionInfo(S, INPUT_PORT, DYNAMIC_DIMENSION);
if (intret == 0)
{
ssWarning(S, "Input dimensions could not be set.");
}
_ssSetInputPortNumDimensions(S, INPUT_PORT, 1);
// Output port
boolret = ssSetNumOutputPorts(S, NUM_OUPUT_PORTS);
if (boolret == 0)
{
return;
}
intret = ssSetOutputPortDimensionInfo(S, OUTPUT_PORT, DYNAMIC_DIMENSION);
if (intret == 0)
{
ssWarning(S, "Output dimensions could not be set.");
}
_ssSetOutputPortNumDimensions(S, OUTPUT_PORT, 1);
// Sample Times
ssSetNumSampleTimes(S, 1);
// Dimension Modes of the Ports
ssSetInputPortDimensionsMode(S, INPUT_PORT, INHERIT_DIMS_MODE);
ssSetOutputPortDimensionsMode(S, OUTPUT_PORT, INHERIT_DIMS_MODE);
// This is required for any kind of variable size signal:
ssSetInputPortRequiredContiguous(S, INPUT_PORT, true);
ssSetOptions(S, 0);
// Note: In the doc of ssSetOutputPortWidth it is wriiten:
// "If the width is dynamically sized, the S-function must provide
// mdlSetOutputPortDimensionInfo and mdlSetDefaultPortDimensionInfo
// methods to enable the signal dimensions to be set correctly
// during signal propagation."
// However in the example sfun_varsize_concat1D this methods are
// not present. The function _ssSetOutputPortNumDimensions() may be sufficient
// This usgae of this function is copied from the example sfcndemo_varsize
}
#if defined(MATLAB_MEX_FILE)
/**
* "Set the width of an input port that accepts 1-D (vector) signals"
*/
#define MDL_SET_INPUT_PORT_WIDTH
static void mdlSetInputPortWidth(SimStruct* S, int_T port, int_T width)
{
// Set to the sugessted width (e.g. the output width
// from the connected block)
ssSetInputPortWidth(S, port, width);
// Check if the setting was sucessful
if (ssGetInputPortWidth(S, INPUT_PORT) != DYNAMICALLY_SIZED)
{
ssSetOutputPortWidth(S, OUTPUT_PORT, width);
}
return;
}
/**
* "Set the width of an output port that outputs 1-D (vector) signals"
*/
#define MDL_SET_OUTPUT_PORT_WIDTH
static void mdlSetOutputPortWidth(SimStruct* S, int_T port, int_T width)
{
// Nothing here, but its required since the output port is set as
// dynamically sized. But its size is set in mdlSetInputPortWidth()
UNUSED_ARG(S);
UNUSED_ARG(port);
UNUSED_ARG(width);
return;
}
#endif //defined(MATLAB_MEX_FILE)
/**
* "Specify the sample rates at which this C MEX S-function operates"
*/
static void mdlInitializeSampleTimes(SimStruct* S)
{
ssSetSampleTime(S, 0, INHERITED_SAMPLE_TIME);
ssSetOffsetTime(S, 0, 0.0);
ssSetModelReferenceSampleTimeDefaultInheritance(S);
}
/**
* "Compute the signals that this block emits."
*/
static void mdlOutputs(SimStruct* S, int_T tid)
{
UNUSED_ARG(tid);
const real_T* insignal = ssGetInputPortRealSignal(S, INPUT_PORT);
auto width = static_cast<const int>(insignal[0]);
// This function does the trick:
ssSetCurrentOutputPortDimensions(S, OUTPUT_PORT, 0, width);
// newWidth should be width
int_T newWidth = ssGetCurrentOutputPortDimensions(S, OUTPUT_PORT, 0 /* dimension ID */);
real_T* outsignal = ssGetOutputPortRealSignal(S, OUTPUT_PORT);
for (int i = 0; i < newWidth; i++)
{
*outsignal++ = i+1;
}
}
/**
* "Perform any actions required at termination of the simulation"
*/
static void mdlTerminate(SimStruct* S)
{
UNUSED_ARG(S);
}
// Required S-function trailer
#ifdef MATLAB_MEX_FILE /* Is this file being compiled as a MEX-file? */
#include "simulink.c" /* MEX-file interface mechanism */
#else
#include "cg_sfun.h" /* Code generation registration function */
#endif
I've been trying to find a working floodfill algorithm. Of the many algorithms I've tried only the 'recursive line fill' one behaves exactly as it should with the major caveat that it occasionally blows the stack. :(
I have tried many non-recursive implementations I've found and they have all been exceptionally tempermental: either they leave gaps in strange places, or flood the whole area (when they should be enclosed).
Anyone has a NON-recursive floodfill working sourcecode written in C (or c++ that isn't too heavily OOP and I can disentangle easily enough)?
Just implement a stack of int pairs with an array of some fixed size (maybe the size of the image in pixels or the square root of that, for example) for the stack and track the top with an int.
Here is some C# code that implements floodfill non-recursively:
private static void Floodfill(byte[,] vals, Point q, byte SEED_COLOR, byte COLOR)
{
int h = vals.GetLength(0);
int w = vals.GetLength(1);
if (q.Y < 0 || q.Y > h - 1 || q.X < 0 || q.X > w - 1)
return;
Stack<Point> stack = new Stack<Point>();
stack.Push(q);
while (stack.Count > 0)
{
Point p = stack.Pop();
int x = p.X;
int y = p.Y;
if (y < 0 || y > h - 1 || x < 0 || x > w - 1)
continue;
byte val = vals[y, x];
if (val == SEED_COLOR)
{
vals[y, x] = COLOR;
stack.Push(new Point(x + 1, y));
stack.Push(new Point(x - 1, y));
stack.Push(new Point(x, y + 1));
stack.Push(new Point(x, y - 1));
}
}
}
Here's some C++ code that does what you want. It uses a queue, and is more efficient about insertions into the queue.
connectedRegion(const Point& source, RegionType& region, const Color target)
{
Color src_color = color_of(source, region);
if (region.count(source) == 0 || src_color == target)
return;
std::queue<Point> analyze_queue;
analyze_queue.push(source);
while (!analyze_queue.empty())
{
if (color_of(analyze_queue.front()) != src_color)
{
analyze_queue.pop();
continue;
}
Point leftmost_pt = analyze_queue.front();
leftmost_pt.col -= 1;
analyze_queue.pop();
Point rightmost_pt = leftmost_pt;
rightmost_pt.col += 2;
while (color_of(leftmost_pt, region) == src_color)
--leftmost_pt.col;
while (color_of(rightmost_pt, region) == src_color)
++rightmost_pt.col;
bool check_above = true;
bool check_below = true;
Point pt = leftmost_pt;
++pt.col;
for (; pt.col < rightmost_pt.col; ++pt.col)
{
set_color(pt, region, target);
Point pt_above = pt;
--pt_above.row;
if (check_above)
{
if (color_of(pt_above, region) == src_color)
{
analyze_queue.push(pt_above);
check_above = false;
}
}
else // !check_above
{
check_above = (color_of(pt_above, region) != src_color);
}
Point pt_below = pt;
++pt_below.row;
if (check_below)
{
if (color_of(pt_below, region) == src_color)
{
analyze_queue.push(pt_below);
check_below = false;
}
}
else // !check_below
{
check_below = (color_of(pt_below, region) != src_color);
}
} // for
} // while queue not empty
return connected;
}
A quick googling brings up the Wikipedia article on Flood Fill which includes pseudocode implementations which are not recursive. Below is some code that could help get you started, a basic queue implementation in C:
typedef struct queue_ { struct queue_ *next; } queue_t;
typedef struct ffnode_ { queue_t node; int x, y; } ffnode_t;
/* returns the new head of the queue after adding node to the queue */
queue_t* enqueue(queue_t *queue, queue_t *node) {
if (node) {
node->next = queue;
return node;
}
return NULL;
}
/* returns the head of the queue and modifies queue to be the new head */
queue_t* dequeue(queue_t **queue) {
if (queue) {
queue_t *node = (*queue);
(*queue) = node->next;
node->next = NULL;
return node;
}
return NULL;
}
ffnode_t* new_ffnode(int x, int y) {
ffnode_t *node = (ffnode_t*)malloc(sizeof(ffnode_t));
node->x = x; node->y = y;
node->node.next = NULL;
return node;
}
void flood_fill(image_t *image, int startx, int starty,
color_t target, color_t replacement) {
queue_t *head = NULL;
ffnode_t *node = NULL;
if (!is_color(image, startx, starty, target)) return;
node = new_ffnode(startx, starty);
for ( ; node != NULL; node = (ffnode_t*)dequeue(&head)) {
if (is_color(image, node->x, node->y, target)) {
ffnode_t *west = node, *east = node;
recolor(image, node->x, node->y, replacement);
/* 1. move w to the west until the color of the node to the west
no longer matches target */
...
}
}
}
Isn't there a proof somewhere that all recursive functions can be implemented as an iterative function by using local data to mimic a stack? You could probably use std::vector to create stack-like behavior of the algorithm without blowing the stack since it will use the heap.
EDIT: I noticed you are using C, so instead of std::vector, you could just implement similar behavior via realloc as you need to add more elements to your local "stack" of whatever data structure you would use.
I do not know if my answer is perfectly relevant to the question you put, but hereafter I propose my C version of the Flood-Fill algorithm, which does not use recursive calls.
1-11-2017: NEW-VERSION; SUCCESFULLY TESTED WITH TWO BITMAPS.
It uses only a queue of the offsets of the new points, it works on the window: WinnOffs-(WinDimX,WinDimY) of the double-buffer: *VBuffer (copy of the screen or image) and, optionally, it write a mask of the flood-fill's result (*ExtraVBuff).
ExtraVBuff must be filled it with 0 before the call (if you don't need a mask you may set ExtraVBuff= NULL); using it after call you can do gradient floodfill or other painting effects. NewFloodFill works with 32 Bit per Pixel and it is a C function. I've reinvented this algorithm in 1991 (I wrote his in Pascal), but now it works in C with 32 Bit per Pixel; also not uses any functions calls, does only a division after each "pop" from queue, and never overflows the queue, that, if it is sized in the right way (about 1/4 of the pixels of the image), it allows always to fill correctly any area; I show before the c-function (FFILL.C), after the test program (TEST.C):
#define IMAGE_WIDTH 1024
#define IMAGE_HEIGHT 768
#define IMAGE_SIZE IMAGE_WIDTH*IMAGE_HEIGHT
#define QUEUE_MAX IMAGE_SIZE/4
typedef int T_Queue[QUEUE_MAX];
typedef int T_Image[IMAGE_SIZE];
void NewFloodFill(int X,
int Y,
int Color,
int BuffDimX,
int WinOffS,
int WinDimX,
int WinDimY,
T_Image VBuffer,
T_Image ExtraVBuff,
T_Queue MyQueue)
/* Replaces all pixels adjacent to the first pixel and equal to this; */
/* if ExtraVBuff == NULL writes to *VBuffer (eg BUFFER of 786432 Pixel),*/
/* otherwise prepare a mask by writing on *ExtraVBuff (such BUFFER must */
/* always have the same size as *VBuffer (it must be initialized to 0)).*/
/* X,Y: Point coordinates' of origin of the flood-fill. */
/* WinOffS: Writing start offset on *VBuffer and *ExtraVBuff. */
/* BuffDimX: Width, in number of Pixel (int), of each buffer. */
/* WinDimX: Width, in number of Pixel (int), of the window. */
/* Color: New color that replace all_Pixel == origin's_point. */
/* WinDimY: Height, in number of Pixel (int), of the window. */
/* VBuffer: Pointer to the primary buffer. */
/* ExtraVBuff: Pointer to the mask buffer (can be = NULL). */
/* MyQueue: Pointer to the queue, containing the new-points' offsets*/
{
int VBuffCurrOffs=WinOffS+X+Y*BuffDimX;
int PixelIn=VBuffer[VBuffCurrOffs];
int QueuePnt=0;
int *TempAddr=((ExtraVBuff) ? ExtraVBuff : VBuffer);
int TempOffs1;
int TempX1;
int TempX2;
char FLAG;
if (0<=X && X<WinDimX && 0<=Y && Y<WinDimY) do
{
/* Fill to left the current line */
TempX2=X;
while (X>=0 && PixelIn==VBuffer[VBuffCurrOffs])
{
TempAddr[VBuffCurrOffs--]=Color;
--X;
}
TempOffs1=VBuffCurrOffs+1;
TempX1=X+1;
/* Fill to right the current line */
VBuffCurrOffs+=TempX2-X;
X=TempX2;
while (X+1<WinDimX && PixelIn==VBuffer[VBuffCurrOffs+1])
{
++X;
TempAddr[++VBuffCurrOffs]=Color;
}
TempX2=X;
/* Backward scan of the previous line; puts new points offset in Queue[] */
if (Y>0)
{
FLAG=1;
VBuffCurrOffs-=BuffDimX;
while (X-->=TempX1)
{
if (PixelIn!=VBuffer[VBuffCurrOffs] ||
ExtraVBuff && Color==ExtraVBuff[VBuffCurrOffs])
FLAG=1;
else
if (FLAG)
{
FLAG=0;
if (QueuePnt<QUEUE_MAX)
MyQueue[QueuePnt++]=VBuffCurrOffs;
}
--VBuffCurrOffs;
}
}
/* Forward scan of the next line; puts new points offset in Queue[] */
if (Y<WinDimY-1)
{
FLAG=1;
VBuffCurrOffs=TempOffs1+BuffDimX;
X=TempX1;
while (X++<=TempX2)
{
if (PixelIn!=VBuffer[VBuffCurrOffs] ||
ExtraVBuff && Color==ExtraVBuff[VBuffCurrOffs])
FLAG=1;
else
if (FLAG)
{
FLAG=0;
if (QueuePnt<QUEUE_MAX)
MyQueue[QueuePnt++]=VBuffCurrOffs;
}
++VBuffCurrOffs;
}
}
/* Gets a new point offset from Queue[] */
if (--QueuePnt>=0)
{
VBuffCurrOffs=MyQueue[QueuePnt];
TempOffs1=VBuffCurrOffs-WinOffS;
X=TempOffs1%BuffDimX;
Y=TempOffs1/BuffDimX;
}
/* Repeat the main cycle until the Queue[] is not empty */
} while (QueuePnt>=0);
}
Here there is the test program:
#include <stdio.h>
#include <malloc.h>
#include "ffill.c"
#define RED_COL 0xFFFF0000
#define WIN_LEFT 52
#define WIN_TOP 48
#define WIN_WIDTH 920
#define WIN_HEIGHT 672
#define START_LEFT 0
#define START_TOP 671
#define BMP_HEADER_SIZE 54
typedef char T_Image_Header[BMP_HEADER_SIZE];
void main(void)
{
T_Image_Header bmpheader;
T_Image *image;
T_Image *mask;
T_Queue *MyQueue;
FILE *stream;
char *filename1="ffill1.bmp";
char *filename2="ffill2.bmp";
char *filename3="ffill3.bmp";
int bwritten;
int bread;
image=malloc(sizeof(*image));
mask=malloc(sizeof(*mask));
MyQueue=malloc(sizeof(*MyQueue));
stream=fopen(filename1,"rb");
bread=fread(&bmpheader, 1, BMP_HEADER_SIZE, stream);
bread=fread((char *)image, 1, IMAGE_SIZE<<2, stream);
fclose(stream);
memset(mask,0,IMAGE_SIZE<<2);
NewFloodFill(START_LEFT,
START_TOP,
RED_COL,
IMAGE_WIDTH,
IMAGE_WIDTH*WIN_TOP+WIN_LEFT,
WIN_WIDTH,
WIN_HEIGHT,
*image,
NULL,
*MyQueue);
stream=fopen(filename2,"wb+");
bwritten=fwrite(&bmpheader, 1, BMP_HEADER_SIZE, stream);
bwritten=fwrite((char *)image, 1, IMAGE_SIZE<<2, stream);
fclose(stream);
stream=fopen(filename3,"wb+");
bwritten=fwrite(&bmpheader, 1, BMP_HEADER_SIZE, stream);
bwritten=fwrite((char *)mask, 1, IMAGE_SIZE<<2, stream);
fclose(stream);
free(MyQueue);
free(mask);
free(image);
}
I've used, for the input of the test program shown, the follow Windows uncompressed .BMP image (ffill1.bmp):
Filled, by the test program shown, as follows (ffill2.bmp):
Using "mask" instead of NULL, the output bitmap is (ffill3.bmp):
You can convert any recursive algorithm to iterative by creating an explicit stack or queue and loading work onto it/pulling it off.
All you need is to choose a nice, compact representation of the work to be done. Worst case: create a struct holding the arguments you would normally pass to the recursive version...
We noticed that our floodfill implementation on 3d volumes was consuming way much memory; so we modified the code in the following ways (there was a vast improvement):
Create a sphere of radius = 10 voxs around the starting point, and mark all the voxels within that radius as "to be visited"
If the current voxel > threshold, insert 1.
Go to the neighbors [+1, -1, 0] (also check that one doesn't revisit any voxel), if the neighbor.getVoxVal = 0 (the initialization value for the target volume), then it falls at the boundary of the sphere, insert the coordinates in a different stack. (this would be the starting point for our next sphere)
radius = radius + 10 (voxels)
So at a time, our floodfill is working on a concentric sphere and filling things up, which is a part of the entire volume, and as I said, this has reduced the memory consumption drastically, but I am still searching for an implementation/idea that would be better.
I have a non-recursive flood fill, but I won't post it because it's the solution to a homework assignment. But here's a hint: depth-first search, which is the natural algorithm, uses far more auxiliary space than a breadth-first search. Here's what I wrote at the time (suitably expurgated):
I dare not try depth-first search by simple recursion; the depth of recursion is limited only by REDACTED, and my experiments show that an PROBLEM REDACTED could nevertheless require a stack depth of over a million. So I put the stack in an auxiliary data structure. Using an explicit stack actually makes it easy to try breadth-first search as well, and it turns out that breadth-first search can use forty times less space than depth-first search.
For my data structure I used the Seq_T from Dave Hanson's C Interfaces and Implementations; changing from depth-first to breadth-first requires changing just one function call.
You can quickly convert a recursive flood fill into an ultra-performant pseudo-recursive... Don't edit the lines,just add new lines:
place the recursive function in an XY loop for added structure.
record the found neighbors to a "found neighbors array"
instead of memory, so you will start packing the 4-16-64 style tree of the recursion into an XY array. memory usage goes from 1 gygabyte to 2 megabytes.
Also use a 2D array called "filled neighbors array"... abort the function for any pixels marked as filled in the "filled neighbors array", this uses 2 instructions for every duplicate, 20 instructions for every floodfill operation, and it iteratively fills leftwards and upwards like dominoes, insanely quickly.
1024x1024 uses about 1million *20 instructions which is 0.1 seconds for a single core.
I achieve 9 million filled pixels per second on an i7 in this way, i have a video as proof, and a blog page with code and theory explanations:
www.youtube.com/watch?v=4hQ1wA4Sl4c
and here is a page where I attempted to explain how it works.
http://unity3dmc.blogspot.com/2017/02/ultimate-3d-floodfill-scanline.html?m=1
And the code.
Recursions would be the fastest if they could stay organized.
If you recurse through a grid of data (image) you can store the processing of the recursions in grid format too, because the processed steps represent pixels from the grid, rather than explode the results into a tree format.
Below is my BFS based iterative c++ method for the flood fill problem:
// M is the input matrix, with every entry(pixel) have a color
// represented with an integer value.
// (x, y) row and column of seed point respectively
// k: The new color to fill the seed and its adjacent pixels
void floodFill(vector<vector<int>> &M, int x, int y, int k) {
queue<pair<int, int>> nodeQ;
nodeQ.push({x, y});
int oldCol = M[x][y];
while(!nodeQ.empty()) {
pair<int, int> currNode = nodeQ.front();
nodeQ.pop();
if(M[currNode.first][currNode.second] == oldCol) {
M[currNode.first][currNode.second] = k;
if(currNode.first > 0) nodeQ.push({currNode.first-1, currNode.second});
if(currNode.first < (M.size()-1)) nodeQ.push({currNode.first+1, currNode.second});
if(currNode.second > 0) nodeQ.push({currNode.first, currNode.second-1});
if(currNode.second < (M[0].size()-1)) nodeQ.push({currNode.first, currNode.second+1});
}
}
}
Here is a guide for a non-recursive routine which completes 10 million pixels per second: it's called marching-floodfills, what happens if you march the previously recursive routine forwards in a X-Y loop.
write your own memory, a 2D array to record verified spaces, and another array which records the complete filled image, and read and write to them using this loop system... it averages 20 instructions per pixel. i dealt with 2 billion voxel graphs at 5 million Voxels per second using above video logic.
I found this fill by Paul Heckbert to be the simplest non-recursive C implementation:
/*
* A Seed Fill Algorithm
* by Paul Heckbert
* from "Graphics Gems", Academic Press, 1990
*
* user provides pixelread() and pixelwrite() routines
*/
/*
* fill.c : simple seed fill program
* Calls pixelread() to read pixels, pixelwrite() to write pixels.
*
* Paul Heckbert 13 Sept 1982, 28 Jan 1987
*/
typedef struct { /* window: a discrete 2-D rectangle */
int x0, y0; /* xmin and ymin */
int x1, y1; /* xmax and ymax (inclusive) */
} Window;
typedef int Pixel; /* 1-channel frame buffer assumed */
Pixel pixelread(int x, int y);
void pixelwrite(int x, int y, Pixel p);
typedef struct {short y, xl, xr, dy;} Segment;
/*
* Filled horizontal segment of scanline y for xl<=x<=xr.
* Parent segment was on line y-dy. dy=1 or -1
*/
#define MAX 10000 /* max depth of stack */
#define PUSH(Y, XL, XR, DY) /* push new segment on stack */ \
if (sp<stack+MAX && Y+(DY)>=win->y0 && Y+(DY)<=win->y1) \
{sp->y = Y; sp->xl = XL; sp->xr = XR; sp->dy = DY; sp++;}
#define POP(Y, XL, XR, DY) /* pop segment off stack */ \
{sp--; Y = sp->y+(DY = sp->dy); XL = sp->xl; XR = sp->xr;}
/*
* fill: set the pixel at (x,y) and all of its 4-connected neighbors
* with the same pixel value to the new pixel value nv.
* A 4-connected neighbor is a pixel above, below, left, or right of a pixel.
*/
void fill(x, y, win, nv)
int x, y; /* seed point */
Window *win; /* screen window */
Pixel nv; /* new pixel value */
{
int l, x1, x2, dy;
Pixel ov; /* old pixel value */
Segment stack[MAX], *sp = stack; /* stack of filled segments */
ov = pixelread(x, y); /* read pv at seed point */
if (ov==nv || x<win->x0 || x>win->x1 || y<win->y0 || y>win->y1) return;
PUSH(y, x, x, 1); /* needed in some cases */
PUSH(y+1, x, x, -1); /* seed segment (popped 1st) */
while (sp>stack) {
/* pop segment off stack and fill a neighboring scan line */
POP(y, x1, x2, dy);
/*
* segment of scan line y-dy for x1<=x<=x2 was previously filled,
* now explore adjacent pixels in scan line y
*/
for (x=x1; x>=win->x0 && pixelread(x, y)==ov; x--)
pixelwrite(x, y, nv);
if (x>=x1) goto skip;
l = x+1;
if (l<x1) PUSH(y, l, x1-1, -dy); /* leak on left? */
x = x1+1;
do {
for (; x<=win->x1 && pixelread(x, y)==ov; x++)
pixelwrite(x, y, nv);
PUSH(y, l, x-1, dy);
if (x>x2+1) PUSH(y, x2+1, x-1, -dy); /* leak on right? */
skip: for (x++; x<=x2 && pixelread(x, y)!=ov; x++);
l = x;
} while (x<=x2);
}
}
source: https://github.com/erich666/GraphicsGems/blob/master/gems/SeedFill.c