MPI_SEND and MPI_RECIEVE have no Reference by Compile - c

I am trying to compile an MPI program with mpicc. The compiler complains only that there is no reference to MPI_RECIVE and MPI_SEND, and ends the compile error. I have #include in the .c file.
Can someone tell me how I can fix this?
Here ist the Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <mpi.h>
#include "random.h"
#include "md5tool.h"
/* horizontal size of the configuration */
#define XSIZE 1024
/* "ADT" State and line of states (plus border) */
typedef char State;
typedef State Line[XSIZE + 2];
/* determine random integer between 0 and n-1 */
#define randInt(n) ((int)(nextRandomLEcuyer() * n))
/* random starting configuration */
static void initConfig(Line *buf, int lines){
int x, y;
initRandomLEcuyer(424243);
for (y = 1; y <= lines; y++) {
for (x = 1; x <= XSIZE; x++) {
buf[y][x] = randInt(100) >= 50;
}
}
}
/* annealing rule from ChoDro96 page 34
* the table is used to map the number of nonzero
* states in the neighborhood to the new state
*/
static State anneal[10] = {0, 0, 0, 0, 1, 0, 1, 1, 1, 1};
/* a: pointer to array; x,y: coordinates; result: n-th element of anneal,
where n is the number of neighbors */
#define transition(a, x, y) \
(anneal[(a)[(y)-1][(x)-1] + (a)[(y)][(x)-1] + (a)[(y)+1][(x)-1] +\
(a)[(y)-1][(x) ] + (a)[(y)][(x) ] + (a)[(y)+1][(x) ] +\
(a)[(y)-1][(x)+1] + (a)[(y)][(x)+1] + (a)[(y)+1][(x)+1]])
/* treat torus like boundary conditions */
static void boundary(Line *buf, int lines){
int x,y;
for (y = 0; y <= lines+1; y++) {
/* copy rightmost column to the buffer column 0 */
buf[y][0 ] = buf[y][XSIZE];
/* copy leftmost column to the buffer column XSIZE + 1 */
buf[y][1+1] = buf[y][1 ];
}
for (x = 0; x <= XSIZE+1; x++) {
/* copy bottommost row to buffer row 0 */
buf[0][x ] = buf[lines][x];
/* copy topmost row to buffer row lines + 1 */
buf[lines+1][x] = buf[1][x ];
}
}
/* make one simulation iteration with lines lines.
* old configuration is in from, new one is written to to.
*/
//umschreiben
/**
static void simulate(Line *from, Line *to, int lines){
boundary(from, lines);
for (y = 1; y <= lines; y++) {
for (x = 1; x <= XSIZE; x++) {
to[y][x ] = transition(from, x , y);
}
}
}
*/
/* --------------------- measurement ---------------------------------- */
int main(int argc, char** argv){
int lines, its;
int i;
Line *from, *to, *temp, *next;
char* hash;
assert(argc == 3);
lines = atoi(argv[1]);
its = atoi(argv[2]);
from = malloc((lines + 2) * sizeof(Line));
to = malloc((lines + 2) * sizeof(Line));
MPI_Init(NULL, NULL);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if(world_rank == 0){
int y;
next = malloc((lines + 2) * sizeof(Line));
initConfig(from, lines);
for (i = 0; i < its; i++) {
boundary(from, lines);
int z;
for(z = 0; z < world_size;z++){
if(z !=world_rank ){
MPI_SEND(from,(lines + 2) * sizeof(Line),MPI_CHARACTER,z,0,MPI_COMM_WORLD);
}
}
MPI_Status status;
for(z = 0; z < world_size;z++){
if(z !=world_rank ){
MPI_RECIVE(next,(lines + 2) * sizeof(Line),MPI_CHARACTER,z,1,&status);
if(status.MPI_ERROR){
//TODO
MPI_Abort(MPI_COMM_WORLD,1);
}
for (y = 1; y <= (lines%world_size+lines/world_size); y++) {
stpcpy(to[y*z],next[y*z]);
}
}
}
temp = from;
from = to;
to = temp;
}
hash = getMD5DigestStr(from[1], sizeof(Line) * (lines));
printf("hash: %s\n", hash);
free(next);
}else{
int x,y;
MPI_Status status;
for(i = 0; i < its; i++){
MPI_RECIVE(from,(lines + 2) * sizeof(Line),MPI_CHARACTER,0,0,&status);
if(status.MPI_ERROR){
MPI_Abort(MPI_COMM_WORLD,2);
}
for (y = 1; y <= (lines%world_size+lines/world_size); y++) {
for (x = 1; x <= XSIZE; x++) {
to[y*world_rank][x ] = transition(from, x , y*world_rank);
}
}
MPI_SEND(to,(lines + 2) * sizeof(Line),MPI_CHARACTER,0,1,MPI_COMM_WORLD);
}
}
MPI_Finalize();
free(from);
free(to);
free(hash);
return 0;
}
This is a C a Sequence implementation which I wrote for the university as a homework assignment.

Are you talking about MPI_Send and MPI_Recv ?
Don't know about any MPI_SEND or MPI_RECIV function...
I think you just mispelled them.
BTW: here is a great tutorial about how to use them http://mpitutorial.com/tutorials/mpi-send-and-receive/

Related

Efficient way to find rows with same elements in a 3D matrix in C

I have a 3D matrix mat[100][100][100]. What is the efficient way to find a row with same elements that appears in mat[0][][], mat[1][][],....,mat[99][][]?
A simple approach would be comparing each row of mat[0][][] to all rows of the remaining 99 matrices, but it wouldn't be very efficient(I guess). Is there a better way to do it?
To expand on the comment by #chux, the first step is to compute a hash value for each row of each matrix. That's 10000 hash values in all. The results should be stored in an array of 10000 structs.
struct info
{
int m; // the matrix number
int row; // the row number
uint32_t hash; // the hash value for mat[m][row]
};
static struct info hashArray[10000];
After filling in all 10000 entries of the hashArray, sort the array by hash value. Then you can simply scan the array to find any duplicate hash values. When you do find duplicates, you need to confirm by comparing the row elements.
I finally found some time to write the content addressable code. It turns out to be much faster than using hash tables. But the catch is that the code is way more complex and the program takes WAY more memory. My final opinion is that unless you really need the extra speed, stick with the hash table.
Some examples of test runs are given below. The argument to the program specify the number of unique rows. The program fills the rest with randomly chosen existing rows. Then the rows are shuffled. The program looks for all duplicate rows and reports the number of duplicate rows and the time it took for both hash and content addressable tables.
bing#mint5 ~ $ cc -O2 cattest.c -o cattest
bing#mint5 ~ $ ./cattest 500
CAT Test 9500 0.0083
Hash Test 9500 0.0499
bing#mint5 ~ $ ./cattest 5000
CAT Test 5000 0.0195
Hash Test 5000 0.1477
bing#mint5 ~ $ ./cattest 9000
CAT Test 1000 0.0321
Hash Test 1000 0.1092
/* content addressable table vs hash table */
/* written by Bing H Bang */
/* I DONOT give permission to any snot-nosed students to copy my work and turn it in
as their own */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <errno.h>
#include <string.h>
#include <sys/time.h>
#include <sys/sysinfo.h>
double etime()
{
struct timeval tv;
double dt, df;
gettimeofday(&tv, NULL);
dt = (double)(tv.tv_sec);
df = ((double)(tv.tv_usec))/1000000.0;
return(dt+df);
}
struct CAT_entry
{
unsigned fval;
unsigned short rows[10000];
unsigned short num;
unsigned short used;
struct CAT_entry *next;
} *CAT[256] = {NULL};
struct CAT_entry stmem[10000];
int stidx = 0;
unsigned dat[100][10000];
char map[10000];
unsigned hasht[10000];
#define highbit (1 << ((sizeof(unsigned)*8)-1))
unsigned
rotxor(unsigned sum, unsigned v)
{
if((sum & highbit) == 0)
return ((sum << 1) ^ v);
else
return (((sum << 1) | 1) ^ v);
}
unsigned
compute_hash(int y)
{
int x;
unsigned sum = 0;
for(x = 0; x < 100; ++x)
sum = rotxor(sum, dat[x][y]);
return sum;
}
void
mk_hasht()
{
int y;
for(y = 0; y < 10000; ++y)
hasht[y] = compute_hash(y);
}
clearmap()
{
memset((void *)map, 0, 10000);
}
comprow(int y, int yd)
{
int x;
for(x = 0; x < 100; ++x)
if(dat[x][y] != dat[x][yd])
return 0;
return 1;
}
struct CAT_entry **
srch_CAT(unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]);
static struct CAT_entry *r = NULL;
while(*p != NULL)
{
if((*p)->fval == value)
break;
if((*p)->fval > value)
return &r;
else
p = &((*p)->next);
}
return p;
}
void
add_entry(int y, unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]), *q;
while(*p != NULL)
{
q = *p;
if(q->fval == value)
{
q->rows[q->num] = y;
q->num++;
return;
}
if(q->fval > value)
break;
p = &(q->next);
}
q = *p;
//*p = malloc(sizeof(struct CAT_entry));
*p = &stmem[stidx++];
(*p)->next = q;
q = *p;
q->fval = value;
q->num = 0;
q->used = 0;
}
void
mk_CAT()
{
int x,y;
struct CAT_entry **p, *q;
for(y = 0; y < 10000; y++)
add_entry(y, dat[0][y]);
for(x=0; x < 256; ++x)
{
p = &(CAT[x]);
while(*p != NULL)
{
q = *p;
if(q->num == 0)
{
*p = q->next;
//free(q);
}
else
p = &(q->next);
}
}
}
void
gen_data(int npat)
{
int x, y, rnum, limit;
unsigned r;
srandom(time(NULL));
rnum = npat * 100;
for(y = 0; y < rnum; ++y)
dat[y%100][y/100] = random();
for(y = npat; y < 10000; ++y)
{
rnum = random() % npat;
for(x = 0; x < 100; ++x)
dat[x][y]=dat[x][rnum];
}
for(y = 0; y < 10000; ++y)
{
rnum = random() % 10000;
if(rnum == y)
continue;
for(x = 0; x < 100; ++x)
{
r = dat[x][y];
dat[x][y]=dat[x][rnum];
dat[x][rnum] = r;
}
}
}
int
do_CATtest()
{
int y, yd, count = 0, i;
struct CAT_entry **p, *q;
mk_CAT();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] == 0)
{
map[y] = 1;
if(*(p = srch_CAT(dat[0][y])) != NULL)
{
for(q = *p, i = 0; i < q->num; ++i)
{
yd = q->rows[i];
if(map[yd] == 0)
{
if(comprow(y, yd))
{
map[yd] = 1;
++count;
q->used++;
}
}
}
if(q->num <= q->used)
*p = q->next;
}
}
}
return count;
}
int
do_hashtest()
{
unsigned h;
int x, y, yd, count = 0;
mk_hasht();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] != 0)
continue;
map[y] = 1;
h = hasht[y];
for(yd = y+1; yd < 10000; ++yd)
{
if(map[yd] != 0)
continue;
if(h == hasht[yd])
if(comprow(y, yd))
{
map[yd] = 1;
++count;
}
}
}
return count;
}
main(int c, char *v[])
{
int npat = 0, count;
double t1, t2;
if(c == 2)
npat = atoi(v[1]);
if(npat <= 0 || npat >= 10000)
{
puts("input param error");
exit(1);
}
gen_data(npat);
npat = 10000 - npat;
t1 = etime();
if((count = do_CATtest()) != npat)
{
printf("CAT test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("CAT Test %d %.4f\n", npat, t2-t1);
t1 = etime();
if((count = do_hashtest()) != npat)
{
printf("hash test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("Hash Test %d %.4f\n", npat, t2-t1);
}
Make a content addressable table of the first values in each row. Then go through each row, take the first value and look it up on the table. If the lookup returns multiple rows, then those rows should be checked for a match. The searched rows should be remembered as to increase efficiency because the checked rows need not be checked again. You'll end up with a list of identical row groupings.

CUDA reduction to find the maximum of an array

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.
#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
extern __shared__ float temp[];
if (myId < size) {
temp[tid] = d_logLum[myId];
}
else {
temp[tid] = d_logLum[tid];
}
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
if (myId < size) {
temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
} else {
temp[tid] = d_logLum[tid];
}
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
__global__ void reduce_max_kernel2(float *d_out, float *d_in) {
// Reduce log Lum with Max Operator
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if (tid < s) {
d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
}
__syncthreads();
}
if (tid == 0) {
d_out[0] = d_in[0];
}
}
void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
//int size = 1 << 18;
int points = numRows * numCols;
int logPoints = ceil(log(points)/log(2));
int sizePow = logPoints;
int size = pow(2, sizePow);
int numThreads = 1024;
int numBlocks = size / numThreads;
float *d_out;
float *d_max_out;
checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));
cudaDeviceSynchronize();
reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);
cudaDeviceSynchronize();
reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);
float h_out_max;
checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));
printf("%f\n", h_out_max);
checkCudaErrors(cudaFree(d_max_out));
checkCudaErrors(cudaFree(d_out));
}
You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.
Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.
#include <thrust\device_vector.h>
#define BLOCKSIZE 256
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {
int tid = threadIdx.x; // Local thread index
int myId = blockIdx.x * blockDim.x + threadIdx.x; // Global thread index
extern __shared__ float temp[];
// --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;
// --- Your solution
// if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Your solution
//for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
// if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
// __syncthreads();
//}
if (tid == 0) {
d_out[blockIdx.x] = temp[0];
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10;
thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;
int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
int NumBlocks = (N + NumThreads - 1) / NumThreads;
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);
// --- reduce2
thrust::device_vector<float> d_vec_block(NumBlocks);
reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
thrust::host_vector<float> h_vec_block(d_vec_block);
float result_reduce0 = -FLT_MAX;
for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
printf("Result = %f\n",result_reduce0);
}

MPI_AllGather not gather properly...all elements end up the same value?

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#define h 1
#define XY0 0
#define MAX_XY 5
#define N 2 //particles per subdomain
#define BLOCKS 4
#define a 1
#define b 1
float velocityX(float x, float y);
float velocityY(float x, float y);
int malloc2dfloat(float ***array, int length);
int main (int argc, char **argv)
{
typedef struct {
float xcoord;
float ycoord;
float velx;
float vely;
} particle;
int points= (int) floor((MAX_XY - XY0)/h) + 1;
int procsize = 2;
int myid, nproc;
MPI_Datatype particletype, oldtypes[1];
MPI_Aint offset[1], extent;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
int startElementX, startElementY, endElementX, endElementY;
particle* sub_pars = (particle*)malloc(sizeof(particle)*N);
offset[0] = 0;
int blockcounts[1];
blockcounts[0] = 4;
oldtypes[0] = MPI_FLOAT;
MPI_Type_struct(1, blockcounts, offset, oldtypes, &particletype);
MPI_Type_commit(&particletype);
particle* particles = (particle*)malloc(sizeof(particle) * N * procsize*procsize);
if (nproc != procsize*procsize){
printf("Must use np=4 -- split into 4 blocks");
MPI_Abort(MPI_COMM_WORLD,1);
}
srand(time(NULL)+myid);
if (myid == 0)
{
float mins[4];
startElementX = 0;
startElementY = 0;
endElementX = (points/procsize)-1;
endElementY = (points/procsize) -1;
}
else if (myid == 1)
{
startElementX = 0;
startElementY = (points/procsize);
endElementX = (points/procsize) -1;
endElementY = points - 1;
}
else if (myid == 2)
{
startElementX = (points/procsize);
startElementY = 0;
endElementX = points - 1;
endElementY = (points/procsize) -1;
}
else
{
startElementX = (points/procsize);
startElementY = (points/procsize);
endElementX = points-1;
endElementY = points-1;
}
int i;
float localmin;
float mag;
for (i=0; i<N; i++)
{
sub_pars[i].xcoord = ((startElementX + rand()/(RAND_MAX / (endElementX-startElementX+1)+1)))*h + XY0;
printf("%f\n", sub_pars[i].xcoord);
sub_pars[i].ycoord = ((startElementY + rand()/(RAND_MAX / (endElementY-startElementY+1)+1)))*h + XY0;
sub_pars[i].velx = velocityX(sub_pars[i].xcoord, sub_pars[i].ycoord);
sub_pars[i].vely = velocityY(sub_pars[i].xcoord, sub_pars[i].ycoord);
mag = sqrt(sub_pars[i].velx*sub_pars[i].velx + sub_pars[i].vely*sub_pars[i].vely);
if (i==0 || localmin > mag) localmin = mag;
}
printf("localmin of %d is %.2f \n", myid, localmin);
MPI_Allgather(&sub_pars, 1, particletype, particles ,1, particletype, MPI_COMM_WORLD);
MPI_Finalize();
if(myid == 0)
{
int k;
for (k=0; k<N*4; k++)
{
printf("test %.2f \n", particles[i].xcoord);
}
}
return 0;
}
float velocityX(float x, float y)
{
float temp = (a+(b*(y*y-x*x))/((x*x+y*y)*(x*x+y*y)));
return temp;
}
float velocityY(float x, float y)
{
float temp = (-1*(2*b*x*y)/((x*x+y*y)*(x*x+y*y)));
return temp;
}
It just returns the same value for all the particles, but I know they are being calculate correctly within each thread, so something is wrong with my MPI_Allgather, can someone please explain how it should look?
You have made a very common mistake: the & (address-of) operator in the first argument that you pass to MPI_Allgather is unnecessary. sub_pars is already a pointer and calling MPI_Allgather with &sub_pars passes a pointer to the pointer (a location somewhere in the stack frame of the main() routine) instead of pointer to the actual data.

How to fix my Pthreads code about Mandelbrot set?

I have the following Pthreads code about calculating and creating a picture of the Mandelbrot set. My code in C works just fine and it prints the resulting picture nicely. The point is that using the below code, I am able to compile the code and execute it. Afterwards, if I try to view the resulting .ppm file in Gimp, it simply cannot open it. I guess I'm doing something wrong in my code. If someone could help me I would be glad.
// mandpthread.c
// to compile: gcc mandpthread.c -o mandpthread -lm -lrt -lpthread
// usage: ./mandpthread <no_of_iterations> <no_of_threads> > output.ppm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <assert.h>
#include <pthread.h>
typedef struct {
int r, g, b;
} rgb;
int NITERATIONS, NTHREADS;
rgb **m;
void color(rgb **m, int x, int y, int red, int green, int blue)
{
m[y][x].r = red;
m[y][x].g = green;
m[y][x].b = blue;
}
void mandelbrot(int tid)
{
int w = 600, h = 400, x, y;
// each iteration, it calculates: newz = oldz*oldz + p,
// where p is the current pixel, and oldz stars at the origin
double pr, pi; // real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; // real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; // you can change these to zoom and change position
int start = tid * NITERATIONS/NTHREADS;
int end = (tid+1) * (NITERATIONS/NTHREADS) - 1;
//loop through every pixel
for(y = 0; y < h; y++) {
for(x = 0; x < w; x++) {
// calculate the initial real and imaginary part of z,
// based on the pixel location and zoom and position values
pr = 1.5 * (x - w / 2) / (0.5 * zoom * w) + moveX;
pi = (y - h / 2) / (0.5 * zoom * h) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
// i will represent the number of iterations
int i;
// start the iteration process
for(i = start; i <= end; i++) {
// remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
// the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
// if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
if(i == NITERATIONS)
color(m, x, y, 0, 0, 0); // black
else
{
// normalized iteration count method for proper coloring
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256. * log2(1.75 + i - log2(log2(z))) / log2((double)NITERATIONS);
color(m, x, y, brightness, brightness, 255);
}
}
}
}
// worker function which will be passed to pthread_create function
void *worker(void *arg)
{
int tid = (int)arg;
mandelbrot(tid);
}
int main(int argc, char *argv[])
{
pthread_t* threads;
int i, j, rc;
if(argc != 3)
{
printf("Usage: %s <no_of_iterations> <no_of_threads> > output.ppm\n", argv[0]);
exit(1);
}
NITERATIONS = atoi(argv[1]);
NTHREADS = atoi(argv[2]);
threads = (pthread_t*)malloc(NTHREADS * sizeof(pthread_t));
m = malloc(400 * sizeof(rgb *));
for(i = 0; i < 400; i++)
m[i] = malloc(600 * sizeof(rgb));
// declaring the needed variables for calculating the running time
struct timespec begin, end;
double time_spent;
// starting the run time
clock_gettime(CLOCK_MONOTONIC, &begin);
printf("P6\n# AUTHOR: ET\n");
printf("%d %d\n255\n",600,400);
for(i = 0; i < NTHREADS; i++) {
rc = pthread_create(&threads[i], NULL, worker, (void *)i);
assert(rc == 0); // checking whether thread creating was successfull
}
for(i = 0; i < NTHREADS; i++) {
rc = pthread_join(threads[i], NULL);
assert(rc == 0); // checking whether thread join was successfull
}
// printing to file
for(i = 0; i < 400; i++) {
for(j = 0; j < 600; j++) {
fputc((char)m[i][j].r, stdout);
fputc((char)m[i][j].g, stdout);
fputc((char)m[i][j].b, stdout);
}
}
// ending the run time
clock_gettime(CLOCK_MONOTONIC, &end);
// calculating time spent during the calculation and printing it
time_spent = end.tv_sec - begin.tv_sec;
time_spent += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
for(i = 0; i < 400; i++)
free(m[i]);
free(m);
free(threads);
return 0;
}
The newest version of your code works for me with 100 iterations and 1 thread.
Doing two threads fails, because the ppm file has 2 headers one from each thread.
If I delete one of the headers, the image loads but the colours are off and there's a glitch in the image.

Mandelbrot message queue blocking - C

I'm having problems in using message queues in this program
it is supposed to launch a number of processes that is passed by argument to the program by the command prompt but it only launches and calculates the points for one process... the others don't get to launch ..
Please help me .
This is the program that creates the message queues and outputs the selected data into pgm format
only the first process gets to run the other processes don't
can anybody tell me why ?
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <stdlib.h>
#include <string.h>
void output_pgm(char *filename,double *buffer, int nx, int ny, double max) {
int i;
FILE *file;
file = fopen(filename,"w");
fprintf(file,"P2\n");
fprintf(file,"%d %d\n",nx,ny);
fprintf(file,"%d\n",(int)max);
for (i=0; i<nx*ny; i++) {
if (!(i%nx)) fprintf(file,"\n");
fprintf(file,"%d ",(int)buffer[i]);
}
fclose(file);
}
void main(int argc,char *argv[]) {
if(argc != 2) {
} else {
int n = atoi(argv[1]);
int i = 0;
struct msgbuf {
long mtype;
int x;
int y;
double value;
};
struct envio {
long mtype;
long type;
int ny;
double yM1;
double yM2;
};
key_t key = 123;
key_t key2 = 124;
int msgflg = IPC_CREAT | 0666;
int msqid = msgget(key,msgflg);
int msqid2 = msgget(key2,msgflg);
switch(fork()) {
case -1:
printf("Erro de fork");
break;
case 0:
printf("Oi: %d\n",n);
double *b;
int x,y,i,m;
double *ptr = b = malloc(1000*1000*sizeof(double));
printf("Chego(1)\n");
struct msgbuf a;
struct envio c;
size_t buflen = sizeof(a) - sizeof(long);
size_t len2 = sizeof(c) - sizeof(long);
printf("Chego(2)\n");
int msid = msgget(key,msgflg);
int msid2 = msgget(key2,msgflg);
printf("Chego(3)\n");
double aux = -1.0;
double multiplier = ((1.0/n) * 2);
c.mtype = 300;
int ny = (int)(1000/n);
for(i = 0; i < n; i++) {
c.type = (i+1);
c.ny = (int)(1000/n);
c.yM1 = aux;
c.yM2 = aux+multiplier;
printf("Chego aqui(2)\n");
if(msgsnd(msid2,&c,len2,0) < 0) {
perror("Erro do 1o envio\n");
}
}
for(m = 0; m < n; m++) {
printf("Entrei no ciclo(1)\n");
for(y = 0; y <ny ;y++) {
//printf("Chego(4)\n");
for(x = 0; x < 1000;x++) {
if(msgrcv(msid,&a,buflen,(long)(m+1),0) < 0) {
perror("Erro na recepcao:\n ");
}
//printf("Chego(5)\n");
b[a.y * ny + a.x] = a.value;
}
}
b = b + (1000/n)*1000;
}
output_pgm("mandel.pgm", ptr, 1000, 1000, 255);
//msgctl(msid, IPC_RMID, NULL);
//msgctl(msid2, IPC_RMID, NULL);
printf("Processo 1\n");
break;
default:
for(i = 0;i < n;i++) {
switch(fork()) {
case -1:
printf("Erro de fork");
case 0:
exit(0);
break;
default:
printf("Fui lancado\n");
execlp("/home/hyper/Documents/SO2/TP3-4/rec","rec",0);
break;
}
}
break;
}
}
};
The for loop in this program is run only once
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <stdlib.h>
#include <string.h>
double type;
struct senbuf {
long mtype;
int x;
int y;
double value;
};
int max_iterations = 256;
double compute_point(double ci, double cr) {
int iterations = 0;
double zi = 0;
double zr = 0;
while ((zr*zr + zi*zi < 4) && (iterations < max_iterations)) {
double nr, ni;
/* Z <-- Z^2 + C */
nr = zr*zr - zi*zi + cr;
ni = 2*zr*zi + ci;
zi = ni;
zr = nr;
iterations ++;
}
return iterations;
}
/* The "compute" function computes the Mandelbrot function over every
point on a grid that is "nx" points wide by "ny" points tall, where
(xmin,ymin) and (xmax,ymax) give two corners of the region the
complex plane.
*/
void compute(int msqid,int nx, int ny, double xmin, double xmax,
double ymin, double ymax,long type2 ) {
double delta_x, delta_y;
int x, y;
struct senbuf sen;
delta_x = (xmax - xmin)/nx;
delta_y = (ymax - ymin)/ny;
size_t buflen = sizeof(sen) - sizeof(long);
for (y=0; y<ny; y++) {
//printf("Ja entrei aqui");
double y_value = ymin + delta_y * y;
for (x=0; x<nx; x++) {
double x_value = xmin + delta_x * x;
sen.mtype = type;
sen.x = x;
sen.y = y;
sen.value = compute_point(x_value,y_value);
if(msgsnd(msqid,&sen,buflen,0) < 0) {
perror("Erro no envio:");
};
// printf("%f",sen.a[y*nx + x]);
//buffer[y*nx + x] = compute_point(x_value, y_value);
}
}
printf("Ja mandei %d\n",type2);
/*sen.mtype=500;
sen.test = -1;
printf("Ja to a sair\n");
msgsnd(msqid,&sen,buflen,IPC_NOWAIT);*/
}
/* Output the data contained in the buffer to a Portable Greymap format
image file. The parameter "max" should be an upper bound for the
data values in the buffer.
*/
void output_pgm(char *filename,double *buffer, int nx, int ny, double max) {
int i;
FILE *file;
file = fopen(filename,"w");
fprintf(file,"P2\n");
fprintf(file,"%d %d\n",nx,ny);
fprintf(file,"%d\n",(int)max);
for (i=0; i<nx*ny; i++) {
if (!(i%nx)) fprintf(file,"\n");
fprintf(file,"%d ",(int)buffer[i]);
}
fclose(file);
}
int main()
{
int msqid;
int msqid2;
struct recep {
long mtype;
long type;
int ny;
double yM1;
double yM2;
};
struct recep a;
size_t len = sizeof(a) - sizeof(long);
key_t key = 124;
msqid = msgget(key, 0666);
if(msgrcv(msqid, &a, len, 300, 0) < 0) {
perror("Error checking");
};
printf("Dados :\n Tipo : %d\n Ny: %d\n,yM1 : %f\n yM2: %f\n",a.type,a.ny,a.yM1,a.yM2);
type = a.type;
printf("Vou iniciar o compute");
key_t key2 = 123;
msqid2 = msgget(key2,0666);
compute(msqid2,1000,a.ny, -1.0, 1.0,a.yM1,a.yM2,a.type);
}
In your first switch statement you create one child doing whatever it is doing. The parent falls into a second switch statement in which the child immediately exits and the parent is overlaid with the "rec" executable. The parent no longer executes at point - it is the program "rec". You are never going to execute more than one pass of the loop because the code that is executing is gone at that point. If you want multiple instances of "rec" running you should be using execlp on the children not the parent.
EDIT
There are two system calls wait and waitpid that provides various options. The simpler of these is wait and should be sufficient for what you are doing. Define and increment a counter in the parent for each child you create. Then instead of just exiting the parent you wait for all the children to finish. Something as simple as this should suffice:
for (int i = 0; i < counter; i++)
{
wait(NULL);
}

Resources