Good day,
I have an openCL kernel that is using the Leibniz formula to calculate pi. Currently my issue is that the value I get back isn't pi, but instead just 4.
__kernel void calculatePi(int numIterations, __global float *outputPi,
__local float* local_result, int numWorkers)
{
__private const uint gid = get_global_id(0);
__private const uint lid = get_local_id(0);
__private const uint offset = numIterations*gid*2;
__private float sum = 0.0f;
// Have the first worker initialize local_result
if (gid == 0)
{
for (int i = 0; i < numWorkers; i++)
{
local_result[i] = 0.0f;
}
}
// Have all workers wait until this is completed
barrier(CLK_GLOBAL_MEM_FENCE);
// Have each worker calculate their portion of pi
// This is a private value
for (int i=0; i<numIterations; i++)
{
if (i % 2 == 0)
{
sum += 1 / (1 + 2*i + offset);
}
else
{
sum -= 1 / (1 + 2*i + offset);
}
}
// Have each worker move their value to the appropriate
// local_result slot so that the first worker can see it
// when reducing next
local_result[gid] = sum;
// Make sure all workers complete this task before continuing
barrier(CLK_LOCAL_MEM_FENCE);
// Have the first worker add up all of the other worker's values
// to get the final value
if (lid == 0)
{
outputPi[0] = 0;
for (int i = 0; i < numWorkers; i++)
{
outputPi[0] += local_result[i];
}
outputPi[0] *= 4;
}
}
I've steered all of my inputs to my output to verify that they are what I expect. numIterations is 16 and numWorkers is also 16.
When sum is calculated then for the first worker, I would expect the sum to be
1 - 1/3 + 1/5 - 1/7 + 1/9 - 1/11 + 1/13 - 1/15 + 1/17 - 1/19 + 1/21 - 1/23 + 1/25 - 1/27 + 1/29 - 1/31
Using this calculator for the first 16 times, I expect the result to be around 3.2 : https://scratch.mit.edu/projects/19546118/
If I modify my last bit of code to be this so that I can look at a worker's calculated value of "sum":
// Have the first worker add up all of the other worker's values
// to get the final value
if (lid == 0)
{
outputPi[0] = sum * 4;
}
Then the value returned for the first worker is 4 instead of the expected 3.2
Modifying to any other number except lid == 0, all other workers are reporting their sum as 0. So my question is why is that the calculated value? Am I doing something wrong with my sum variable? This should be a private variable and the for loop should be sequential from my understanding for each worker but numerous loops are executed in parallel based on the number of workers.
Here's a link to my github that has the kernel and main code uploaded.
https://github.com/TreverWagenhals/TreverWagenhals/tree/master/School/Heterogeneous%20Computing/Lab2
Thanks
you are performing integral divisions in your code, should be floats:
if (i % 2 == 0)
{
sum += 1. / (1 + 2*i + offset); // notice the 1.
}
else
{
sum -= 1. / (1 + 2*i + offset);
}
Good night
I'm attending to a parallel programming course. The teacher gave us an assignment that involves domain partition for stencil calculations. For this type of calculations (finite difference) the most common way to parallelize a code is to partition the domain and create some ghost zones (halos).
For better understand the creation of ghost zones in MPI I programmed this simple example that initialize some arrays with inner values = 123 and boundary values 88. At the end of all communication, all ghost values should remain 8. In one node I'm getting 123 values.
Serial (no ghosts):
123 - 123 - ... - 123 - 123
Two partitions:
123 - 123 - ... - 88 ||| 88 - ... - 123 - 123
Three partitions:
123 - 123 - ... - 88 ||| 88 - ... - 123 - 123 - 88 ||| 88 - ... - 123 - 123
Aside from this bug, the main question here is about the correct approach to create and maintain ghost zones updated. Is there a cleaner solution for this aside from my messy if(myid == .... else if( myid = ... else type of implementation ? How people usually implement this kind of parallelism ?
#include<mpi.h>
#include<stdio.h>
#include<stdlib.h>
int WhichSize(int mpiId, int numProc, int tam);
int main(int argc, char *argv[]){
int i;
int localSize;
int numProc;
int myid;
int leftProc;
int rightProc;
int * myArray;
int fullDomainSize = 16;
MPI_Request request;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Lets get each partition size.
localSize = WhichSize(myid, numProc, fullDomainSize);
// Allocate arrays acording to proc number.
if(numProc == 1){
//printf("Allocating Array for serial usage\n");
myArray = (int*)malloc(localSize*sizeof(int));
} else if(numProc == 2) {
//printf("Allocating Array for 2 proc usage\n");
myArray = (int*)malloc((localSize+ 1)*sizeof(int));
} else if(numProc > 2) {
if (myid == 0 || myid == numProc - 1){
//printf("Allocating array for boundary nodes usage\n");
myArray = (int*)malloc((localSize+ 1)*sizeof(int));
} else {
//printf("Allocating array for inner nodes usage\n");
myArray = (int*)malloc((localSize+ 2)*sizeof(int));
}
}
// Now we will fill the arrays with a dummy value 123. For the
// boundaries (ghosts) we will fill than with 80. Just to differe
// ntiate.
if(numProc == 1){
//printf("----------------------------------------\n");
//printf("Filling the serial array with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
} else if(numProc == 2) {
////printf("------------------------------------------------\n");
//printf("Filling array for two proc usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghost.
myArray[localSize+1] = 8;
} else if(numProc > 2) {
if (myid == 0 || myid == numProc - 1){
//printf("--------------------------------------------------\n");
//printf("Filling boundary node arrays usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
} else {
//printf("--------------------------------------------------\n");
//printf("Filling inner node arrays usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
myArray[0] = 8;
}
}
// Now lets comunicate the ghosts with MPI_Sendrecv().
if(numProc == 1){
//printf("Serial usage, no ghost to comunicate \n");
} else if(numProc == 2) {
if (myid == 0){
//printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
1,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == 1) {
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
0,
12345,
MPI_COMM_WORLD,
&request);
}
} else if(numProc > 2) {
if (myid == 0){
rightProc = myid + 1;
if (myid == 0){
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
//printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc) {
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
0,
12345,
MPI_COMM_WORLD,
&request);
}
} else if (myid == numProc - 1) {
leftProc = myid - 1;
if (myid == numProc - 1){
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
////printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == leftProc) {
rightProc = myid + 1;
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
}
} else {
//printf("-------------------------------\n");
//printf("Communicating Inner ghosts baby\n");
//printf("-------------------------------\n");
leftProc = myid - 1;
rightProc = myid + 1;
// Communicate tail ghost.
if (myid == leftProc) {
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc){
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
}
// Communicate head ghost.
if (myid == leftProc) {
MPI_Isend(&myArray[0],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc){
MPI_Irecv(&myArray[0],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
}
}
}
// Now I Want to see if the ghosts are in place !.
if (myid == 0){
printf("The ghost value is: %d\n", myArray[localSize + 1]);
} else if (myid == numProc - 1){
printf("The ghost value is: %d\n", myArray[0]);
} else {
printf("The head ghost is: %d\n", myArray[0]);
printf("The tail ghost is: %d\n", myArray[localSize + 1]);
}
MPI_Finalize();
exit(0);
}
int WhichSize(int mpiId, int numProc, int tam){
double resto;
int tamLocal;
tamLocal = tam / numProc;
resto = tam - tamLocal*numProc;
if (mpiId < resto) tamLocal = tamLocal + 1;
return tamLocal;
}
thank you guys !
Halos can be elegantly implemented in MPI using Cartesian virtual topologies and the send-receive operation.
First of all, having lots of rank-dependent logic in conditional operators makes the code hard to read and understand. It is way better when the code is symmetric, i.e. when all ranks execute the same code. Corner cases can be taken care of using the MPI_PROC_NULL null rank - a send to or receive from that rank results in a no-op. It is therefore enough to do:
// Compute the rank of the left neighbour
leftProc = myid - 1;
if (leftProc < 0) leftProc = MPI_PROC_NULL;
// Compute the rank of the right neighbour
rightProc = myid + 1;
if (rightProc >= numProc) rightProc = MPI_PROC_NULL;
// Halo exchange in forward direction
MPI_Sendrecv(&myArray[localSize], 1, MPI_INT, rightProc, 0, // send last element to the right
&myArray[0], 1, MPI_INT, leftProc, 0, // receive into left halo
MPI_COMM_WORLD);
// Halo exchange in reverse direction
MPI_Sendrecv(&myArray[1], 1, MPI_INT, leftProc, 0, // send first element to the left
&myArray[localSize+1], 1, MPI_INT, rightProc, 0, // receive into right halo
MPI_COMM_WORLD);
That code works for any rank, even for those at both ends - there either the source or the destination is the null rank and no actual transfer occurs in the corresponding direction. It also works with any number of MPI processes, from one to many. It requires that all ranks have halos on both sides, including those that don't really need it (the two corner ranks). One can store in those dummy halos useful things like boundary values (e.g. when solving PDEs) or simply live with the memory waste, which is usually negligible.
In your code, you use incorrectly non-blocking operations. Those are tricky and require care to be taken. MPI_Sendrecv could and should be used instead. It performs both send and receive operations at the same time and thus prevents deadlocks (as long as there is a matching receive for each send).
If the domain is periodic, then the rank computation logic becomes simply:
// Compute the rank of the left neighbour
leftProc = (myid - 1 + numProc) % numProc;
// Compute the rank of the right neighbour
rightProc = (myid + 1) % numProc;
Instead of doing the arithmetic, one could create a Cartesian virtual topology and then use MPI_Cart_shift to find the ranks of the two neighbours:
// Create a non-periodic 1-D Cartesian topology
int dims[1] = { numProc };
int periods[1] = { 0 }; // 0 - non-periodic, 1 - periodic
MPI_Comm cart_comm;
MPI_Cart_create(MPI_COMM_WORLD, 1, dims, periods, 1, &cart_comm);
// Find the two neighbours
MPI_Cart_shift(cart_comm, 0, 1, &leftProc, &rightProc);
The code for the halo exchange remains the same with the only difference that cart_comm should replace MPI_COMM_WORLD. MPI_Cart_shift automatically takes care of the corner cases and will return MPI_PROC_NULL when appropriate. The advantage of that method is that you can easily switch between non-periodic and periodic domains by simply flipping the values inside the periods[] array.
The halos have to be updates as often as necessary, which depends on the algorithm. With most iterative schemes, the update must happen at the beginning of each iteration. One could reduce the communication frequency by introducing multi-level halos and using the values in the outer levels to compute the values in the inner ones.
To conclude, your main function could be reduced to (without using a Cartesian topology):
int main(int argc, char *argv[]){
int i;
int localSize;
int numProc;
int myid;
int leftProc;
int rightProc;
int * myArray;
int fullDomainSize = 16;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Compute neighbouring ranks
rightProc = myid + 1;
if (rightProc >= numProc) rightProc = MPI_PROC_NULL;
leftProc = myid - 1;
if (leftProc < 0) leftProc = MPI_PROC_NULL;
// Lets get each partition size.
localSize = WhichSize(myid, numProc, fullDomainSize);
// Allocate arrays.
myArray = (int*)malloc((localSize+ 2)*sizeof(int));
// Now we will fill the arrays with a dummy value 123. For the
// boundaries (ghosts) we will fill than with 80. Just to differe
// ntiate.
//printf("--------------------------------------------------\n");
//printf("Filling node arrays usage with values... \n");
for (i = 1; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
myArray[0] = 8;
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
//printf("Sending ghost value to the right\n");
MPI_Sendrecv(&myArray[localSize], 1, MPI_INT, rightProc, 12345,
&myArray[0], 1, MPI_INT, leftProc, 12345,
MPI_COMM_WORLD);
//printf("Sending ghost value to the left\n");
MPI_Sendrecv(&myArray[1], 1, MPI_INT, leftProc, 12345,
&myArray[localSize+1], 1, MPI_INT, rightProc, 12345,
MPI_COMM_WORLD);
// Now I Want to see if the ghosts are in place !.
printf("[%d] The head ghost is: %d\n", myid, myArray[0]);
printf("[%d] The tail ghost is: %d\n", myid, myArray[localSize + 1]);
MPI_Finalize();
return 0;
}
I am trying to use MPI_Reduce() in C to compute a vector called phi with npts elements. To do that, I have allocated chunks of a long vector (longvec), to each process, sum these chunks separately and then sum the partial result of each processor at the end in process 0 to obtain an estimate of each element of phi.
I am getting very silly results... Can anyone tell me what mistake I am making in the code below?
double phie[npts];
phitemp = (double*) malloc (nprocs * sizeof(double));
for (i = 0; i < npts; i++) {
phitemp[rank] = 0;
for (x = rank * 10 + 1; x <= (rank + 1) * 10; x++) {
phitemp[rank] = phitemp[rank] + longvec[x] * vector[i]; }
}
MPI_Reduce(phitemp, & (((double *) phivec)[i]), 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
}
I would like to split a range of numbers into roughly equal size in C using OpenMP. For example, if I have a range from 7 through 24 and number of threads is 8. I would like the first thread to begin from 7 and end at 9. Second thread begin from 10 and end at 12. Third thread begin from 13 and end at 14. Fourth thread begin at 15 and end at 16 and so forth... Until the last thread begins at 23 and ends at 24. The code that I wrote is as follows, but it is not getting the previously explained results. I am wondering if there is something that I missed that I can do or is there a more efficient way of doing this? Your help is greatly appreciated.
Note of predefined declaration of variables based on the example given above:
first = 7
last = 24
size = 2 (which signifies the amount of numbers per thread)
r = 2 (r signifies remainder)
nthreads = 8
myid = is the thread ID in a range of 0 to 7
if (r > 0)
{
if (myid == 0)
{
start = first + myid*size;
end = start + size;
}
else if (myid == nthreads - 1)
{
start = first + myid*size + myid;
end = last;
}
else
{
start = first + myid*size + myid;
end = start + size;
}
}
else
{
start = first + myid*size;
if (myid == nthreads - 1) end = last;
else end = start + size - 1;
}
As far as I remember, #pragma omp parallel for automatically divides work between threads in equal chunks, and is ok in most situations.
However, if you want to do this manually, here is a piece of code which does what you want:
int len = last - first + 1;
int chunk = len / nthreads;
int r = len % nthreads;
if (myid < r) {
start = first + (chunk + 1) * myid;
end = start + chunk;
} else {
start = first + (chunk + 1) * r + chunk * (myid - r);
end = start + chunk - 1;
}
If there are no additional restrictions, such distribution is indeed optimal.
// assuming half-open interval
int n = ((end-begin) + omp_get_num_threads() - 1)/omp_get_num_threads();
int first = begin + n*omp_get_thread_num();
int last = max(first + n, end);
I am trying to perform an inverse and a pseudo-inverse filtering in the frequency domain.
However I am having trouble accessing DFT coefficients and multiplying DFT matrices afterwards, since I got complex numbers and, therefore, actually two matrices...
Basically the inverse filtering performs
F = G/H,
where F is the restored image, G is the blurred image and H is the kernel that blurred the image.
The pseudo-inverse needs to access the values in H, since if the value is near 0 it should be replaced in order to avoid problems in the restoration. For this we must change the H so that:
H(u,v) = 1/H(u,v) if H(u,v) > threshold
and = 0 otherwise
I have a kernel1 (h_1), and the images imf (restored) and img (blurred). Here is the code:
// compute the DFTs of the kernel (DFT_B) and the blurred image (DBF_A)
cvDFT( dft_A, dft_A, CV_DXT_FORWARD, complexInput1->height );
cvDFT( dft_B, dft_B, CV_DXT_FORWARD, complexInput2->height );
// the first type is the inverse fitlering
if (type == 1) {
printf("...performing inverse filtering\n");
// dividing the transforms
cvDiv(dft_A, dft_B, dft_C, 1);
}
// the second type is the pseudo-inverse filtering
else {
printf("...prepare kernel for pseudo-inverse filtering\n");
// will try to access the real values in order to see if value is above a threshold
cvSplit( dft_B, image_Re1, image_Im1, 0, 0 );
// pointers to access the data into the real and imaginary matrices
uchar * dRe1 = (uchar *)image_Re1->imageData;
uchar * dIm1 = (uchar *)image_Im1->imageData;
int width = image_Re1->width;
int height = image_Re1->height;
int step = image_Re1->widthStep;
image_Re2 = cvCreateImage(cvGetSize(image_Re1), IPL_DEPTH_32F, 1);
image_Im2 = cvCreateImage(cvGetSize(image_Im2), IPL_DEPTH_32F, 1);
// pointers to access the data into the real and imaginary matrices
// it will be the resulting pseudo-inverse filter
uchar * dRe2 = (uchar *)image_Re2->imageData;
uchar * dIm2 = (uchar *)image_Im2->imageData;
printf("...building kernel for pseudo-inverse filtering\n");
for ( i = 0; i < height; i++ ) {
for ( j = 0; j < width; j++ ) {
// generate the 1/H(i,j) value
if (dRe1[i * step + j] > threshold) {
float realsq = dRe1[i * step + j]*dRe1[i * step + j];
float imagsq = dIm1[i * step + j]*dIm1[i * step + j];
dRe2[i * step + j] = dRe1[i * step + j] / (realsq + imagsq);
dIm2[i * step + j] = -1 * (dIm1[i * step + j] / (realsq + imagsq));
}
else {
dRe2[i * step + j] = 0;
dIm2[i * step + j] = 0;
}
}
}
printf("...merging final kernel\n");
cvMerge(image_Re2, image_Im2, 0, 0, dft_B);
printf("...performing pseudo-inverse filtering\n");
cvMulSpectrums(dft_A, dft_B, dft_C, 1);
}
printf("...performing IDFT\n");
cvDFT(dft_C, dft_H, CV_DXT_INV_SCALE, 1);
printf("...getting size\n");
cvGetSubRect(dft_H, &tmp3, cvRect(0, 0, img->width, img->height));
printf("......(%d, %d) - (%d, %d)\n", tmp3.cols, tmp3.rows, restored->width, restored->height);
cvSplit( &tmp3, image_Re1, image_Im1, 0, 0 );
cvNamedWindow("re", 0);
cvShowImage("re", image_Re2);
cvWaitKey(0);
printf("...copying final image\n");
// error is in the line below
cvCopy(image_Re1, imf, NULL);
I have an error on the last line: --- OpenCV Error: Assertion failed (src.depth() == dst.depth() && src.size() == dst.size()) in cvCopy, file /build/buildd/opencv-2.1.0/src/cxcore/cxcopy.cpp, line 466
I know it have to do with the size or depth but I don't know how to control. Anyway, I tried to show the image_Re1 and it is empty...
Can anyone shed some light on it?
Seems like you didn't initialize your imf picture!
cvCopy needs a initialized matrix do a:
IplImage* imf= cvCreateImage(cvGetSize(image_Re1), IPL_DEPTH_32F, 1);
first and I think it'll work.
Also, you don't free the image space in this code (cvReleaseImage(&image))