Related
So i have this program im working on, and the guist of it is that i need to do some operations with threads, following the next shcheme: The j-th thread Hj calculates a group of 100 consecutive iterations of the sum, making a cyclic distribution of the groups among all the threads. For example, if H = 4, the
thread H2 does the calculation of iterations [100..199, 500..599, 900..999, ...].
To ensure no data races occur, the threads must work each on a different sum variable.
Then compare after joining the threads the result achieved by the threads and the one done sequentally.
Here is the code:
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <sys/time.h>
#define H 4
double res[H] = {0};
//Time function
float restar_tiempo(struct timeval *inicio, struct timeval *fin) {
return (fin->tv_sec - inicio->tv_sec) + 1e-6 * (fin->tv_usec - inicio->tv_usec);
}
//Thread function
void *_hilo(void *arg) {
int a = * ((int*)arg);
double pi = 0;
double n = 100 * a;
while (n < 10000000) {
res[a] += (pow(-1, n) / pow(4, n)) * ((2 / (4 * n + 1)) + (2 / (4 * n + 2)) + (1 / (4 * n + 3)));
pi++;
n++;
if ((int) n % 100 == 0)
n += (H - 1)*100;
}
printf("Result on thread[%d]: %f\n", a, res[a]);
pthread_exit(NULL);
}
int main() {
pthread_t hilo[H];
struct timeval in, mid, fin;
gettimeofday(&in, NULL);
for (int i = 0; i < H; i++) {
int* p = malloc(sizeof (int));
*p = i;
printf("Esto es i: %d\n", i);
res[i] = 0;
if (pthread_create(&hilo[i], NULL, _hilo, p) != 0) {
perror(" Error creando hilo");
exit(EXIT_FAILURE);
}
free(p);
}
//Join
for (int i = 0; i < H; i++)
pthread_join(hilo[i], NULL);
//Partial sum
double f = 0;
for (int i = 0; i < H; i++){
printf("Resultado parcial de hilo %d: %f\n", i, res[i]);
f += res[i];
}
//Total partial sum
printf("Resultado total: %lf\n", f);
//printf("Hola/n");
gettimeofday(&mid, NULL);
//Secuential sum
double s = 0;
for (double n = 0; n < 10000000; n++)
s += (pow(-1, n) / pow(4, n)) * ((2 / (4 * n + 1)) + (2 / (4 * n + 2)) + (1 / (4 * n + 3)));
//Print secuential
printf("Resultado secuencial: %f\n", s);
gettimeofday(&fin, NULL);
//Result diff
printf("Diferencia resultados: %f\n", fabs(f - s));
//Time threads
printf("Tiempo por hilos: %f\n", restar_tiempo(&in, &mid));
//Secuential time
printf("Tiempo secuencial: %f\n", restar_tiempo(&mid, &fin));
//Time diff
printf("Diferencia tiempos: %f\n", restar_tiempo(&in, &mid) - restar_tiempo(&mid, &fin));
return 0;
}
I can compile everything without warnings, but when i execute the program, the result provided by the first thread is erratic, as it changes between executions (the rest of threads display 0 because they work with very little values).
Example with some added prints inside the thread function and after doing the join:
First execution:
This is i:0
This is i:1
This is i:2
This is i:3
//Inside thread funct
Thread result[2]: 0.000000
Thread result[2]: 0.000000
Thread result[3]: 0.000000
Thread result[0]: 3.141593
//After join
Partial result of thread 0: 3.141593
Partial result of thread 1: 0.000000
Partial result of thread 2: 0.000000
Partial result of thread 3: 0.000000
Total result: 3.141593
Sequential result: 3.141593
Difference results: 0.000000
Time per threads: 0.183857
Sequential time: 0.034788
Difference times: 0.149069
Second execution:
This is i:0
This is i:1
This is i:2
This is i:3
Thread result[2]: 0.000000
Thread result[0]: 6.470162
Thread result[0]: 6.470162
Thread result[3]: 0.000000
Partial result of thread 0: 6.470162
Partial result of thread 1: 0.000000
Partial result of thread 2: 0.000000
Partial result of thread 3: 0.000000
Total result: 6.470162
Sequential result: 3.141593
Difference results: 3.328570
Time per threads: 0.189794
Sequential time: 0.374017
Difference times: -0.184223
How can i make it so the sum works properly?
I think it has something to do with arg in the function _hilo, or the subsequent int cast with int a.
(Excuse the mix in languages, i speak spanish so most of the printfs are in said language. Dont mind them, the block with the results example has the traduction)
Okay i solved it but i dont know why it works like this fully or why this caused issues. I just deleted the free (p) statement and now it works like a charm. If someone can enlighten me on why this happens, i´ll be grateful.
Running time is about the same, regardless of the number of threads. I am having trouble figuring out why. I know that the threads are running in parallel as they are supposed to, but I don't have even a good guess as to why there would be no performance improvement. (approx. 21 seconds to find all primes less than 8 million, for both single and multiple threads) What is going on here?
typedef struct prime_finder_vars {
long from;
long to;
int idx;
} PrimeFinderVars;
int is_prime(long num) {
int limit = round(sqrt(num));
for (long i = 2; i <= limit; i++) {
if (num % i == 0)
return FALSE;
}
return TRUE;
}
void *prime_finder(void *pf) {
PrimeFinderVars *pf_vars = (PrimeFinderVars *) pf;
long next_cand = pf_vars->from;
while (next_cand < pf_vars->to) {
if (is_prime(next_cand)) {
++counts[pf_vars->idx];
}
next_cand += 2;
}
return pf;
}
int main(void) {
struct timespec start;
struct timespec end;
double start_sec, end_sec, elapsed_sec;
int sum = 0;
clock_gettime(CLOCK_REALTIME, &start);
pthread_t threads[NUM_THREADS];
PrimeFinderVars vars[NUM_THREADS];
int slice_size = SEARCH_RANGE / NUM_THREADS;
for (int i = 0; i < NUM_THREADS; i++) {
vars[i].from = i * slice_size + 1;
vars[i].to = (i + 1) * slice_size;
vars[i].idx = i;
pthread_create(&threads[i], NULL, prime_finder, &vars[i]);
}
for (int i = 0; i < NUM_THREADS; i++) {
pthread_join(threads[i], NULL);
sum += counts[i];
}
clock_gettime(CLOCK_REALTIME, &end);
start_sec = start.tv_sec + start.tv_nsec / NANO_PER_SEC;
end_sec = end.tv_sec + end.tv_nsec / NANO_PER_SEC;
elapsed_sec = end_sec - start_sec;
}
This is an interesting question. Everything Mikhail Vladimirov says is true but I decided to do some testing on my laptop to see what I got. My laptop is a modern MacBook pro with an eight core i9. I'm not sure if it is hyper threaded or not, but here are my results:
I tested with the number of threads varying between 1 and 50 and a search range of 10,000,000.
With one thread it takes nearly eleven seconds but this drops rapidly to around 1.5 seconds with 16 threads, and it doesn't get any better after that.
My conclusion is
My comment on Mikhail's answer about the cost of the thread functions is wrong, at least on my platform. I see no increased overhead with more threads
There's something wrong with your thread library.
I think you probably need to satisfy yourself that the threads really are running in parallel on separate cores. One explanation of your results could be that they are all competing for the same CPU.
Just for fun I decided to try profiling the program.
Each step represents another core going to 100%. I'm not sure why the prt with three threads doesn't go to 300%, but you can see with four threads that it goes up to 400% straight away but comes down in steps of 100%. This is the effect of you splitting the task into equal ranges and the threads dealing with lower numbers finishing sooner.
The first 16 data points
Threads Time
1 11.893418
2 7.352520
3 5.117278
4 4.062026
5 3.511605
6 2.892274
7 2.401555
8 2.172573
9 1.910534
10 1.864023
11 1.860944
12 1.369277
13 1.628883
14 1.196646
15 1.626215
16 1.548878
The code I used to produce the test results (slightly modified from yours).
#include <stdio.h>
#include <pthread.h>
#include <math.h>
#include <stdbool.h>
#define SEARCH_RANGE 10000000
#define NANO_PER_SEC 1000000000
typedef struct prime_finder_vars {
long from;
long to;
int* count;
} PrimeFinderVars;
int is_prime(long num) {
int limit = round(sqrt(num));
for (long i = 2; i <= limit; i++) {
if (num % i == 0)
return false;
}
return true;
}
void *prime_finder(void *pf)
{
PrimeFinderVars *pf_vars = (PrimeFinderVars *) pf;
long next_cand = pf_vars->from;
while (next_cand < pf_vars->to)
{
if (is_prime(next_cand))
{
(*pf_vars->count)++ ;
}
next_cand += 2;
}
return pf;
}
void trial(int numThreads)
{
struct timespec start;
struct timespec end;
double start_sec, end_sec, elapsed_sec;
int sum = 0;
clock_gettime(CLOCK_REALTIME, &start);
int counts[numThreads];
pthread_t threads[numThreads];
PrimeFinderVars vars[numThreads];
int slice_size = SEARCH_RANGE / numThreads;
for (int i = 0; i < numThreads; i++)
{
counts[i] = 0;
vars[i].from = i * slice_size + 1;
vars[i].to = (i + 1) * slice_size;
vars[i].count = &counts[i];
pthread_create(&threads[i], NULL, prime_finder, &vars[i]);
}
for (int i = 0; i < numThreads; i++)
{
pthread_join(threads[i], NULL);
sum += counts[i];
}
clock_gettime(CLOCK_REALTIME, &end);
start_sec = (double)start.tv_sec + (double)start.tv_nsec / NANO_PER_SEC;
end_sec = (double)end.tv_sec + (double)end.tv_nsec / NANO_PER_SEC;
elapsed_sec = end_sec - start_sec;
printf("%d\t%f\n", numThreads, elapsed_sec);
}
int main()
{
printf("Threads\tTime\n");
for (int threads = 1 ; threads <= 50 ; ++threads)
{
trial(threads);
}
}
I pursued this a bit further over the last day or two. Firstly, I was intrigued about why there seemed to be a double line of timings: After about 12 threads, a run would take either 1.5 seconds or 1 second. I theorised above it was because of the bug Mikhail mentioned so I plotted the actual answer given for each number of threads and found that, while the answer was usually around 664,579 it would often be around half that and unsurprisingly, when the answer was half the real answer, that corresponded to the lower of the two timing lines.
So I fixed that bug and the double line effect disappeared. However, I was still getting more than one different answer depending on the number of threads.
The reason for this is that there are two more bugs.
the original algorithm fails to test the top number in each range.
The size of the ranges is calculated by doing an integer division of the search range by the number of threads. Unless there is no remainder, numbers at the top of the search range will not be checked.
I fixed both bugs and did a third run. This didn't affect the timings appreciably, but I got the same answer for each number of threads used.
For comparison, I wrote a sieve of Eratosthenes and timed that too. Using it and a single thread took only 0.2 seconds - about seven times faster than the fastest number of cores.
I've published a spreadsheet of the results and there's a git repo of the code.
Workload is not well balanced between threads. Each thread has to check about the same number of candidates, but for threads with higher indexes it takes more time to check each candidate, than for threads with lower indexes.
I would rewrite the main loop like this:
for (long candidate = pf_vars->idx;
candidate < SEARCH_RANGE;
candidate += NUM_THREADS) {
if (is_prime (candidate)) {
++counts [pf_vars->idx];
}
}
NUM_THREADS has to be prime itself for this to work efficiently.
Also, I doubt your code produces correct results, as in case pf_vars->from is even, prime_finder will check only even candidates which doesn't make much sense.
Also, threads run in parallel only when they run on different cores. If the number of thread is much more than the number of cores, then performance will degrade as switching a core between several threads also takes some time.
After you've identified 2 and 3 as prime, all remaining primes take the form 6N±1, for N >= 1. To balance the workload across K threads, you should have each of the K threads stepping through its own sequence of values for N: thread T working on 1 + T + K * X, where for each thread, X sequences from 0 upwards. If you have 8 threads, it means thread 0 works on N₀ = { 1, 9, 17, … }, etc. It still means thread K-1 does more work than thread 0 because it is tackling bigger numbers, but the discrepancy is much less than when you slice the ranges horizontally. This means you need to provide each thread with a starting number, S, and the total number of threads, K, and the thread will then set a counter x to values 0, 1, ... and check for primes 6(S + xK)±1.
Now, with that as a decent basis for creating a multi-threaded prime finder. This is closely based on your code. It does use some code that is available in my SOQ (Stack Overflow Questions) repository on GitHub as files timer.c, timer.h, stderr.c and stderr.h in the src/libsoq sub-directory. It uses function isprime() renamed from IsPrime3B() found in the file isprime.c in the src/Primes sub-directory of my SOQ repository. This program is prime-thread.c from the same src/Primes directory.
/* SO 6438-1942 */
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "stderr.h"
#include "timer.h"
#define NANO_PER_SEC 1.0E9
enum { NUM_THREADS = 8 };
enum { MAX_NUMBER = 10000000 };
static size_t counts[NUM_THREADS];
static const unsigned int small_primes[] =
{
5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
53, 59, 61, 67, 71, 73, 79, 83, 87, 89, 91, 97
};
enum { NUM_SMALL_PRIMES = sizeof(small_primes) / sizeof(small_primes[0]) };
/* IsPrime3B() from isprime.c - renamed is_prime() */
static int is_prime(unsigned number)
{
if (number <= 1)
return 0;
if (number == 2 || number == 3)
return 1;
if (number % 2 == 0 || number % 3 == 0)
return 0;
for (unsigned i = 0; i < NUM_SMALL_PRIMES; i++)
{
if (number == small_primes[i])
return 1;
if (number % small_primes[i] == 0)
return 0;
}
/* After 97, the next prime numbers are 101, 103, 107, 109 */
/*
** It would be feasible to start this loop from:
** i = (((small_primes[NUM_SMALL_PRIMES - 1] + 1) / 6) + 1) * 6
*/
for (unsigned i = 102; (i - 1) <= number / (i - 1); i += 6)
{
if (number % (i - 1) == 0 || number % (i + 1) == 0)
return 0;
}
return 1;
}
typedef struct prime_finder_vars
{
unsigned from;
unsigned to;
unsigned increment; /* Number of threads */
unsigned idx;
} PrimeFinderVars;
static void *prime_finder(void *pf)
{
PrimeFinderVars *pf_vars = (PrimeFinderVars *) pf;
printf("Thread %u: from = %u, to = %u, inc = %u\n",
pf_vars->idx, pf_vars->from, pf_vars->to, pf_vars->increment);
unsigned next = pf_vars->from;
while (next < pf_vars->to) {
unsigned six_n = 6 * next;
if (is_prime(six_n - 1))
++counts[pf_vars->idx];
if (is_prime(six_n + 1))
++counts[pf_vars->idx];
next += pf_vars->increment;
}
printf("Thread %u: done\n", pf_vars->idx);
return pf;
}
int main(int argc, char **argv)
{
err_setarg0(argv[0]);
if (argc != 1)
err_usage("");
struct timespec start;
struct timespec end;
double start_sec, end_sec, elapsed_sec;
int sum = 0;
Clock clk;
clk_init(&clk);
clk_start(&clk);
clock_gettime(CLOCK_REALTIME, &start);
pthread_t threads[NUM_THREADS];
PrimeFinderVars vars[NUM_THREADS];
int max_n = (MAX_NUMBER + 5) / 6;
for (int i = 0; i < NUM_THREADS; i++)
{
vars[i].from = i + 1;
vars[i].to = max_n;
vars[i].idx = i;
vars[i].increment = NUM_THREADS;
int rc;
if ((rc = pthread_create(&threads[i], NULL, prime_finder, &vars[i])) != 0)
err_syserr("failed to create thread %d: ", i);
}
for (int i = 0; i < NUM_THREADS; i++)
{
pthread_join(threads[i], NULL);
sum += counts[i];
}
clock_gettime(CLOCK_REALTIME, &end);
clk_stop(&clk);
start_sec = start.tv_sec + start.tv_nsec / NANO_PER_SEC;
end_sec = end.tv_sec + end.tv_nsec / NANO_PER_SEC;
elapsed_sec = end_sec - start_sec;
printf("Time 1: %.6f\n", elapsed_sec);
char buffer[32];
printf("Time 2: %s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));
/* Because 2 and 3 are primes but are not analyzed */
size_t t_count = 2;
for (int i = 0; i < NUM_THREADS; i++)
{
t_count += counts[i];
printf("%d: %7zu primes found\n", i, counts[i]);
}
printf("Total primes found up to %d = %zu\n", MAX_NUMBER, t_count);
return 0;
}
Example output:
$ timecmd -u -- prime-thread
2020-10-16 12:15:05.101785 [PID 75174] prime-thread
Thread 0: from = 1, to = 1666667, inc = 8
Thread 7: from = 8, to = 1666667, inc = 8
Thread 2: from = 3, to = 1666667, inc = 8
Thread 3: from = 4, to = 1666667, inc = 8
Thread 5: from = 6, to = 1666667, inc = 8
Thread 4: from = 5, to = 1666667, inc = 8
Thread 6: from = 7, to = 1666667, inc = 8
Thread 1: from = 2, to = 1666667, inc = 8
Thread 0: done
Thread 6: done
Thread 4: done
Thread 7: done
Thread 3: done
Thread 5: done
Thread 2: done
Thread 1: done
Time 1: 0.231135
Time 2: 0.231135
0: 83090 primes found
1: 83176 primes found
2: 83023 primes found
3: 82996 primes found
4: 83060 primes found
5: 82995 primes found
6: 83179 primes found
7: 83058 primes found
Total primes found up to 10000000 = 664579
2020-10-16 12:15:05.341489 [PID 75174; status 0x0000] - 0.239704s
$
There are indeed 664,579 primes less than 10,000,000.
Note that the timecmd program counts the entire running time (start-up and printing) of the prime-thread, whereas the internal timing only counts the thread creation, running, and termination time. That accounts for the 8 ms timing difference. (It's a home-brew program that I use for timing commands. It's loosely similar to the system-provided time command — but significantly different too.)
Given a list of the primes up to 10,000,000, it would be feasible to calculate how many primes each thread should have found. Given that the totals are correct, it is unlikely that there's a problem there, though.
Timing
Note that the question says it took 21 seconds to count the number of primes up to 8,000,000. This code took 0.231 seconds to count the number of primes up to 10,000,000.
That suggests that the isprime() function in use is not as good as the one I used.
Indeed, the code shown is:
int is_prime(long num) {
int limit = round(sqrt(num));
for (long i = 2; i <= limit; i++) {
if (num % i == 0)
return FALSE;
}
return TRUE;
}
This does far more work than necessary. There's only one even prime number, 2. This code checks 4, 6, 8, … which are trivially non-prime. That's twice as much work as necessary. Checking for 2 and then only checking odd numbers would be a significant improvement. Checking 2, 3, and then numbers which match 6N±1 gives another improvement.
Even so, checking one third as much data would only improve things by a factor of 3. It is likely that the unbalanced workload is a bigger factor. With 8 threads (0..7), thread 7 working on the range 7,000,000..8,000,000 and it has a lot more computation to do than thread 0 working on the range 0..1,000,000, even though there are fewer primes for it to count.
The question doesn't show a complete MCVE (Minimal, Complete, Verifiable Example
— or MRE or whatever name SO now uses)
or an
SSCCE (Short, Self-Contained, Correct Example). It doesn't show how many threads are in use.
I have not, but perhaps should, parameterize prime-thread.c to take a variable number of threads and a variable range for analysis (and remove the thread debugging printing) and see how much it changes behaviour. On my machine, it is unlikely that more threads will improve things; it may be that fewer threads would be better.
I have a program primes which prints the primes in a given range using a Sieve of Eratosthenes. It prints all 664,579 primes up to 10 million in about 0.135 seconds when the output goes to (SSD) file or /dev/null. That is significantly faster than prime-thread manages to count the primes. Quite a lot of the benefit there is from the better algorithm. This isprime() function does a lot of computation for each candidate number.
Two lessons to draw from this:
Algorithms matter.
Threads aren't a panacea for speeding things up.
The aim of my program is calculate the electrostatic potential between an inner conductor and an outer conductor by splitting it up into a grid and then into grid slices. Each processor gets a slice and runs the calculations on each slice. I send data between processors using MPI_Isend and MPI_Irecv. When testing the code I get a segmentation fault:
[physnode5:81440] *** Process received signal ***
[physnode5:81440] Signal: Segmentation fault (11)
[physnode5:81440] Signal code: Address not mapped (1)
[physnode5:81440] Failing at address: 0x58
[physnode5:81440] [ 0] /lib64/libpthread.so.0(+0xf5d0)[0x2ab8069df5d0]
[physnode5:81440] [ 1] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(ompi_request_default_wait+0xd)[0x2ab8066495ed]
[physnode5:81440] [ 2] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(MPI_Wait+0x5d)[0x2ab80667a00d]
[physnode5:81440] [ 3] ./mpi_tezt.exe[0x400ffc]
[physnode5:81440] [ 4] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2ab806c0e3d5]
[physnode5:81440] [ 5] ./mpi_tezt.exe[0x4009b9]
[physnode5:81440] *** End of error message ***
when this bit of code is executed. please not i have ssh'ed into a cluster. The file name is mpi_tezt.exe (yes i mispelled it).
I have checked the arrays I want to send are correctly allocated and the send and recv are not sending or receiving data that isn't there (i.e sending data outside range of array.
My code for the MPI_Isend and MPI_Irecv is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
/*MPI Specific Variables*/
int my_size, my_rank, up, down;
MPI_Request reqU, reqD, sreqU, sreqD;
MPI_Status rUstatus, rDstatus, sUstatus, sDstatus;
/*Physical Dimensions*/
double Linner = 5.0;/*mm*/
double Rinner = 1.0;/*mm*/
double phi_0 = 1000.0;/*V*/
/*Other Variables*/
int grid_size = 100;
int slice;
int x,y;
double grid_res_y = 0.2;
double grid_res_x = 0.1;
int xboundary, yboundary;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &my_size);
/*Determining neighbours*/
if (my_rank != 0) /*if statemets used to stop highest and lowest rank neighbours arent outside 0 - my_size-1 range of ranks*/
{
up = my_rank-1;
}
else
{
up = 0;
}
if(my_rank != my_size-1)
{
down = my_rank+1;
}
else
{
down = my_size-1;
}
/*cross-check: presumed my_size is a factor of gridsize else there are odd sized slices and this is not coded for*/
if (grid_size%my_size != 0)
{
printf("ERROR - number of procs = %d, this is not a factor of grid_size %d\n", my_size, grid_size);
exit(0);
}
/*Set Up Distributed Data Approach*/
slice = grid_size/my_size;
yboundary = Linner/grid_res_y; /*y grid index of inner conductor wall*/
xboundary = Rinner/grid_res_x; /*x grid and individual array index of inner conductor wall*/
double phi[slice+2][grid_size]; /*extra 2 rows to allow for halo data*/
for (y=0; y < slice+2; y++)
{
for (x=0; x < grid_size; x++)
{
phi[y][x] = 0.0;
}
}
if(my_rank == 0) /*Boundary Containing rank does 2 loops. One over part with inner conductor and one over part without inner conductor*/
{
for(y=0; y < slice+1; y++)
{
for(x=xboundary; x < grid_size; x++)
{
phi[y][x] = phi_0;
}
}
}
if (my_rank < my_size-1)
{
/*send top most strip up one node to be recieved as bottom halo*/
MPI_Isend(&phi[1][0], grid_size , MPI_DOUBLE, down, 1, MPI_COMM_WORLD, &sreqU);
/*recv top halo from up one node*/
MPI_Irecv(&phi[slice+1][0], grid_size, MPI_DOUBLE, down, 2, MPI_COMM_WORLD, &reqU);
}
if (my_rank > 0)
{
/*recv top halo from down one node*/
MPI_Irecv(&phi[0][0], grid_size , MPI_DOUBLE, up, 2, MPI_COMM_WORLD, &reqD);
/*send bottom most strip down one node to be recieved as top halo*/
MPI_Isend(&phi[slice][0], grid_size , MPI_DOUBLE, up, 1, MPI_COMM_WORLD, &sreqD);
}
if (my_rank<my_size-1)
{
/*Wait for send to down one rank to complete*/
MPI_Wait(&sreqD, &sDstatus);
/*Wait for recieve from up one rank to complete*/
MPI_Wait(&reqD, &rDstatus);
}
if (my_rank>0)
{
/*Wait for send to up down one rank to complete*/
MPI_Wait(&sreqU, &sUstatus);
/*Wait for recieve from down one rank to complete*/
MPI_Wait(&reqU, &rUstatus);
}
MPI_Finalize();
return 0;
}
I have been testing on 2 processors (ranks 0 and 1) with the hope of extending it to more.
Any ideas where the fault may lie?
You're faulting in the first MPI_Wait (for rank 0). This is step 7 in the example code below.
Using mpirun -np 2 ./whatever:
It appears that sReqD is not being set correctly. This is set at step 5 by rank 1.
But, step 7 is being executed by rank 0, which does not set sReqD.
So, you need to adjust your if statements to match up correctly for which rank does which MPI_Wait, etc.
Here is your code with some debug printf statements:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
int
main(int argc, char *argv[])
{
/* MPI Specific Variables */
int my_size,
my_rank,
up,
down;
MPI_Request reqU,
reqD,
sreqU,
sreqD;
MPI_Status rUstatus,
rDstatus,
sUstatus,
sDstatus;
/* Physical Dimensions */
double Linner = 5.0; /* mm */
double Rinner = 1.0; /* mm */
double phi_0 = 1000.0;
/*V*/
/* Other Variables */
int grid_size = 100;
int slice;
int x,
y;
double grid_res_y = 0.2;
double grid_res_x = 0.1;
int xboundary,
yboundary;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &my_size);
/* Determining neighbours */
/* if statemets used to stop highest and lowest rank neighbours arent
outside 0 - my_size-1 range of ranks */
if (my_rank != 0) {
up = my_rank - 1;
}
else {
up = 0;
}
if (my_rank != my_size - 1) {
down = my_rank + 1;
}
else {
down = my_size - 1;
}
printf("my_rank=%d my_size=%d up=%d down=%d\n",my_rank,my_size,up,down);
/* cross-check: presumed my_size is a factor of gridsize else there are
odd sized slices and this is not coded for */
if (grid_size % my_size != 0) {
printf("ERROR - number of procs = %d, this is not a factor of grid_size %d\n", my_size, grid_size);
exit(0);
}
/* Set Up Distributed Data Approach */
slice = grid_size / my_size;
/* y grid index of inner conductor wall */
yboundary = Linner / grid_res_y;
/* x grid and individual array index of inner conductor wall */
xboundary = Rinner / grid_res_x;
if (my_rank == 0) {
printf("Linner=%g grid_res_y=%g yboundary=%d\n",
Linner,grid_res_y,yboundary);
printf("Rinner=%g grid_res_x=%g xboundary=%d\n",
Rinner,grid_res_x,xboundary);
printf("slice=%d grid_size=%d phi=%ld\n",
slice,grid_size,sizeof(double) * (slice + 2) * grid_size);
}
/* extra 2 rows to allow for halo data */
double phi[slice + 2][grid_size];
for (y = 0; y < slice + 2; y++) {
for (x = 0; x < grid_size; x++) {
phi[y][x] = 0.0;
}
}
/* Boundary Containing rank does 2 loops. One over part with inner
conductor and one over part without inner conductor */
if (my_rank == 0) {
for (y = 0; y < slice + 1; y++) {
for (x = xboundary; x < grid_size; x++) {
phi[y][x] = phi_0;
}
}
}
if (my_rank < my_size - 1) {
/* send top most strip up one node to be recieved as bottom halo */
printf("1: my_rank=%d MPI_Isend\n",my_rank);
MPI_Isend(&phi[1][0], grid_size, MPI_DOUBLE, down, 1, MPI_COMM_WORLD,
&sreqU);
/* recv top halo from up one node */
printf("2: my_rank=%d MPI_Irecv\n",my_rank);
MPI_Irecv(&phi[slice + 1][0], grid_size, MPI_DOUBLE, down, 2,
MPI_COMM_WORLD, &reqU);
printf("3: my_rank=%d\n",my_rank);
}
if (my_rank > 0) {
/* recv top halo from down one node */
printf("4: my_rank=%d MPI_Irecv\n",my_rank);
MPI_Irecv(&phi[0][0], grid_size, MPI_DOUBLE, up, 2, MPI_COMM_WORLD,
&reqD);
/* send bottom most strip down one node to be recieved as top halo */
printf("5: my_rank=%d MPI_Isend\n",my_rank);
MPI_Isend(&phi[slice][0], grid_size, MPI_DOUBLE, up, 1, MPI_COMM_WORLD,
&sreqD);
printf("6: my_rank=%d\n",my_rank);
}
if (my_rank < my_size - 1) {
/* Wait for send to down one rank to complete */
printf("7: my_rank=%d\n",my_rank);
MPI_Wait(&sreqD, &sDstatus);
printf("8: my_rank=%d\n",my_rank);
/* Wait for recieve from up one rank to complete */
printf("9: my_rank=%d\n",my_rank);
MPI_Wait(&reqD, &rDstatus);
printf("10: my_rank=%d\n",my_rank);
}
if (my_rank > 0) {
/* Wait for send to up down one rank to complete */
printf("11: my_rank=%d\n",my_rank);
MPI_Wait(&sreqU, &sUstatus);
printf("12: my_rank=%d\n",my_rank);
/* Wait for recieve from down one rank to complete */
printf("12: my_rank=%d\n",my_rank);
MPI_Wait(&reqU, &rUstatus);
printf("13: my_rank=%d\n",my_rank);
}
MPI_Finalize();
return 0;
}
Here is the output. Notice that step 7 prints (which is before the first MPI_Wait for rank 0). But, rank 0 never gets to step 8 (the printf after that call)
my_rank=0 my_size=2 up=0 down=1
Linner=5 grid_res_y=0.2 yboundary=25
Rinner=1 grid_res_x=0.1 xboundary=10
slice=50 grid_size=100 phi=41600
1: my_rank=0 MPI_Isend
2: my_rank=0 MPI_Irecv
3: my_rank=0
7: my_rank=0
my_rank=1 my_size=2 up=0 down=1
4: my_rank=1 MPI_Irecv
5: my_rank=1 MPI_Isend
6: my_rank=1
11: my_rank=1
[manderly:230404] *** Process received signal ***
[manderly:230403] *** Process received signal ***
[manderly:230403] Signal: Segmentation fault (11)
[manderly:230403] Signal code: Address not mapped (1)
[manderly:230403] Failing at address: 0x58
[manderly:230404] Signal: Segmentation fault (11)
[manderly:230404] Signal code: Address not mapped (1)
[manderly:230404] Failing at address: 0x58
[manderly:230403] [ 0] [manderly:230404] [ 0] /lib64/libpthread.so.0(+0x121c0)/lib64/libpthread.so.0(+0x121c0)[0x7fa5478341c0]
[0x7fa0ebe951c0]
[manderly:230404] [ 1] [manderly:230403] [ 1] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa0ec0e9a81]
[manderly:230404] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa547a88a81]
[manderly:230403] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa0ec12c350]
[manderly:230404] [ 3] ./fix2[0x400f93]
[manderly:230404] [ 4] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa547acb350]
[manderly:230403] [ 3] ./fix2[0x400ef7]
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa0ebaedfea]
[manderly:230404] [ 5] ./fix2[0x40081a[manderly:230403] [ 4] ]
[manderly:230404] *** End of error message ***
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa54748cfea]
[manderly:230403] [ 5] ./fix2[0x40081a]
[manderly:230403] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node manderly exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
I wrote a program with 2 threads doing the same thing but I found the throughput of each threads is slower than if I only spawn one thread. Then I write this simple test to see if that's my problem or it's because of the system.
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
/*
* Function: run_add
* -----------------------
* Do addition operation for iteration ^ 3 times
*
* returns: void
*/
void *run_add(void *ptr) {
clock_t t1, t2;
t1 = clock();
int sum = 0;
int i = 0, j = 0, k = 0;
int iteration = 1000;
long total = iteration * iteration * iteration;
for (i = 0; i < iteration; i++) {
for (j = 0; j < iteration; j++) {
for (k = 0; k < iteration; k++) {
sum++;
}
}
}
t2 = clock();
float diff = ((float)(t2 - t1) / 1000000.0F );
printf("thread id = %d\n", (int)(pthread_self()));
printf("Total addtions: %ld\n", total);
printf("Total time: %f second\n", diff);
printf("Addition per second: %f\n", total / diff);
printf("\n");
return NULL;
}
void run_test(int num_thread) {
pthread_t pth_arr[num_thread];
int i = 0;
for (i = 0; i < num_thread; i++) {
pthread_create(&pth_arr[i], NULL, run_add, NULL);
}
for (i = 0; i < num_thread; i++) {
pthread_join(pth_arr[i], NULL);
}
}
int main() {
int num_thread = 5;
int i = 0;
for (i = 1; i < num_thread; i++) {
printf("Running SUM with %d threads. \n\n", i);
run_test(i);
}
return 0;
}
The result still shows the average speed of n threads is slower than one single thread. The more threads I have, the slower each one is.
Here's the result:
Running SUM with 1 threads.
thread id = 528384,
Total addtions: 1000000000,
Total time: 1.441257 second,
Addition per second: 693838784.000000
Running SUM with 2 threads.
thread id = 528384,
Total addtions: 1000000000,
Total time: 2.970870 second,
Addition per second: 336601728.000000
thread id = 1064960,
Total addtions: 1000000000,
Total time: 2.972992 second,
Addition per second: 336361504.000000
Running SUM with 3 threads.
thread id = 1064960,
Total addtions: 1000000000,
Total time: 4.434701 second,
Addition per second: 225494352.000000
thread id = 1601536,
Total addtions: 1000000000,
Total time: 4.449250 second,
Addition per second: 224756976.000000
thread id = 528384,
Total addtions: 1000000000,
Total time: 4.454826 second,
Addition per second: 224475664.000000
Running SUM with 4 threads.
thread id = 528384,
Total addtions: 1000000000,
Total time: 6.261967 second,
Addition per second: 159694224.000000
thread id = 1064960,
Total addtions: 1000000000,
Total time: 6.293107 second,
Addition per second: 158904016.000000
thread id = 2138112,
Total addtions: 1000000000,
Total time: 6.295047 second,
Addition per second: 158855056.000000
thread id = 1601536,
Total addtions: 1000000000,
Total time: 6.306261 second,
Addition per second: 158572560.000000
I have a 4-core CPU and my system monitor shows each time I ran n threads, n CPU cores are 100% utilized. Is it true that n threads(<= my CPU cores) are supposed to run n times as fast as one thread? Why it is not the case here?
clock() measures CPU time not "Wall" time.
it also measures the total time of all threads..
CPU time is time when the processor was executing you code, wall time is real world elapsed time (like a clock on the wall would show)
time your program using /usr/bin/time to see what's really happening.
or use a wall-time function like time(), gettimeofday() or clock_gettime()
clock_gettime() can measure CPU time for this thread, for this process, or wall time. - it's probably the best way to do this type of experiment.
While you have your answer regarding why the multi-threaded performance seemed worse than single-thread, there are several things you can do to clean up the logic of your program and make it work like it appears you intended it to.
First, if you were keeping track of the relative wall-time that passed and the time reported by your diff of the clock() times, you would have noticed the time reported was approximately a (n-proccessor core) multiple of the actual wall-time. That was explained in the other answer.
For relative per-core performance timing, the use of clock() is fine. You are getting only an approximation of wall-time, but for looking at a relative additions per-second, that provides a clean per-core look at performance.
While you have correctly used a divisor of 1000000 for diff, time.h provides a convenient define for you. POSIX requires that CLOCKS_PER_SEC equals 1000000 independent of the actual resolution. That constant is provided in time.h.
Next, you should also notice that your output per-core wasn't reported until all threads were joined making reporting totals in run_add somewhat pointless. You can output thread_id, etc. from the individual threads for convenience, but the timing information should be computed back in the calling function after all threads have been joined. That will clean up the logic of your run_add significantly. Further, if you want to be able to vary the number of iterations, you should consider passing that value through ptr. e.g.:
/*
* Function: run_add
* -----------------------
* Do addition operation for iteration ^ 3 times
*
* returns: void
*/
void *run_add (void *ptr)
{
int i = 0, j = 0, k = 0, iteration = *(int *)ptr;
unsigned long sum = 0;
for (i = 0; i < iteration; i++)
for (j = 0; j < iteration; j++)
for (k = 0; k < iteration; k++)
sum++;
printf (" thread id = %lu\n", (long unsigned) (pthread_self ()));
printf (" iterations = %lu\n\n", sum);
return NULL;
}
run_test is relatively unchanged, with the bulk of the calculation changes being those moved from run_add to main and being scaled to account for the number of cores utilized. The following is a rewrite of main allowing the user to specify the number of cores to use as the first argument (using all-cores by default) and the base for your cubed number of iterations as the second argument (1000 by default):
int main (int argc, char **argv) {
int nproc = sysconf (_SC_NPROCESSORS_ONLN), /* number of core available */
num_thread = argc > 1 ? atoi (argv[1]) : nproc,
iter = argc > 2 ? atoi (argv[2]) : 1000;
unsigned long subtotal = iter * iter * iter,
total = subtotal * num_thread;
double diff = 0.0, t1 = 0.0, t2 = 0.0;
if (num_thread > nproc) num_thread = nproc;
printf ("\nrunning sum with %d threads.\n\n", num_thread);
t1 = clock ();
run_test (num_thread, &iter);
t2 = clock ();
diff = (double)((t2 - t1) / CLOCKS_PER_SEC / num_thread);
printf ("----------------\nTotal time: %lf second\n", diff);
printf ("Total addtions: %lu\n", total);
printf ("Additions per-second: %lf\n\n", total / diff);
return 0;
}
Putting all the pieces together, you could write a working example as follows. Make sure you disable optimizations to prevent your compiler from optimizing out your loops for sum, etc...
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <unistd.h>
/*
* Function: run_add
* -----------------------
* Do addition operation for iteration ^ 3 times
*
* returns: void
*/
void *run_add (void *ptr)
{
int i = 0, j = 0, k = 0, iteration = *(int *)ptr;
unsigned long sum = 0;
for (i = 0; i < iteration; i++)
for (j = 0; j < iteration; j++)
for (k = 0; k < iteration; k++)
sum++;
printf (" thread id = %lu\n", (long unsigned) (pthread_self ()));
printf (" iterations = %lu\n\n", sum);
return NULL;
}
void run_test (int num_thread, int *it)
{
pthread_t pth_arr[num_thread];
int i = 0;
for (i = 0; i < num_thread; i++)
pthread_create (&pth_arr[i], NULL, run_add, it);
for (i = 0; i < num_thread; i++)
pthread_join (pth_arr[i], NULL);
}
int main (int argc, char **argv) {
int nproc = sysconf (_SC_NPROCESSORS_ONLN),
num_thread = argc > 1 ? atoi (argv[1]) : nproc,
iter = argc > 2 ? atoi (argv[2]) : 1000;
unsigned long subtotal = iter * iter * iter,
total = subtotal * num_thread;
double diff = 0.0, t1 = 0.0, t2 = 0.0;
if (num_thread > nproc) num_thread = nproc;
printf ("\nrunning sum with %d threads.\n\n", num_thread);
t1 = clock ();
run_test (num_thread, &iter);
t2 = clock ();
diff = (double)((t2 - t1) / CLOCKS_PER_SEC / num_thread);
printf ("----------------\nTotal time: %lf second\n", diff);
printf ("Total addtions: %lu\n", total);
printf ("Additions per-second: %lf\n\n", total / diff);
return 0;
}
Example Use/Output
Now you can measure the relative number of additions per-second performed based on the number of cores utilized -- and have it return a Total time that is roughly what wall-time would be. For example, measuring the additions per-second using a single core results in:
$ ./bin/pthread_one_per_core 1
running sum with 1 threads.
thread id = 140380000397056
iterations = 1000000000
----------------
Total time: 2.149662 second
Total addtions: 1000000000
Additions per-second: 465189411.172547
Approximatey 465M additions per-sec. Using two cores should double that rate:
$ ./bin/pthread_one_per_core 2
running sum with 2 threads.
thread id = 140437156796160
iterations = 1000000000
thread id = 140437165188864
iterations = 1000000000
----------------
Total time: 2.152436 second
Total addtions: 2000000000
Additions per-second: 929179560.000957
Exactly twice the additions per-sec at 929M/s. Using 4-cores:
$ ./bin/pthread_one_per_core 4
running sum with 4 threads.
thread id = 139867841853184
iterations = 1000000000
thread id = 139867858638592
iterations = 1000000000
thread id = 139867867031296
iterations = 1000000000
thread id = 139867850245888
iterations = 1000000000
----------------
Total time: 2.202021 second
Total addtions: 4000000000
Additions per-second: 1816513309.422720
Doubled again to 1.81G/s, and using 8-cores gives the expected results:
$ ./bin/pthread_one_per_core
running sum with 8 threads.
thread id = 140617712838400
iterations = 1000000000
thread id = 140617654089472
iterations = 1000000000
thread id = 140617687660288
iterations = 1000000000
thread id = 140617704445696
iterations = 1000000000
thread id = 140617662482176
iterations = 1000000000
thread id = 140617696052992
iterations = 1000000000
thread id = 140617670874880
iterations = 1000000000
thread id = 140617679267584
iterations = 1000000000
----------------
Total time: 2.250243 second
Total addtions: 8000000000
Additions per-second: 3555171004.558562
3.55G/s. Look over the both answers (currently) and let us know if you have any questions.
note: there are a number of additional clean-ups and validations that could be applied, but for purposes of your example, updating the types to rational unsigned prevents strange results with thread_id and the addition numbers.
This program estimates Pi by throwing random "darts" (sampling points) to a circle or radius=1 inscribed inside a square board of length=2. Using the relationship
Area of circle / Area of Square = Pi/4
we can estimate Pi using the same relationship expressed as
Darts Inside Circle / Darts Outside Circle = Pi/4
The program works fine when I specify NDARTS in a #define, but when trying to broadcast it as a long long int, read from scanf, I get the following execution error:
mpirun -np 4 ./pi_montecarlo.x
-----------------------------------------------------------------------------
One of the processes started by mpirun has exited with a nonzero exit
code. This typically indicates that the process finished in error.
If your process did not finish in error, be sure to include a "return
0" or "exit(0)" in your C code before exiting the application.
PID 10591 failed on node n0 (127.0.0.1) due to signal 11.
Why?
Is there anything wrong with my MPI_Bcast declaration?
long long int *NDARTS=0;
scanf("%Ld",NDARTS);
MPI_Bcast(NDARTS, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
Full code:
/*
mpicc -g -Wall -lm pi_montecarlo3.c -o pi_montecarlo.x
mpirun -np 4 ./pi_montecarlo.x
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#define MASTER 0
#define PI 3.1415926535
d ouble pseudo_random (double a, double b) {
double r;
r = ((b-a) * ((double) rand() / (double) RAND_MAX)) +a;
return r;
}
int main(int argc, char*argv[]){
long long int *NDARTS=0;
int proc_id,
n_procs,
llimit,
ulimit,
n_circle,
i;
double pi_current,
pi_sum,
x,
y,
z,
error,
start_time,
end_time;
struct timeval stime;
llimit = -1;
ulimit = 1;
n_circle =0;
MPI_Init(&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &proc_id);
MPI_Comm_size (MPI_COMM_WORLD, &n_procs);
if (proc_id == MASTER){
printf("\nMonte Carlo Method to estimate Pi \n\n");
printf("Introduce Number of Darts \n");
scanf("%Ld",NDARTS);
printf(" Number of processes: %d \n", n_procs);
printf(" Number of darts: %Ld \n", *NDARTS);
MPI_Bcast(NDARTS, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
start_time = MPI_Wtime();
}
gettimeofday(&stime, NULL);
srand(stime.tv_usec * stime.tv_usec * stime.tv_usec * stime.tv_usec);
for (i=1; i<=*NDARTS;i++){
x = pseudo_random(llimit, ulimit);
y = pseudo_random(llimit, ulimit);
z = pow(x,2) + pow(y,2);
if (z<=1.0){
n_circle++;
}
}
pi_current = 4.0 * (double)n_circle / (double) *NDARTS;
MPI_Reduce (&pi_current, &pi_sum, 1, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
if (proc_id == MASTER) {
pi_sum = pi_sum / n_procs;
error = fabs ((pi_sum -PI) / PI) *100;
end_time = MPI_Wtime();
printf("Known value of PI : %11.10f \n", PI);
printf("Estimated Value of PI : %11.10f\n", pi_sum);
printf("Error Percentage : %10.8f\n", error);
printf("Time : %10.8f\n\n", end_time - start_time);
}
MPI_Finalize();
return 0;
}
You're not using scanf() correctly. It should be like this instead:
long long int NDARTS;
scanf("%lld",&NDARTS);
MPI_Bcast(&NDARTS, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
In your current code, long long int *NDARTS=0; effectively initializes NDARTS as a NULL pointer. So scanf() will obviously seg-fault when it tries to write to it.