Why openmp 32 thread is much slower than 1 thread?

Why openmp 32 thread is much slower than 1 thread? - c

I am trying to write an application calculating l2 norm of 2 arrays. I have to parallel my calculation.
Here is the code that I have parallelized:
double time_start_openmp = omp_get_wtime();
#pragma omp parallel for
for (i = 0; i < n; i++)
{
numberOfThreads = omp_get_num_threads();
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
time_end_openmp = omp_get_wtime();
l2_norm = sqrt(l2_norm);
openmp_exec_time = time_end_openmp - time_start_openmp;
printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
I compile the code as:
gcc -fopenmp -g -ggdb -Wall -lm -o test test.c
I am running this code with 1 threads and 32 threads. The output is the exact opposite of what's expected. Here is an example output:
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00
Am I seeing wrong or using 32 threads is 10 times slower than 1 thread? So, what am I doing wrong here?
Here is my full code:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#define MATSIZE 2000
static size_t totalMemUsage = 0;
size_t vectors_dot_prod(double *x, double *y, size_t n)
{
double res = 0.0;
size_t i;
for (i = 0; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
size_t res = 0.0;
size_t i = 0;
for (; i <= n - 4; i += 4)
{
res += (x[i] * y[i] +
x[i + 1] * y[i + 1] +
x[i + 2] * y[i + 2] +
x[i + 3] * y[i + 3]);
}
for (; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
size_t i;
for (i = 0; i < rows; i++)
{
result[i] = vectors_dot_prod2(mat[i], vec, cols);
}
}
double get_random()
{
double range = 1000;
double div = RAND_MAX / range;
double randomNumber = (rand() / div);
// printf("%d\n", randomNumber);
return randomNumber;
}
void print_2d_arr(double *arr, size_t row, size_t col)
{
size_t i, j, index;
for (i = 0; i < row; i++)
{
for (j = 0; j < col; j++)
{
index = i * col + j;
printf("%3f ", arr[index]);
}
printf("\n");
}
}
void print_1d_arr(double *arr, size_t row)
{
size_t i;
for (i = 0; i < row; i++)
{
printf("%f, ", arr[i]);
}
printf("\n");
}
size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
/*
* Fulfilling the array with random numbers
* */
size_t i;
for (i = 0; i < n; i++)
{
arr[i] = get_random();
}
return 0;
}
double *allocarray1D(size_t size)
{
double *array = calloc(size, sizeof(double));
totalMemUsage = totalMemUsage + size * sizeof(double);
return array;
}
size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
size_t i, j;
size_t nlocal;
double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
x[i] += a[index] * fb[j];
}
}
free(fb);
return 0;
}
size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{
// Process 0 sends b to everyone
MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
size_t i, j;
size_t nlocal;
// double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
// MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x_partial[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x_partial[i] += a[index] * b[j];
}
}
// free(b);
// Process 0 gathers x_partials to create x
MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
return 0;
}
size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
size_t i, j;
for (i = 0; i < n; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x[i] += a[index] * b[j];
}
}
return 0;
}
int main(int argc, char *argv[])
{
// Global declerations
size_t i;
// MPI_Status status;
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int taskid;
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
if (argc != 2)
{
if (taskid == 0)
printf("Usage: %s <N>\n", argv[0]);
MPI_Finalize();
return 0;
}
srand(time(NULL) + taskid);
size_t n = atoi(argv[1]);
size_t nOverK = n / world_size;
double *a = allocarray1D(n * n);
double *b = allocarray1D(n);
double *x = allocarray1D(n);
double *x_partial = allocarray1D(nOverK);
double *xseq = allocarray1D(n);
double *a_partial = allocarray1D(n * nOverK);
if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
{
if (taskid == 0)
printf("Allocation failed\n");
MPI_Finalize();
return 0;
}
// Process 0 creates A matrix.
if (taskid == 0)
{
fullfillArrayWithRandomNumbers(a, n * n);
// Process 0 produces the b
fullfillArrayWithRandomNumbers(b, n);
}
// Process 0 sends a_partial to everyone
if (!(world_size == 1 && n == 64000))
{
MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
double time_start = MPI_Wtime();
ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
double time_end = MPI_Wtime();
double parallel_exec_time = time_end - time_start;
double *exec_times = allocarray1D(world_size);
// Process 0 gathers x_partials to create x
MPI_Gather(&parallel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// print_1d_arr(x, n);
if (taskid == 0)
{
SequentialMatrixMultiply(n, a, b, xseq);
// check difference between x and xseq using OpenMP
//print_1d_arr(exec_times, world_size);
// print_1d_arr(xseq, n);
double max_exec, min_exec, avg_exec;
min_exec = 1000;
for (i = 0; i < world_size; i++)
{
if (max_exec < exec_times[i])
{
max_exec = exec_times[i];
}
if (min_exec > exec_times[i])
{
min_exec = exec_times[i];
}
avg_exec += exec_times[i];
}
avg_exec = avg_exec / world_size;
long double time_start_openmp = omp_get_wtime();
long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
max_exec_time = 0;
max_exec_time = 1000;
long double l2_norm = 0;
size_t numberOfThreads = 0;
size_t r = 0;
double *diff_vector = allocarray1D(n);
size_t nrepeat = 10000;
if (world_size == 1)
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
else
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
l2_norm = sqrt(l2_norm);
time_end_openmp = omp_get_wtime();
openmp_exec_time = time_end_openmp - time_start_openmp;
// print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
if (world_size == 1)
{
printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);
//printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
//printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
MPI_Finalize();
return 0;
}
Here is the output;
cn009
36
mpicc -fopenmp -g -ggdb -lm -o rowmv rowmv.c
OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15

Am I seeing wrong or using 32 threads is 10 times slower than 1
thread? So, what am I doing wrong here?
In the portion of code that is being both profiled and parallelized with OpenMP:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
there is a race condition, namely the access to the variable l2_norm. Moreover, you can drop the private(i), since the index variable (i.e., i) in the parallelized loop will be set implicitly as private by OpenMP. The race condition can be fixed with the OpenMP reduction. Furthermore, your loop is not actually distributing the iterations among threads as you wanted. Because you added again the parallel clause to that #pragma omp for, and assuming that you have nested parallelism disabled, which by default it is, each of the threads created in the outer parallel region will execute "sequentially" the code within that region, namely:
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
Hence, each thread will execute all the N iterations of the loop that you intended to be parallelized. Consequently, removing the parallelism and adding additional overhead (e.g., thread creation) to the sequential code. To fix those problems (i.e., race condition and "nested" parallel region) change this code to:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp for reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
Now, having fixed those problems you are left still with another problem (performance-wise), namely that the parallel loop is being performed in the context of a hybrid parallelization of OpenMP + MPI, and you did not explicitly bind the OpenMP threads (within the MPI processes) to the corresponded cores. Without that explicit binding, one cannot be sure in which cores those threads will end up. Naturally, more often than not, having multiple threads running in the same logical core will increase the overall execution of the application being parallelized.
If your application uses threads, then you probably want to ensure that you are either not bound at all (by specifying --bind-to none), or bound to multiple cores using an appropriate binding level or a specific number of processing elements per application process. You can solve this problem by either:
disabling the binding with the MPI flag --bind-to none, to enable threads to be assigned to different cores;
or perform the bound of threads, accordingly. Check this SO thread on how to map the threads to cores in Hybrid parallelizations such as MPI + OpenMP.
By explicitly setting the number of threads per process accordingly, you can avoid that multiple threads end up in the same core, and consequently, avoid that threads within the same core fight for the same resources.
Advice:
IMO you should first test the performance of the OpenMP alone, without any MPI process. In this context, test the scalability of code by measuring the sequential version against 2 threads, then 4, 8, and so on, gradually increasing the number of threads. Eventually, there will be a number of threads for which the code simply stops scaling. Naturally, the amount of parallel work being performed by the threads has to be big enough to overcome the overhead of parallelism. Therefore, you should also test around with bigger and bigger inputs.
After having profiled, tested an improved your OpenMP version you can then extent that shared-memory parallelization with multiple processes using MPI.

Besides the race condition in updating a shared variable as noted in #dreamcrash's answer, your code is not distributing the work properly.
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
~~~~~~~~
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
The parallel construct in the inner loop makes it a nested combined parallel for construct. It means that each thread in the team executing the outer parallel loop spawns a brand new parallel region and distributes the i-loop over the threads in it. There is no distribution happening in the outer parallel region and you end up with N threads all repeating the exact same work. By default nested parallelism is disabled, so the nested parallel region runs sequentially and your code is effectively doing this:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
There is no distribution of work and all threads write to the same locations in the diff_vector[] array.
On one hand, this code in general is a memory-bound one since the amount of computation per byte of data is low - modern CPUs can do many multiplications and subtractions per cycle while fetching data from memory and writing results back there takes many cycles. Memory-bound problems don't get any faster with more threads since the limiting factor is the memory bandwidth. This isn't that big of a problem in your case because 32K array entries take up 256 KB of memory and that fits in most CPU caches, and the L3 cache is blazing fast, but is still larger than the fastest L1 cache of a single CPU core. On the other hand, writing to the same memory areas from multiple threads results in true and false sharing, with the associated inter-thread cache invalidation, which usually results in the parallel code running way slower than the sequential version.
There are tools that can help you analyse the performance of your code and spot problems. As I already wrote in a comment, Intel VTune is one of them and is freely available as part of the oneAPI toolkit. Intel Inspector is another one (again, free and part of the oneAPI toolkit) and it finds problems such as data races. The two tools work very well together and I couldn't recommend them strongly enough to any aspiring parallel programmer.
There is also a minor race condition writing to numberOfThreads, but since all values written are the same, that isn't much of a logical problem. The correct version of the code in question should be:
#pragma omp parallel
{
#pragma omp master
numberOfThreads = omp_get_num_threads();
#pragma omp parallel reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}

Related

OpenMP parallel multiplication slower than Sequential multiplication

I'm learning OpenMP and I'm trying to do a simple task: A[r][c] * X[c] = B[r] (matrix vector multiplication).
The problem is: the sequential code is faster than parallel and I don't know why!
My code:
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
// Defined variables
#define row_matriz_A 80000
#define col_matriz_A 800
#define THREADS_NUM 4
// FUNCAO - GERAR MATRIZES
void gerarMatrizes(int r, int c, int mA[], int vX[], int vB[]){...}
// FUNCAO - SEQUENTIAL MULTIPLICATION
void multSequencial(int r, int c, int mA[], int vX[], int vB[]){
// Variables
int i, j, offset, sum;
struct timeval tv1,tv2;
double t1, t2;
// Begin Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao sequencial foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - MULTIPLICACAO PARALELA COM OpenMP
void matvecHost(int r, int c, int mA[], int vX[], int vB[]){
// Variaveis
int tID, i, j, offset, sum;
struct timeval tv1, tv2;
double t1, t2;
// Init vB
for(i = 0; i < r; i++) vB[i] = 0;
// BEGIN Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
omp_set_num_threads(THREADS_NUM);
#pragma omp parallel private(tID, i, j) shared(mA, vB, vX)
{
tID = omp_get_thread_num();
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao OpenMP foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - PRINCIPAL
int main(int argc, char * argv[]) {
int row, col;
row = row_matriz_A;
col = col_matriz_A;
int *matrizA = (int *)calloc(row * col, sizeof(int));
int *vectorX = (int *)calloc(col * 1, sizeof(int));
int *vectorB = (int *)calloc(row * 1, sizeof(int));
gerarMatrizes(row, col, matrizA, vectorX, vectorB);
multSequencial(row, col, matrizA, vectorX, vectorB);
matvecHost(row, col, matrizA, vectorX, vectorB);
return 0;
}
Previous solutions that did not worked:
Use collapse in my squared for
Increse rows and columns size
Increase thread numbers (A teacher recommend to use thread number == threads physical number)
Use malloc instead of m[i][j]
EDIT - ANSWER
My parallel block was correctly changed based on the correct answer:
#pragma omp parallel private(i, j, sum) shared(mA, vB, vX)
{
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
sum += mA[i * c + j] * vX[j];
}
vB[i] = sum;
}
}
I still got some a doubt:
If I define i, j and sum inside my parallel block, they will be set as private automatically? This improve the speed in my code or not?

You have race conditions on sum and offset - those are shared between the threads instead of being thread-private.
This also likely explains the slowdown: On x86, the CPU will actually work hard to make sure accesses to shared variables "work". This involves flushing cache lines after every (!) write to offset and sum - so all the threads are wildly writing into the same variables, but each one has to wait until the write from the previous thread (on a different core) has arrived in the local cache again after having been flushed. And of course it will produce completely nonsensical results.
I don't know why you are declaring all your variables at the start of the function - that's prone to these kind of mistakes. If you declared i, j, sum and offset (and the unused tID) in the smallest possible scopes instead, you wouldn't ever had this problem because they would be thread-private automatically in that case.

Multithreaded matrix multiplication in C

I'm trying to do some multithreaded high performance c matrix multiplication, the code below here is the program i wrote in C, it just works fine when the # of cores is 12 (since my pc has 12 threads or when i manually fix it to 12) when I switch it to a lower value (like 10 f.e.) gives me strange results, doesn anyone have an idea on what the problem could be?
Tested an perfectly working with 12 cores (or threads, call as whatever u want ) with a lower number of cores doesn't work anymore (look like he ends the execution almost immediately)
Tried with different values but looks like there is an error in the code I can't figure out probably.
The error is present in big size matrices but sometimes also in small size matrices
//
// Created by christian on 06/09/2019.
//
#pragma GCC optimize("O3", "unroll-loops", "omit-frame-pointer", "inline") //Optimization flags
#pragma GCC option("arch=native", "tune=native", "no-zero-upper") //Enable AVX
#pragma GCC target("avx") //Enable AVX
#include <time.h> // for clock_t, clock(), CLOCKS_PER_SEC
#include <sys/time.h>
#include <stdio.h> //AVX/SSE Extensions are included in stdio.h
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
int ops = 0;
//define matrix size (in this case we'll use a square matrix)
#define DIM 200 //DO NOT EXCEED 10000 (modification to the stack size needed)
float matrix[DIM][DIM];
float result_matrix[DIM][DIM];
float *matrix_ptr = (float *) &matrix;
float *result_ptr = (float *) &result_matrix;
// set the number of logical cores to 1 (just in case the auto-detection doesn't work properly)
int cores = 1;
//functions prototypes
void single_multiply(int row);
void *thread_multiply(void *offset);
int detect_number_of_cores();
void fill_matrix();
int main() {
//two instructions needed for pseudo-random float numbers
srand((unsigned int) time(NULL));
//detect the number of active cores
cores = detect_number_of_cores();
//matrix filling with random float values
fill_matrix();
printf("------------- MATRIX MULTIPLICATION -------------\n");
printf("--- multi-thread (vectorization enabled) v1.0 ---\n");
// printf("\n ORIGINAL MATRIX");
// for(int c=0; c<DIM; c++){
// printf("\n");
// for(int k=0; k<DIM; k++){
// printf("%f \t", matrix[c][k]);
// }
// }
//uncomment and modify this value to force a particular number of threads (not recommended)
//cores = 4;
printf("\n Currently using %i cores", cores);
printf("\n Matrix size: %i x %i", DIM, DIM);
//time detection struct declaration
struct timeval start, end;
gettimeofday(&start, NULL);
//decisional tree for the number of threads to be used
if (cores == 0 || cores == 1 || cores > DIM) {
//passing 0 because it has to start from the first row
single_multiply(0);
//this value may not be correct if matrix size exceeds 80x80 due to thread lock problems
printf("\n Total multiply ops: %i", ops);
gettimeofday(&end, NULL);
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
return 0;
} else {
//split the matrix in more parts (as much as the number of active cores)
int rows_por_thread = DIM / cores;
printf("\n Rows por Thread: %i", rows_por_thread);
//calculate the rest of the division (if there is one obviously)
int rest = DIM % cores;
printf("\n Rest: %i \n", rest);
if (rest == 0) {
//execute just the multi-thread function n times
int times = rows_por_thread;
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0;
int error;
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...");
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
} else {
//THE PROBLEM MUST BE INSIDE THIS ELSE STATEMENT
//execute the multi-thread function n times and the single function th rest remaining times
printf("\n The number of cores is NOT a divisor of the size of the matrix. \n");
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0; //counter for the thread ID
int entrypoint_residual_rows = 0; //first unprocessed residual row
//launching the threads according to the available coreS
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
entrypoint_residual_rows = cores * rows_por_thread;
int error;
//launch the threads
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...\n");
//join all the previous generated threads
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
printf("\n entry-point index for the single function %i ", entrypoint_residual_rows);
single_multiply(entrypoint_residual_rows);
}
}
// printf("\n MULTIPLIED MATRIX");
// for (int c = 0; c < DIM; c++) {
// printf("\n");
// for (int k = 0; k < DIM; k++) {
// printf("%f \t", result_matrix[c][k]);
// }
// }
gettimeofday(&end, NULL);
printf("\n All threads joined correctly");
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
//this value may not be correct if matrix size exceeds 80x80 due to thread lock problems
printf("\n Total multiply ops: %i", ops);
return 0;
}
//detect number of cores of the CPU (logical cores)
int detect_number_of_cores() {
return (int) sysconf(_SC_NPROCESSORS_ONLN); // Get the number of logical CPUs.
}
//matrix filling function
void fill_matrix() {
float a = 5.0;
for (int c = 0; c < DIM; c++)
for (int d = 0; d < DIM; d++) {
matrix[c][d] = (float) rand() / (float) (RAND_MAX) * a;
}
}
//row by row multiplication algorithm (mono-thread version)
void single_multiply(int row) {
for (int i = row; i < DIM; i++) {
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
ops++;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
}
//thread for the multiplication algorithm
void *thread_multiply(void *offset) {
//de-reference the parameter passed by the main-thread
int *row_offset = (int *) offset;
//multiplication loops
for (int i = *row_offset; i < (*row_offset + (DIM / cores)); i++) {
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
ops++;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
return NULL;
}
this is the way the result looks (also the number of ops in the result should be equal to size x size)
------------- MATRIX MULTIPLICATION -------------
--- multi-thread (vectorization enabled) v1.0 ---
Currently using 4 cores
Matrix size: 200 x 200
Rows por Thread: 50
Rest: 0
created thread n 0 with argument: 0
created thread n 1 with argument: 50
created thread n 2 with argument: 100
created thread n 3 with argument: 150
... working ...
Waiting to join thread n: 0
Waiting to join thread n: 1
Waiting to join thread n: 2
Waiting to join thread n: 3
All threads joined correctly
Time elapsed is 0 seconds and 804 micros
Total multiply ops: 2200
Process finished with exit code 0

This pthread_join here looks extremely fishy -- observe how the loop variable is c, but you index the array on i:
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
I doubt it's doing the right thing.

in thread_multiple, the unadorned line:
ops++;
looks a bit suspicious. Did you not say you were running multiple instances of these threads?
As a general comment, you should look to have your functions a bit better defined; for example if you changed your single_multiply to be:
int single_multiply(int RowStart, int RowEnd) {
int ops = 0;
....
return ops;
}
then
void *thread_multiply(void *p) {
int *rows = p;
int ops;
ops = single_multiply(rows[0], rows[1]);
return (void *)ops;
}
you have:
reduced the bit of code that cares about things like 'cores' to the only bit that matters about them.
removed contention on the counter (you can collect them in pthread_join)
removed the redundant, nearly identical code.

Thank You ALL guys, this is now what it looks like, i would have expected better performance honeslty but at least it looks like it's working, does anyone have some idea on performance improvements I could do?
//
// Created by christian on 06/09/2019.
//
#pragma GCC optimize("O3", "unroll-loops", "omit-frame-pointer", "inline") //Optimization flags
#pragma GCC option("arch=native", "tune=native", "no-zero-upper") //Enable AVX
#pragma GCC target("avx") //Enable AVX
#include <time.h> // for clock_t, clock(), CLOCKS_PER_SEC
#include <sys/time.h>
#include <stdio.h> //AVX/SSE Extensions are included in stdio.h
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
//define matrix size (in this case we'll use a square matrix)
#define DIM 4000 //DO NOT EXCEED 10000 (modification to the stack size needed)
float matrix[DIM][DIM];
float result_matrix[DIM][DIM];
float *matrix_ptr = (float *) &matrix;
float *result_ptr = (float *) &result_matrix;
// set the number of logical cores to 1 (just in case the auto-detection doesn't work properly)
int cores = 1;
//functions prototypes
void single_multiply(int rowStart, int rowEnd);
void *thread_multiply(void *offset);
int detect_number_of_cores();
void fill_matrix();
int main() {
//two instructions needed for pseudo-random float numbers
srand((unsigned int) time(NULL));
//detect the number of active cores
cores = detect_number_of_cores();
//matrix filling with random float values
fill_matrix();
printf("------------- MATRIX MULTIPLICATION -------------\n");
printf("--- multi-thread (vectorization enabled) v1.0 ---\n");
// printf("\n ORIGINAL MATRIX");
// for(int c=0; c<DIM; c++){
// printf("\n");
// for(int k=0; k<DIM; k++){
// printf("%f \t", matrix[c][k]);
// }
// }
//uncomment and modify this value to force a particular number of threads (not recommended)
//cores = 4;
printf("\n Currently using %i cores", cores);
printf("\n Matrix size: %i x %i", DIM, DIM);
//time detection struct declaration
struct timeval start, end;
gettimeofday(&start, NULL);
//decisional tree for the number of threads to be used
if (cores == 0 || cores == 1 || cores > DIM) {
//passing 0 because it has to start from the first row
single_multiply(0, DIM);
gettimeofday(&end, NULL);
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %ld seconds and %ld micros\n", seconds, micros);
return 0;
} else {
//split the matrix in more parts (as much as the number of active cores)
int rows_por_thread = DIM / cores;
printf("\n Rows por Thread: %i", rows_por_thread);
//calculate the rest of the division (if there is one obviously)
int rest = DIM % cores;
printf("\n Rest: %i \n", rest);
if (rest == 0) {
//execute just the multi-thread function n times
int times = rows_por_thread;
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0;
int error;
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...");
for (int c = 0; c < cores; c++) {
error = pthread_join(threads[c], NULL);
if (error != 0) {
printf("\n Error in thread %i join", c);
}
printf("\n Waiting to join thread n: %i", c);
}
} else {
//THE PROBLEM MUST BE INSIDE THIS ELSE STATEMENT
//execute the multi-thread function n times and the single function th rest remaining times
printf("\n The number of cores is NOT a divisor of the size of the matrix. \n");
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0; //counter for the thread ID
int entrypoint_residual_rows = 0; //first unprocessed residual row
//launching the threads according to the available coreS
for (int c = 0; c < DIM - rest; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
entrypoint_residual_rows = cores * rows_por_thread;
int error;
//launch the threads
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...\n");
//join all the previous generated threads
for (int c = 0; c < cores; c++) {
pthread_join(threads[c], NULL);
printf("\n Waiting to join thread n: %i", c);
}
printf("\n entry-point index for the single function %i ", entrypoint_residual_rows);
single_multiply(entrypoint_residual_rows, DIM);
}
}
// printf("\n MULTIPLIED MATRIX");
// for (int c = 0; c < DIM; c++) {
// printf("\n");
// for (int k = 0; k < DIM; k++) {
// printf("%f \t", result_matrix[c][k]);
// }
// }
gettimeofday(&end, NULL);
printf("\n All threads joined correctly");
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
return 0;
}
//detect number of cores of the CPU (logical cores)
int detect_number_of_cores() {
return (int) sysconf(_SC_NPROCESSORS_ONLN); // Get the number of logical CPUs.
}
//matrix filling function
void fill_matrix() {
float a = 5.0;
for (int c = 0; c < DIM; c++)
for (int d = 0; d < DIM; d++) {
matrix[c][d] = (float) rand() / (float) (RAND_MAX) * a;
}
}
//row by row multiplication algorithm (mono-thread version)
void single_multiply(int rowStart, int rowEnd) {
for (int i = rowStart; i < rowEnd; i++) {
//printf("\n %i", i);
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
}
//thread for the multiplication algorithm
void *thread_multiply(void *offset) {
//de-reference the parameter passed by the main-thread
int *row_offset = (int *) offset;
printf(" Starting at line %i ending at line %i \n ", *row_offset, *row_offset + (DIM / cores));
single_multiply(*row_offset, *row_offset + (DIM / cores));
printf("\n ended at line %i", *row_offset + (DIM / cores));
return NULL;
}

Concurrent Kernel Launch Example - CUDA

I'm attempting to implement concurrent kernel launches for a very complex CUDA kernel, so I thought I'd start out with a simple example. It just launches a kernel which does a sum reduction. Simple enough. Here it is:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cuda.h>
extern __shared__ char dsmem[];
__device__ double *scratch_space;
__device__ double NDreduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
double *b;
b = scratch_space;
for (int i = tid; i < length; i+= blockDim.x)
b[i] = a[i];
__syncthreads();
do {
k = (k + 1) / 2;
if (tid < k && tid + k < length)
b[tid] += b[tid + k];
length = k;
__syncthreads();
} while (k > 1);
return b[0];
}
__device__ double reduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
do
{
k = (k + 1) / 2;
if (tid < k && tid + k < length)
a[tid] += a[tid + k];
length = k;
__syncthreads();
}
while (k > 1);
return a[0];
}
__global__ void kernel_thing(double *ad, int size)
{
double sum_1, sum_2, sum_3;
time_t begin, end, t1, t2, t3;
scratch_space = (double *) &dsmem[0];
for (int j = 0; j < 1000000; j++) {
begin = clock();
sum_1 = NDreduceSum(ad, size);
end = clock();
}
__syncthreads();
t1 = end - begin;
begin = clock();
sum_2 = 0;
if (threadIdx.x == 0) {
for (int i = 0; i < size; i++) {
sum_2 += ad[i];
}
}
__syncthreads();
end = clock();
t2 = end - begin;
__syncthreads();
begin = clock();
sum_3 = reduceSum(ad, size);
end = clock();
__syncthreads();
t3 = end - begin;
if (threadIdx.x == 0) {
printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.\n", sum_1, sum_2, sum_3, t1, t2, t3);
}
}
int main(int argc, char **argv)
{
int i;
const int size = 512;
double *a, *ad, *b, *bd;
double sum_a, sum_b;
cudaStream_t stream_a, stream_b;
cudaError_t result;
cudaEvent_t a_start, a_stop, b_start, b_stop;
a = (double *) malloc(sizeof(double) * size);
b = (double *) malloc(sizeof(double) * size);
srand48(time(0));
for (i = 0; i < size; i++) {
a[i] = drand48();
}
for (i = 0; i < size; i++) {
b[i] = drand48();
}
sum_a = 0;
for (i = 0; i < size; i++) {
sum_a += a[i];
}
sum_b = 0;
for (i = 0; i < size; i++) {
sum_b += b[i];
}
printf("Looking for sum_a %lf\n", sum_a);
printf("Looking for sum_b %lf\n", sum_b);
cudaEventCreate(&a_start);
cudaEventCreate(&b_start);
cudaEventCreate(&a_stop);
cudaEventCreate(&b_stop);
cudaMalloc((void **) &ad, sizeof(double) * size);
cudaMalloc((void **) &bd, sizeof(double) * size);
result = cudaStreamCreate(&stream_a);
result = cudaStreamCreate(&stream_b);
result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a);
result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b);
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
cudaEventRecord(a_stop);
cudaEventRecord(b_start);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(b_stop);
result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a);
result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b);
cudaEventSynchronize(a_stop);
cudaEventSynchronize(b_stop);
float a_ms = 0;
float b_ms = 0;
cudaEventElapsedTime(&a_ms, a_start, a_stop);
cudaEventElapsedTime(&b_ms, b_start, b_stop);
printf("%lf ms for A.\n", a_ms);
printf("%lf ms for B.\n", b_ms);
result = cudaStreamDestroy(stream_a);
result = cudaStreamDestroy(stream_b);
if (result != cudaSuccess) {
printf("I should probably do this after each important operation.\n");
}
/*
printf("Matrix after:\n");
for (i = 0; i < size; i++) {
printf("%lf ", a[i]);
}
printf("\n");
*/
free(a);
free(b);
cudaFree(ad);
cudaFree(bd);
return 0;
}
Compiled like so:
CFLAGS = -arch sm_35
CC = nvcc
all: parallel
parallel: parallel.cu
$(LINK.c) $^ -o $#
clean:
rm -f *.o core parallel
I'm using a single Tesla K20X.
When I run this simple example, I get the following output:
Looking for sum_a 247.983945
Looking for sum_b 248.033749
Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks.
Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks.
4645.079102 ms for A.
4630.725098 ms for B.
Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967
So, as you can see, each of the kernels gets the correct results and takes around 4.5 s, which is what I got in an earlier one-kernel version. Great! However, as you can see from the aprun output, the wall time is actually around 10 s, which is much more than the one-kernel version. So, it looks like the kernels are either not launching in parallel, or I'm not getting nearly the speed-up (2x) that I was expecting from concurrent kernel launches.
To tl;dr this question:
Am I missing anything in my code example? Are the kernels actually launching in parallel?
What kind of speed-up should I expect with a Tesla K20X? Shouldn't the kernels run exactly in parallel, completing twice the work in the same time? How many kernels can I expect to run efficiently in parallel?
Thanks for you help.

The cudaEventRecord operations in between your kernels are causing serialization.
Right now the results you are getting:
4645.079102 ms for A.
4630.725098 ms for B.
are back-to-back due to this serialization.
Instead, just time the entire kernel launch sequence:
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(a_stop);
And I think you will see an elapsed time for (a_start, a_stop) that is roughly the same as one of your previous kernels (~4600ms) indicating more or less full concurrency. I used CUDA 6 RC, copied data back to the host rather than printf from kernel, and eliminated the cudaEventRecord operations between the kernel calls, and I got an overall execution time of ~4.8s. If I didn't modify the cudaEventRecord arrangement, instead my execution time was ~8.3s
A few other notes:
I wouldn't use printf from kernel when running tests like these.
You won't get overlap of compute and cudaMemcpyAsync when the host buffer is allocated with malloc. You need to use cudaHostAlloc.
I would start with running and understanding the concurrent kernels cuda sample first.
You may want to review the appropriate section of the programming guide

How to extract indices of all nonzero elements from an 0-1 array using OpenMP?

I am an OpenMP beginner. I come across such a problem.
I have a mask array M with the length N, whose element is either 0 or 1. I hope to extract all indices i that satisfies M[i]=1 and store them into a new array T.
Can this problem be accelerated by OpenMP?
I have tried following code. But it is not performance effective.
int count = 0;
#pragma omp parallel for
for(int i = 0; i < N; ++i) {
if(M[i] == hashtag) {
int pos = 0;
#pragma omp critical (c1)
pos = count++;
T[pos] = i;
}

I am not 100% sure this will be much better, but you could try the following:
int count = 0;
#pragma omp parallel for
for(int i = 0; i < N; ++i) {
if(M[i]) {
#pragma omp atomic
T[count++] = i;
}
}
If the array is quite sparse, threads will be able to zip through a lot of zeros without waiting for others. But you can only update one index at a time. The problem is really that different threads are writing to the same memory block (T), which means you will be running into issues of caching: every time one thread writes to T, the cache of all the other cores is "dirty" - so when they try to modify it, a lot of shuffling goes on behind the scenes. All this is transparent to you (you don't need to write code to handle it) but it slows things down signficantly - I suspect that's your real bottleneck. If your matrix is big enough to make it worth your while, you might try to do the following:
Create as many arrays T as there are threads
Let each thread update its own version of T
Combine all the T arrays into one after the loops have completed
It might be faster (because the different threads don't write to the same memory) - but since there are so few statements inside the loop, I suspect it won't be.
EDIT I created a complete test program, and found two things. First, the atomic directive doesn't work in all versions of omp, and you may well have to use T[count++] += i; for it to even compile (which is OK since T can be set to all zeros initially); more troubling, you will not get the same answer twice if you do this (the final value of count changes from one pass to the next); if you use critical, that doesn't happen.
A second observation is that the speed of the program really slows down when you increase the number of threads, which confirms what I was suspecting about shared memory (times for 10M elements processed:
threads elapsed
1 0.09s
2 0.73s
3 1.21s
4 1.62s
5 2.34s
You can see this is true by changing how sparse matrix M is - when I create M as a random array, and test for M[i] < 0.01 * RAND_MAX (0.1% dense matrix), things run much more quickly than if I make it 10% dense - showing that the part inside the critical section is really slowing us down.
That being the case, I don't think there is a way of speeding up this task in OMP - the job of consolidating the outputs of all the threads into a single list at the end is just going to eat up any speed advantage you may have had, given how little is going on inside the inner loop. So rather than using multiple threads, I suggest you rewrite the loop as efficiently as possible - for example:
for( i = 0; i < N; i++) {
T[count] = i;
count += M[i];
}
In my quick benchmark, this was faster than the OMP solution - comparable with the threads = 1 solution. Again - this is because of the way memory is being accessed here. Note that I avoid using an if statement - this keeps the code as fast as possible. Instead, I take advantage of the fact that M[i] is always zero or one. At the end of the loop you have to discard the element T[count] because it will be invalid... the "good elements" are T[0] ... T[count-1]. An array M with 10M elements was processed by this loop in ~ 0.02 sec on my machine. Should be sufficient for most purposes?

Based on Floris's fast function I tried to see if I could find a way to find a faster solution with OpenMP. I came up with two functions foo_v2 and foo_v3 which are faster for larger arrays, foo_v2 is faster independent of density and foo_v3 is faster for sparser arrays. The function foo_v2 essentially creates a 2D array with width N*nthreads as well as an array countsa which contains the counts for each thread. This is better explained with code. The following code would loop over all the elements written out to T.
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
T[ithread*N/nthread + i]
}
}
The functionfoo_v3creates a 1D array as requested. In all casesNhas to be pretty large to overcome the OpenMP overhead. The code below defaults to 256MB with a density ofMabout 10%. The OpenMP functions are both faster by over a factor of 2 on my 4 core Sandy Bridge system. If you put the density at 50%foo_v2is faster still by about about a factor of 2 butfoo_v3is no longer faster.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int foo_v1(int *M, int *T, const int N) {
int count = 0;
for(int i = 0; i<N; i++) {
T[count] = i;
count += M[i];
}
return count;
}
int foo_v2(int *M, int *T, int *&counta, const int N) {
int nthreads;
#pragma omp parallel
{
nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
counta = new int[nthreads];
int count_private = 0;
#pragma omp for
for(int i = 0; i<N; i++) {
T[ithread*N/nthreads + count_private] = i;
count_private += M[i];
}
counta[ithread] = count_private;
}
return nthreads;
}
int foo_v3(int *M, int *T, const int N) {
int count = 0;
int *counta = 0;
#pragma omp parallel reduction(+:count)
{
const int nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
{
counta = new int[nthreads+1];
counta[0] = 0;
}
int *Tprivate = new int[N/nthreads];
int count_private = 0;
#pragma omp for nowait
for(int i = 0; i<N; i++) {
Tprivate[count_private] = i;
count_private += M[i];
}
counta[ithread+1] = count_private;
count += count_private;
#pragma omp barrier
int offset = 0;
for(int i=0; i<(ithread+1); i++) {
offset += counta[i];
}
for(int i=0; i<count_private; i++) {
T[offset + i] = Tprivate[i];
}
delete[] Tprivate;
}
delete[] counta;
return count;
}
void compare(const int *T1, const int *T2, const int N, const int count, const int *counta, const int nthreads) {
int diff = 0;
int n = 0;
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
int i2 = N*ithread/nthreads+i;
//printf("%d %d\n", T1[n], T2[i2]);
int tmp = T1[n++] - T2[i2];
if(tmp<0) tmp*=-1;
diff += tmp;
}
}
printf("diff %d\n", diff);
}
void compare_v2(const int *T1, const int *T2, const int count) {
int diff = 0;
int n = 0;
for(int i=0; i<count; i++) {
int tmp = T1[i] - T2[i];
//if(tmp!=0) printf("%i %d %d\n", i, T1[i], T2[i]);
if(tmp<0) tmp*=-1;
diff += tmp;
}
printf("diff %d\n", diff);
}
int main() {
const int N = 1 << 26;
printf("%f MB\n", 4.0*N/1024/1024);
int *M = new int[N];
int *T1 = new int[N];
int *T2 = new int[N];
int *T3 = new int[N];
int *counta;
double dtime;
for(int i=0; i<N; i++) {
M[i] = ((rand()%10)==0);
}
//int repeat = 10000;
int repeat = 1;
int count1, count2;
int nthreads;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count1 = foo_v1(M, T1, N);
dtime = omp_get_wtime() - dtime;
printf("time v1 %f\n", dtime);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) nthreads = foo_v2(M, T2, counta, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
compare(T1, T2, N, count1, counta, nthreads);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count2 = foo_v3(M, T3, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
printf("count1 %d, count2 %d\n", count1, count2);
compare_v2(T1, T3, count1);
}

The critical operation should be atomic instead of critical; actually in your case you have to use the atomic capture clause:
int pos, count = 0; // pos declared outside the loop
#pragma omp parallel for private(pos) // and privatized, count is implicitly
for(int i = 0; i < N; ++i) { // shared by all the threads
if(M[i]) {
#pragma omp atomic capture
pos = count++;
T[pos] = i;
}
}
Take a look at this answer to have an overview over all the possible possibilities of atomic operations with OpenMP.

OpenMP : Parallel QuickSort

I try to use OpenMP to parallelize QuickSort in partition part and QuickSort part. My C code is as follows:
#include "stdlib.h"
#include "stdio.h"
#include "omp.h"
// parallel partition
int ParPartition(int *a, int p, int r) {
int b[r-p];
int key = *(a+r); // use the last element in the array as the pivot
int lt[r-p]; // mark 1 at the position where its element is smaller than the key, else 0
int gt[r-p]; // mark 1 at the position where its element is bigger than the key, else 0
int cnt_lt = 0; // count 1 in the lt array
int cnt_gt = 0; // count 1 in the gt array
int j=p;
int k = 0; // the position of the pivot
// deal with gt and lt array
#pragma omp parallel for
for ( j=p; j<r; ++j) {
b[j-p] = *(a+j);
if (*(a+j) < key) {
lt[j-p] = 1;
gt[j-p] = 0;
} else {
lt[j-p] = 0;
gt[j-p] = 1;
}
}
// calculate the new position of the elements
for ( j=0; j<(r-p); ++j) {
if (lt[j]) {
++cnt_lt;
lt[j] = cnt_lt;
} else
lt[j] = cnt_lt;
if (gt[j]) {
++cnt_gt;
gt[j] = cnt_gt;
} else
gt[j] = cnt_gt;
}
// move the pivot
k = lt[r-p-1];
*(a+p+k) = key;
// move elements to their new positon
#pragma omp parallel for
for ( j=p; j<r; ++j) {
if (b[j-p] < key)
*(a+p+lt[j-p]-1) = b[j-p];
else if (b[j-p] > key)
*(a+k+gt[j-p]) = b[j-p];
}
return (k+p);
}
void ParQuickSort(int *a, int p, int r) {
int q;
if (p<r) {
q = ParPartition(a, p, r);
#pragma omp parallel sections
{
#pragma omp section
ParQuickSort(a, p, q-1);
#pragma omp section
ParQuickSort(a, q+1, r);
}
}
}
int main() {
int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
ParQuickSort(a, 0, 9);
int i=0;
for (; i!=10; ++i)
printf("%d\t", a[i]);
printf("\n");
return 0;
}
For the example in the main function, the sorting result is:
0 9 9 2 2 2 6 7 7 7
I used gdb to debug. In the early recursion, all went well. But in some recursions, it suddenly messed up to begin duplicate elements. Then generate the above result.
Can someone help me figure out where the problem is?

I decided to post this answer because:
the accepted answer is wrong, and the user seems inactive these days. There is a race-condition on
#pragma omp parallel for
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i]; //<- race condition lt_n is shared
}else{
gt[gt_n++] = a[i]; //<- race condition gt_n is shared
}
}
Nonetheless, even if it was correct, the modern answer to this question is to use OpenMP tasks instead of sections.
I am providing the community with full runnable example of such approach including tests and profiling.
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define TASK_SIZE 100
unsigned int rand_interval(unsigned int min, unsigned int max)
{
// https://stackoverflow.com/questions/2509679/
int r;
const unsigned int range = 1 + max - min;
const unsigned int buckets = RAND_MAX / range;
const unsigned int limit = buckets * range;
do
{
r = rand();
}
while (r >= limit);
return min + (r / buckets);
}
void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
for (int i = 0; i < size; i++)
m[i] = rand_interval(min, max);
}
void init(int *a, int size){
for(int i = 0; i < size; i++)
a[i] = 0;
}
void printArray(int *a, int size){
for(int i = 0; i < size; i++)
printf("%d ", a[i]);
printf("\n");
}
int isSorted(int *a, int size){
for(int i = 0; i < size - 1; i++)
if(a[i] > a[i + 1])
return 0;
return 1;
}
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, p, div - 1);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, div + 1, r);
}
}
int main(int argc, char *argv[])
{
srand(123456);
int N = (argc > 1) ? atoi(argv[1]) : 10;
int print = (argc > 2) ? atoi(argv[2]) : 0;
int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
int *X = malloc(N * sizeof(int));
int *tmp = malloc(N * sizeof(int));
omp_set_dynamic(0); /** Explicitly disable dynamic teams **/
omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/
// Dealing with fail memory allocation
if(!X || !tmp)
{
if(X) free(X);
if(tmp) free(tmp);
return (EXIT_FAILURE);
}
fillupRandomly (X, N, 0, 5);
double begin = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
quicksort(X, 0, N);
}
double end = omp_get_wtime();
printf("Time: %f (s) \n",end-begin);
assert(1 == isSorted(X, N));
if(print){
printArray(X, N);
}
free(X);
free(tmp);
return (EXIT_SUCCESS);
return 0;
}
How to run:
This program accepts three parameters:
The size of the array;
Print or not the array, 0 for no, otherwise yes;
The number of Threads to run in parallel.
Mini Benchmark
In a 4 core machine : Input 100000 with
1 Thread -> Time: 0.784504 (s)
2 Threads -> Time: 0.424008 (s) ~ speedup 1.85x
4 Threads -> Time: 0.282944 (s) ~ speedup 2.77x

I feel sorry for my first comment.It does not matter with your problem.I have not found the true problem of your question(Maybe your move element has the problem).According to your opinion, I wrote a similar program, it works
fine.(I am also new on OpenMP).
#include <stdio.h>
#include <stdlib.h>
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
#pragma omp parallel for
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp parallel sections
{
#pragma omp section
quicksort(a, p, div - 1);
#pragma omp section
quicksort(a, div + 1, r);
}
}
}
int main(void)
{
int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
int i;
quicksort(a, 0, 9);
for(i = 0;i < 10; i++){
printf("%d\t", a[i]);
}
printf("\n");
return 0;
}

I've implemented parallel quicksort in a production environment, although with concurrent processes (i.e. fork() and join()) and not OpenMP. I also found a pretty good pthread solution, but a concurrent process solution was the best in terms of worst-case runtime. Let me start by saying that it doesn't seem like you're making copies of your input array for each thread, so you'll definitely encounter race conditions which can corrupt your data.
Essentially, what is happening is you have created an array N in shared memory, and when you do a #pragma omp parallel sections, you're spawning as many worker threads as there are #pragma omp section's. Each time a worker thread tries to access and modify elements of a, it will execute a series of instructions: "read the n'th value of N from the given address", "modify the n'th value of N", "write the n'th value of N back to the given address". Since you have multiple threads with no locking or synchronization, the read, modify, and write instructions may be executed in any order by multiple processors, so the threads may overwrite each other's modifications or read a non-updated value.
The best solution that I found (after many weeks of testing and benchmarking many solutions that I came up with) is to subdivide the list log(n) times, where n is the number of processors. For example, if you have a quad core machine (n = 4), subdivide the list 2 times (log(4) = 2) choosing pivots that are the medians of the data set. It is important that the pivots are medians, because otherwise you can end up with a case where a poorly chosen pivot causes the lists to be distributed unevenly amongst processes. Then each process does quicksort on its local subarray, then merges its results with the results of other processes. This is called "hyperquicksort", and from an initial github search, I found this. I can't vouch for the code in there, and can't publish any of the code that I wrote since it is protected under an NDA.
By the way, one of the best parallel sorting algorithm is PSRS (Parallel Sorting by Regular Sampling), which keeps list sizes more balanced amongst processes, doesn't unnecessarily communicate keys between processes, and can work on an arbitrary number of concurrent processes (they don't necessarily have to be a power of 2).

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight