I'm trying to do some multithreaded high performance c matrix multiplication, the code below here is the program i wrote in C, it just works fine when the # of cores is 12 (since my pc has 12 threads or when i manually fix it to 12) when I switch it to a lower value (like 10 f.e.) gives me strange results, doesn anyone have an idea on what the problem could be?
Tested an perfectly working with 12 cores (or threads, call as whatever u want ) with a lower number of cores doesn't work anymore (look like he ends the execution almost immediately)
Tried with different values but looks like there is an error in the code I can't figure out probably.
The error is present in big size matrices but sometimes also in small size matrices
//
// Created by christian on 06/09/2019.
//
#pragma GCC optimize("O3", "unroll-loops", "omit-frame-pointer", "inline") //Optimization flags
#pragma GCC option("arch=native", "tune=native", "no-zero-upper") //Enable AVX
#pragma GCC target("avx") //Enable AVX
#include <time.h> // for clock_t, clock(), CLOCKS_PER_SEC
#include <sys/time.h>
#include <stdio.h> //AVX/SSE Extensions are included in stdio.h
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
int ops = 0;
//define matrix size (in this case we'll use a square matrix)
#define DIM 200 //DO NOT EXCEED 10000 (modification to the stack size needed)
float matrix[DIM][DIM];
float result_matrix[DIM][DIM];
float *matrix_ptr = (float *) &matrix;
float *result_ptr = (float *) &result_matrix;
// set the number of logical cores to 1 (just in case the auto-detection doesn't work properly)
int cores = 1;
//functions prototypes
void single_multiply(int row);
void *thread_multiply(void *offset);
int detect_number_of_cores();
void fill_matrix();
int main() {
//two instructions needed for pseudo-random float numbers
srand((unsigned int) time(NULL));
//detect the number of active cores
cores = detect_number_of_cores();
//matrix filling with random float values
fill_matrix();
printf("------------- MATRIX MULTIPLICATION -------------\n");
printf("--- multi-thread (vectorization enabled) v1.0 ---\n");
// printf("\n ORIGINAL MATRIX");
// for(int c=0; c<DIM; c++){
// printf("\n");
// for(int k=0; k<DIM; k++){
// printf("%f \t", matrix[c][k]);
// }
// }
//uncomment and modify this value to force a particular number of threads (not recommended)
//cores = 4;
printf("\n Currently using %i cores", cores);
printf("\n Matrix size: %i x %i", DIM, DIM);
//time detection struct declaration
struct timeval start, end;
gettimeofday(&start, NULL);
//decisional tree for the number of threads to be used
if (cores == 0 || cores == 1 || cores > DIM) {
//passing 0 because it has to start from the first row
single_multiply(0);
//this value may not be correct if matrix size exceeds 80x80 due to thread lock problems
printf("\n Total multiply ops: %i", ops);
gettimeofday(&end, NULL);
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
return 0;
} else {
//split the matrix in more parts (as much as the number of active cores)
int rows_por_thread = DIM / cores;
printf("\n Rows por Thread: %i", rows_por_thread);
//calculate the rest of the division (if there is one obviously)
int rest = DIM % cores;
printf("\n Rest: %i \n", rest);
if (rest == 0) {
//execute just the multi-thread function n times
int times = rows_por_thread;
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0;
int error;
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...");
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
} else {
//THE PROBLEM MUST BE INSIDE THIS ELSE STATEMENT
//execute the multi-thread function n times and the single function th rest remaining times
printf("\n The number of cores is NOT a divisor of the size of the matrix. \n");
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0; //counter for the thread ID
int entrypoint_residual_rows = 0; //first unprocessed residual row
//launching the threads according to the available coreS
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
entrypoint_residual_rows = cores * rows_por_thread;
int error;
//launch the threads
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...\n");
//join all the previous generated threads
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
printf("\n entry-point index for the single function %i ", entrypoint_residual_rows);
single_multiply(entrypoint_residual_rows);
}
}
// printf("\n MULTIPLIED MATRIX");
// for (int c = 0; c < DIM; c++) {
// printf("\n");
// for (int k = 0; k < DIM; k++) {
// printf("%f \t", result_matrix[c][k]);
// }
// }
gettimeofday(&end, NULL);
printf("\n All threads joined correctly");
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
//this value may not be correct if matrix size exceeds 80x80 due to thread lock problems
printf("\n Total multiply ops: %i", ops);
return 0;
}
//detect number of cores of the CPU (logical cores)
int detect_number_of_cores() {
return (int) sysconf(_SC_NPROCESSORS_ONLN); // Get the number of logical CPUs.
}
//matrix filling function
void fill_matrix() {
float a = 5.0;
for (int c = 0; c < DIM; c++)
for (int d = 0; d < DIM; d++) {
matrix[c][d] = (float) rand() / (float) (RAND_MAX) * a;
}
}
//row by row multiplication algorithm (mono-thread version)
void single_multiply(int row) {
for (int i = row; i < DIM; i++) {
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
ops++;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
}
//thread for the multiplication algorithm
void *thread_multiply(void *offset) {
//de-reference the parameter passed by the main-thread
int *row_offset = (int *) offset;
//multiplication loops
for (int i = *row_offset; i < (*row_offset + (DIM / cores)); i++) {
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
ops++;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
return NULL;
}
this is the way the result looks (also the number of ops in the result should be equal to size x size)
------------- MATRIX MULTIPLICATION -------------
--- multi-thread (vectorization enabled) v1.0 ---
Currently using 4 cores
Matrix size: 200 x 200
Rows por Thread: 50
Rest: 0
created thread n 0 with argument: 0
created thread n 1 with argument: 50
created thread n 2 with argument: 100
created thread n 3 with argument: 150
... working ...
Waiting to join thread n: 0
Waiting to join thread n: 1
Waiting to join thread n: 2
Waiting to join thread n: 3
All threads joined correctly
Time elapsed is 0 seconds and 804 micros
Total multiply ops: 2200
Process finished with exit code 0
This pthread_join here looks extremely fishy -- observe how the loop variable is c, but you index the array on i:
for (int c = 0; c < cores; c++) {
pthread_join(threads[i], NULL);
printf("\n Waiting to join thread n: %i", c);
}
I doubt it's doing the right thing.
in thread_multiple, the unadorned line:
ops++;
looks a bit suspicious. Did you not say you were running multiple instances of these threads?
As a general comment, you should look to have your functions a bit better defined; for example if you changed your single_multiply to be:
int single_multiply(int RowStart, int RowEnd) {
int ops = 0;
....
return ops;
}
then
void *thread_multiply(void *p) {
int *rows = p;
int ops;
ops = single_multiply(rows[0], rows[1]);
return (void *)ops;
}
you have:
reduced the bit of code that cares about things like 'cores' to the only bit that matters about them.
removed contention on the counter (you can collect them in pthread_join)
removed the redundant, nearly identical code.
Thank You ALL guys, this is now what it looks like, i would have expected better performance honeslty but at least it looks like it's working, does anyone have some idea on performance improvements I could do?
//
// Created by christian on 06/09/2019.
//
#pragma GCC optimize("O3", "unroll-loops", "omit-frame-pointer", "inline") //Optimization flags
#pragma GCC option("arch=native", "tune=native", "no-zero-upper") //Enable AVX
#pragma GCC target("avx") //Enable AVX
#include <time.h> // for clock_t, clock(), CLOCKS_PER_SEC
#include <sys/time.h>
#include <stdio.h> //AVX/SSE Extensions are included in stdio.h
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
//define matrix size (in this case we'll use a square matrix)
#define DIM 4000 //DO NOT EXCEED 10000 (modification to the stack size needed)
float matrix[DIM][DIM];
float result_matrix[DIM][DIM];
float *matrix_ptr = (float *) &matrix;
float *result_ptr = (float *) &result_matrix;
// set the number of logical cores to 1 (just in case the auto-detection doesn't work properly)
int cores = 1;
//functions prototypes
void single_multiply(int rowStart, int rowEnd);
void *thread_multiply(void *offset);
int detect_number_of_cores();
void fill_matrix();
int main() {
//two instructions needed for pseudo-random float numbers
srand((unsigned int) time(NULL));
//detect the number of active cores
cores = detect_number_of_cores();
//matrix filling with random float values
fill_matrix();
printf("------------- MATRIX MULTIPLICATION -------------\n");
printf("--- multi-thread (vectorization enabled) v1.0 ---\n");
// printf("\n ORIGINAL MATRIX");
// for(int c=0; c<DIM; c++){
// printf("\n");
// for(int k=0; k<DIM; k++){
// printf("%f \t", matrix[c][k]);
// }
// }
//uncomment and modify this value to force a particular number of threads (not recommended)
//cores = 4;
printf("\n Currently using %i cores", cores);
printf("\n Matrix size: %i x %i", DIM, DIM);
//time detection struct declaration
struct timeval start, end;
gettimeofday(&start, NULL);
//decisional tree for the number of threads to be used
if (cores == 0 || cores == 1 || cores > DIM) {
//passing 0 because it has to start from the first row
single_multiply(0, DIM);
gettimeofday(&end, NULL);
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %ld seconds and %ld micros\n", seconds, micros);
return 0;
} else {
//split the matrix in more parts (as much as the number of active cores)
int rows_por_thread = DIM / cores;
printf("\n Rows por Thread: %i", rows_por_thread);
//calculate the rest of the division (if there is one obviously)
int rest = DIM % cores;
printf("\n Rest: %i \n", rest);
if (rest == 0) {
//execute just the multi-thread function n times
int times = rows_por_thread;
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0;
int error;
for (int c = 0; c < DIM; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...");
for (int c = 0; c < cores; c++) {
error = pthread_join(threads[c], NULL);
if (error != 0) {
printf("\n Error in thread %i join", c);
}
printf("\n Waiting to join thread n: %i", c);
}
} else {
//THE PROBLEM MUST BE INSIDE THIS ELSE STATEMENT
//execute the multi-thread function n times and the single function th rest remaining times
printf("\n The number of cores is NOT a divisor of the size of the matrix. \n");
//create an array of thread-like objects
pthread_t threads[cores];
//create an array with the arguments for each thread
int thread_args[cores];
//launching the threads according to the available cores
int i = 0; //counter for the thread ID
int entrypoint_residual_rows = 0; //first unprocessed residual row
//launching the threads according to the available coreS
for (int c = 0; c < DIM - rest; c += rows_por_thread) {
thread_args[i] = c;
i++;
}
entrypoint_residual_rows = cores * rows_por_thread;
int error;
//launch the threads
for (int c = 0; c < cores; c++) {
error = pthread_create(&threads[c], NULL, thread_multiply, (void *) &thread_args[c]);
if (error != 0) {
printf("\n Error in thread %i creation, exiting...", c);
}
printf("created thread n %i with argument: %i \n", c, thread_args[c]);
}
printf("\n ... working ...\n");
//join all the previous generated threads
for (int c = 0; c < cores; c++) {
pthread_join(threads[c], NULL);
printf("\n Waiting to join thread n: %i", c);
}
printf("\n entry-point index for the single function %i ", entrypoint_residual_rows);
single_multiply(entrypoint_residual_rows, DIM);
}
}
// printf("\n MULTIPLIED MATRIX");
// for (int c = 0; c < DIM; c++) {
// printf("\n");
// for (int k = 0; k < DIM; k++) {
// printf("%f \t", result_matrix[c][k]);
// }
// }
gettimeofday(&end, NULL);
printf("\n All threads joined correctly");
long seconds = (end.tv_sec - start.tv_sec);
long micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("\n\n Time elapsed is %d seconds and %d micros\n", seconds, micros);
return 0;
}
//detect number of cores of the CPU (logical cores)
int detect_number_of_cores() {
return (int) sysconf(_SC_NPROCESSORS_ONLN); // Get the number of logical CPUs.
}
//matrix filling function
void fill_matrix() {
float a = 5.0;
for (int c = 0; c < DIM; c++)
for (int d = 0; d < DIM; d++) {
matrix[c][d] = (float) rand() / (float) (RAND_MAX) * a;
}
}
//row by row multiplication algorithm (mono-thread version)
void single_multiply(int rowStart, int rowEnd) {
for (int i = rowStart; i < rowEnd; i++) {
//printf("\n %i", i);
for (int j = 0; j < DIM; j++) {
*(result_ptr + i * DIM + j) = 0;
for (int k = 0; k < DIM; k++) {
*(result_ptr + i * DIM + j) += *(matrix_ptr + i * DIM + k) * *(matrix_ptr + k * DIM + j);
}
}
}
}
//thread for the multiplication algorithm
void *thread_multiply(void *offset) {
//de-reference the parameter passed by the main-thread
int *row_offset = (int *) offset;
printf(" Starting at line %i ending at line %i \n ", *row_offset, *row_offset + (DIM / cores));
single_multiply(*row_offset, *row_offset + (DIM / cores));
printf("\n ended at line %i", *row_offset + (DIM / cores));
return NULL;
}
Related
I am trying to write an application calculating l2 norm of 2 arrays. I have to parallel my calculation.
Here is the code that I have parallelized:
double time_start_openmp = omp_get_wtime();
#pragma omp parallel for
for (i = 0; i < n; i++)
{
numberOfThreads = omp_get_num_threads();
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
time_end_openmp = omp_get_wtime();
l2_norm = sqrt(l2_norm);
openmp_exec_time = time_end_openmp - time_start_openmp;
printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
I compile the code as:
gcc -fopenmp -g -ggdb -Wall -lm -o test test.c
I am running this code with 1 threads and 32 threads. The output is the exact opposite of what's expected. Here is an example output:
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00
Am I seeing wrong or using 32 threads is 10 times slower than 1 thread? So, what am I doing wrong here?
Here is my full code:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#define MATSIZE 2000
static size_t totalMemUsage = 0;
size_t vectors_dot_prod(double *x, double *y, size_t n)
{
double res = 0.0;
size_t i;
for (i = 0; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
size_t res = 0.0;
size_t i = 0;
for (; i <= n - 4; i += 4)
{
res += (x[i] * y[i] +
x[i + 1] * y[i + 1] +
x[i + 2] * y[i + 2] +
x[i + 3] * y[i + 3]);
}
for (; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
size_t i;
for (i = 0; i < rows; i++)
{
result[i] = vectors_dot_prod2(mat[i], vec, cols);
}
}
double get_random()
{
double range = 1000;
double div = RAND_MAX / range;
double randomNumber = (rand() / div);
// printf("%d\n", randomNumber);
return randomNumber;
}
void print_2d_arr(double *arr, size_t row, size_t col)
{
size_t i, j, index;
for (i = 0; i < row; i++)
{
for (j = 0; j < col; j++)
{
index = i * col + j;
printf("%3f ", arr[index]);
}
printf("\n");
}
}
void print_1d_arr(double *arr, size_t row)
{
size_t i;
for (i = 0; i < row; i++)
{
printf("%f, ", arr[i]);
}
printf("\n");
}
size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
/*
* Fulfilling the array with random numbers
* */
size_t i;
for (i = 0; i < n; i++)
{
arr[i] = get_random();
}
return 0;
}
double *allocarray1D(size_t size)
{
double *array = calloc(size, sizeof(double));
totalMemUsage = totalMemUsage + size * sizeof(double);
return array;
}
size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
size_t i, j;
size_t nlocal;
double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
x[i] += a[index] * fb[j];
}
}
free(fb);
return 0;
}
size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{
// Process 0 sends b to everyone
MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
size_t i, j;
size_t nlocal;
// double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
// MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x_partial[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x_partial[i] += a[index] * b[j];
}
}
// free(b);
// Process 0 gathers x_partials to create x
MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
return 0;
}
size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
size_t i, j;
for (i = 0; i < n; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x[i] += a[index] * b[j];
}
}
return 0;
}
int main(int argc, char *argv[])
{
// Global declerations
size_t i;
// MPI_Status status;
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int taskid;
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
if (argc != 2)
{
if (taskid == 0)
printf("Usage: %s <N>\n", argv[0]);
MPI_Finalize();
return 0;
}
srand(time(NULL) + taskid);
size_t n = atoi(argv[1]);
size_t nOverK = n / world_size;
double *a = allocarray1D(n * n);
double *b = allocarray1D(n);
double *x = allocarray1D(n);
double *x_partial = allocarray1D(nOverK);
double *xseq = allocarray1D(n);
double *a_partial = allocarray1D(n * nOverK);
if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
{
if (taskid == 0)
printf("Allocation failed\n");
MPI_Finalize();
return 0;
}
// Process 0 creates A matrix.
if (taskid == 0)
{
fullfillArrayWithRandomNumbers(a, n * n);
// Process 0 produces the b
fullfillArrayWithRandomNumbers(b, n);
}
// Process 0 sends a_partial to everyone
if (!(world_size == 1 && n == 64000))
{
MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
double time_start = MPI_Wtime();
ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
double time_end = MPI_Wtime();
double parallel_exec_time = time_end - time_start;
double *exec_times = allocarray1D(world_size);
// Process 0 gathers x_partials to create x
MPI_Gather(¶llel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// print_1d_arr(x, n);
if (taskid == 0)
{
SequentialMatrixMultiply(n, a, b, xseq);
// check difference between x and xseq using OpenMP
//print_1d_arr(exec_times, world_size);
// print_1d_arr(xseq, n);
double max_exec, min_exec, avg_exec;
min_exec = 1000;
for (i = 0; i < world_size; i++)
{
if (max_exec < exec_times[i])
{
max_exec = exec_times[i];
}
if (min_exec > exec_times[i])
{
min_exec = exec_times[i];
}
avg_exec += exec_times[i];
}
avg_exec = avg_exec / world_size;
long double time_start_openmp = omp_get_wtime();
long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
max_exec_time = 0;
max_exec_time = 1000;
long double l2_norm = 0;
size_t numberOfThreads = 0;
size_t r = 0;
double *diff_vector = allocarray1D(n);
size_t nrepeat = 10000;
if (world_size == 1)
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
else
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
l2_norm = sqrt(l2_norm);
time_end_openmp = omp_get_wtime();
openmp_exec_time = time_end_openmp - time_start_openmp;
// print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
if (world_size == 1)
{
printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);
//printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
//printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
MPI_Finalize();
return 0;
}
Here is the output;
cn009
36
mpicc -fopenmp -g -ggdb -lm -o rowmv rowmv.c
OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
Am I seeing wrong or using 32 threads is 10 times slower than 1
thread? So, what am I doing wrong here?
In the portion of code that is being both profiled and parallelized with OpenMP:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
there is a race condition, namely the access to the variable l2_norm. Moreover, you can drop the private(i), since the index variable (i.e., i) in the parallelized loop will be set implicitly as private by OpenMP. The race condition can be fixed with the OpenMP reduction. Furthermore, your loop is not actually distributing the iterations among threads as you wanted. Because you added again the parallel clause to that #pragma omp for, and assuming that you have nested parallelism disabled, which by default it is, each of the threads created in the outer parallel region will execute "sequentially" the code within that region, namely:
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
Hence, each thread will execute all the N iterations of the loop that you intended to be parallelized. Consequently, removing the parallelism and adding additional overhead (e.g., thread creation) to the sequential code. To fix those problems (i.e., race condition and "nested" parallel region) change this code to:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp for reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
Now, having fixed those problems you are left still with another problem (performance-wise), namely that the parallel loop is being performed in the context of a hybrid parallelization of OpenMP + MPI, and you did not explicitly bind the OpenMP threads (within the MPI processes) to the corresponded cores. Without that explicit binding, one cannot be sure in which cores those threads will end up. Naturally, more often than not, having multiple threads running in the same logical core will increase the overall execution of the application being parallelized.
If your application uses threads, then you probably want to ensure that you are either not bound at all (by specifying --bind-to none), or bound to multiple cores using an appropriate binding level or a specific number of processing elements per application process. You can solve this problem by either:
disabling the binding with the MPI flag --bind-to none, to enable threads to be assigned to different cores;
or perform the bound of threads, accordingly. Check this SO thread on how to map the threads to cores in Hybrid parallelizations such as MPI + OpenMP.
By explicitly setting the number of threads per process accordingly, you can avoid that multiple threads end up in the same core, and consequently, avoid that threads within the same core fight for the same resources.
Advice:
IMO you should first test the performance of the OpenMP alone, without any MPI process. In this context, test the scalability of code by measuring the sequential version against 2 threads, then 4, 8, and so on, gradually increasing the number of threads. Eventually, there will be a number of threads for which the code simply stops scaling. Naturally, the amount of parallel work being performed by the threads has to be big enough to overcome the overhead of parallelism. Therefore, you should also test around with bigger and bigger inputs.
After having profiled, tested an improved your OpenMP version you can then extent that shared-memory parallelization with multiple processes using MPI.
Besides the race condition in updating a shared variable as noted in #dreamcrash's answer, your code is not distributing the work properly.
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
~~~~~~~~
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
The parallel construct in the inner loop makes it a nested combined parallel for construct. It means that each thread in the team executing the outer parallel loop spawns a brand new parallel region and distributes the i-loop over the threads in it. There is no distribution happening in the outer parallel region and you end up with N threads all repeating the exact same work. By default nested parallelism is disabled, so the nested parallel region runs sequentially and your code is effectively doing this:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
There is no distribution of work and all threads write to the same locations in the diff_vector[] array.
On one hand, this code in general is a memory-bound one since the amount of computation per byte of data is low - modern CPUs can do many multiplications and subtractions per cycle while fetching data from memory and writing results back there takes many cycles. Memory-bound problems don't get any faster with more threads since the limiting factor is the memory bandwidth. This isn't that big of a problem in your case because 32K array entries take up 256 KB of memory and that fits in most CPU caches, and the L3 cache is blazing fast, but is still larger than the fastest L1 cache of a single CPU core. On the other hand, writing to the same memory areas from multiple threads results in true and false sharing, with the associated inter-thread cache invalidation, which usually results in the parallel code running way slower than the sequential version.
There are tools that can help you analyse the performance of your code and spot problems. As I already wrote in a comment, Intel VTune is one of them and is freely available as part of the oneAPI toolkit. Intel Inspector is another one (again, free and part of the oneAPI toolkit) and it finds problems such as data races. The two tools work very well together and I couldn't recommend them strongly enough to any aspiring parallel programmer.
There is also a minor race condition writing to numberOfThreads, but since all values written are the same, that isn't much of a logical problem. The correct version of the code in question should be:
#pragma omp parallel
{
#pragma omp master
numberOfThreads = omp_get_num_threads();
#pragma omp parallel reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
I'm learning OpenMP and I'm trying to do a simple task: A[r][c] * X[c] = B[r] (matrix vector multiplication).
The problem is: the sequential code is faster than parallel and I don't know why!
My code:
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
// Defined variables
#define row_matriz_A 80000
#define col_matriz_A 800
#define THREADS_NUM 4
// FUNCAO - GERAR MATRIZES
void gerarMatrizes(int r, int c, int mA[], int vX[], int vB[]){...}
// FUNCAO - SEQUENTIAL MULTIPLICATION
void multSequencial(int r, int c, int mA[], int vX[], int vB[]){
// Variables
int i, j, offset, sum;
struct timeval tv1,tv2;
double t1, t2;
// Begin Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao sequencial foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - MULTIPLICACAO PARALELA COM OpenMP
void matvecHost(int r, int c, int mA[], int vX[], int vB[]){
// Variaveis
int tID, i, j, offset, sum;
struct timeval tv1, tv2;
double t1, t2;
// Init vB
for(i = 0; i < r; i++) vB[i] = 0;
// BEGIN Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
omp_set_num_threads(THREADS_NUM);
#pragma omp parallel private(tID, i, j) shared(mA, vB, vX)
{
tID = omp_get_thread_num();
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao OpenMP foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - PRINCIPAL
int main(int argc, char * argv[]) {
int row, col;
row = row_matriz_A;
col = col_matriz_A;
int *matrizA = (int *)calloc(row * col, sizeof(int));
int *vectorX = (int *)calloc(col * 1, sizeof(int));
int *vectorB = (int *)calloc(row * 1, sizeof(int));
gerarMatrizes(row, col, matrizA, vectorX, vectorB);
multSequencial(row, col, matrizA, vectorX, vectorB);
matvecHost(row, col, matrizA, vectorX, vectorB);
return 0;
}
Previous solutions that did not worked:
Use collapse in my squared for
Increse rows and columns size
Increase thread numbers (A teacher recommend to use thread number == threads physical number)
Use malloc instead of m[i][j]
EDIT - ANSWER
My parallel block was correctly changed based on the correct answer:
#pragma omp parallel private(i, j, sum) shared(mA, vB, vX)
{
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
sum += mA[i * c + j] * vX[j];
}
vB[i] = sum;
}
}
I still got some a doubt:
If I define i, j and sum inside my parallel block, they will be set as private automatically? This improve the speed in my code or not?
You have race conditions on sum and offset - those are shared between the threads instead of being thread-private.
This also likely explains the slowdown: On x86, the CPU will actually work hard to make sure accesses to shared variables "work". This involves flushing cache lines after every (!) write to offset and sum - so all the threads are wildly writing into the same variables, but each one has to wait until the write from the previous thread (on a different core) has arrived in the local cache again after having been flushed. And of course it will produce completely nonsensical results.
I don't know why you are declaring all your variables at the start of the function - that's prone to these kind of mistakes. If you declared i, j, sum and offset (and the unused tID) in the smallest possible scopes instead, you wouldn't ever had this problem because they would be thread-private automatically in that case.
I am implementing sparse matrix multiplication using MKL library and benchmarking on various Intel processors like Intel Xeon E5 and Intel Xeon Phi.
Although I am able to benchmark with satisfactory results on Xeon E5, everytime I run the same code on Xeon Phi, I get segmentation fault after 3 iterations from the called "mkl_dcsrmultcsr" function. I am not able to figure out the reason for this, please let me know what might be the reason.
Following is the code
#include "stdio.h"
#include "stdlib.h"
#include "time.h"
#include "omp.h"
#include "mkl.h"
#include "mkl_spblas.h"
double timerval ()
{
struct timeval st;
gettimeofday(&st, NULL);
return (st.tv_sec+st.tv_usec*1e-6);
}
int main(){
double *nz, *nzc;
int *ia,*ja, *ic,*jc,*pos;
int info=1;
int i, j, k;
FILE *fp1,*fp2,*fp3,*fp4;
double avg_time = 0, s_time, e_time;
//open file to write results
//FILE *fp1;
char trans = 'N';
int sort = 1;
int m=4;
int iterations;
int request = 0;
/* iterate the loop for input size from 2exp3 to 2exp10 */
for (iterations=0; iterations<8; iterations++)
{
m *= 2; // increase the dimension of Matrix with every iteration
int n = m; // Assuming a square matrix.
int nzmax =m*n ;
double dense_const = 0.05;
int temp5, temp6,temp3,temp4;
int density=(m*n)*(dense_const);
//memory allocation for matrix A and B
nz = calloc((m*n),sizeof(double));
ia = calloc((m*n),sizeof(int));
ja = calloc((m*n),sizeof(int));
//memory allocation for product matrix C
nzc =calloc((m*n),sizeof(double));
ic = calloc((m*n),sizeof(int));
jc = calloc((m*n),sizeof(int));
//Configuration parameters
k=0;
//density of the sparse matrix to be created. Assume 5% density.
//position array for random initialisation of positions in input matrix
pos= calloc((m*n), sizeof(int));
int temp,temp1;
// printf("the density is %d\n",density);
// printf("check 1:\n");
//randomly initialise positions
for(i=0;i<density;i++)
{
temp1=rand()%(m*n);
pos[i]=temp1;
}
// printf("check 2:\n");
//sort the 'pos' array
for (i = 0 ; i < density; i++)
{
int d = i;
int t;
while ( d > 0 && pos[d] < pos[d-1])
{
t = pos[d];
pos[d] = pos[d-1];
pos[d-1] = t;
d--;
}
}
//printf("check 3:\n");
// initialise with non zero elements and extract column and row ptr vector
j=1;
ja[0]=1;
int p=0;
for(i = 0; i < density; i++)
{
temp=pos[i];
nz[k] = rand();
// nz[k] = 1;
ia[k] = temp%m;
k++;
p++;
temp5= pos[i];
temp6=pos[i+1];
temp3=temp5-(temp5%m);
temp4=temp6-(temp6%m);
if(!(temp3== temp4))
{
if((temp3+m==temp6))
{}
else
{
ja[j]=p+1;
j++;
}
}
}
printf("check1\n");
request = 0;
s_time = timerval();
for(i=0; i<1000;i++)
{
#pragma omp parallel
{
mkl_dcsrmultcsr(&trans, &request, &sort, &n, &n, &n, nz, ia, ja, nz, ia, ja, nzc, jc, ic, &nzmax, &info);
}
}
e_time = timerval();
avg_time = (e_time - s_time);
/* write the timing information in "output.txt"*/
avg_time = avg_time / 1000;
printf("check 5:\n");
if((fp2 = fopen("output.txt","a"))==NULL)
{
printf("error opening file\n");
}
//fseek(fp1,1000,SEEK_END);
fprintf (fp2, "\n Input size: %d x %d ,Time: %lf and density is %d and info is %d \n", m,n, avg_time, density,info);
fclose(fp2);
//mkl_free_buffers();
free(ja);
free(ia);
free(nz);
free(pos);
free(jc);
free(ic);
free(nzc);
}
return 0;
}
You can use mkl_(thread_)free_buffers() before a call to the multiplication functions. That worked out for me!
I wrote a parallel pthreads program computing the column sum norm of the product of
two n*n sized matrices. The right matrix is vertically partitioned. The user inputs the matrix size n and the number of threads(p) so that:
pthreads are involved in the parallel computations.
The 1-dimensional parallel algorithm of matrix multiplication is employed:
the right matrix is partitioned in one dimension into p equal slices(A*B, then B is partitioned into p slices)
there is one-to-one mapping between the partitions and threads
each thread is responsible for computation of the corresponding slice of the resulting matrix
The code:
double *A;
double *B;
double *C;
int n;
double matrix_norm;
typedef struct {
double *b;
double *c;
int num_of_columns;
pthread_mutex_t *mutex;
} matrix_slice;
void *matrix_slice_multiply(void *arg){
matrix_slice *slice = arg;
int i, j;
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, slice->num_of_columns, n, 1.0, A, n, slice->b, n, 0.0, slice->c, n);
// compute column norm of each slice
double slice_norm = 0.0;
for(j = 0; j < slice->num_of_columns; j++) {
double column_sum=0.;
for(i = 0; i < n; i++)
column_sum += *(slice->c + i * n + j);
if(column_sum>slice_norm)
slice_norm=column_sum;
}
pthread_mutex_lock(slice->mutex);
if (slice_norm>matrix_norm)
matrix_norm=slice_norm;
pthread_mutex_unlock(slice->mutex);
pthread_exit(NULL);
}
int main(void) {
int num_of_thrds, num_of_columns_per_slice;
pthread_t *working_thread;
matrix_slice *slice;
pthread_mutex_t *mutex;
int i = 0;
printf ("Please enter matrix dimension n : ");
scanf("%d", &n);
printf ("Please enter number of threads : ");
scanf("%d", &num_of_thrds);
while (num_of_thrds > n) {
printf("number of threads must not be greater than matrix dimension\n");
printf ("Please enter number of threads : ");
scanf("%d", &num_of_thrds);
}
// allocate memory for the matrices
///////////////////// Matrix A //////////////////////////
A = (double *)malloc(n * n * sizeof(double));
if (!A) {
printf("memory failed \n");
exit(1);
}
///////////////////// Matrix B //////////////////////////
B = (double *)malloc(n * n * sizeof(double));
if (!B) {
printf("memory failed \n");
exit(1);
}
///////////////////// Matrix C //////////////////////////
C = (double *)malloc(n * n * sizeof(double));
if (!C) {
printf("memory failed \n");
exit(1);
}
// initialize the matrices
for (i = 0; i < n * n; i++) {
A[i] = rand() % 15;
B[i] = rand() % 10;
C[i] = 0.;
}
clock_t t1 = clock();
working_thread = malloc(num_of_thrds * sizeof(pthread_t));
slice = malloc(num_of_thrds * sizeof(matrix_slice));
mutex = malloc(sizeof(pthread_mutex_t));
num_of_columns_per_slice = n / num_of_thrds;
for(i = 0; i < num_of_thrds; i++){
slice[i].b = B + i * num_of_columns_per_slice;
slice[i].c = C + i * num_of_columns_per_slice;
slice[i].mutex = mutex;
slice[i].num_of_columns = (i == num_of_thrds - 1) ? n-i * num_of_columns_per_slice : num_of_columns_per_slice;
pthread_create(&working_thread[i], NULL, matrix_slice_multiply, (void *)&slice[i]);
}
for(i = 0; i < num_of_thrds; i++)
pthread_join(working_thread[i], NULL);
clock_t t2=clock();
printf("elapsed time: %f\n", (double)(t2 - t1)/CLOCKS_PER_SEC);
printf("column sum norm is %f\n", matrix_norm);
//deallocate memory
free(A);
free(B);
free(C);
free(working_thread);
free(slice);
return 0;
}
I ran the program dozens of times with various inputs and it turned out that the more threads used, the more time it cost. This is quite counter-intuitive. Shouldn't more threads help improve the performance?
The savings from running computations in parallel needs to be greater than the overhead of creating, maintaining, and switching between the threads. Instead of running dozens of times with a large number of threads, run a single very large operation one time with the same number of threads as there are cores on your system.
Using threads when performing operations that may cause the program to wait for a resource, such as read/write to file system or network, can help improving the performance of your application, but if there are no such operations, the overhead of creating thread, acquiring and releasing mutexes and performing the context switches between the threads might make the application slower.
I have been working on this program that accomplishes this:
counts the number of occurrences of a specific integer value in a 2D array (matrix). Each position of the matrix must first be initialized to an integer value between 0 and
n. Once initialized, program will search and count the total number of occurrences of a specific value.
The program is run by taking in the parameters as command line arguments:
programName rows cols n c
rows – number of rows of the matrix
cols – number of columns of the matrix
n – the upper bound of the random values of the matrix, values can be 0–(n-1)
c – the value to search for in the matrix, note c must be between 0–(n-1)
After this, the program implements the search using 1 to 10 threads and displays the execution time and number of occurrences.
I seem to have all of this working how I wish, however the problem is that whenever I enter a value over 4 in the command line for rows, I keep getting the segment fault error.
I am at a loss as to what is causing this. Please help me understand what error in my coding may be contributing to this? Thank you.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#define NUM_THREADS 10
int **arr;
int rows, cols, n, c, totalOccurrence, done, numThreads;
int threadCounter[10];
void *matrixThread (void *threadid)
{
long tid;
tid = (long)threadid;
long lowBound = tid * (rows / numThreads);
long highBound = lowBound + (rows / numThreads);
int localcount = 0;
if (tid == numThreads - 1)
{
highBound = rows;
}
long i;
int ic;
for (i = lowBound; i < highBound; i++)
{
for (ic = 0; ic < cols; ic++)
{
if (arr[i][ic] == c)
{
localcount++;
}
}
}
threadCounter[tid] = localcount;
pthread_exit(NULL);
}
int main (int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
if (argc != 5)
{
printf("Error: Invalid number of arguments\n");
}
else
{
rows = strtol(argv[1], NULL, 10);
cols = strtol(argv[2], NULL, 10);
n = strtol(argv[3], NULL, 10);
c = strtol(argv[4], NULL, 10);
int r, cl;
arr = (int**)malloc(rows * sizeof(int));
for (r = 0; r < rows; r++)
{
arr[r] = malloc(cols * sizeof(int));
}
int randomNum;
srand(time(NULL));
for (r = 0; r < rows; r++)
{
for (cl = 0; cl < cols; cl++)
{
randomNum = rand() % n;
arr[r][cl] = randomNum;
}
}
long rc, t;
for (numThreads = 1; numThreads <= 10; numThreads++)
{
struct timeval start,end;
double elapsed_time;
gettimeofday(&start, NULL);
for (t = 0; t < numThreads; t++)
{
rc = pthread_create(&threads[t], NULL, matrixThread, (void *)t);
if (rc)
{
printf ("Error: Thread could not be created; return %d", rc);
exit(-1);
}
}
for (t = 0; t < numThreads; t++)
{
pthread_join(threads[t], NULL);
}
totalOccurrence = 0;
int q;
for (q = 0; q < numThreads; q++)
{
totalOccurrence += threadCounter[q];
}
gettimeofday(&end, NULL);
elapsed_time = (end.tv_sec + end.tv_usec/1000000.10000) - (start.tv_sec + start.tv_usec/1000000.10000);
printf("\nNumber of threads: %d " , numThreads);
printf("Total Occurrences of %d: %d " ,c, totalOccurrence);
printf("Elapsed time: %.8f\n" , elapsed_time);
totalOccurrence = 0;
}
}
pthread_exit(NULL);
}
Here is one problem:
arr = (int**)malloc(rows * sizeof(int));
should be:
arr = (int**)malloc(rows * sizeof(int *));
The allocation of the rows should be like this
arr = (int**)malloc(rows * sizeof(int*));
Because the sizeof datatypes can vary. But the sizeof a pointer will be constant in a particular machine architecture. In a 64 bit machine the sizeof a pointer will be 8 bytes. But sizeof int will be usually 4 bytes (gcc). So here you will be having allocated only 4 blocks. That why when you try to pass more than 4, it's crashing because there's an invalid memory read.
Also your program will cause memory leak, as you are not freeing the allocated memory. Use like this at the end.
for (r = 0; r < rows; r++)
{
free (arr[r]);
}
free (arr);