Related
I am creating 2 programs to test the differences in run time of serial matrix multiply vs that of parallel matrix multiply. The parallel code that I have written is actually running slower than serial code, and running the program with additional cores enabled provides no speedup at all... using more cores actually seems to slow down the parallel program.
What is going on here? This is my parallel code: to use this pass in matrix size and thread number (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
#include <pthread.h>
// Time struct + prototypes
struct timespec time1, time2, diffTime;
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** reserveMatrix(int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
void* matMult(void* arg);
// Argstruct
typedef struct {
double** result;
int tid;
int size;
int s;
int e;
} argStr;
// global variables for use by all threads
int size; // Size of a row and column.
int numThreads; // Number of pThreads to do work
double** mat1;
double** mat2;
double** mat3;
// Main function
int main(int argc, char *argv[]) {
size = atoi(argv[1]);
numThreads = atoi(argv[2]);
mat1 = reserveMatrix(size, size);
mat2 = reserveMatrix(size, size);
mat3 = reserveMatrix(size, size);
if (size == 0) {
//printf("Matrix cannot be size 0\n");
return -1;
}
//Start timer
clock_gettime(CLOCK_MONOTONIC, &time1);
// *********** Begin main operation *********** //
// //
// declare necessary local variables
pthread_t theThreads[numThreads];
argStr data[numThreads]; // Create numThreads # of argStr objects
for (int i = 0; i < numThreads; i++) {
data[i].result = reserveMatrix(size, size);
data[i].tid = i; // Self-assigned threadID
data[i].size = size; // Size of a block
data[i].s = size * i / numThreads;
data[i].e = size * (i + 1) / numThreads - 1;
//printf("I handle operations from %d to %d\n", data[i].s, data[i].e);
}
// Start the threads
for (int i = 0; i < numThreads; i++) {
pthread_create(&theThreads[i], NULL, matMult, (void*) (&data[i]));
}
// await all threads being done.
for (int i = 0; i < numThreads; i++) {
pthread_join(theThreads[i], NULL);
}
// rejoin received data
//printMat(data[1].result, size, size);
// //
// *********** End main operation *********** //
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Pthread Matrix Multiply, %d, %d, %lf\n", size, numThreads, cpuTimeUsed);
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// Reserve matrix function
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
void* matMult(void* arg) {
//printf("Begin an operation\n");
argStr* args = (argStr*)arg;
double** result = args->result;
int tid = args->tid;
int size = args->size; // Size of the matrix
long s = args->s; // Start
long e = args->e; // End
// Print message to confirm data is getting stored
//printf("Hello from operation %d! \n", tid);
//printf("I am working from number %ld to %ld\n", s, e);
for(int r = s; r <= e; r++) { // May need to declare out of loop
for(int c = 0; c < size; c++) {
result[r][c] = 0.0;
for(int i = 0; i < size; i++) {
result[r][c] += mat1[r][i] * mat2[i][c];
}
}
}
// Print multipled matrix values
//printMat(mat3, size, size);
return NULL;
}
This is my serial code: To use this pass in the same sized row and column (see my useage below)
#include <stdio.h>
#include <stdlib.h> // rand(), srand()
#include <unistd.h>
#include <time.h>
// Matrix multiply code
// **** Time struct **** //
struct timespec time1, time2, diffTime;
// Prototypes
struct timespec timespecDifference(struct timespec start, struct timespec end); // For timing
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols);
double** reserveMatrix(int nRows, int nCols);
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols);
void printMat(double** mat1, int rows, int cols);
// Begin main
int main(int argc, char *argv[])
{
int rows = atoi(argv[1]);
int cols = atoi(argv[2]);
// Declare the ARRAYS and populate them
double** arr1 = reserveMatrix(rows, cols);
double** arr2 = reserveMatrix(rows, cols);
double** arr3 = reserveMatrix(rows, cols);
double** arr4 = reserveMatrix(rows, cols);
double prod1 = matrixProduct(arr1, arr2, rows, cols);
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr3 = matrixMultiply(arr1, arr2, arr3, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
double cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
// Print input matrix values. Used to test that matrix multiply works - it does
// Perform a transposition of matrix 2
for (int r = 0; r < rows; ++r) {
for (int c = r + 1; c < cols; ++c) {
double val = arr2[r][c];
arr2[r][c] = arr2[c][r];
arr2[c][r] = val;
}
}
// Run matrix multiply again on the newly transposed data.
//Start Clock
clock_gettime(CLOCK_MONOTONIC, &time1);
arr4 = transMatrixMultiply(arr1, arr2, arr4, rows, cols);
// Stop timer and find time taken
clock_gettime(CLOCK_MONOTONIC, &time2);
diffTime = timespecDifference(time1, time2);
cpuTimeUsed = ((double)diffTime.tv_sec + (double)diffTime.tv_nsec / 1000000000.0);
//Print Time
printf("Trans Matrix Multiply, %d, %lf\n", rows, cpuTimeUsed);
//double prod2 = matrixProduct(arr3, arr4, rows, cols);
//printf("The matrix product of m3 and m4 is: %f\n", prod2);
//printMat(mat3, rows, cols);
return 0;
}
// Struct Timer
struct timespec timespecDifference(struct timespec start, struct timespec end)
{
struct timespec temp;
if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000 + end.tv_nsec - start.tv_nsec;
}
else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
return temp;
}
// standard matrix multiply
double** matrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int r = 0; r < nRows; ++r) {
for (int c = 0; c < nCols; ++c) {
result[r][c] = 0.0;
for (int i = 0; i < nRows; ++i) {
result[r][c] += matrix1[r][i] * matrix2[i][c];
}
}
}
return result;
}
// Transpose matrix multiply
double** transMatrixMultiply(double** matrix1, double** matrix2, double** result, int nRows, int nCols) {
for (int c = 0; c < nCols; ++c) {
for (int r = 0; r < nRows; ++r) {
result[c][r] = 0.0;
for (int i = 0; i < nCols; ++i) {
result[c][r] += matrix1[c][i] * matrix2[r][i];
}
}
}
return result;
}
// Reserve data function. Reserves and populates array data
double** reserveMatrix(int nRows, int nCols) {
double** matrix1 = (double**)malloc(nRows * sizeof(double*));
matrix1[0] = (double*)malloc(nRows * nCols * sizeof(double));
// Assign row pointers to "segment" out the data
for (int r = 1; r < nRows; ++r) {
matrix1[r] = &(matrix1[0][r * nCols]);
}
// Give values to the array
for(int i = 0; i < nRows * nCols; i++) {
matrix1[0][i] = i;
}
return matrix1;
}
// Check that matrix1 and matrix2 are the same
double matrixProduct(double** mat1, double** mat2, int nRows, int nCols) {
double sum = 0.0;
for(int i = 0; i < nRows * nCols; i++) {
sum += (mat1[0][i] - mat2[0][i]) * (mat1[0][i] - mat2[0][i]);
//printf("matrix product pos: %i, sum: %f\n", i, sum);
}
return sum;
}
// Print matrix function
void printMat(double** mat1, int rows, int cols) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
printf("%f, ", mat1[i][j]);
}
printf("\n");
}
printf("End of array print\n");
}
Here is the linux output of me compiling and running this code. At matrix size 1200 x 1200 the run time differences are not that pronounced, but the serial code ends up being significantly faster than the parallel at sizes above 1500 x 1500:
MYPC:~/Projects/matrixMultiply/phase3$ gcc matrixMult.c -o MM
MYPC:~/Projects/matrixMultiply/phase3$ gcc pMatMult.c -lpthread -o PMM
MYPC:~/Projects/matrixMultiply/phase3$ ./MM 1200 1200
Matrix Multiply, 1200, 25.487388
Trans Matrix Multiply, 1200, 16.452777
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 2
Pthread Matrix Multiply, 1200, 2, 22.495115
MYPC:~/Projects/matrixMultiply/phase3$ ./PMM 1200 4
Pthread Matrix Multiply, 1200, 4, 22.181686
The sections in bold contain the meaningful output. It reads
name of the process
matrix size
number of threads spawned (in pThread program only)
run time
Any help would be appreciated. I will be instantly replying to questions for the next 2 hours.
The solution was to terminate extra processes that were running on my ubuntu machine. The code worked perfectly fine as a few users pointed out. Killing all other processes on the machine, then running my parallel code provided the expected speedups.
I am not sure of the precise technical reason this is going on other than the machine wasn't prioritizing my program when it had others running, resulting in slower times.
I'm learning OpenMP and I'm trying to do a simple task: A[r][c] * X[c] = B[r] (matrix vector multiplication).
The problem is: the sequential code is faster than parallel and I don't know why!
My code:
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
// Defined variables
#define row_matriz_A 80000
#define col_matriz_A 800
#define THREADS_NUM 4
// FUNCAO - GERAR MATRIZES
void gerarMatrizes(int r, int c, int mA[], int vX[], int vB[]){...}
// FUNCAO - SEQUENTIAL MULTIPLICATION
void multSequencial(int r, int c, int mA[], int vX[], int vB[]){
// Variables
int i, j, offset, sum;
struct timeval tv1,tv2;
double t1, t2;
// Begin Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao sequencial foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - MULTIPLICACAO PARALELA COM OpenMP
void matvecHost(int r, int c, int mA[], int vX[], int vB[]){
// Variaveis
int tID, i, j, offset, sum;
struct timeval tv1, tv2;
double t1, t2;
// Init vB
for(i = 0; i < r; i++) vB[i] = 0;
// BEGIN Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
omp_set_num_threads(THREADS_NUM);
#pragma omp parallel private(tID, i, j) shared(mA, vB, vX)
{
tID = omp_get_thread_num();
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao OpenMP foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - PRINCIPAL
int main(int argc, char * argv[]) {
int row, col;
row = row_matriz_A;
col = col_matriz_A;
int *matrizA = (int *)calloc(row * col, sizeof(int));
int *vectorX = (int *)calloc(col * 1, sizeof(int));
int *vectorB = (int *)calloc(row * 1, sizeof(int));
gerarMatrizes(row, col, matrizA, vectorX, vectorB);
multSequencial(row, col, matrizA, vectorX, vectorB);
matvecHost(row, col, matrizA, vectorX, vectorB);
return 0;
}
Previous solutions that did not worked:
Use collapse in my squared for
Increse rows and columns size
Increase thread numbers (A teacher recommend to use thread number == threads physical number)
Use malloc instead of m[i][j]
EDIT - ANSWER
My parallel block was correctly changed based on the correct answer:
#pragma omp parallel private(i, j, sum) shared(mA, vB, vX)
{
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
sum += mA[i * c + j] * vX[j];
}
vB[i] = sum;
}
}
I still got some a doubt:
If I define i, j and sum inside my parallel block, they will be set as private automatically? This improve the speed in my code or not?
You have race conditions on sum and offset - those are shared between the threads instead of being thread-private.
This also likely explains the slowdown: On x86, the CPU will actually work hard to make sure accesses to shared variables "work". This involves flushing cache lines after every (!) write to offset and sum - so all the threads are wildly writing into the same variables, but each one has to wait until the write from the previous thread (on a different core) has arrived in the local cache again after having been flushed. And of course it will produce completely nonsensical results.
I don't know why you are declaring all your variables at the start of the function - that's prone to these kind of mistakes. If you declared i, j, sum and offset (and the unused tID) in the smallest possible scopes instead, you wouldn't ever had this problem because they would be thread-private automatically in that case.
I find out about Variable Length Array in C99, but it looks like it behave almost the same as malloc + free.
The practical differences I found:
Too big array handling:
unsigned size = 4000000000;
int* ptr = malloc(size); // ptr is 0, program doesn't crash
int array[size]; // segmentation fault, program crashes
Memory leaks: only possible in dynamic array allocation:
int* ptr = malloc(size);
...
if(...)
return;
...
free(ptr);
Life of object and possibility to return from function: dynamically allocated array lives until the memory is frees and can be returned from function which allocated the memory.
Resizing: resizing possible only with pointers to allocated memory.
My questions are:
What are more differences (I'm interested in practical advice)?
What are more problems a programmer can have with both ways of arrays with variable length?
When to choose VLA but when dynamic array allocation?
What is faster: VLA or malloc+free?
Some practical advices:
VLAs are in practice located on the space-limited stack, while malloc() and its friends allocates on the heap, that is likely to allow bigger allocations. Moreveover you have more control on that process, as malloc() could return NULL if it fails. In other words you have to be careful with VLA not-to-blow your stack in runtine.
Not all compilers support VLA, e.g. Visual Studio. Moreover C11 marked them as optional feature and allows not to support them when __STDC_NO_VLA__ macro is defined.
From my experience (numerical programs like finding prime numbers with trial division, Miller-Rabin etc.) I wouldn't say that VLAs are any faster than malloc(). There is some overhead of malloc() call of course, but what seems to be more important is data access efficiency.
Here is some quick & dirty comparison using GNU/Linux x86-64 and GCC compiler. Note that results may vary from platform to another or even compiler's version. You might use as some basic (though very far of being complete) data-access malloc() vs VLA benchmark.
prime-trial-gen.c:
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
bool isprime(int n);
int main(void)
{
FILE *fp = fopen("primes.txt", "w");
assert(fp);
fprintf(fp, "%d\n", 2);
for (int i = 3; i < 10000; i += 2)
if (isprime(i))
fprintf(fp, "%d\n", i);
fclose(fp);
return 0;
}
bool isprime(int n)
{
if (n % 2 == 0)
return false;
for (int i = 3; i * i <= n; i += 2)
if (n % i == 0)
return false;
return true;
}
Compile & run:
$ gcc -std=c99 -pedantic -Wall -W prime-trial-gen.c
$ ./a.out
Then here is second program, that take use of generated "primes dictionary":
prime-trial-test.c:
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
bool isprime(int n, int pre_prime[], int num_pre_primes);
int get_num_lines(FILE *fp);
int main(void)
{
FILE *fp = fopen("primes.txt", "r");
assert(fp);
int num_lines = get_num_lines(fp);
rewind(fp);
#if WANT_VLA
int pre_prime[num_lines];
#else
int *pre_prime = malloc(num_lines * sizeof *pre_prime);
assert(pre_prime);
#endif
for (int i = 0; i < num_lines; i++)
assert(fscanf(fp, "%d", pre_prime + i));
fclose(fp);
/* NOTE: primes.txt holds primes <= 10 000 (10**4), thus we are safe upto 10**8 */
int num_primes = 1; // 2
for (int i = 3; i < 10 * 1000 * 1000; i += 2)
if (isprime(i, pre_prime, num_lines))
++num_primes;
printf("pi(10 000 000) = %d\n", num_primes);
#if !WANT_VLA
free(pre_prime);
#endif
return 0;
}
bool isprime(int n, int pre_prime[], int num_pre_primes)
{
for (int i = 0; i < num_pre_primes && pre_prime[i] * pre_prime[i] <= n; ++i)
if (n % pre_prime[i] == 0)
return false;
return true;
}
int get_num_lines(FILE *fp)
{
int ch, c = 0;
while ((ch = fgetc(fp)) != EOF)
if (ch == '\n')
++c;
return c;
}
Compile & run (malloc version):
$ gcc -O2 -std=c99 -pedantic -Wall -W prime-trial-test.c
$ time ./a.out
pi(10 000 000) = 664579
real 0m1.930s
user 0m1.903s
sys 0m0.013s
Compile & run (VLA version):
$ gcc -DWANT_VLA=1 -O2 -std=c99 -pedantic -Wall -W prime-trial-test.c
ime ./a.out
pi(10 000 000) = 664579
real 0m1.929s
user 0m1.907s
sys 0m0.007s
As you might check π(10**7) is indeed 664,579. Notice that both execution times are almost the same.
One advantage of VLAs is that you can pass variably-dimensioned arrays to functions, which can be handy when dealing with (sanely sized) matrices, for example:
int n = 4;
int m = 5;
int matrix[n][m];
// …code to initialize matrix…
another_func(n, m, matrix);
// No call to free()
where:
void another_func(int n, int m, int matrix[n][m])
{
int sum = 0;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
// …use matrix just like normal…
sum += matrix[i][j];
}
}
// …do something with sum…
}
This is particularly valuable since the alternatives using malloc() without using VLA as well mean that you either have to do subscript calculations manually in the called function, or you have to create a vector of pointers.
Manual subscript calculations
int n = 4;
int m = 5;
int *matrix = malloc(sizeof(*matrix) * n * m);
// …code to initialize matrix…
another_func2(n, m, matrix);
free(matrix);
and:
void another_func2(int n, int m, int *matrix)
{
int sum = 0;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
// …do manual subscripting…
sum += matrix[i * m + j];
}
}
// …do something with sum…
}
Vector of pointers
int n = 4;
int m = 5;
int **matrix = malloc(sizeof(*matrix) * n);
for (int i = 0; i < n; i++)
matrix[i] = malloc(sizeof(matrix[i] * m);
// …code to initialize matrix…
another_func2(n, m, matrix);
for (int i = 0; i < n; i++)
free(matrix[i]);
free(matrix);
and:
void another_func3(int n, int m, int **matrix)
{
int sum = 0;
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
// …use matrix 'just' like normal…
// …but there is an extra pointer indirection hidden in this notation…
sum += matrix[i][j];
}
}
// …do something with sum…
}
This form can be optimized to two allocations:
int n = 4;
int m = 5;
int **matrix = malloc(sizeof(*matrix) * n);
int *values = malloc(sizeof(*values) * n * m);
for (int i = 0; i < n; i++)
matrix[i] = &values[i * m];
// …code to initialize matrix…
another_func2(n, m, matrix);
free(values);
free(matrix);
Advantage VLA
There is less bookkeeping work to do when you use VLAs. But if you need to deal with preposterously sized arrays, malloc() still scores. You can use VLAs with malloc() et al if you're careful — see calloc() for an array of array with negative index in C for an example.
I'm attempting to implement concurrent kernel launches for a very complex CUDA kernel, so I thought I'd start out with a simple example. It just launches a kernel which does a sum reduction. Simple enough. Here it is:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cuda.h>
extern __shared__ char dsmem[];
__device__ double *scratch_space;
__device__ double NDreduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
double *b;
b = scratch_space;
for (int i = tid; i < length; i+= blockDim.x)
b[i] = a[i];
__syncthreads();
do {
k = (k + 1) / 2;
if (tid < k && tid + k < length)
b[tid] += b[tid + k];
length = k;
__syncthreads();
} while (k > 1);
return b[0];
}
__device__ double reduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
do
{
k = (k + 1) / 2;
if (tid < k && tid + k < length)
a[tid] += a[tid + k];
length = k;
__syncthreads();
}
while (k > 1);
return a[0];
}
__global__ void kernel_thing(double *ad, int size)
{
double sum_1, sum_2, sum_3;
time_t begin, end, t1, t2, t3;
scratch_space = (double *) &dsmem[0];
for (int j = 0; j < 1000000; j++) {
begin = clock();
sum_1 = NDreduceSum(ad, size);
end = clock();
}
__syncthreads();
t1 = end - begin;
begin = clock();
sum_2 = 0;
if (threadIdx.x == 0) {
for (int i = 0; i < size; i++) {
sum_2 += ad[i];
}
}
__syncthreads();
end = clock();
t2 = end - begin;
__syncthreads();
begin = clock();
sum_3 = reduceSum(ad, size);
end = clock();
__syncthreads();
t3 = end - begin;
if (threadIdx.x == 0) {
printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.\n", sum_1, sum_2, sum_3, t1, t2, t3);
}
}
int main(int argc, char **argv)
{
int i;
const int size = 512;
double *a, *ad, *b, *bd;
double sum_a, sum_b;
cudaStream_t stream_a, stream_b;
cudaError_t result;
cudaEvent_t a_start, a_stop, b_start, b_stop;
a = (double *) malloc(sizeof(double) * size);
b = (double *) malloc(sizeof(double) * size);
srand48(time(0));
for (i = 0; i < size; i++) {
a[i] = drand48();
}
for (i = 0; i < size; i++) {
b[i] = drand48();
}
sum_a = 0;
for (i = 0; i < size; i++) {
sum_a += a[i];
}
sum_b = 0;
for (i = 0; i < size; i++) {
sum_b += b[i];
}
printf("Looking for sum_a %lf\n", sum_a);
printf("Looking for sum_b %lf\n", sum_b);
cudaEventCreate(&a_start);
cudaEventCreate(&b_start);
cudaEventCreate(&a_stop);
cudaEventCreate(&b_stop);
cudaMalloc((void **) &ad, sizeof(double) * size);
cudaMalloc((void **) &bd, sizeof(double) * size);
result = cudaStreamCreate(&stream_a);
result = cudaStreamCreate(&stream_b);
result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a);
result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b);
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
cudaEventRecord(a_stop);
cudaEventRecord(b_start);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(b_stop);
result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a);
result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b);
cudaEventSynchronize(a_stop);
cudaEventSynchronize(b_stop);
float a_ms = 0;
float b_ms = 0;
cudaEventElapsedTime(&a_ms, a_start, a_stop);
cudaEventElapsedTime(&b_ms, b_start, b_stop);
printf("%lf ms for A.\n", a_ms);
printf("%lf ms for B.\n", b_ms);
result = cudaStreamDestroy(stream_a);
result = cudaStreamDestroy(stream_b);
if (result != cudaSuccess) {
printf("I should probably do this after each important operation.\n");
}
/*
printf("Matrix after:\n");
for (i = 0; i < size; i++) {
printf("%lf ", a[i]);
}
printf("\n");
*/
free(a);
free(b);
cudaFree(ad);
cudaFree(bd);
return 0;
}
Compiled like so:
CFLAGS = -arch sm_35
CC = nvcc
all: parallel
parallel: parallel.cu
$(LINK.c) $^ -o $#
clean:
rm -f *.o core parallel
I'm using a single Tesla K20X.
When I run this simple example, I get the following output:
Looking for sum_a 247.983945
Looking for sum_b 248.033749
Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks.
Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks.
4645.079102 ms for A.
4630.725098 ms for B.
Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967
So, as you can see, each of the kernels gets the correct results and takes around 4.5 s, which is what I got in an earlier one-kernel version. Great! However, as you can see from the aprun output, the wall time is actually around 10 s, which is much more than the one-kernel version. So, it looks like the kernels are either not launching in parallel, or I'm not getting nearly the speed-up (2x) that I was expecting from concurrent kernel launches.
To tl;dr this question:
Am I missing anything in my code example? Are the kernels actually launching in parallel?
What kind of speed-up should I expect with a Tesla K20X? Shouldn't the kernels run exactly in parallel, completing twice the work in the same time? How many kernels can I expect to run efficiently in parallel?
Thanks for you help.
The cudaEventRecord operations in between your kernels are causing serialization.
Right now the results you are getting:
4645.079102 ms for A.
4630.725098 ms for B.
are back-to-back due to this serialization.
Instead, just time the entire kernel launch sequence:
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(a_stop);
And I think you will see an elapsed time for (a_start, a_stop) that is roughly the same as one of your previous kernels (~4600ms) indicating more or less full concurrency. I used CUDA 6 RC, copied data back to the host rather than printf from kernel, and eliminated the cudaEventRecord operations between the kernel calls, and I got an overall execution time of ~4.8s. If I didn't modify the cudaEventRecord arrangement, instead my execution time was ~8.3s
A few other notes:
I wouldn't use printf from kernel when running tests like these.
You won't get overlap of compute and cudaMemcpyAsync when the host buffer is allocated with malloc. You need to use cudaHostAlloc.
I would start with running and understanding the concurrent kernels cuda sample first.
You may want to review the appropriate section of the programming guide
I know the basics of OpenMP and I know that in order to parallelize a for its iterations must not depend on previous iterations. Also one can use reductions, but they support only basic operators such as +, -,/, *, &&, ||.
How I can make this for parallel?
for (i = 1; i < n; ++i) {
for (j = 1; j < n; ++j) {
// stanga
if (res[i][j - 1] != res[i][j]) {
cmin2[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin_res[i][j][0]);
} else {
cmin2[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin_res[i][j][0]);
}
// sus
if (res[i - 1][j] != res[i][j]) {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2[i - 1][j][0] + 1, cmin_res[i][j][1]);
} else {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin_res[i][j][1]);
}
}
}
My question is rather how I can decompose this for to be able to run it in parallel (and maybe use reductions if possible).
The problem is that at each iteration the operations must be done in this order, because I have 3 more groups of for like this.
P.S. min and min3 are macros.
There's a brute force way to do what you want, but a better parallelization will require a little more input about what you want in and out of the routines.
The data dependencies in your loop look like this, in i-j space:
i →
..........
j .....1....
↓ ....12....
...123....
where the value at point three depends on that those point 2s, and those depend on those at pt 1, etc. Because of this diagonal structure, you can re-order the loops to traverse the grid diagonally, eg first iteration is over (0,1), (1,0) then over (0,2),(1,1),(2,0), and so on. A simplified version of your problem looks like below:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
int **int2darray(int n, int m);
void free2darray(int **array);
void init2darray(int **array, int n, int m);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
const int N=10000;
int **serialarr, **omparr;
struct timeval serialtimer, omptimer;
double serialtime, omptime;
serialarr = int2darray(N,N);
omparr = int2darray(N,N);
init2darray(serialarr, N, N);
init2darray(omparr, N, N);
/* serial calculation */
tick(&serialtimer);
for (int i=1; i<N; i++)
for (int j=1; j<N; j++)
serialarr[i][j] = serialarr[i-1][j] + serialarr[i][j-1];
serialtime = tock(&serialtimer);
/* omp */
tick(&omptimer);
#pragma omp parallel shared(omparr) default(none)
{
for (int ipj=1; ipj<=N; ipj++) {
#pragma omp for
for (int j=1; j<ipj; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
for (int ipj=N+1; ipj<2*N-1; ipj++) {
#pragma omp for
for (int j=ipj-N+1; j<N; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
}
omptime = tock(&omptimer);
/* compare results */
int abserr = 0;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
abserr += abs(omparr[i][j] - serialarr[i][j]);
printf("Difference between serial and OMP array: %d\n", abserr);
printf("Serial time = %lf\n", serialtime);
printf("OMP time = %lf\n", omptime);
free2darray(omparr);
free2darray(serialarr);
return 0;
}
int **int2darray(int n, int m) {
int *data = malloc(n*m*sizeof(int));
int **array = malloc(n*sizeof(int*));
for (int i=0; i<n; i++)
array[i] = &(data[i*m]);
return array;
}
void free2darray(int **array) {
free(array[0]);
free(array);
}
void init2darray(int **array, int n, int m) {
for (int i=0; i<n; i++)
for (int j=0; j<m; j++)
array[i][j] = i*m+j;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running gives:
$ gcc -fopenmp -Wall -O2 loops.c -o loops -std=c99
$ export OMP_NUM_THREADS=8
$ ./loops
Difference between serial and OMP array: 0
Serial time = 0.246649
OMP time = 0.174936
You'll notice the speedup is pretty poor, even with large N, because the amount of computation per iteration is small, it's the inner loop that's parallelized, and we're going through memory in a weird, cache-unfriendly order.
Some of the above could probably be fixed, but it would help a bit more to know more about what you're trying to do; eg, do you care about the cmin2_res arrays, or are they just intermediate products? In words, what are you trying to calculate?