Concurrent Kernel Launch Example - CUDA - c

I'm attempting to implement concurrent kernel launches for a very complex CUDA kernel, so I thought I'd start out with a simple example. It just launches a kernel which does a sum reduction. Simple enough. Here it is:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cuda.h>
extern __shared__ char dsmem[];
__device__ double *scratch_space;
__device__ double NDreduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
double *b;
b = scratch_space;
for (int i = tid; i < length; i+= blockDim.x)
b[i] = a[i];
__syncthreads();
do {
k = (k + 1) / 2;
if (tid < k && tid + k < length)
b[tid] += b[tid + k];
length = k;
__syncthreads();
} while (k > 1);
return b[0];
}
__device__ double reduceSum(double *a, unsigned short length)
{
const int tid = threadIdx.x;
unsigned short k = length;
do
{
k = (k + 1) / 2;
if (tid < k && tid + k < length)
a[tid] += a[tid + k];
length = k;
__syncthreads();
}
while (k > 1);
return a[0];
}
__global__ void kernel_thing(double *ad, int size)
{
double sum_1, sum_2, sum_3;
time_t begin, end, t1, t2, t3;
scratch_space = (double *) &dsmem[0];
for (int j = 0; j < 1000000; j++) {
begin = clock();
sum_1 = NDreduceSum(ad, size);
end = clock();
}
__syncthreads();
t1 = end - begin;
begin = clock();
sum_2 = 0;
if (threadIdx.x == 0) {
for (int i = 0; i < size; i++) {
sum_2 += ad[i];
}
}
__syncthreads();
end = clock();
t2 = end - begin;
__syncthreads();
begin = clock();
sum_3 = reduceSum(ad, size);
end = clock();
__syncthreads();
t3 = end - begin;
if (threadIdx.x == 0) {
printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.\n", sum_1, sum_2, sum_3, t1, t2, t3);
}
}
int main(int argc, char **argv)
{
int i;
const int size = 512;
double *a, *ad, *b, *bd;
double sum_a, sum_b;
cudaStream_t stream_a, stream_b;
cudaError_t result;
cudaEvent_t a_start, a_stop, b_start, b_stop;
a = (double *) malloc(sizeof(double) * size);
b = (double *) malloc(sizeof(double) * size);
srand48(time(0));
for (i = 0; i < size; i++) {
a[i] = drand48();
}
for (i = 0; i < size; i++) {
b[i] = drand48();
}
sum_a = 0;
for (i = 0; i < size; i++) {
sum_a += a[i];
}
sum_b = 0;
for (i = 0; i < size; i++) {
sum_b += b[i];
}
printf("Looking for sum_a %lf\n", sum_a);
printf("Looking for sum_b %lf\n", sum_b);
cudaEventCreate(&a_start);
cudaEventCreate(&b_start);
cudaEventCreate(&a_stop);
cudaEventCreate(&b_stop);
cudaMalloc((void **) &ad, sizeof(double) * size);
cudaMalloc((void **) &bd, sizeof(double) * size);
result = cudaStreamCreate(&stream_a);
result = cudaStreamCreate(&stream_b);
result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a);
result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b);
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
cudaEventRecord(a_stop);
cudaEventRecord(b_start);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(b_stop);
result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a);
result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b);
cudaEventSynchronize(a_stop);
cudaEventSynchronize(b_stop);
float a_ms = 0;
float b_ms = 0;
cudaEventElapsedTime(&a_ms, a_start, a_stop);
cudaEventElapsedTime(&b_ms, b_start, b_stop);
printf("%lf ms for A.\n", a_ms);
printf("%lf ms for B.\n", b_ms);
result = cudaStreamDestroy(stream_a);
result = cudaStreamDestroy(stream_b);
if (result != cudaSuccess) {
printf("I should probably do this after each important operation.\n");
}
/*
printf("Matrix after:\n");
for (i = 0; i < size; i++) {
printf("%lf ", a[i]);
}
printf("\n");
*/
free(a);
free(b);
cudaFree(ad);
cudaFree(bd);
return 0;
}
Compiled like so:
CFLAGS = -arch sm_35
CC = nvcc
all: parallel
parallel: parallel.cu
$(LINK.c) $^ -o $#
clean:
rm -f *.o core parallel
I'm using a single Tesla K20X.
When I run this simple example, I get the following output:
Looking for sum_a 247.983945
Looking for sum_b 248.033749
Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks.
Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks.
4645.079102 ms for A.
4630.725098 ms for B.
Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967
So, as you can see, each of the kernels gets the correct results and takes around 4.5 s, which is what I got in an earlier one-kernel version. Great! However, as you can see from the aprun output, the wall time is actually around 10 s, which is much more than the one-kernel version. So, it looks like the kernels are either not launching in parallel, or I'm not getting nearly the speed-up (2x) that I was expecting from concurrent kernel launches.
To tl;dr this question:
Am I missing anything in my code example? Are the kernels actually launching in parallel?
What kind of speed-up should I expect with a Tesla K20X? Shouldn't the kernels run exactly in parallel, completing twice the work in the same time? How many kernels can I expect to run efficiently in parallel?
Thanks for you help.

The cudaEventRecord operations in between your kernels are causing serialization.
Right now the results you are getting:
4645.079102 ms for A.
4630.725098 ms for B.
are back-to-back due to this serialization.
Instead, just time the entire kernel launch sequence:
cudaEventRecord(a_start);
kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size);
kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size);
cudaEventRecord(a_stop);
And I think you will see an elapsed time for (a_start, a_stop) that is roughly the same as one of your previous kernels (~4600ms) indicating more or less full concurrency. I used CUDA 6 RC, copied data back to the host rather than printf from kernel, and eliminated the cudaEventRecord operations between the kernel calls, and I got an overall execution time of ~4.8s. If I didn't modify the cudaEventRecord arrangement, instead my execution time was ~8.3s
A few other notes:
I wouldn't use printf from kernel when running tests like these.
You won't get overlap of compute and cudaMemcpyAsync when the host buffer is allocated with malloc. You need to use cudaHostAlloc.
I would start with running and understanding the concurrent kernels cuda sample first.
You may want to review the appropriate section of the programming guide

Related

Why openmp 32 thread is much slower than 1 thread?

I am trying to write an application calculating l2 norm of 2 arrays. I have to parallel my calculation.
Here is the code that I have parallelized:
double time_start_openmp = omp_get_wtime();
#pragma omp parallel for
for (i = 0; i < n; i++)
{
numberOfThreads = omp_get_num_threads();
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
time_end_openmp = omp_get_wtime();
l2_norm = sqrt(l2_norm);
openmp_exec_time = time_end_openmp - time_start_openmp;
printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
I compile the code as:
gcc -fopenmp -g -ggdb -Wall -lm -o test test.c
I am running this code with 1 threads and 32 threads. The output is the exact opposite of what's expected. Here is an example output:
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00
Am I seeing wrong or using 32 threads is 10 times slower than 1 thread? So, what am I doing wrong here?
Here is my full code:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#define MATSIZE 2000
static size_t totalMemUsage = 0;
size_t vectors_dot_prod(double *x, double *y, size_t n)
{
double res = 0.0;
size_t i;
for (i = 0; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
size_t res = 0.0;
size_t i = 0;
for (; i <= n - 4; i += 4)
{
res += (x[i] * y[i] +
x[i + 1] * y[i + 1] +
x[i + 2] * y[i + 2] +
x[i + 3] * y[i + 3]);
}
for (; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
size_t i;
for (i = 0; i < rows; i++)
{
result[i] = vectors_dot_prod2(mat[i], vec, cols);
}
}
double get_random()
{
double range = 1000;
double div = RAND_MAX / range;
double randomNumber = (rand() / div);
// printf("%d\n", randomNumber);
return randomNumber;
}
void print_2d_arr(double *arr, size_t row, size_t col)
{
size_t i, j, index;
for (i = 0; i < row; i++)
{
for (j = 0; j < col; j++)
{
index = i * col + j;
printf("%3f ", arr[index]);
}
printf("\n");
}
}
void print_1d_arr(double *arr, size_t row)
{
size_t i;
for (i = 0; i < row; i++)
{
printf("%f, ", arr[i]);
}
printf("\n");
}
size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
/*
* Fulfilling the array with random numbers
* */
size_t i;
for (i = 0; i < n; i++)
{
arr[i] = get_random();
}
return 0;
}
double *allocarray1D(size_t size)
{
double *array = calloc(size, sizeof(double));
totalMemUsage = totalMemUsage + size * sizeof(double);
return array;
}
size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
size_t i, j;
size_t nlocal;
double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
x[i] += a[index] * fb[j];
}
}
free(fb);
return 0;
}
size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{
// Process 0 sends b to everyone
MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
size_t i, j;
size_t nlocal;
// double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
// MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x_partial[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x_partial[i] += a[index] * b[j];
}
}
// free(b);
// Process 0 gathers x_partials to create x
MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
return 0;
}
size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
size_t i, j;
for (i = 0; i < n; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x[i] += a[index] * b[j];
}
}
return 0;
}
int main(int argc, char *argv[])
{
// Global declerations
size_t i;
// MPI_Status status;
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int taskid;
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
if (argc != 2)
{
if (taskid == 0)
printf("Usage: %s <N>\n", argv[0]);
MPI_Finalize();
return 0;
}
srand(time(NULL) + taskid);
size_t n = atoi(argv[1]);
size_t nOverK = n / world_size;
double *a = allocarray1D(n * n);
double *b = allocarray1D(n);
double *x = allocarray1D(n);
double *x_partial = allocarray1D(nOverK);
double *xseq = allocarray1D(n);
double *a_partial = allocarray1D(n * nOverK);
if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
{
if (taskid == 0)
printf("Allocation failed\n");
MPI_Finalize();
return 0;
}
// Process 0 creates A matrix.
if (taskid == 0)
{
fullfillArrayWithRandomNumbers(a, n * n);
// Process 0 produces the b
fullfillArrayWithRandomNumbers(b, n);
}
// Process 0 sends a_partial to everyone
if (!(world_size == 1 && n == 64000))
{
MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
double time_start = MPI_Wtime();
ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
double time_end = MPI_Wtime();
double parallel_exec_time = time_end - time_start;
double *exec_times = allocarray1D(world_size);
// Process 0 gathers x_partials to create x
MPI_Gather(&parallel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// print_1d_arr(x, n);
if (taskid == 0)
{
SequentialMatrixMultiply(n, a, b, xseq);
// check difference between x and xseq using OpenMP
//print_1d_arr(exec_times, world_size);
// print_1d_arr(xseq, n);
double max_exec, min_exec, avg_exec;
min_exec = 1000;
for (i = 0; i < world_size; i++)
{
if (max_exec < exec_times[i])
{
max_exec = exec_times[i];
}
if (min_exec > exec_times[i])
{
min_exec = exec_times[i];
}
avg_exec += exec_times[i];
}
avg_exec = avg_exec / world_size;
long double time_start_openmp = omp_get_wtime();
long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
max_exec_time = 0;
max_exec_time = 1000;
long double l2_norm = 0;
size_t numberOfThreads = 0;
size_t r = 0;
double *diff_vector = allocarray1D(n);
size_t nrepeat = 10000;
if (world_size == 1)
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
else
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
l2_norm = sqrt(l2_norm);
time_end_openmp = omp_get_wtime();
openmp_exec_time = time_end_openmp - time_start_openmp;
// print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
if (world_size == 1)
{
printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);
//printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
//printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
MPI_Finalize();
return 0;
}
Here is the output;
cn009
36
mpicc -fopenmp -g -ggdb -lm -o rowmv rowmv.c
OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
Am I seeing wrong or using 32 threads is 10 times slower than 1
thread? So, what am I doing wrong here?
In the portion of code that is being both profiled and parallelized with OpenMP:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
there is a race condition, namely the access to the variable l2_norm. Moreover, you can drop the private(i), since the index variable (i.e., i) in the parallelized loop will be set implicitly as private by OpenMP. The race condition can be fixed with the OpenMP reduction. Furthermore, your loop is not actually distributing the iterations among threads as you wanted. Because you added again the parallel clause to that #pragma omp for, and assuming that you have nested parallelism disabled, which by default it is, each of the threads created in the outer parallel region will execute "sequentially" the code within that region, namely:
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
Hence, each thread will execute all the N iterations of the loop that you intended to be parallelized. Consequently, removing the parallelism and adding additional overhead (e.g., thread creation) to the sequential code. To fix those problems (i.e., race condition and "nested" parallel region) change this code to:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp for reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
Now, having fixed those problems you are left still with another problem (performance-wise), namely that the parallel loop is being performed in the context of a hybrid parallelization of OpenMP + MPI, and you did not explicitly bind the OpenMP threads (within the MPI processes) to the corresponded cores. Without that explicit binding, one cannot be sure in which cores those threads will end up. Naturally, more often than not, having multiple threads running in the same logical core will increase the overall execution of the application being parallelized.
If your application uses threads, then you probably want to ensure that you are either not bound at all (by specifying --bind-to none), or bound to multiple cores using an appropriate binding level or a specific number of processing elements per application process. You can solve this problem by either:
disabling the binding with the MPI flag --bind-to none, to enable threads to be assigned to different cores;
or perform the bound of threads, accordingly. Check this SO thread on how to map the threads to cores in Hybrid parallelizations such as MPI + OpenMP.
By explicitly setting the number of threads per process accordingly, you can avoid that multiple threads end up in the same core, and consequently, avoid that threads within the same core fight for the same resources.
Advice:
IMO you should first test the performance of the OpenMP alone, without any MPI process. In this context, test the scalability of code by measuring the sequential version against 2 threads, then 4, 8, and so on, gradually increasing the number of threads. Eventually, there will be a number of threads for which the code simply stops scaling. Naturally, the amount of parallel work being performed by the threads has to be big enough to overcome the overhead of parallelism. Therefore, you should also test around with bigger and bigger inputs.
After having profiled, tested an improved your OpenMP version you can then extent that shared-memory parallelization with multiple processes using MPI.
Besides the race condition in updating a shared variable as noted in #dreamcrash's answer, your code is not distributing the work properly.
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
~~~~~~~~
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
The parallel construct in the inner loop makes it a nested combined parallel for construct. It means that each thread in the team executing the outer parallel loop spawns a brand new parallel region and distributes the i-loop over the threads in it. There is no distribution happening in the outer parallel region and you end up with N threads all repeating the exact same work. By default nested parallelism is disabled, so the nested parallel region runs sequentially and your code is effectively doing this:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
There is no distribution of work and all threads write to the same locations in the diff_vector[] array.
On one hand, this code in general is a memory-bound one since the amount of computation per byte of data is low - modern CPUs can do many multiplications and subtractions per cycle while fetching data from memory and writing results back there takes many cycles. Memory-bound problems don't get any faster with more threads since the limiting factor is the memory bandwidth. This isn't that big of a problem in your case because 32K array entries take up 256 KB of memory and that fits in most CPU caches, and the L3 cache is blazing fast, but is still larger than the fastest L1 cache of a single CPU core. On the other hand, writing to the same memory areas from multiple threads results in true and false sharing, with the associated inter-thread cache invalidation, which usually results in the parallel code running way slower than the sequential version.
There are tools that can help you analyse the performance of your code and spot problems. As I already wrote in a comment, Intel VTune is one of them and is freely available as part of the oneAPI toolkit. Intel Inspector is another one (again, free and part of the oneAPI toolkit) and it finds problems such as data races. The two tools work very well together and I couldn't recommend them strongly enough to any aspiring parallel programmer.
There is also a minor race condition writing to numberOfThreads, but since all values written are the same, that isn't much of a logical problem. The correct version of the code in question should be:
#pragma omp parallel
{
#pragma omp master
numberOfThreads = omp_get_num_threads();
#pragma omp parallel reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}

OpenMP parallel multiplication slower than Sequential multiplication

I'm learning OpenMP and I'm trying to do a simple task: A[r][c] * X[c] = B[r] (matrix vector multiplication).
The problem is: the sequential code is faster than parallel and I don't know why!
My code:
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
// Defined variables
#define row_matriz_A 80000
#define col_matriz_A 800
#define THREADS_NUM 4
// FUNCAO - GERAR MATRIZES
void gerarMatrizes(int r, int c, int mA[], int vX[], int vB[]){...}
// FUNCAO - SEQUENTIAL MULTIPLICATION
void multSequencial(int r, int c, int mA[], int vX[], int vB[]){
// Variables
int i, j, offset, sum;
struct timeval tv1,tv2;
double t1, t2;
// Begin Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao sequencial foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - MULTIPLICACAO PARALELA COM OpenMP
void matvecHost(int r, int c, int mA[], int vX[], int vB[]){
// Variaveis
int tID, i, j, offset, sum;
struct timeval tv1, tv2;
double t1, t2;
// Init vB
for(i = 0; i < r; i++) vB[i] = 0;
// BEGIN Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
omp_set_num_threads(THREADS_NUM);
#pragma omp parallel private(tID, i, j) shared(mA, vB, vX)
{
tID = omp_get_thread_num();
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao OpenMP foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - PRINCIPAL
int main(int argc, char * argv[]) {
int row, col;
row = row_matriz_A;
col = col_matriz_A;
int *matrizA = (int *)calloc(row * col, sizeof(int));
int *vectorX = (int *)calloc(col * 1, sizeof(int));
int *vectorB = (int *)calloc(row * 1, sizeof(int));
gerarMatrizes(row, col, matrizA, vectorX, vectorB);
multSequencial(row, col, matrizA, vectorX, vectorB);
matvecHost(row, col, matrizA, vectorX, vectorB);
return 0;
}
Previous solutions that did not worked:
Use collapse in my squared for
Increse rows and columns size
Increase thread numbers (A teacher recommend to use thread number == threads physical number)
Use malloc instead of m[i][j]
EDIT - ANSWER
My parallel block was correctly changed based on the correct answer:
#pragma omp parallel private(i, j, sum) shared(mA, vB, vX)
{
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
sum += mA[i * c + j] * vX[j];
}
vB[i] = sum;
}
}
I still got some a doubt:
If I define i, j and sum inside my parallel block, they will be set as private automatically? This improve the speed in my code or not?
You have race conditions on sum and offset - those are shared between the threads instead of being thread-private.
This also likely explains the slowdown: On x86, the CPU will actually work hard to make sure accesses to shared variables "work". This involves flushing cache lines after every (!) write to offset and sum - so all the threads are wildly writing into the same variables, but each one has to wait until the write from the previous thread (on a different core) has arrived in the local cache again after having been flushed. And of course it will produce completely nonsensical results.
I don't know why you are declaring all your variables at the start of the function - that's prone to these kind of mistakes. If you declared i, j, sum and offset (and the unused tID) in the smallest possible scopes instead, you wouldn't ever had this problem because they would be thread-private automatically in that case.

Apply an operation to an entire block of memory in C

As I know in C language in order to multiply an array by a scalar, you have to iterate over each element using a for-loop. And as I know also the source code for the R software environment is written primarily in C. And from there when I have a big matrix in R like mat = matrix(5, nrow = 1100, ncol = 1100) and then multiply it by a constant and measure the time of this operation, just like so:
t_start = Sys.time()
mat = mat *5
print(Sys.time()-t_start)
output:
Time difference of 0.005984068 secs
But doing the same thing using for-loops, it takes too much time:
t_start = Sys.time()
for(i in 1:1100)
{
for(j in 1:1100)
{
mat[i,j] = mat[i,j] * 5
}
}
print(Sys.time()-t_start)
output:
Time difference of 0.1437349 secs
The second way is ~24 times slower, now I'm assuming that behind the scene the first way is also has been done using for-loops, if so, why the time difference is too big?!
I'm wondering if there is a better way to apply an operation to an entire block of memory in C, without iterating over each element using loops.
I would like to get some answers from C language perspective, as I'm working currently with C. And those pieces of R-code was just to show two different ways of doing this that R provides and C do not.
Even use the C language for loop, it is faster than the first way In R language.
so you don't have to worry about for-loop slower in c language.
See the results below.
C language for loop: 0.00093478 secs
gcc -otest test.c -g -Wall -O2
./test
Time difference of 0.00093478 secs
the first way In R language: 0.004915237 secs
./Rscript first.R
Time difference of 0.004915237 secs
test.c code:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
typedef struct matrix {
int nrow;
int ncol;
int *buf;
int *(array[]);
} matrix;
double sys_time()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1000000000.0;
}
void test(matrix *mat)
{
int i, j;
double t_start, t_end;
t_start = sys_time();
for(i = 0; i < mat->nrow; i++) {
for(j = 0; j < mat->ncol; j++) {
mat->array[i][j] *= 5;
}
}
t_end = sys_time();
printf("Time difference of %g secs\n", t_end - t_start);
}
matrix *create_matrix(int val, int nrow, int ncol)
{
matrix *mat;
int *buf;
int i, j;
mat = (matrix *)malloc(sizeof(*mat) + nrow * sizeof(int *));
buf = (int *)malloc(sizeof(int) * nrow * ncol);
mat->buf = buf;
mat->nrow = nrow;
mat->ncol = ncol;
for(i = 0; i < nrow; i++) {
for(j = 0; j < ncol; j++)
buf[j] = val;
mat->array[i] = buf;
buf += ncol;
}
return mat;
}
void destroy_matrix(matrix *mat)
{
free(mat->buf);
free(mat);
}
int main()
{
matrix *mat;
mat = create_matrix(5, 1100, 1100);
test(mat);
destroy_matrix(mat);
return 0;
}

CUDA - Sieve of Eratosthenes division into parts

I'm writing implementation of Sieve of Eratosthenes (https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes) on GPU. But no sth like this - http://developer-resource.blogspot.com/2008/07/cuda-sieve-of-eratosthenes.html
Method:
Creating n-element array with default values 0/1 (0 - prime, 1 - no) and passing it on GPU (I know that it can be done directly in kernel but it's not problem in this moment).
Each thread in block checks multiples of a single number. Each block checks in total sqrt(n) possibilities. Each block == different interval.
Marking multiples as 1 and passing data back to the host.
Code:
#include <stdio.h>
#include <stdlib.h>
#define THREADS 1024
__global__ void kernel(int *global, int threads) {
extern __shared__ int cache[];
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
cache[tid - 1] = global[number];
__syncthreads();
int start = offset + 1;
int end = offset + threads;
for (int i = start; i <= end; i++) {
if ((i != tid) && (tid != 1) && (i % tid == 0)) {
cache[i - offset - 1] = 1;
}
}
__syncthreads();
global[number] = cache[tid - 1];
}
int main(int argc, char *argv[]) {
int *array, *dev_array;
int n = atol(argv[1]);
int n_sqrt = floor(sqrt((double)n));
size_t array_size = n * sizeof(int);
array = (int*) malloc(n * sizeof(int));
array[0] = 1;
array[1] = 1;
for (int i = 2; i < n; i++) {
array[i] = 0;
}
cudaMalloc((void**)&dev_array, array_size);
cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice);
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
int shared = threads * sizeof(int);
kernel<<<blocks, threads, shared>>>(dev_array, threads);
cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost);
int count = 0;
for (int i = 0; i < n; i++) {
if (array[i] == 0) {
count++;
}
}
printf("Count: %d\n", count);
return 0;
}
Run:
./sieve 10240000
It works correctly when n = 16, 64, 1024, 102400... but for n = 10240000 I getting incorrect result. Where is problem?
This code has a variety of problems, in my view.
You are fundamentally accessing items out of range. Consider this sequence in your kernel:
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
cache[tid - 1] = global[number];
You (in some cases -- see below) have launched a thread array exactly equal in size to your global array. So what happens when the highest numbered thread runs the above code? number = threadIdx.x+1+blockIdx.x*blockDim.x. This number index will be one beyond the end of your array. This is true for many possible values of n. This problem would have been evident to you if you had either used proper cuda error checking or had run your code with cuda-memcheck. You should always do those things when you are having trouble with a CUDA code and also before asking for help from others.
The code only has a chance of working correctly if the input n is a perfect square. The reason for this is contained in these lines of code (as well as dependencies in the kernel):
int n = atol(argv[1]);
int n_sqrt = floor(sqrt((double)n));
...
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
(note that the correct function here would be atoi not atol, but I digress...) Unless n is a perfect square, the resultant n_sqrt will be somewhat less than the actual square root of n. This will lead you to compute a total thread array that is smaller than the necessary size. (It's OK if you don't believe me at this point. Run the code I will post below and input a size like 1025, then see if the number of threads * blocks is of sufficient size to cover an array of 1025.)
As you've stated:
Each block checks in total sqrt(n) possibilities.
Hopefully this also points out the danger of non-perfect square n, but we must now ask "what if n is larger than the square of the largest threadblock size (1024)? The answer is that the code will not work correctly in many cases - and your chosen input of 10240000, although a perfect square, exceeds 1024^2 (1048576) and it does not work for this reason. Your algorithm (which I claim is not a Sieve of Eratosthenes) requires that each block be able to check sqrt(n) possibilities, just as you stated in the question. When that no longer becomes possible because of the limits of threads per block, then your algorithm starts to break.
Here is a code that makes some attempt to fix issue #1 above, and at least give an explanation for the failures associated with #2 and #3:
#include <stdio.h>
#include <stdlib.h>
#define THREADS 1024
#define MAX 10240000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *global, int threads) {
extern __shared__ int cache[];
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
if ((blockIdx.x != (gridDim.x-1)) || (threadIdx.x != (blockDim.x-1))){
cache[tid - 1] = global[number];
__syncthreads();
int start = offset + 1;
int end = offset + threads;
for (int i = start; i <= end; i++) {
if ((i != tid) && (tid != 1) && (i % tid == 0)) {
cache[i - offset - 1] = 1;
}
}
__syncthreads();
global[number] = cache[tid - 1];}
}
int cpu_sieve(int n){
int limit = floor(sqrt(n));
int *test_arr = (int *)malloc(n*sizeof(int));
if (test_arr == NULL) return -1;
memset(test_arr, 0, n*sizeof(int));
for (int i = 2; i < limit; i++)
if (!test_arr[i]){
int j = i*i;
while (j <= n){
test_arr[j] = 1;
j += i;}}
int count = 0;
for (int i = 2; i < n; i++)
if (!test_arr[i]) count++;
return count;
}
int main(int argc, char *argv[]) {
int *array, *dev_array;
if (argc != 2) {printf("must supply n as command line parameter\n"); return 1;}
int n = atoi(argv[1]);
if ((n < 1) || (n > MAX)) {printf("n out of range %d\n", n); return 1;}
int n_sqrt = floor(sqrt((double)n));
size_t array_size = n * sizeof(int);
array = (int*) malloc(n * sizeof(int));
array[0] = 1;
array[1] = 1;
for (int i = 2; i < n; i++) {
array[i] = 0;
}
cudaMalloc((void**)&dev_array, array_size);
cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice);
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
int shared = threads * sizeof(int);
printf("threads = %d, blocks = %d\n", threads, blocks);
kernel<<<blocks, threads, shared>>>(dev_array, threads);
cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
int count = 0;
for (int i = 0; i < n; i++) {
if (array[i] == 0) {
count++;
}
}
printf("Count: %d\n", count);
printf("CPU Sieve: %d\n", cpu_sieve(n));
return 0;
}
There are a couple of issues, I think, but here's a pointer to the actual problem: The sieve of Eratosthenes removes iteratively multiples of already encountered prime numbers, and you want to separate the work-load into thread-blocks, where each thread-block operates on a piece of shared memory (cache, in your example). Thread-blocks, however, are generally independent from all other thread-blocks and cannot easily communicate with one another. One example to illustrate the problem: The thread with index 0 in thread-block with index 0 removes multiples of 2. Thread blocks with index > 0 have no way to know about this.

Is this a possible way to optimize multidimensional arrays on heap?

Below is a usual way to allocate multidimensional arrays on heap, by using pointers to pointers.
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
But I decided to make a version that allocates only one chunck of memory and does element accessing by multiplication to an index value.
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
The second version appreas to run about 20% faster when running the test code below. What is likely to be the cause, and is this a generally applicable optimization technique? Are there some libraries that define array types of this kind for optimization purpose?
I made a huge mistake in the test code before edit. The first version ran slower because its allocation and deallocation kept place inside the for-loop while the second one did it only once before entering the loop. See the comments in the test code below. After making the two tests equal, I find that the first version can run even faster, especially after optimization. The more complex operations and various copies I put into the test code, I see the first one always run a little bit faster. It seems that the multiplication for indexing is slow in my machine? I'm not sure for the cause, though.
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
Yes, using arithmetic and a single base pointer is what the compiler does internally for non-dynamically allocated 2D (n-dimensional) arrays.
You gain the most performance because there's a single calculation and indexed lookup. With the 2D array shown, there are two pointer lookups and two index calculations per array access (one index calculation and lookup to get to the right array, and then the second to access the element in the right array). With a 3D array, there'd be three index calculations and three lookups.
You also allocate less memory, and need fewer memory allocations, but those are second order effects.
Also, as WhozCraig points out in a comment but I didn't mention, you get better locality of reference and potential for smarter prefetch with a single big chunk of memory compared with multiple smaller chunks (that add up to more memory than the single big chunk).
I tested this file (sim2d.c) compiled with GCC 4.9.1 on Mac OS X 10.10.2 Yosemite.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
There are two slightly different ways of accessing the aai2 data. I also separated the array size (N = 2000) from the number of cycles in a single test (N_CYCLES = 1000). The timing results I got were:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
I was getting similar patterns with (N_CYCLE = 2000), but it was taking twice as long to run — surprise, surprise.
I'm seeing a small but noticeable benefit (about 13% decrease) from the single allocation code, but no significant difference between the two timings for the 'aai2' tests.
Basic statistics:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
Clearly, formally making sure the machine is otherwise unloaded, and running many more iterations of the test, and similar benchmarking steps might improve the data, but the single allocation aai2 mechanism performs better on this machine than the multi-allocation aai mechanism. (Tangential aside: why do people not put a suffix 1 on their first version when they have two or more versions of the code?)
Hardware: 17" Mac Book Pro, early-2011, 2.3 GHz Intel Core i7, 16 GiB 1333 MHz DDR3 RAM.

Resources