I noticed that two different ways of initializing an array in C seems to result in very different running time after compiling with O3 optimization. Here is a minimum (albeit meaningless) example to replicate such difference:
#include <stdio.h>
#include <time.h>
int main(void) {
int i, j, k;
int size=10000;
int a[size];
clock_t time1 = clock();
for (i=0; i<size; i++) {
for (j=0; j<300000; j++) {
for (k=0; k<700000; k++) {
a[i] = j+k;
}
}
}
clock_t time2 = clock();
double time = (double)(time2-time1)/CLOCKS_PER_SEC*1000.0;
printf("%f\n", time);
getchar();
return 0;
}
Compile this program with gcc (Ubuntu 5.4.0-6ubuntu1~16.04.4) 5.4.0 20160609 with O3 optimization turned on. This program takes about 0.02s to finish on my computer.
Now, change the array initialization from "int a[size];" to "static int a[10000];" and keep everything else the same. Again compile with the same environment and O3 optimization. This time, the program runs for about 0.001s.
Can anyone explain why there is such a different? Thanks!
I think this largely depends on compiler. My GCC 5.4 completely removes the loop when static is present, probly because it can figure out that computations have no side-effects ("dead code elimination"). For some reason it fails to do so when VLA is present (that's a missing optimization).
As a side note, to reliably measure performance you need to prevent compiler from optimizing too much. In your case I'd suggest to separate array creation and computations e.g. like
void __attribute__((noinline, noclone)) benchmark(int *a, int size) {
for (i=0; i<size; i++)
for (j=0; j<300000; j++)
for (k=0; k<700000; k++)
a[i] = j+k;
}
int main(void) {
int i, j, k;
int size=10000;
int a[size];
clock_t time1 = clock();
benchmark(a, size);
clock_t time2 = clock();
double time = (double)(time2-time1)/CLOCKS_PER_SEC*1000.0;
printf("%f\n", time);
getchar();
return 0;
}
Related
you guys have helped me so much with this code. Let me preface by saying I do not know C very well and am trying really hard to do this.
This is what the program should do:
Create a list of random numbers of length 10 Million
Sort the list of random numbers using shell sort function (still doesn't work properly...i think its how I am passing the pointer to the function)
Make a list 1 Million Longer
repeat for up to 100 million while recording time (the time shows up as 0.0000000 for some reason)
I'm just trying to test this shell sort program vs quick sort built into the standard library.
I've tried with and without pointers. The commented out section should work when it's done. It just messes things up more lol
Please help me out, you guys have been so great so far...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void shellSort(int *A, int n);
void checkSort(int *A, int n);
int main(){
/*Initialize Random Array*/
int unsorted_list[10000000];
int *ptr = &unsorted_list[0];
int random_number;
int i;
srand ( time(NULL) );
for(i=0; i<10000000; i++){
random_number = rand();
unsorted_list[i] = random_number % 10000000;
}
//Do C Shell Sort
double shell_results[10][2];
double clock_diff;
int j=10000000;
clock_t t0, t1;
int k;
for(i=0;i<10;i++){
/*Sort the list using shellSort and take the time difference*/
t0 = clock();
shellSort(ptr, j);
t1= clock();
/*Take difference in time*/
clock_diff = (t1 - t0)/CLOCKS_PER_SEC;
/*Add time and list length to the results array*/
shell_results[i][0] = (double)j;
shell_results[i][1] = clock_diff;
/*Check to make sure the array has been sorted*/
checkSort(ptr, j);
/*Re-initialize a longer array*/
//j+=1000000;
//for(k=0; k<j; k++){
// random_number = rand();
// unsorted_list[k] = random_number % 1000000;
//}
printf("%d",(int)shell_results[i][0]);
printf(" ");
printf("%f",shell_results[i][1]);
printf("\n");
}
return 0;
}
void shellSort(int *A, int n){
int gap , i , j , temp;
for (gap = n/2; gap>0; gap /=2)
for (i=gap; i<n; i++)
for(j = i-gap; j>=0 && A[j] > A[j+gap]; j-=gap){
temp = A[j];
A[j] = A[j + gap];
A[j + gap] = temp;
}
}
void checkSort(int *A, int n){
int i;
for(i=0;i<n;i++){
if(A[i]>A[i+1]){
printf("Error in sorting \n");
break;
}
}
}
You probably don't have 10 megabytes of stack space. Make that array global, declare it with static, or allocate it dynamically using malloc(). If you choose the latter, don't forget to free() it.
Later, when you need to use the 100,000,000 element array, make sure to use a new allocation for it!
Well there is no way you are going to have that amount of space available on the stack. Allocate it off the heap using malloc(). Remember to free() it afterwards.
I'm using the "read" benchmark from Why is writing to memory much slower than reading it?, and I added just two lines:
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
They should have no effect, because OpenMP should only parallelize the outer loop, but the code now consistently runs twice faster.
Update: These lines aren't even necessary. Simply adding
omp_get_num_threads();
(implicitly declared) in the same place has the same effect.
Complete code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
unsigned long do_xor(const unsigned long* p, unsigned long n)
{
unsigned long i, x = 0;
for(i = 0; i < n; ++i)
x ^= p[i];
return x;
}
int main()
{
unsigned long n, r, i;
unsigned long *p;
clock_t c0, c1;
double elapsed;
n = 1000 * 1000 * 1000; /* GB */
r = 100; /* repeat */
p = calloc(n/sizeof(unsigned long), sizeof(unsigned long));
c0 = clock();
#pragma omp parallel for
for(unsigned dummy = 0; dummy < 1; ++dummy)
for(i = 0; i < r; ++i) {
p[0] = do_xor(p, n / sizeof(unsigned long)); /* "use" the result */
printf("%4ld/%4ld\r", i, r);
fflush(stdout);
}
c1 = clock();
elapsed = (c1 - c0) / (double)CLOCKS_PER_SEC;
printf("Bandwidth = %6.3f GB/s (Giga = 10^9)\n", (double)n * r / elapsed / 1e9);
free(p);
}
Compiled and executed with
gcc -O3 -Wall -fopenmp single_iteration.c && time taskset -c 0 ./a.out
The wall time reported by time is 3.4s vs 7.5s.
GCC 7.3.0 (Ubuntu)
The reason for the performance difference is not actually any difference in code, but in how memory is mapped. In the fast case you are reading from zero-pages, i.e. all virtual addresses are mapped to a single physical page - so nothing has to be read from memory. In the slow case, it is not zeroed. For details see this answer from a slightly different context.
On the other side, it is not caused by calling omp_get_num_threads or the pragma itstelf, but merely linking to the OpenMP runtime library. You can confirm that by using -Wl,--no-as-needed -fopenmp. If you just specify -fopenmp but don't use it at all, the linker will omit it.
Now unfortunately I am still missing the final puzzle piece: why does linking to OpenMP change the behavior of calloc regarding zero'd pages .
I want to parallelize a for loop which contains a nested comparison function for qsort:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int main(){
int i;
#pragma omp parallel for
for(i = 0; i < 100; i++){
int *index= (int *) malloc(sizeof(int)*10);
double *tmp_array = (double*) malloc(sizeof(double)*10);
int j;
for(j=0; j<10; j++){
tmp_array[j] = rand();
index[j] = j;
}
// QuickSort the index array based on tmp_array:
int simcmp(const void *a, const void *b){
int ia = *(int *)a;
int ib = *(int *)b;
if ((tmp_array[ia] - tmp_array[ib]) > 1e-12){
return -1;
}else{
return 1;
}
}
qsort(index, 10, sizeof(*index), simcmp);
free(index);
free(tmp_array);
}
return 0;
}
When I try to compile this, I get the error:
internal compiler error: in get_expr_operands, at tree-ssa-operands.c:881
}
As far as I can tell, this error is due to the nested comparison function. Is there a way to make openmp work with this nested comparison function? If not, is there a good way to achieve a similar result without a nested comparison function?
Edit:
I'm using GNU C compiler where nested functions are permitted. The code compiles and runs fine without the pragma statement. I can't define simcmp outside of the for loop because tmp_array would then have to be a global variable, which would mess up the multi-threading. However, if somebody has a suggestion to achieve the same result without a nested function, that would be most welcome.
I realize this has been self answered, but here are some standard C and OpenMP options. The qsort_r function is a good classic choice, but it's worth noting that qsort_s is part of the c11 standard, and thus is portable wherever c11 is offered (which does not include Windows, they don't quite offer c99 yet).
As to doing it in OpenMP without the nested comparison function, still using original qsort, there are two ways that come to mind. First is to use the classic global variable in combination with OpenMP threadprivate:
static int *index = NULL;
static double *tmp_array = NULL;
#pragma omp threadprivate(index, tmp_array)
int simcmp(const void *a, const void *b){
int ia = *(int *)a;
int ib = *(int *)b;
double aa = ((double *)tmp_array)[ia];
double bb = ((double *)tmp_array)[ib];
if ((aa - bb) > 1e-12){
return -1;
}else{
return 1;
}
}
int main(){
int i;
#pragma omp parallel for
for(i = 0; i < 100; i++){
index= (int *) malloc(sizeof(int)*10);
tmp_array = (double*) malloc(sizeof(double)*10);
int j;
for(j=0; j<10; j++){
tmp_array[j] = rand();
index[j] = j;
}
// QuickSort the index array based on tmp_array:
qsort_r(index, 10, sizeof(*index), simcmp, tmp_array);
free(index);
free(tmp_array);
}
return 0;
}
The version above causes every thread in the parallel region to use a private copy of the global variables index and tmp_array, which takes care of the issue. This is probably the most portable version you can write in standard C and OpenMP, with the only likely incompatible platforms being those that do not implement thread local memory (some microcontrollers, etc.).
If you want to avoid the global variable and still have portability and use OpenMP, then I would recommend using C++11 and the std::sort algorithm with a lambda:
std::sort(index, index+10, [=](const int& a, const int& b){
if ((tmp_array[a] - tmp_array[b]) > 1e-12){
return -1;
}else{
return 1;
}
});
I solved my problem with qsort_r, which allows you to pass an additional pointer to the comparison function.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int simcmp(const void *a, const void *b, void *tmp_array){
int ia = *(int *)a;
int ib = *(int *)b;
double aa = ((double *)tmp_array)[ia];
double bb = ((double *)tmp_array)[ib];
if ((aa - bb) > 1e-12){
return -1;
}else{
return 1;
}
}
int main(){
int i;
#pragma omp parallel for
for(i = 0; i < 100; i++){
int *index= (int *) malloc(sizeof(int)*10);
double *tmp_array = (double*) malloc(sizeof(double)*10);
int j;
for(j=0; j<10; j++){
tmp_array[j] = rand();
index[j] = j;
}
// QuickSort the index array based on tmp_array:
qsort_r(index, 10, sizeof(*index), simcmp, tmp_array);
free(index);
free(tmp_array);
}
return 0;
}
This compiles and runs with no issue. However, it is not completely ideal as qsort_r is platform and compiler dependent. There is a portable version of qsort_r here where the author summarizes my problem nicely:
If you want to qsort() an array with a comparison operator that takes
parameters you need to use global variables to pass those parameters
(not possible when writing multithreaded code), or use qsort_r/qsort_s
which are not portable (there are separate GNU/BSD/Windows versions
and they all take different arguments).
I'm attempting to implement block matrix multiplication and making it more parallelized.
This is my code :
int i,j,jj,k,kk;
float sum;
int en = 4 * (2048/4);
#pragma omp parallel for collapse(2)
for(i=0;i<2048;i++) {
for(j=0;j<2048;j++) {
C[i][j]=0;
}
}
for (kk=0;kk<en;kk+=4) {
for(jj=0;jj<en;jj+=4) {
for(i=0;i<2048;i++) {
for(j=jj;j<jj+4;j++) {
sum = C[i][j];
for(k=kk;k<kk+4;k++) {
sum+=A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
}
}
I've been playing around with OpenMP but still have had no luck in figuring what the best way to have this done in the least amount of time.
Getting good performance from matrix multiplication is a big job. Since "The best code is the code I don't have to write", a much better use of your time would be to understand how to use a BLAS library.
If you are using X86 processors, the Intel Math Kernel Library (MKL) is available free, and includes optimized, parallelized, matrix multiplication operations.
https://software.intel.com/en-us/articles/free-mkl
(FWIW, I work for Intel, but not on MKL :-))
I recently started looking into dense matrix multiplication (GEMM)again. It turns out the Clang compiler is really good at optimization GEMM without needing any intrinsics (GCC still needs intrinsics). The following code gets 60% of the peak FLOPS of my four core/eight hardware thread Skylake system. It uses block matrix multiplication.
Hyper-threading gives worse performance so you make sure you only use threads equal to the number of cores and bind threads to prevent thread migration.
export OMP_PROC_BIND=true
export OMP_NUM_THREADS=4
Then compile like this
clang -Ofast -march=native -fopenmp -Wall gemm_so.c
The code
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <omp.h>
#include <x86intrin.h>
#define SM 80
typedef __attribute((aligned(64))) float * restrict fast_float;
static void reorder2(fast_float a, fast_float b, int n) {
for(int i=0; i<SM; i++) memcpy(&b[i*SM], &a[i*n], sizeof(float)*SM);
}
static void kernel(fast_float a, fast_float b, fast_float c, int n) {
for(int i=0; i<SM; i++) {
for(int k=0; k<SM; k++) {
for(int j=0; j<SM; j++) {
c[i*n + j] += a[i*n + k]*b[k*SM + j];
}
}
}
}
void gemm(fast_float a, fast_float b, fast_float c, int n) {
int bk = n/SM;
#pragma omp parallel
{
float *b2 = _mm_malloc(sizeof(float)*SM*SM, 64);
#pragma omp for collapse(3)
for(int i=0; i<bk; i++) {
for(int j=0; j<bk; j++) {
for(int k=0; k<bk; k++) {
reorder2(&b[SM*(k*n + j)], b2, n);
kernel(&a[SM*(i*n+k)], b2, &c[SM*(i*n+j)], n);
}
}
}
_mm_free(b2);
}
}
static int doublecmp(const void *x, const void *y) { return *(double*)x < *(double*)y ? -1 : *(double*)x > *(double*)y; }
double median(double *x, int n) {
qsort(x, n, sizeof(double), doublecmp);
return 0.5f*(x[n/2] + x[(n-1)/2]);
}
int main(void) {
int cores = 4;
double frequency = 3.1; // i7-6700HQ turbo 4 cores
double peak = 32*cores*frequency;
int n = SM*10*2;
int mem = sizeof(float) * n * n;
float *a = _mm_malloc(mem, 64);
float *b = _mm_malloc(mem, 64);
float *c = _mm_malloc(mem, 64);
memset(a, 1, mem), memset(b, 1, mem);
printf("%dx%d matrix\n", n, n);
printf("memory of matrices: %.2f MB\n", 3.0*mem*1E-6);
printf("peak SP GFLOPS %.2f\n", peak);
puts("");
while(1) {
int r = 10;
double times[r];
for(int j=0; j<r; j++) {
times[j] = -omp_get_wtime();
gemm(a, b, c, n);
times[j] += omp_get_wtime();
}
double flop = 2.0*1E-9*n*n*n; //GFLOP
double time_mid = median(times, r);
double flops_low = flop/times[r-1], flops_mid = flop/time_mid, flops_high = flop/times[0];
printf("%.2f %.2f %.2f %.2f\n", 100*flops_low/peak, 100*flops_mid/peak, 100*flops_high/peak, flops_high);
}
}
This does GEMM 10 times per iteration of an infinite loop and prints the low, median, and high ratio of FLOPS to peak_FLOPS and finally the median FLOPS.
You will need to adjust the following lines
int cores = 4;
double frequency = 3.1; // i7-6700HQ turbo 4 cores
double peak = 32*cores*frequency;
to the number of physical cores, frequency for all cores (with turbo if enabled), and the number of floating pointer operations per core which is 16 for Core2-Ivy Bridge, 32 for Haswell-Kaby Lake, and 64 for the Xeon Phi Knights Landing.
This code may be less efficient with NUMA systems. It does not do nearly as well with Knight Landing (I just started looking into this).
you guys have helped me so much with this code. Let me preface by saying I do not know C very well and am trying really hard to do this.
This is what the program should do:
Create a list of random numbers of length 10 Million
Sort the list of random numbers using shell sort function (still doesn't work properly...i think its how I am passing the pointer to the function)
Make a list 1 Million Longer
repeat for up to 100 million while recording time (the time shows up as 0.0000000 for some reason)
I'm just trying to test this shell sort program vs quick sort built into the standard library.
I've tried with and without pointers. The commented out section should work when it's done. It just messes things up more lol
Please help me out, you guys have been so great so far...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void shellSort(int *A, int n);
void checkSort(int *A, int n);
int main(){
/*Initialize Random Array*/
int unsorted_list[10000000];
int *ptr = &unsorted_list[0];
int random_number;
int i;
srand ( time(NULL) );
for(i=0; i<10000000; i++){
random_number = rand();
unsorted_list[i] = random_number % 10000000;
}
//Do C Shell Sort
double shell_results[10][2];
double clock_diff;
int j=10000000;
clock_t t0, t1;
int k;
for(i=0;i<10;i++){
/*Sort the list using shellSort and take the time difference*/
t0 = clock();
shellSort(ptr, j);
t1= clock();
/*Take difference in time*/
clock_diff = (t1 - t0)/CLOCKS_PER_SEC;
/*Add time and list length to the results array*/
shell_results[i][0] = (double)j;
shell_results[i][1] = clock_diff;
/*Check to make sure the array has been sorted*/
checkSort(ptr, j);
/*Re-initialize a longer array*/
//j+=1000000;
//for(k=0; k<j; k++){
// random_number = rand();
// unsorted_list[k] = random_number % 1000000;
//}
printf("%d",(int)shell_results[i][0]);
printf(" ");
printf("%f",shell_results[i][1]);
printf("\n");
}
return 0;
}
void shellSort(int *A, int n){
int gap , i , j , temp;
for (gap = n/2; gap>0; gap /=2)
for (i=gap; i<n; i++)
for(j = i-gap; j>=0 && A[j] > A[j+gap]; j-=gap){
temp = A[j];
A[j] = A[j + gap];
A[j + gap] = temp;
}
}
void checkSort(int *A, int n){
int i;
for(i=0;i<n;i++){
if(A[i]>A[i+1]){
printf("Error in sorting \n");
break;
}
}
}
You probably don't have 10 megabytes of stack space. Make that array global, declare it with static, or allocate it dynamically using malloc(). If you choose the latter, don't forget to free() it.
Later, when you need to use the 100,000,000 element array, make sure to use a new allocation for it!
Well there is no way you are going to have that amount of space available on the stack. Allocate it off the heap using malloc(). Remember to free() it afterwards.