How does the OpenACC copyin directive work? - c

According to the OpenACC documentation:
copyin - Create space for the listed variables on the device, initialize the variable by copying
data to the device at the beginning of the region, and release the space on the device when
done without copying the data back the the host.
I've created a test example program
int main(int argc, char** argv)
{
int teste[] = { -15 };
#pragma acc data copyin(teste[0:1])
{
#pragma acc parallel loop
for (int p = 0; p < 5000; p++) {
teste[0] = p;
}
}
printf("%d", teste[0]);
return 0;
}
According to the Docs the program should output -15 since the data is modified on the device and the result is not copied back to the host. But once I compile and run this code, the output is 4999
My compiler is gcc (tdm64-1) 10.3.0 and I'm running the program at a computer with separate device and host memory
I'd like to know why is this not working, and what could I do to prevent the copy from the device back to the host.
Here's the program running using git bash on windows:
$ cat test.c && echo "" &&gcc -fopenacc test.c && ./a.exe
#include <stdio.h>
int main(int argc, char** argv)
{
int teste[] = { -15 };
#pragma acc data copyin(teste[0:1])
{
#pragma acc parallel loop
for (int p = 0; p < 5000; p++) {
teste[0] = p;
}
}
printf("%d\n", teste[0]);
return 0;
}
4999
I also got access to a Linux Machine, and even using nvc I could not get the correct results
cat test.c && echo "" && /opt/nvidia/hpc_sdk/Linux_x86_64/2021/compilers/bin/nvc -acc -Minfo=accel test.c && ./a.out
#include <stdio.h>
int main(int argc, char** argv)
{
int teste[] = { -15 };
#pragma acc data copyin(teste[0:1])
{
#pragma acc parallel loop
for (int p = 0; p < 5000; p++) {
teste[0] = p;
}
}
printf("%d\n", teste[0]);
return 0;
}
main:
9, Generating copyin(teste[:]) [if not already present]
Generating NVIDIA GPU code
12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
4999

The program should print -15 since the value isn't changed on the host. Hence this is either a bug in gcc or you're not actually enabling OpenACC. What compiler flags are you using?
Here's the output using nvc targeting an NVIDIA A100:
% cat test.c
#include <stdio.h>
int main(int argc, char** argv)
{
int teste[] = { -15 };
#pragma acc data copyin(teste[0:1])
{
#pragma acc parallel loop
for (int p = 0; p < 5000; p++) {
teste[0] = p;
}
}
printf("%d\n", teste[0]);
return 0;
}
% nvc test.c -acc -Minfo=accel ; a.out
main:
10, Generating copyin(teste[:]) [if not already present]
Generating NVIDIA GPU code
13, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
-15

Related

openacc error when assigning values to dynamically allocated struct member array of struct referenced by pointer

I am trying to wrap my head around combining openacc with pointers to structs containing dynamically allocated members. The code below fails with
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
when compiled using nvc ("nvc 20.9-0 LLVM 64-bit target on x86-64 Linux -tp haswell"). As far as I can tell I am following the approach suggested eg in the OpenACC 'getting started' guide. But somehow presumably the pointers don't stick (?) on the device. Does anyone know what goes wrong here?
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data create(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X[0:g->N])
#pragma acc exit data delete(g[0:1])
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}```
From the compiler feedback messages you'll see something like:
fill:
32, Accelerator restriction: size of the GPU copy of g is unknown
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
32, Generating implicit copyin(g) [if not already present]
37, Generating update self(g->X[:g->N])
The problem being that the compiler can't implicitly copy aggregate types with dynamic data members so you need to add a "present(g)" to indicate that g is already the device.
Also, you'll want to copyin g in order to get the value of N on the device and no need to include the array shape in the exit data delete directive. For example:
% cat test.c
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data copyin(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X)
#pragma acc exit data delete(g)
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop present(g)
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
#pragma acc update self(g->X[:g->N])
for (i = 0; i < 4; i++)
{
printf("%d : %f \n",i,g->X[i]);
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}
% nvc -acc test.c -Minfo=accel -V20.9 ; a.out
allocate:
17, Generating enter data copyin(g[:1])
Generating enter data create(g->X[:N])
release:
24, Generating exit data delete(g[:1],g->X[:1])
fill:
32, Generating present(g[:1])
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
37, Generating update self(g->X[:g->N])
0 : 42.000000
1 : 42.000000
2 : 42.000000
3 : 42.000000

multifile direct compile ok, direct compile error

There are 3 files(generator.c, generator.h and main.c).
generator.c: There is only 1 function (gen fun) which is used to generate an array to store 10 random-generate numbers in generator.c.
generator.h:Declaration of generator.c.
main.c: There is only 1 function (main fun) in main.c which is used to print the number generated previously.
If generator.c is included in main.c and I compile it directly by execute "gcc main.c". The result is ok.
But while I compile it using " gcc -c generator.h, gcc -c main.c, gcc generator.o main.o ", it reported a warning "warning: assignment makes pointer from integer without a cast" at " p = gen(arr); " sentence in main funciton. And the final result was "Segmentation fault (core dumped)". The debug information showed "Cannot access memory at address" if i try to visit the value of pointer *p(i.e. array[0]) in the while loop of main function.
//////generator.c///////
int * gen( int arr[])
{
int i = 0;
int * p = arr;
int len = 10;
srand( (unsigned)((time)(NULL)));
while (i< len)
{
*p = rand() % ( len +1) + 0;
i ++;
p++;
}
return arr;
}
//////generator.h//////
int * gen( int arr[]);
//////main.c///////
int main(void)
{
int i = 0;
int arr[10]={0};
int * p;
p = gen(arr);
while (i < 10)
{
printf("output is %d\n",*p);// Segmentation fault (core dumped)
i++;
p++;
}
return 0;
}
Based on the addition to your question, it appears you are confused about how to include generator.h and then to compile the code. First your generator.h should be:
//////generator.h//////
#ifndef GENERATOR_H
#define GENERATOR_H 1
int *gen (int arr[]);
#endif
(edit: added appropriate Header Guards to prevent multiple inclusion of generator.h)
Your generator.c would then be:
//////generator.c///////
#include <stdlib.h>
#include "generator.h"
int *gen (int arr[])
{
int i = 0;
int * p = arr;
int len = 10;
while (i< len)
{
*p = rand() % len + 1;
i ++;
p++;
}
return arr;
}
And finally your main.c (I called it gen.c) would be:
//////main.c///////
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "generator.h"
int main(void)
{
int i = 0;
int arr[10]={0};
int *p;
srand( (unsigned)((time)(NULL)));
p = gen(arr);
while (i < 10)
{
printf ("output is %d\n",*p);
i++;
p++;
}
return 0;
}
Compile
$ gcc -Wall -Wextra -pedantic -std=c11 -Ofast generator.c -o bin/gen gen.c
(note: I would also encourage adding -Wshadow as a normal part of your compile string as well to identify any shadowed variables)
Example Use/Output
$ ./bin/gen
output is 8
output is 1
output is 5
output is 4
output is 9
output is 5
output is 4
output is 6
output is 5
output is 6
Look things over and let me know if you have further questions.

Why does this OpenMP code work on Linux, but not Windows?

Edit: solved! Windows limits the stack size to where my buffer does not fit; linux does not (additionaly I was accessing memory outside of my array... oops). Using gcc, you can set the stack size like so: gcc -Wl --stack,N [your other flags n stuff] where N is the size of the stack in bytes. Final working compile command: gcc -Wl --stack,8000000 -fopenmp openmp.c -o openmp
An interesting sidenote is that the rand() function seems to produce smaller patterns than in Linux, because I can see patterns (tiling) in the generated noise on Windows, but not on Linux. As always, if you need it to be absolutely random, use a cryptographically secure rand function.
Pre edit:
This piece of code is supposed to make a screenbuffer of randomnoise, then write that to a file. It works on linux (ubuntu 19) but not on windows (8.1).
The error message:
Unhandled exception at 0x0000000000413C46 in openmp.exe:
0xC00000FD: Stack overflow (parameters: 0x0000000000000001, 0x0000000000043D50).
0000000000413C46 or qword ptr [rcx],0
// gcc -fopenmp openmp.c -o openmp
// ./openmp
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <stdint.h>
int main(int argc, char **argv)
{
int w = 1920;
int h = 1080;
int thread_id, nloops;
unsigned char buffer[w][h][3]; // 1920 x 1080 pixels, 3 channels
printf("Did setup\n");
#pragma omp parallel private(thread_id, nloops)
{
nloops = 0;
thread_id = omp_get_thread_num();
printf("Thread %d started\n", thread_id);
#pragma omp for
for (int x = 0; x < w; x++){
for (int y = 0; y < h; y++){
nloops++;
unsigned char r = rand();
unsigned char g = rand();
unsigned char b = rand();
buffer[x][y][0] = r;
buffer[x][y][1] = g;
buffer[x][y][2] = b;
}
}
printf("Thread %d performed %d iterations of the loop.\n", thread_id, nloops);
}
FILE* image = fopen("render.ppm","w");
fprintf(image, "P3\n%d %d\n%d\n", w, h, 255);
for (int x = 0; x < w; x++){
for (int y = 0; y < h-1; y++){
fprintf(image, "%d %d %d ", buffer[x][y][0], buffer[x][y][1], buffer[x][y][2]);
}
fprintf(image, "%d %d %d\n", buffer[w][h][0], buffer[w][h][1], buffer[w][h][2]);
}
printf("%fmb\n", ((float)sizeof(buffer))/1000000);
return 0;
}
The local buffer variable wants 1920 * 1080 * 3 (6,220,800) bytes of space. This is more than the default stack size on a Windows application.
If you were using the Microsoft tools, you could use the /STACK linker option to specify a larger stack.
With the GCC toolchain, you can use the --stack,8000000 option to set a larger stack size.
Or you can dynamically allocate space for buffer using malloc.
A third alternative is to use the editbin tool to specify the size after the executable is built.
In
fprintf(image, "%d %d %d\n", buffer[w][h][0], buffer[w][h][1], buffer[w][h][2]);
you are accessing buffer out of bounds. The highest valid indices for buffer are w - 1 and h - 1:
fprintf(image, "%d %d %d\n", buffer[w - 1][h - 1][0], buffer[w - 1][h - 1][1], buffer[w - 1][h - 1][2]);

Fail to link c code to lapack / blas : undefined reference

i have been trying for hours and it drives me crazy. The last error I get is :
demo_cblas.c:(.text+0x83): undefined reference to `clapack_sgetrf'
demo_cblas.c:(.text+0xa3): undefined reference to `clapack_sgetri'
I am compiling the code using
/usr/bin/gcc -o demo_cblas demo_cblas.c -L /usr/lib64 -l :libgfortran.so.3 -L /usr/lib64 \
-llapack -L /usr/lib64 -lblas
I try with and without libgfortran, with different compilers gcc-33, gcc-47, gcc-48. The test code is not from me but comes from this forum ...
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "clapack.h"
#include "cblas.h"
void invertMatrix(float *a, unsigned int height){
int info, ipiv[height];
info = clapack_sgetrf(CblasColMajor, height, height, a, height, ipiv);
info = clapack_sgetri(CblasColMajor, height, a, height, ipiv);
}
void displayMatrix(float *a, unsigned int height, unsigned int width)
{
int i, j;
for(i = 0; i < height; i++){
for(j = 0; j < width; j++)
{
printf("%1.3f ", a[height*j + i]);
}
printf("\n");
}
printf("\n");
}
int main(int argc, char *argv[])
{
int i;
float a[9], b[9], c[9];
srand(time(NULL));
for(i = 0; i < 9; i++)
{
a[i] = 1.0f*rand()/RAND_MAX;
b[i] = a[i];
}
displayMatrix(a, 3, 3);
return 0;
}
I am on Suse 12.3 64bits. In /usr/lib64 I have liblapack.a liblapack.so, ... and libblas.a libblas.so, ... and libgfortran.so.3
The same code without the function "invertMatrix" (the one using the library) compiles fine.
Any idea or suggestion ?
Thank you all for your help.
Vava
I'm quite positive that you also need to link to libcblas, which is the c wrapper library for libblas. Note that libblas is a FORTRAN library which therefore does not contain the function clapack_* you're calling.
I've just got this working on FreeBSD with:
gcc -o test test.c \
-llapack -lblas -lalapack -lcblas
I'd installed math/atlas (from ports) and the lapack and blas packages.
See my question here

pthreads and drand48 concurrency performance

According to specification, the function rand() in C uses mutexes to lock context (http://sourcecodebrowser.com/uclibc/0.9.27/rand_8c.html). So if I use multiple threads that call it, my program will be slow because all threads will try to access this lock region.
So, I have found drand48(), another random number generator function, which does not have locks (http://sourcecodebrowser.com/uclibc/0.9.27/drand48_8c.html#af9329f9acef07ca14ea2256191c3ce74). But, somehow, my parallel program is still slower than the serial one! The code is pasted bellow:
Serial version:
#include <cstdlib>
#define M 100000000
int main()
{
for (int i = 0; i < M; ++i)
drand48();
return 0;
}
Parallel version:
#include <pthread.h>
#include <cstdlib>
#define M 100000000
#define N 4
pthread_t threads[N];
void* f(void* p)
{
for (int i = 0; i < M/N; ++i)
drand48();
}
int main()
{
for (int i = 0; i < N; ++i)
pthread_create(&threads[i], NULL, f, NULL);
for (int i = 0; i < N; ++i)
pthread_join(threads[i], NULL);
return 0;
}
I executed both codes. The serial one runs in ~0.6 seconds and the parallel in ~2.1 seconds.
Could anyone explain me why this happens?
Some additional information: I have 4 cores on my PC. I compile the serial version using
g++ serial.cpp -o serial
and the parallel using
g++ parallel.cpp -lpthread -o parallel
Edit:
Apparently, this performance loss happens whenever I updates a global variable in my threads. In the exemple below, the x variable is the global (note that in the parallel example, the operation will be non thread-safe):
Serial:
#include <cstdlib>
#define M 1000000000
int x = 0;
int main()
{
for (int i = 0; i < M; ++i)
x = x + 10 - 10;
return 0;
}
Parallel:
#include <pthread.h>
#include <cstdlib>
#define M 1000000000
#define N 4
pthread_t threads[N];
int x;
void* f(void* p)
{
for (int i = 0; i < M/N; ++i)
x = x + 10 - 10;
}
int main()
{
for (int i = 0; i < N; ++i)
pthread_create(&threads[i], NULL, f, NULL);
for (int i = 0; i < N; ++i)
pthread_join(threads[i], NULL);
return 0;
}
Note that the drand48() uses the global struct variable _libc_drand48_data.
drand48() uses the global struct variable _libc_drand48_data, it keeps state there (writes to it), and is therefore the source of cache line contention, which is very likely the source of the performance degradation. It isn't false sharing as I initially suspected and wrote in the comments, it is bona fide sharing. The reason there is no locking in the implementation of drand48() is two fold:
drand48() is not required to be thread-safe "The drand48(), lrand48(), and mrand48() functions need not be thread-safe."
If two threads happen to access it at the same time, and their writes to memory are interleaved there is no harm done - the data structure is not corrupted, and it is, after all, supposed to return pseudo random data.
There are some subtle considerations (race conditions) in the use of drand48() when one thread is initializing state, but considered harmless
Notice below in __drand48_iterate how it stores to three 16-bit words in the global variable, this is where the random generator keeps its state, and this is the source of the cache-line contention between your threads
xsubi[0] = result & 0xffff;
xsubi[1] = (result >> 16) & 0xffff;
xsubi[2] = (result >> 32) & 0xffff;
Source code
You provided the link to drand48() source code which I've included below for reference. The problem is cache line contention when the state is updated
#include <stdlib.h>
/* Global state for non-reentrant functions. Defined in drand48-iter.c. */
extern struct drand48_data __libc_drand48_data;
double drand48(void)
{
double result;
erand48_r (__libc_drand48_data.__x, &__libc_drand48_data, &result);
return result;
}
And here is the source for erand48_r
extern int __drand48_iterate(unsigned short xsubi[3], struct drand48_data *buffer);
int erand48_r (xsubi, buffer, result)
unsigned short int xsubi[3];
struct drand48_data *buffer;
double *result;
{
union ieee754_double temp;
/* Compute next state. */
if (__drand48_iterate (xsubi, buffer) < 0)
return -1;
/* Construct a positive double with the 48 random bits distributed over
its fractional part so the resulting FP number is [0.0,1.0). */
temp.ieee.negative = 0;
temp.ieee.exponent = IEEE754_DOUBLE_BIAS;
temp.ieee.mantissa0 = (xsubi[2] << 4) | (xsubi[1] >> 12);
temp.ieee.mantissa1 = ((xsubi[1] & 0xfff) << 20) | (xsubi[0] << 4);
/* Please note the lower 4 bits of mantissa1 are always 0. */
*result = temp.d - 1.0;
return 0;
}
And the implementation of __drand48_iterate which is where it writes back to the global
int
__drand48_iterate (unsigned short int xsubi[3], struct drand48_data *buffer)
{
uint64_t X;
uint64_t result;
/* Initialize buffer, if not yet done. */
if (unlikely(!buffer->__init))
{
buffer->__a = 0x5deece66dull;
buffer->__c = 0xb;
buffer->__init = 1;
}
/* Do the real work. We choose a data type which contains at least
48 bits. Because we compute the modulus it does not care how
many bits really are computed. */
X = (uint64_t) xsubi[2] << 32 | (uint32_t) xsubi[1] << 16 | xsubi[0];
result = X * buffer->__a + buffer->__c;
xsubi[0] = result & 0xffff;
xsubi[1] = (result >> 16) & 0xffff;
xsubi[2] = (result >> 32) & 0xffff;
return 0;
}

Resources