Error in cudaMemcpyToSymbol using CUDA 5 - c

The Problem
I have prepared one sample CUDA code using the constant memory. I can run this in cuda 4.2 successfully but I get "invalid device symbol" when I compile using the CUDA 5.
I have attached the sample code here.
The Code
#include <iostream>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
struct CParameter
{
int A;
float B;
float C;
float D;
};
__constant__ CParameter * CONSTANT_PARAMETER;
#define PARAMETER "CONSTANT_PARAMETER"
bool ERROR_CHECK(cudaError_t Status)
{
if(Status != cudaSuccess)
{
printf(cudaGetErrorString(Status));
return false;
}
return true;
}
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N)
{
a[idx] = CONSTANT_PARAMETER->A * a[idx];
}
}
////Main Function/////
int main(void)
{
/////Variable Definition
const int N = 10;
size_t size = N * sizeof(float);
cudaError_t Status = cudaSuccess;
CParameter * m_dParameter;
CParameter * m_hParameter;
float * m_D;
float * m_H;
//Memory Allocation Host
m_hParameter = new CParameter;
m_H = new float[N];
//Memory Allocation Device
cudaMalloc((void **) &m_D, size);
cudaMalloc((void**)&m_dParameter,sizeof(CParameter));
////Data Initialization
for (int i=0; i<N; i++)
m_H[i] = (float)i;
m_hParameter->A = 5;
m_hParameter->B = 3;
m_hParameter->C = 98;
m_hParameter->D = 100;
//Memory Copy from Host To Device
Status = cudaMemcpy(m_D, m_H, size, cudaMemcpyHostToDevice);
ERROR_CHECK(Status);
Status = cudaMemcpy(m_dParameter,m_hParameter,sizeof(CParameter),cudaMemcpyHostToDevice);
ERROR_CHECK(Status);
Status = cudaMemcpyToSymbol(PARAMETER, &m_dParameter, sizeof(m_dParameter));
ERROR_CHECK(Status);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<<n_blocks, block_size>>>(m_D,N);
// Retrieve result from device and store it in host array
cudaMemcpy(m_H, m_D, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++)
printf("%d %f\n", i, m_H[i]);
// Cleanup
free(m_H);
free(m_hParameter);
cudaFree(m_dParameter);
cudaFree(m_D);
return 0;
}
I have tried WINDOWS: CUDA 5.0 Production Release and the Graphics card is GTX 590.
Any help will be appreciated.

In an effort to avoid being "Stringly Typed", the use of character strings to refer to device symbols was deprecated in CUDA runtime API functions in CUDA 4.1, and removed in CUDA 5.0.
The CUDA 5 release notes read:
** The use of a character string to indicate a device symbol, which was possible
with certain API functions, is no longer supported. Instead, the symbol should be
used directly.
If you change your code to the following, it should work.
Status = cudaMemcpyToSymbol(CONSTANT_PARAMETER, &m_dParameter, sizeof(m_dParameter));
ERROR_CHECK(Status);

From the CUDA 5.0 Release Notes:
** The use of a character string to indicate a device symbol, which was possible with certain API functions, is no longer supported. Instead, the symbol should be used directly. "
These API functions still exist, but they accept the target symbol argument only as a bare identifier now, not as either a bare identifier or a string literal naming an ident. E.g.
__ device__ __ constant__ type ident;
main() { cudaMemcpyToSymbol("ident", ...); } // no longer valid, returns cudaErrorInvalidSymbol
main() { cudaMemcpyToSymbol(ident, ...); } // valid
So get rid of this:
#define PARAMETER "CONSTANT_PARAMETER"
And change this:
Status = cudaMemcpyToSymbol(PARAMETER, &m_dParameter, sizeof(m_dParameter));
To this:
Status = cudaMemcpyToSymbol(CONSTANT_PARAMETER, &m_dParameter, sizeof(m_dParameter));
And I think it will work.

Related

How to manually compile shared dpi lib?

I try to import some C-function that generates an array in SystemVerilog.
Here is code:
#include "svdpi.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
void getPacket(int packetSize, svOpenArrayHandle fpSig, svOpenArrayHandle err)
{
int* cfpSig = (int*) calloc(packetSize, sizeof(int));
double* cerr = (double*)calloc(packetSize, sizeof(double));
for(int i = 0; i < packetSize; ++i)
{
cfpSig[i] = i;
cerr[i] = 1.1*i;
printf("%d %f\n",cfpSig[i],cerr[i]);
}
printf("----------");
memcpy((int*) svGetArrayPtr(fpSig),cfpSig,packetSize);
memcpy((int*) svGetArrayPtr(err),cerr,packetSize);
free(cfpSig);
free(cerr);
}
import "DPI-C" function void getPacket(input int packetSize,
output int fpSig[], output real err[]);
module top();
initial begin
parameter int packetSize = 4;
int fpSig[packetSize];
real err[packetSize];
getPacket(packetSize,fpSig,err);
for(int i = 0; i < packetSize; ++i) begin
$display("fpSig: %d\nerr : %f",fpSig[i],err[i]);
end
end
endmodule
But when I compile the c-code manually, an error is generated at the linking stage: undefined reference to 'svGetArrayPtr'.
I have not previously worked with svOpenArrayHandle and it was enough to connect the header file "svdpi.h". I tried to look for some svdpi.dll lib in the questa install folder, but didn't find it.
If I compile c-file by vlog it's working fine, but I want to compile it manually because I plan to include matlab libs and compiling via vlog will become uncomfortable.
In Questasim simulator, the library containing the svGetArrayPtr symbol is mtipli.dll

Call GNU Octave functions in C?

I want to use matrix algebra and optimization. I have tested different C and C++ libraries for matrix algebra but the problem with those is they cannot handle garbage data as good as GNU Octave does. Garbage data in C and C++ goes low to like e-8 but in GNU Octave it will be pushed down way to low as e-17. That's very useful if you planning to use garbage data from e.g measurement in calculations. They don't effect nothing of your results.
But GNU Octave have a C++ API, which I don't really understand how to use. But I want to use C and call GNU Octave functions from C.
Is that possible that I can create a struct that contains a 2D array and dimensions, and send it to GNU Octave and I will return a struct again that have the result and the dimension e.g solution.
There is a c mex interface. However the octave interpreter must be embedded and initialized before any mex function can be called. As of Octave 4.4 octave_main as suggested by the linked answer has been deprecated and some other changes also are needed for it to be useful for mex programs. So I have prepared a c++ source file calloctave.cc containing the functions mexCallOctave and free_arg_list and its header calloctave.h.
calloctave.cc
// calloctave.cc
#include "interpreter.h"
#include "mxarray.h"
#include "parse.h"
extern "C"
int
mexCallOctave (int nargout, mxArray *argout[], int nargin,
mxArray *argin[], const char *fname)
{
static octave::interpreter embedded_interpreter;
if (!embedded_interpreter.initialized())
embedded_interpreter.execute ();
octave_value_list args;
args.resize (nargin);
for (int i = 0; i < nargin; i++)
args(i) = mxArray::as_octave_value (argin[i]);
bool execution_error = false;
octave_value_list retval;
retval = octave::feval (fname, args, nargout);
int num_to_copy = retval.length ();
if (nargout < retval.length ())
num_to_copy = nargout;
for (int i = 0; i < num_to_copy; i++)
{
argout[i] = new mxArray (retval(i));
}
while (num_to_copy < nargout)
argout[num_to_copy++] = nullptr;
return execution_error ? 1 : 0;
}
extern "C"
void
free_arg_list (int nargs, mxArray* arglist[])
{
for(int i = 0; i < nargs; i++)
delete arglist[i];
}
calloctave.h
// calloctave.h
#pragma once
#include "mex.h"
#if defined (__cplusplus)
extern "C" {
#endif
int
mexCallOctave (int nargout, mxArray *argout[], int nargin,
mxArray *argin[], const char *fname);
void
free_arg_list (int nargs, mxArray* arglist[]);
#if defined (__cplusplus)
}
#endif
Here is a basic introduction into mex files. You can compile an example hello world program adding the option --verbose as mkoctfile --mex --verbose hello.c to get the list of compiler options that you need to use them for compilation of your actual programs. Note that because calloctave.cc is a c++ source it should be compiled using a c++ compiler such as g++.
In the following example a m function "myfunction" is called. It gets one input and produces one output. mexCallOctave is used for calling the octave function and it has the same signature as mexCallMATLAB.
myfunction.m
% myfunction.m
function out= myfunction( a )
out = sum(a);
endfunction
main.c
//main.c
#include <stdio.h>
#include "calloctave.h"
int main()
{
double input_data[] = {0,1,2,3,4,5,6,7,8,9,10};
const int nargin = 1;
const int nargout = 1;
mxArray* rhs[nargin];
mxArray* lhs[nargout];
// allocate mex array
rhs[0] = mxCreateDoubleMatrix( 10, 1, mxREAL);
double* rhs_ptr = mxGetPr( rhs[0] );
// copy data from input buffer to mex array
for (int i = 0 ; i < 10; i++)
rhs_ptr[i] = input_data[i];
// call octave function
mexCallOctave(nargout, lhs, nargin, rhs, "myfunction");
double* lhs_ptr = mxGetPr( lhs[0] );
double output_data = *lhs_ptr;
// show the result
printf ("result = %f", output_data);
// free memory
mxDestroyArray(rhs[0]);
free_arg_list(nargout, lhs);
}

Memory corruption when implementing a Pure Data external using C

I am currently trying to implement an external for Pure Data using C. I've not been using C for a while, and I have a memory corruption problem. I don't know what to do, so I'm asking for your help.
Here is the code. Please remember that it is a C code for Pure Data.
t_int *myfft_tilde_perform(t_int *w) {
t_myfft_tilde *fft = (t_myfft_tilde *)w[1];
t_float *in = (t_float *)w[2];
t_float *out = (t_float *)w[3];
int n = (int)w[4];
int i;
for(i=0; i<n; i++)
{
fft->bigbuffer[fft->nextindex] = in[i];
fft->nextindex = (fft->nextindex+1)%TAILLE;
}
if (!fft->isfull && fft->nextindex==0)
{
fft->isfull=1;
}
if (fft->isfull)
{
for(i=0; i<TAILLE; i++)
{
fft->bigbuffercpy[i] = fft->window[i] * fft->bigbuffer[i];
}
rdft(TAILLE, 1, fft->bigbuffercpy, fft->bitshuffle, fft->weighting);
if (fft->nextindex==0)
{
i=(TAILLE-1)-64;
}
else
{
i=fft->nextindex-64;
}
int j;
for(j=0; j<64; j++)
{
out[j] = fft->bigbuffercpy[i+j];
}
}
return (w+5);
}
void myfft_tilde_dsp(t_myfft_tilde *x, t_signal **sp) {
dsp_add(myfft_tilde_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n);
}
void myfft_tilde_free(t_myfft_tilde *x) {
if (x->bitshuffle)
freebytes(x->bitshuffle, TAILLE*2*sizeof(int));
if (x->weighting)
freebytes(x->weighting, TAILLE*2*sizeof(float));
if (x->window)
freebytes(x->window, TAILLE*sizeof(float));
if (x->bigbuffer)
freebytes(x->bigbuffer, TAILLE*sizeof(float));
if (x->bigbuffercpy)
freebytes(x->bigbuffercpy, TAILLE*sizeof(float));
}
void *myfft_tilde_new(void) {
t_myfft_tilde *fft = (t_myfft_tilde *)pd_new(myfft_tilde_class);
fft->x_out = outlet_new(&fft->x_obj, &s_signal);
fft->bitshuffle = (int *)calloc(TAILLE*2, sizeof(int));
fft->weighting = (float *)calloc(TAILLE*2, sizeof(float));
fft->window = (float *)calloc(TAILLE, sizeof(float));
fft->bigbuffer = (float *)calloc(TAILLE, sizeof(float));
fft->bigbuffercpy = (float *)calloc(TAILLE, sizeof(float));
int i;
for(i=0; i<TAILLE; i++)
{
fft->window[i] = 0.54 - 0.46*cos(TWOPI*i/TAILLE);
}
fft->nextindex=0;
fft->isfull=0;
init_rdft(TAILLE*2, fft->bitshuffle, fft->weighting);
return (void *)fft;
}
void myfft_tilde_setup(void) {
myfft_tilde_class = class_new(gensym("myfft~"),
(t_newmethod)myfft_tilde_new,
(t_method)myfft_tilde_free,
0, sizeof(t_myfft_tilde),
CLASS_DEFAULT, A_GIMME, 0);
CLASS_MAINSIGNALIN(myfft_tilde_class, t_myfft_tilde, f);
class_addmethod(myfft_tilde_class, (t_method)myfft_tilde_dsp,
gensym("dsp"), 0);
}
The functions that you don't know are functions that have been given to me. The memory corruption error occurs this way : I compile the code using a given makefile, then I try to run a given Pure Data file using the resulting pd_linux file in my console, and I immediately get this message in the console :
*** Error in `puredata': malloc(): memory corruption: 0x084f4570 ***
Pd: signal 6
I tried even to remove all the calloc I've done, but the error still occurs...
you are using class_new() wrongly.
it's arguments are:
name
constructor
destructor
size of the member-data struct
flags
arg-specification for the constructor
a 0 (terminator)
you have swapped size and flags, giving an effective size of 0 bytes. pd_new() will then allocate 0 (zero, naught) bytes for your fft-struct, which leads to a memory corruption once you access the struct members (as you do in the next line).
apart from that: please always use t_sample instead of t_float (or float!!) for sample values.
always use t_float instead of float for message numbers.

How to produce sound in C on Linux?

I need a way to play certain musical notes in my C program on Linux.
When using windows, it is possible to #include <dos.h> and use straight forward functions like sound(note/frequency), delay(time in ms), and the self explaining nosound().
Is there anything parallel on Linux?
Thanks
I like the tip above concerning libao - I just gave it a try and it works nicely. Here is a similar level of complexity using OpenAL to synthesize a raw audio buffer in PCM format then to render as audio
// sudo apt-get install libopenal-dev
// gcc -o openal_play_monday openal_play_monday.c -lopenal -lm
#include <stdio.h>
#include <stdlib.h> // gives malloc
#include <math.h>
#ifdef __APPLE__
#include <OpenAL/al.h>
#include <OpenAL/alc.h>
#elif __linux
#include <AL/al.h>
#include <AL/alc.h>
#include <unistd.h>
#endif
ALCdevice * openal_output_device;
ALCcontext * openal_output_context;
ALuint internal_buffer;
ALuint streaming_source[1];
int al_check_error(const char * given_label) {
ALenum al_error;
al_error = alGetError();
if(AL_NO_ERROR != al_error) {
printf("ERROR - %s (%s)\n", alGetString(al_error), given_label);
return al_error;
}
return 0;
}
void MM_init_al() {
const char * defname = alcGetString(NULL, ALC_DEFAULT_DEVICE_SPECIFIER);
openal_output_device = alcOpenDevice(defname);
openal_output_context = alcCreateContext(openal_output_device, NULL);
alcMakeContextCurrent(openal_output_context);
// setup buffer and source
alGenBuffers(1, & internal_buffer);
al_check_error("failed call to alGenBuffers");
}
void MM_exit_al() {
ALenum errorCode = 0;
// Stop the sources
alSourceStopv(1, & streaming_source[0]); // streaming_source
int ii;
for (ii = 0; ii < 1; ++ii) {
alSourcei(streaming_source[ii], AL_BUFFER, 0);
}
// Clean-up
alDeleteSources(1, &streaming_source[0]);
alDeleteBuffers(16, &streaming_source[0]);
errorCode = alGetError();
alcMakeContextCurrent(NULL);
errorCode = alGetError();
alcDestroyContext(openal_output_context);
alcCloseDevice(openal_output_device);
}
void MM_render_one_buffer() {
/* Fill buffer with Sine-Wave */
// float freq = 440.f;
float freq = 100.f;
float incr_freq = 0.1f;
int seconds = 4;
// unsigned sample_rate = 22050;
unsigned sample_rate = 44100;
double my_pi = 3.14159;
size_t buf_size = seconds * sample_rate;
// allocate PCM audio buffer
short * samples = malloc(sizeof(short) * buf_size);
printf("\nhere is freq %f\n", freq);
int i=0;
for(; i<buf_size; ++i) {
samples[i] = 32760 * sin( (2.f * my_pi * freq)/sample_rate * i );
freq += incr_freq; // change freq just to make things interesting
if (100.0 > freq || freq > 5000.0) {
incr_freq *= -1.0f; // toggle direction of freq increment
}
}
/* upload buffer to OpenAL */
alBufferData( internal_buffer, AL_FORMAT_MONO16, samples, buf_size, sample_rate);
al_check_error("populating alBufferData");
free(samples);
/* Set-up sound source and play buffer */
// ALuint src = 0;
// alGenSources(1, &src);
// alSourcei(src, AL_BUFFER, internal_buffer);
alGenSources(1, & streaming_source[0]);
alSourcei(streaming_source[0], AL_BUFFER, internal_buffer);
// alSourcePlay(src);
alSourcePlay(streaming_source[0]);
// ---------------------
ALenum current_playing_state;
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
while (AL_PLAYING == current_playing_state) {
printf("still playing ... so sleep\n");
sleep(1); // should use a thread sleep NOT sleep() for a more responsive finish
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
}
printf("end of playing\n");
/* Dealloc OpenAL */
MM_exit_al();
} // MM_render_one_buffer
int main() {
MM_init_al();
MM_render_one_buffer();
}
If you want to take OpenAL further ... take a gander at this
https://github.com/scottstensland/render-audio-openal
Out of the box OpenAL plays a buffer of PCM audio just fine ... however it leaves as an exercise the ability to play a stream. In that github repo I wrote an audio server using OpenAL which implements playing streaming audio ... enjoy
Windows uses its own one and only sound architecture, therefore you can access the sound() routine.
Different linux machines, depending on the packages installed, may require different approaches.
Maybe the utility beep (out of this question on stackexchange) can guide you to the right direction
one way
including
#include<conio.h>
and in side main() or where you want to use call print("\a")
printf("\a");
2nd way
including header file
#include <windows.h>
and calling function
Beep(500, 500);
Beep(freq, dur); where freq =beep frequency which is int and dutation in also int

CUDA extern texture declaration

I want to declare my texture once and use it in all my kernels and files. Therefore, I declare it as extern in a header and include the header on all other files (following the SO How do I use extern to share variables between source files?)
I have a header cudaHeader.cuh file containing my texture:
extern texture<uchar4, 2, cudaReadModeElementType> texImage;
In my file1.cu, I allocate my CUDA array and bind it to the texture:
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc< uchar4 >( );
cudaStatus=cudaMallocArray( &cu_array_image, &channelDesc, width, height );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMallocArray failed! cu_array_image couldn't be created.\n");
return cudaStatus;
}
cudaStatus=cudaMemcpyToArray( cu_array_image, 0, 0, image, size_image, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpyToArray failed! Copy from the host memory to the device texture memory failed.\n");
return cudaStatus;
}
// set texture parameters
texImage.addressMode[0] = cudaAddressModeWrap;
texImage.addressMode[1] = cudaAddressModeWrap;
texImage.filterMode = cudaFilterModePoint;
texImage.normalized = false; // access with normalized texture coordinates
// Bind the array to the texture
cudaStatus=cudaBindTextureToArray( texImage, cu_array_image, channelDesc);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaBindTextureToArray failed! cu_array couldn't be bind to texImage.\n");
return cudaStatus;
}
In file2.cu, I use the texture in the kernel function as follows:
__global__ void kernel(int width, int height, unsigned char *dev_image) {
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
if(y< height) {
uchar4 tempcolor=tex2D(texImage, x, y);
//if(tempcolor.x==0)
// printf("tempcolor.x %d \n", tempcolor.x);
dev_image[y*width*3+x*3]= tempcolor.x;
dev_image[y*width*3+x*3+1]= tempcolor.y;
dev_image[y*width*3+x*3+2]= tempcolor.z;
}
}
The problem is that my texture contains nothing or corrupt values when I use it in my file2.cu. Even if I use the function kernel directly in file1.cu, the data are not correct.
If I add: texture<uchar4, 2, cudaReadModeElementType> texImage; in file1.cu and file2.cu, the compiler says that there is a redefinition.
EDIT:
I tried the same thing with CUDA version 5.0 but the same problem appears. If I print the address of texImage in file1.cu and file2.cu, I don't have the same address. There must have a problem with the declaration of the variable texImage.
This is a very old question and answers were provided in the comments by talonmies and Tom. In the pre-CUDA 5.0 scenario, extern textures were not feasible due to the lack of a true linker leading to extern linkage possibilities. As a consequence, and as mentioned by Tom,
you can have different compilation units, but they cannot reference each other
In the post-CUDA 5.0 scenario, extern textures are possible and I want to provide a simple example below, showing this in the hope that it could be useful to other users.
kernel.cu compilation unit
#include <stdio.h>
texture<int, 1, cudaReadModeElementType> texture_test;
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* LOCAL KERNEL FUNCTION */
/*************************/
__global__ void kernel1() {
printf("ThreadID = %i; Texture value = %i\n", threadIdx.x, tex1Dfetch(texture_test, threadIdx.x));
}
__global__ void kernel2();
/********/
/* MAIN */
/********/
int main() {
const int N = 16;
// --- Host data allocation and initialization
int *h_data = (int*)malloc(N * sizeof(int));
for (int i=0; i<N; i++) h_data[i] = i;
// --- Device data allocation and host->device memory transfer
int *d_data; gpuErrchk(cudaMalloc((void**)&d_data, N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaBindTexture(NULL, texture_test, d_data, N * sizeof(int)));
kernel1<<<1, 16>>>();
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
kernel2<<<1, 16>>>();
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaUnbindTexture(texture_test));
}
kernel2.cu compilation unit
#include <stdio.h>
extern texture<int, 1, cudaReadModeElementType> texture_test;
/**********************************************/
/* DIFFERENT COMPILATION UNIT KERNEL FUNCTION */
/**********************************************/
__global__ void kernel2() {
printf("Texture value = %i\n", tex1Dfetch(texture_test, threadIdx.x));
}
Remember to compile generating relocatable device code, namely, -rdc = true, to enable external linkage

Resources