#include "mbed.h"
#include "TextLCD.h"
#include "HCSR04.h"
PinName pin_ECHO = D12; // Echo
PinName pin_TRIG = D11; // Trig
HCSR04 Ultra(pin_ECHO, pin_TRIG); // Sensor
// rs, rw, e, d0-d3
TextLCD lcd(D1, D2, D4, D5, D6, D7);
float Ultra_cm(void);
int main() {
uint16_t dist = 0;
lcd.printf(" Ultra\n");
lcd.printf(" Dist : [cm]");
while(1) {
dist = Ultra_cm();
lcd.locate(8, 1);
lcd.printf("%3d", dist);
wait(0.5);
}
}
float Ultra_cm(void)
{
Ultra.Measurment();
while(!Ultra.isNewDataReady());
return Ultra.getDistance_cm();
}
I'm trying to create a project where I have a ultrasonic sensor,HC-SR04, that will read the distance of an object and display it on a LCD(1602). This is the code I am using but when I compile it says the Ultra.Measurment() has no member and I don't know what I'm missing that is the issue. Any suggestion or help would be appreciated. The board I'm using is a Nucleo-L496ZG.
Related
I have directly tried to convert C++ example of mathGL Into C code here:
#include <mgl2/mgl_cf.h>
#include <mgl2/wnd_cf.h>
#include <mgl2/fltk.h>
#include <pthread.h>
#include <synchapi.h>
//mglFLTK *gr=NULL; // pointer to window
HMGL gr = NULL;//HMGL mgl_create_graph_fltk
void *calc(void *) // function with calculations
{
// mglPoint pnt; // some data for plot
// HMDT dat;
// dat= mgl_create_data () ;
for(long i=0;;i++) // do calculation
{
// long_calculations(); // which can be very long
// Sleep(200);
for(uint64_t i=0;i<200000;i++);
// pnt.Set(2*mgl_rnd()-1,2*mgl_rnd()-1);
if(gr)
{
// gr->Clf(); // make new drawing
mgl_clf ( gr) ;
// draw something
// gr->Line(mglPoint(),pnt,"Ar2");
mgl_line(gr,0,0,0,2*mgl_rnd()-1,2*mgl_rnd()-1,0,"Ar2",2);
char str[16]; snprintf(str,15,"i=%ld",i);
// gr->Puts(mglPoint(),str);
mgl_puts(gr,0,0,0,str,":C",-.7);
// don’t forgot to update window
// gr->Update();
mgl_wnd_update(gr);
}
}
}
But it doesn't work. I've added printf before mgl_wnd_update to printing i in console and it only prints one time: i=0. I'm using windows 10 and installed minGW GCC and mathgl by MSYS2 through eclipse IDE. Though I've created the not threaded one here:
int main(int argc,char **argv)
{
gr = mgl_create_graph_fltk(NULL, "First C graph", NULL, NULL);
mgl_fltk_thr ();
while(1)
{
Sleep(100);
mgl_clf ( gr) ;
// draw something
// gr->Line(mglPoint(),pnt,"Ar2");
mgl_line(gr,0,0,0,2*mgl_rnd()-1,2*mgl_rnd()-1,0,"Ar2",-.5);
char str1[16]; snprintf(str1,15,"i=%ld",1);
// gr->Puts(mglPoint(),str);
mgl_puts(gr,0,0,0,str1,":C",-.5);
// don’t forgot to update window
// gr->Update();
mgl_wnd_update(gr);
}
return 0;
}
And this is works. What is wrong with that pthreaded code? It seems that the thread loop only works once. But this loops works well on none-threade program!!
code:
#include <BLEDevice.h>
#include <BLEUtils.h>
#include <BLEServer.h>
#include <BLE2902.h>
byte flags = 0b00011110;
byte bpm;
byte rr;
byte heart[8] = { 0b00011110, 60, 60, 60, 60 , 60, 60, 60};
byte hrmPos[1] = {2};
bool _BLEClientConnected = false;
#define heartRateService BLEUUID((uint16_t)0x180D)
BLECharacteristic heartRateMeasurementCharacteristics(BLEUUID((uint16_t)0x2A37), BLECharacteristic::PROPERTY_NOTIFY);
BLECharacteristic sensorPositionCharacteristic(BLEUUID((uint16_t)0x2A38), BLECharacteristic::PROPERTY_READ);
BLEDescriptor heartRateDescriptor(BLEUUID((uint16_t)0x2901));
BLEDescriptor sensorPositionDescriptor(BLEUUID((uint16_t)0x2901));
class MyServerCallbacks : public BLEServerCallbacks {
void onConnect(BLEServer* pServer) {
_BLEClientConnected = true;
};
void onDisconnect(BLEServer* pServer) {
_BLEClientConnected = false;
}
};
void InitBLE() {
BLEDevice::init("Sensor X");
// Create the BLE Server
BLEServer *pServer = BLEDevice::createServer();
pServer->setCallbacks(new MyServerCallbacks());
// Create the BLE Service
BLEService *pHeart = pServer->createService(heartRateService);
pHeart->addCharacteristic(&heartRateMeasurementCharacteristics);
heartRateMeasurementCharacteristics.addDescriptor(&heartRateDescriptor);
heartRateMeasurementCharacteristics.addDescriptor(new BLE2902());
pHeart->addCharacteristic(&sensorPositionCharacteristic);
sensorPositionCharacteristic.addDescriptor(&sensorPositionDescriptor);
pServer->getAdvertising()->addServiceUUID(heartRateService);
pHeart->start();
// Start advertising
pServer->getAdvertising()->start();
}
void setup() {
Serial.begin(115200);
Serial.println("Start");
InitBLE();
bpm = 1;
rr = 800;
}
void loop() {
// put your main code here, to run repeatedly:
heart[1] = (byte)bpm;
int energyUsed = 3000;
heart[3] = energyUsed / 256;
heart[6] = (byte)bpm;
heart[2] = energyUsed - (heart[2] * 256);
Serial.println(bpm);
Serial.println(rr);
heartRateMeasurementCharacteristics.setValue(heart, 8);
heartRateMeasurementCharacteristics.notify();
sensorPositionCharacteristic.setValue(hrmPos, 1);
rr++;
bpm++;
delay(2000);
}
I have the above code - and I have been looking at this here but I can't figure out how do I pass the RR data over BLE. I set the flags according to the BLE spec shown here. But whenever I use an app to see the data "HRV Logger", it says that I am not transmitted RR data. I thought all I had to do was pass heart[6] data and that would show up as RR. I have dummy information being passed since I'm not connected to any sensors. Help would be helpful, determining how to pass the RR data across, thanks! (for future reference I am using the Sparkfun ESP32, but this shouldn't affect solving the issue since it has to do with protocols)
I am trying to parallelize a simple mandelbrot c program, yet I get this error that has to do with not including acc routine information. Also, I am not sure whether I should be copying data in and out of the parallel section. PS I am relatively new to parallel programming, so any advice with learning it would be appreciated.
(Warning when compiled)
PGC-S-0155-Procedures called in a compute region must have acc routine information: fwrite (mandelbrot.c: 88)
PGC-S-0155-Accelerator region ignored; see -Minfo messages (mandelbrot.c: 51)
main:
51, Accelerator region ignored
88, Accelerator restriction: call to 'fwrite' with no acc routine information
PGC/x86-64 Linux 16.10-0: compilation completed with severe errors
Here is my code:
#include <stdio.h>
#include <math.h>
int main()
{
/* screen ( integer) coordinate */
int iX,iY;
const int iXmax = 800;
const int iYmax = 800;
/* world ( double) coordinate = parameter plane*/
double Cx,Cy;
const double CxMin=-2.5;
const double CxMax=1.5;
const double CyMin=-2.0;
const double CyMax=2.0;
/* */
double PixelWidth=(CxMax-CxMin)/iXmax;
double PixelHeight=(CyMax-CyMin)/iYmax;
/* color component ( R or G or B) is coded from 0 to 255 */
/* it is 24 bit color RGB file */
const int MaxColorComponentValue=255;
FILE * fp;
char *filename="new1.ppm";
char *comment="# ";/* comment should start with # */
static unsigned char color[3];
/* Z=Zx+Zy*i ; Z0 = 0 */
double Zx, Zy;
double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */
/* */
int Iteration;
const int IterationMax=200;
/* bail-out value , radius of circle ; */
const double EscapeRadius=2;
double ER2=EscapeRadius*EscapeRadius;
/*create new file,give it a name and open it in binary mode */
fp= fopen(filename,"wb"); /* b - binary mode */
/*write ASCII header to the file*/
fprintf(fp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue);
/* compute and write image data bytes to the file*/
#pragma acc parallel loop present(CyMin, iY, PixelHeight, iX, iXmax, CxMin, PixelWidth, Zx, Zy, Zx2, Zy2, Iteration, IterationMax)
for(iY=0;iY<iYmax;iY++)
{
Cy=CyMin + iY*PixelHeight;
if (fabs(Cy)< PixelHeight/2) Cy=0.0; /* Main antenna */
#pragma acc loop
for(iX=0;iX<iXmax;iX++)
{
Cx=CxMin + iX*PixelWidth;
/* initial value of orbit = critical point Z= 0 */
Zx=0.0;
Zy=0.0;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
/* */
#pragma acc loop
for (Iteration=0;Iteration<IterationMax && ((Zx2+Zy2)<ER2);Iteration++)
{
Zy=2*Zx*Zy + Cy;
Zx=Zx2-Zy2 +Cx;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
};
/* compute pixel color (24 bit = 3 bytes) */
if (Iteration==IterationMax)
{ /* interior of Mandelbrot set = black */
color[0]=0;
color[1]=0;
color[2]=0;
}
else
{ /* exterior of Mandelbrot set = white */
color[0]=255; /* Red*/
color[1]=255; /* Green */
color[2]=255;/* Blue */
};
/*write color to the file*/
fwrite(color,1,3,fp);
}
}
fclose(fp);
return 0;
}
Since you can't access a file from the GPU, you'll want to capture the results to arrays, copy them back to the host, and then output the results to a file.
Also, the "present" clause indicates that you've already copied the data over to the device and the program will abort if it's not there. Given the usage, I think you meant to use "private", which indicates that the variable should be private to the execution level. However scalars are private by default in OpenACC, so there's no need to manually privatize these variables. If you do manually privatize a variable, be sure to put it at the correct loop level. For example, if you privatize "Zx" on the outer loop, it will only private to that loop level. It would be shared by all the vectors of the inner loop! Again, here, it's best to just let the compiler handle privatizing the scalars, but just be mindful of where you privatize things in the few cases where you have to manually privatize variables.
Here's a corrected version of your code.
#include <stdio.h>
#include <math.h>
int main()
{
/* screen ( integer) coordinate */
int iX,iY;
const int iXmax = 800;
const int iYmax = 800;
/* world ( double) coordinate = parameter plane*/
double Cx,Cy;
const double CxMin=-2.5;
const double CxMax=1.5;
const double CyMin=-2.0;
const double CyMax=2.0;
/* */
double PixelWidth=(CxMax-CxMin)/iXmax;
double PixelHeight=(CyMax-CyMin)/iYmax;
/* color component ( R or G or B) is coded from 0 to 255 */
/* it is 24 bit color RGB file */
const int MaxColorComponentValue=255;
FILE * fp;
char *filename="new1.ppm";
char *comment="# ";/* comment should start with # */
static unsigned char color[3];
unsigned char red[iXmax][iYmax];
unsigned char blue[iXmax][iYmax];
unsigned char green[iXmax][iYmax];
/* Z=Zx+Zy*i ; Z0 = 0 */
double Zx, Zy;
double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */
/* */
int Iteration;
const int IterationMax=200;
/* bail-out value , radius of circle ; */
const double EscapeRadius=2;
double ER2=EscapeRadius*EscapeRadius;
/*create new file,give it a name and open it in binary mode */
fp= fopen(filename,"wb"); /* b - binary mode */
/*write ASCII header to the file*/
fprintf(fp,"P6\n %s\n %d\n %d\n %d\n",comment,iXmax,iYmax,MaxColorComponentValue);
/* compute and write image data bytes to the file*/
#pragma acc parallel loop copyout(red,blue,green)
for(iY=0;iY<iYmax;iY++)
{
Cy=CyMin + iY*PixelHeight;
if (fabs(Cy)< PixelHeight/2) Cy=0.0; /* Main antenna */
#pragma acc loop
for(iX=0;iX<iXmax;iX++)
{
Cx=CxMin + iX*PixelWidth;
/* initial value of orbit = critical point Z= 0 */
Zx=0.0;
Zy=0.0;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
/* */
#pragma acc loop
for (Iteration=0;Iteration<IterationMax && ((Zx2+Zy2)<ER2);Iteration++)
{
Zy=2*Zx*Zy + Cy;
Zx=Zx2-Zy2 +Cx;
Zx2=Zx*Zx;
Zy2=Zy*Zy;
};
/* compute pixel color (24 bit = 3 bytes) */
if (Iteration==IterationMax)
{ /* interior of Mandelbrot set = black */
red[iX][iY]=0;
blue[iX][iY]=0;
green[iX][iY]=0;
}
else
{ /* exterior of Mandelbrot set = white */
red[iX][iY]=255;
blue[iX][iY]=255;
green[iX][iY]=255;
}; }
}
/*write color to the file*/
for(iY=0;iY<iYmax;iY++) {
for(iX=0;iX<iXmax;iX++) {
color[0] = red[iX][iY];
color[1] = blue[iX][iY];
color[2] = green[iX][iY];
fwrite(color,1,3,fp);
}
}
fclose(fp);
return 0;
}
When using OpenACC, the parallel regions are offloaded to a device, like a GPU. GPU devices normally don't have access to the entire system or the IO, and have a reduced subset of the standard library implemented.
In your case, the fwrite function call cannot be offloaded to a device, since you cannot access the disk from the accelerator.
You could do that in OpenMP, where parallel regions are executed on CPU or MIC threads, which typically have access to the entire system library.
The acc routine directive that PGC is suggesting would allow you to annotate any function to create a device version of it.
I need a way to play certain musical notes in my C program on Linux.
When using windows, it is possible to #include <dos.h> and use straight forward functions like sound(note/frequency), delay(time in ms), and the self explaining nosound().
Is there anything parallel on Linux?
Thanks
I like the tip above concerning libao - I just gave it a try and it works nicely. Here is a similar level of complexity using OpenAL to synthesize a raw audio buffer in PCM format then to render as audio
// sudo apt-get install libopenal-dev
// gcc -o openal_play_monday openal_play_monday.c -lopenal -lm
#include <stdio.h>
#include <stdlib.h> // gives malloc
#include <math.h>
#ifdef __APPLE__
#include <OpenAL/al.h>
#include <OpenAL/alc.h>
#elif __linux
#include <AL/al.h>
#include <AL/alc.h>
#include <unistd.h>
#endif
ALCdevice * openal_output_device;
ALCcontext * openal_output_context;
ALuint internal_buffer;
ALuint streaming_source[1];
int al_check_error(const char * given_label) {
ALenum al_error;
al_error = alGetError();
if(AL_NO_ERROR != al_error) {
printf("ERROR - %s (%s)\n", alGetString(al_error), given_label);
return al_error;
}
return 0;
}
void MM_init_al() {
const char * defname = alcGetString(NULL, ALC_DEFAULT_DEVICE_SPECIFIER);
openal_output_device = alcOpenDevice(defname);
openal_output_context = alcCreateContext(openal_output_device, NULL);
alcMakeContextCurrent(openal_output_context);
// setup buffer and source
alGenBuffers(1, & internal_buffer);
al_check_error("failed call to alGenBuffers");
}
void MM_exit_al() {
ALenum errorCode = 0;
// Stop the sources
alSourceStopv(1, & streaming_source[0]); // streaming_source
int ii;
for (ii = 0; ii < 1; ++ii) {
alSourcei(streaming_source[ii], AL_BUFFER, 0);
}
// Clean-up
alDeleteSources(1, &streaming_source[0]);
alDeleteBuffers(16, &streaming_source[0]);
errorCode = alGetError();
alcMakeContextCurrent(NULL);
errorCode = alGetError();
alcDestroyContext(openal_output_context);
alcCloseDevice(openal_output_device);
}
void MM_render_one_buffer() {
/* Fill buffer with Sine-Wave */
// float freq = 440.f;
float freq = 100.f;
float incr_freq = 0.1f;
int seconds = 4;
// unsigned sample_rate = 22050;
unsigned sample_rate = 44100;
double my_pi = 3.14159;
size_t buf_size = seconds * sample_rate;
// allocate PCM audio buffer
short * samples = malloc(sizeof(short) * buf_size);
printf("\nhere is freq %f\n", freq);
int i=0;
for(; i<buf_size; ++i) {
samples[i] = 32760 * sin( (2.f * my_pi * freq)/sample_rate * i );
freq += incr_freq; // change freq just to make things interesting
if (100.0 > freq || freq > 5000.0) {
incr_freq *= -1.0f; // toggle direction of freq increment
}
}
/* upload buffer to OpenAL */
alBufferData( internal_buffer, AL_FORMAT_MONO16, samples, buf_size, sample_rate);
al_check_error("populating alBufferData");
free(samples);
/* Set-up sound source and play buffer */
// ALuint src = 0;
// alGenSources(1, &src);
// alSourcei(src, AL_BUFFER, internal_buffer);
alGenSources(1, & streaming_source[0]);
alSourcei(streaming_source[0], AL_BUFFER, internal_buffer);
// alSourcePlay(src);
alSourcePlay(streaming_source[0]);
// ---------------------
ALenum current_playing_state;
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
while (AL_PLAYING == current_playing_state) {
printf("still playing ... so sleep\n");
sleep(1); // should use a thread sleep NOT sleep() for a more responsive finish
alGetSourcei(streaming_source[0], AL_SOURCE_STATE, & current_playing_state);
al_check_error("alGetSourcei AL_SOURCE_STATE");
}
printf("end of playing\n");
/* Dealloc OpenAL */
MM_exit_al();
} // MM_render_one_buffer
int main() {
MM_init_al();
MM_render_one_buffer();
}
If you want to take OpenAL further ... take a gander at this
https://github.com/scottstensland/render-audio-openal
Out of the box OpenAL plays a buffer of PCM audio just fine ... however it leaves as an exercise the ability to play a stream. In that github repo I wrote an audio server using OpenAL which implements playing streaming audio ... enjoy
Windows uses its own one and only sound architecture, therefore you can access the sound() routine.
Different linux machines, depending on the packages installed, may require different approaches.
Maybe the utility beep (out of this question on stackexchange) can guide you to the right direction
one way
including
#include<conio.h>
and in side main() or where you want to use call print("\a")
printf("\a");
2nd way
including header file
#include <windows.h>
and calling function
Beep(500, 500);
Beep(freq, dur); where freq =beep frequency which is int and dutation in also int
I have a program which (for now) calculates values of two functions in random points on GPU , sends these values back to host, and then visualizes them. This is what I get, some nice semi-random points:
Now, if I modify my kernel code, and add the local array initalization code at the very end,
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
//works fine with 'new'
//float* test_array=new float[2];
float test_array[2]={1.0f,2.0f};
}
I get something like this everytime:
Does anyone know the cause of this behavior? All the drawn points are computed BEFORE test_array is initialized, yet they are affected by it. It doesn't happen when I initialize test_array before the 'for' loop.
Host/device code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand_kernel.h"
#include "device_functions.h"
#include <random>
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
#define XSIZE 5
#define TEST_POINTS 100
#define NOF 2
#define BLOCK_COUNT 64
#define THR_COUNT 128
#define POINTS_PER_THREAD (NOF*TEST_POINTS+THR_COUNT*BLOCK_COUNT-1)/(THR_COUNT*BLOCK_COUNT)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=false)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float g(float* x)
{
float tmp=1;
for(int i=1;i<XSIZE;i++)
tmp*=x[i];
return 1+9*(tmp/(XSIZE-1));
}
__device__ float ZDT_f1(float* x)
{
return x[0];
}
__device__ float ZDT_f2(float* x)
{
float gp=g(x);
return gp*(1-sqrtf(x[0]/gp));
}
__device__ bool oneDominatesTwo(float* x1, float* x2)
{
for(int i=0;i<XSIZE;i++)
if(x1[i]>=x2[i])
return false;
return true;
}
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = blockDim.x*blockIdx.x+threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void optymalize(curandState * state, float* testPoints)
{
int ind=blockDim.x*blockIdx.x+threadIdx.x;
int step=blockDim.x*gridDim.x;
for(int i=ind*2;i<NOF*TEST_POINTS;i+=step*2)
{
float* x=generateX(state);
testPoints[i]=ZDT_f1(x);
testPoints[i+1]=ZDT_f2(x);
}
__syncthreads();
//float* test_array=new float[2];
//test_array[0]=1.0f;
//test_array[1]=1.0f;
float test_array[2]={1.0f,1.0f};
}
void saveResultToFile(float* result)
{
ofstream resultFile;
resultFile.open ("result.txt");
for(unsigned int i=0;i<NOF*TEST_POINTS;i+=NOF)
{
resultFile << result[i] << " "<<result[i+1]<<"\n";
}
resultFile.close();
}
int main()
{
float* dev_fPoints;
float* fPoints=new float[NOF*TEST_POINTS];
gpuErrchk(cudaMalloc((void**)&dev_fPoints, NOF * TEST_POINTS * sizeof(float)));
curandState* devStates;
gpuErrchk(cudaMalloc(&devStates,THR_COUNT*sizeof(curandState)));
cudaEvent_t start;
gpuErrchk(cudaEventCreate(&start));
cudaEvent_t stop;
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaThreadSetLimit(cudaLimitMallocHeapSize, 128*1024*1024));
gpuErrchk(cudaEventRecord(start, NULL));
setup_kernel<<<BLOCK_COUNT, THR_COUNT>>>(devStates,unsigned(time(NULL)));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
optymalize<<<BLOCK_COUNT,THR_COUNT>>>(devStates, dev_fPoints);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaMemcpy(fPoints, dev_fPoints, NOF * TEST_POINTS * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaEventRecord(stop, NULL));
gpuErrchk(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, start, stop);
cout<<"Kernel execution time: "<<msecTotal<< "ms"<<endl;
saveResultToFile(fPoints);
system("start pythonw plot_data.py result.txt");
cudaFree(dev_fPoints);
cudaFree(devStates);
system("pause");
return 0;
}
Plot script code:
import matplotlib.pyplot as plt;
import sys;
if len(sys.argv)<2:
print("Usage: python PlotScript <filename>");
sys.exit(0);
path=sys.argv[1];
x=[]
y=[]
with open(path,"r") as f:
for line in f:
vals=line.strip().split(" ");
x.append(vals[0]);
y.append(vals[1]);
plt.plot(x,y,'ro')
plt.show();
The basic problem was in code you originally didn't show in your question, specifically this:
__device__ float* generateX(curandState* globalState)
{
int ind = threadIdx.x;
float x[XSIZE];
for(int i=0;i<XSIZE;i++)
x[i]=curand_uniform(&globalState[ind]);
return x;
}
Returning an address or reference to a local scope variable from a function results in undefined behaviour. It is only valid to use x by reference or value within generateX while it is in scope. There should be no surprise that adding or moving other local scope variables around within the kernel changes the kernel behaviour.
Fix this function so it populates an array passed by reference, rather than returning the address of a local scope array. And pay attention to compiler warnings - there will have been one for this which should have immediately set off alarm bells that there was something wrong.