Related
I'm trying to compute the average vector in RGB tristimulus space with libpng in C, and NumPy in Python, but I'm getting different results with each. I'm quite confident Python is giving the correct result with this image of [ 127.5 127.5 0. ]. However, with the following block of C I get the preposterous result of [ 38.406494 38.433670 38.459641 ]. I've been staring at my code for weeks without any give, so I thought I'd see if others may have an idea.
Also, I've tested this code with other images and it gives similar preposterous results. It's quite curious because all three numbers usually match for the first 4 or so digits. I'm not sure what may be causing this.
/* See if our average vector matches that of Python's */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <png.h>
// For getting the PNG data and header/information back
typedef struct
{
uint32_t width; // width of image
uint32_t height; // height of image
int bit_depth; // bits/pixel component (should be 8 in RGB)
png_bytep datap; // data
} rTuple;
#define PNG_BYTES_TO_CHECK 8
#define CHANNELS 3
int
check_PNG_signature(unsigned char *buffer)
{
unsigned i;
const unsigned char signature[8] = { 0x89, 0x50, 0x4e, 0x47,
0x0d, 0x0a, 0x1a, 0x0a };
for (i = 0; i < PNG_BYTES_TO_CHECK; ++i)
{
if (buffer[i] != signature[i])
{
fprintf(stderr, "** File sig does not match PNG, received ");
for (i = 0; i < PNG_BYTES_TO_CHECK; ++i)
fprintf(stderr, "%.2X ", buffer[i]);
fprintf(stderr, "\n");
abort();
}
}
return 1;
}
rTuple
read_png_file(char *file_name)
{
/* Get PNG data - I've pieced this together by reading `example.c` from
beginning to end */
printf("** Reading data from %s\n", file_name);
png_uint_32 width, height; // holds width and height of image
uint32_t row; // for iteration later
int bit_depth, color_type, interlace_type;
unsigned char *buff = malloc(PNG_BYTES_TO_CHECK * sizeof(char));
memset(buff, 0, PNG_BYTES_TO_CHECK * sizeof(char));
FILE *fp = fopen(file_name, "rb");
if (fp == NULL) abort();
if (fread(buff, 1, PNG_BYTES_TO_CHECK, fp) != PNG_BYTES_TO_CHECK) {
fprintf(stderr, "** Could not read %d bytes\n", PNG_BYTES_TO_CHECK);
abort();
}
check_PNG_signature(buff);
rewind(fp);
// create and initialize the png_struct, which will be destroyed later
png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING
, NULL /* Following 3 mean use stderr & longjump method */
, NULL
, NULL
);
if (!png_ptr) abort();
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) abort();
// following I/O initialization method is required
png_init_io(png_ptr, fp);
png_set_sig_bytes(png_ptr, 0); // libpng has this built in too
// call to png_read_info() gives us all of the information from the
// PNG file before the first IDAT (image data chunk)
png_read_info(png_ptr, info_ptr);
// Get header metadata now
png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type,
&interlace_type, NULL, NULL);
// Scale 16-bit images to 8-bits as accurately as possible (shouldn't be an
// issue though, since we're working with RGB data)
#ifdef PNG_READ_SCALE_16_TO_8_SUPPORTED
png_set_scale_16(png_ptr);
#else
png_set_strip_16(png_ptr);
#endif
png_set_packing(png_ptr);
// PNGs we're working with should have a color_type RGB
if (color_type == PNG_COLOR_TYPE_PALETTE)
png_set_palette_to_rgb(png_ptr);
// Required since we selected the RGB palette
png_read_update_info(png_ptr, info_ptr);
// Allocate memory to _hold_ the image data now (lines 547-)
png_bytep row_pointers[height];
for (row = 0; row < height; ++row)
row_pointers[row] = NULL;
for (row = 0; row < height; ++row)
row_pointers[row] = png_malloc(png_ptr,\
png_get_rowbytes(png_ptr, info_ptr)
);
png_read_image(png_ptr, row_pointers);
png_read_end(png_ptr, info_ptr);
// Now clean up - the image data is in memory
png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
fclose(fp);
rTuple t = { width, height, bit_depth, *row_pointers };
return t;
}
int
main(int argc, char *argv[])
{
if (argc != 2) {
printf("** Provide filename\n");
abort();
}
char *fileName = argv[1];
// get data read
rTuple data = read_png_file(fileName);
/* let's try computing the absolute average vector */
uint32_t i, j, k;
double *avV = malloc(CHANNELS * sizeof(double));
memset(avV, 0, sizeof(double) * CHANNELS);
double new_px[CHANNELS];
png_bytep row, px;
for (i = 0; i < data.height; ++i)
{
row = &data.datap[i];
for (j = 0; j < data.width; ++j)
{
px = &(row[j * sizeof(int)]);
for (k = 0; k < CHANNELS; ++k) {
new_px[k] = (double)px[k];
avV[k] += new_px[k];
}
}
}
double size = (double)data.width * (double)data.height;
for (k = 0; k < CHANNELS; ++k) {
avV[k] /= size;
printf("channel %d: %lf\n", k + 1, avV[k]);
}
printf("\n");
return 0;
}
Now with Python I'm just opening an image with a simple context manager and computing np.mean(image_data, axis=(0, 1)), which yields my result above.
Basically, you had a couple bugs (libpng side and pointer arithmetic) that I try to find them by comparing your code with this Github gist. The followings are the list of changes that I have made to produce the same image mean as Python NumPy.
In rTuple struct, you need to change the png_bytep datap to a pointer of type png_byte using: png_bytep *datap;.
In read_png_file, use png_set_filler to add the filler byte after reading the image. See here for more information about it.
if(color_type == PNG_COLOR_TYPE_RGB ||
color_type == PNG_COLOR_TYPE_GRAY ||
color_type == PNG_COLOR_TYPE_PALETTE)
png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
In read_png_file, update the changes before allocating the row_pointers using png_read_update_info(png_ptr, info_ptr);
Again, in read_png_file, change the way you are mallocing memory for image pixels using:
png_bytep *row_pointers = (png_bytep*)malloc(sizeof(png_bytep) * height);
for(row = 0; row < height; row++)
{
row_pointers[row] = malloc(png_get_rowbytes(png_ptr,info_ptr));
}
In main, change row = &data.datap[i]; to row = data.datap[i]; as your accessing a pointer here.
I did not want to populate the answer with a code that is barely the same as the question, so if you want to just copy and paste the answer, this is the link to the complete code.
I am working on a programming assignment in C, which is about creating basic automation for cinema halls.
For holding data of halls, I define a structure like this:
typedef struct {
char *hallName;
char *movieName;
seat** hallSeats;
int studentCount;
int fullFareCount;
int totalSum;
int width;
int height;
}Hall;
So I am given a text file with commands and whenever I came up with a specific command, I should create a separate hall. For that reason, I created another function for that.
Hall makeHall(char **temp) //TEMP HOLDING THE LINES FROM FILE
{
int width = strToInt(temp[3]);
int height = strToInt(temp[4]);
char currentRowLetter = 'A';
int currentRow;
int currentSeat;
seat **hall = malloc(sizeof(seat*) * width );
for (currentRow=0 ; currentRow < width ; currentRow++)
{
hall[currentRow] = malloc(sizeof(seat) * height );
for(currentSeat=0; currentSeat < height ; currentSeat++)
{
hall[currentRow][currentSeat].rowLetter = currentRowLetter;
hall[currentRow][currentSeat].seatNumber = currentSeat + 1;
hall[currentRow][currentSeat].seatTaken = ' ';
}
++currentRowLetter;
}
Hall newHall;
newHall.hallName = temp[1];
newHall.movieName = temp[2];
newHall.hallSeats = hall;
newHall.width = width;
newHall.height = height;
return newHall;
}
Since I will have multiple halls, I created a Hall array in order to access them later.
Hall *allHalls = malloc(sizeof(Hall) * 10); /*Hall placeholder*/
While I iterate over the lines, I check commands and create halls or sell tickets.
Hall *allHalls = malloc(sizeof(Hall) * 10); /*Hall placeholder*/
FILE *f;
f = fopen("input.txt", "rt");
char *line = malloc (sizeof(char) * 200); /*LINE HOLDER*/
int currentLineNumber = 0;
char *tmp;
int hallNumber = 0;
while (1) { /*PARSING FILE*/
if (fgets(line,200, f) == NULL) break; /*FILE END CHECKER*/
currentLineNumber++;
tmp = strtok(line," ");
char **temp = malloc(sizeof(char*) * 6);
int currentWordNumber = 0;
while(tmp != NULL) /*PARSING LINES*/
{
temp[currentWordNumber] = malloc(strlen(tmp) + 1);
strcpy(temp[currentWordNumber],tmp);
tmp = strtok (NULL, " ");
currentWordNumber++;
}
if(!strcmp("CREATEHALL",temp[0]))
{
allHalls[hallNumber] = makeHall(temp); /*<<<<<<<PROBLEM*/
hallNumber++;
printf("%d\n",hallNumber);
}
Now that's the part I am lost at. Whenever I tried to access the array, the program crashes.
I thought it was a memory problem, so increased memory allocated by malloc for allHalls to 40 (even though it should not be a problem, since file only gives 3 different halls) and program no longer crashes, but instead overwrites the previous hall in the array.
I tried multiple solutions but none of them came out any good, so closest I get is this.
I did use java a lot before, so I am still stuck to OOP and pretty new to C.
EDIT
Seat is defined as
typedef struct {
char rowLetter;
int seatNumber;
char seatTaken;
}seat;
also example createhall command is
CREATEHALL Hall_A Avatar 24 20
while the numbers at the end being width and height for hall
EDIT : CODE
I got the bug:
At the bottom of the while(1) loop in main you do a free(allHalls); so now there are no more halls and you get a segfault...
It was in the code you didn't show us:
while (1) {
...
if(!strcmp("CREATEHALL",temp[0]))
{
allHalls[hallNumber] = makeHall(temp); /*<<<<<<<PROBLEM*/
hallNumber++;
printf("%d\n",hallNumber);
}
....
free(temp);
free(allHalls); // <-- there's your bug
}
fclose(f);
free(line);
As mentioned I have a Zynq SoC (ZC706 Eval Board) and I'm trying to read an image from the SD Card. To do this I'm using the FatFs lib (http://elm-chan.org/fsw/ff/00index_e.html).
In my code I read 4096 Byte from the file and save it to a buffer. After that i copy the buffer to an unsigned char pointer that size I increase after every read operation.
Then I'm using realloc, the for loop in the copyU32ArrayToUnsignedCharArray function 'failed' because the size variable is overwritten by the out array.
Code that overwrite the "size" in the copyU32ArrayToUnsignedCharArray function:
u32 buffer[1024];
unsigned char *img = NULL;
bytesreaded = 0;
for (;;) {
br=0;
fr = f_read(&fil, buffer, sizeof(buffer), &br); /* Read a chunk of source file */
if (fr || br == 0)
break; /* error or eof */
img = realloc(img,br);
copyU32ArrayToUnsignedCharArray(buffer, &img[bytesreaded], br/4); // /4 because u32(32 bit) in to unsigned char(8 bit)
bytesreaded += br; // update readed bytes
}
The code that worked:
u32 buffer[1024];
unsigned char *img = NULL;
img = malloc(512*512*3+100);
bytesreaded = 0;
for (;;) {
br=0;
fr = f_read(&fil, buffer, sizeof(buffer), &br); /* Read a chunk of source file */
if (fr || br == 0)
break; /* error or eof */
copyU32ArrayToUnsignedCharArray(buffer, &img[bytesreaded], br/4); // /4 because u32(32 bit) in to unsigned char(8 bit)
bytesreaded += br; // update readed bytes
}
The copyU32ArrayToUnsignedCharArray function:
void copyU32ArrayToUnsignedCharArray(u32 *in, unsigned char* out, uint size){
int i,x;
x = 0;
for (i = 0; i < size; i++) {
if(size != 1024)
break;
in[i] = Xil_In32BE(&in[i]);
out[x] = (u32) in[i] >> 24;
out[x + 1] = (u32) in[i] >> 16 & 0x00FF;
out[x + 2] = (u32) in[i] >> 8 & 0x0000FF;
out[x + 3] = (u32) in[i] & 0x000000FF;
x += 4;
}
}
I want to use realloc because I don't know how big the image will be that I read.
Update:
Some further information to the code that doesn't work. I debugged it and the pointer to *img isn't null, so the realloc was successfully. If I'm using gdb the following things happen in the copyU32ArrayToUnsignedCharArray function:
- pointer to the variable "out" is 0x001125a8
- the address of the "size" variable is 0x0011309c (the value that is stored at this location is correct)
- the space in memory between this two variables is 0xaf4 = 2804 dec (difference of the two addresses)
- if the for loop within the copyU32ArrayToUnsignedCharArray function reached i=702 and x=2808 the size variable is changed to another value
Sincerely,
Arno
I solved the problem with the hint from Notlikethat. The problem was the small heap size. Increasing the heap is done by editing the linker script file
Firstly, i'm not very familiarized with C, i come from Java, C#, C++... and possibly i inherited defects from this languages in order to realize this practice, well i have the follows question, here is my code:
#include <stdio.h>
#include <stdlib.h>
void decrypt(unsigned long* v, unsigned long* k);
const int MAX = 32;
const long delta = 0x9e3779b9;
long sum=0xC6EF3720;
int main() {
FILE *fp;
FILE *destino;
unsigned long v[2];
unsigned long k[4] = { 128, 129, 130, 131 };
unsigned long tam=0;
char* buffer;
char* aux[sizeof(unsigned long)];
int i;
if ((fp = fopen("image.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fread(&aux,sizeof(unsigned long),1,fp);
memcpy(&tam,&aux,sizeof(unsigned long));
buffer = (char*)malloc(tam);
//fread(&buffer,1,tam,fp);
char *buffer2[28568];
fread(&buffer2,1,28568,fp);
/*for(i = 0;i < tam;++i) {
printf("%c", ((char *)buffer2)[i]);
}*/
for(i=4;i<tam;i+=8) {
memcpy(&v,&buffer2[i],8);
decrypt(&v,&k);
}
if ((result= fopen("image2.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fwrite(v,sizeof(unsigned long)*2,1,result);
fclose (result);
fclose(fp);
}
}
return 0;
}
void decrypt(unsigned long* v, unsigned long* k) {
int i=0;
while(i<MAX) {
v[1] = v[1] -((4 << v[0])+(k[2]^v[0])+(sum^(5 >> v[0]))+k[3]);
v[0] = v[0] -((4 << v[1])+(k[0]^v[1])+(sum^(5 >> v[1]))+k[1]);
sum = sum-delta;
i++;
}
}
Where tam is the size of my binary file (image in this case) where i store first 4 bytes (unsigned long) where is located the size in my png file (28568)
When i create my char* buffer i have to assign dynamically with malloc but when i make a new fread from my file i get a "No source available for "msvrct!memcpy() at 0xrandom_memory_address" from Eclipse when i debug, well, i comment this line and i try to make it manually set a new buffer2 with 28568 as size of my array, apparently works, making a iteration of buffer2 prints ascii characters values but when i call decrypt for make the decryption of my image, the final result is stored in v array which i have to copy in a new file, i tried to search how to make a empty image png in C but i didn't find anything, so i created a copy of my encrypt image calling it "image2.png" but i suppose this not the "clean solution" for that, because for the other hand is not working at all.
For more explanation about this exercise just say that the decrypt funcion work with blocks of 8 bytes (64 bits) that through a key (array k) make a series of operation where they store in v array itself, crossing through the loop 8 in 8 and retrieve the value of buffer in v in each one, after the loop execution we have the result in v and only left to copy in a new file where finally show up the image decrypt.
It's a very complex practice for all of one newbies in C, it's driving my crazy trying to figure out what i doing wrong.
I hope anyone can see what i'm not able to for now.
I think you are having problems with the declarations of the buffers. I think the correct should be:
FILE *fp;
FILE *destino;
unsigned long v[2];
unsigned long k[4] = { 128, 129, 130, 131 };
unsigned long tam=0;
char* buffer;
char aux[sizeof(unsigned long)]; // without the "*"
int i;
if ((fp = fopen("image.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fread(aux,sizeof(unsigned long),1,fp);
memcpy(&tam,aux,sizeof(unsigned long));
buffer = (char*)malloc(tam);
//fread(buffer,1,tam,fp); // without the "&" in this case
char buffer2[28568]; // without the "*"
fread(buffer2,1,28568,fp); // or fread(buffer,1,tam,fp);
/*for(i = 0;i < tam;++i) {
printf("%c", buffer2[i]); // or buufer[i] if you change to use it again
}*/
for(i=4;i<tam;i+=8) {
memcpy(v,&buffer2[i],8);
decrypt(v,k);
}
...
I don't fully understand what you are trying to accomplish, but one problem is here:
char* aux[sizeof(unsigned long)];
// ... some code ...
fread(&aux,sizeof(unsigned long),1,fp);
Understand that char* aux[sizeof(unsigned long)]; means that you are declaring a double pointer, but fread() prototype states that the destination is a single pointer:
size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
so what you should be doing instead is:
char aux[sizeof(unsigned long)];
// ... some code ...
fread(aux,sizeof(unsigned long),1,fp);
Don't complicate things that are not complicated!
You also do this mistake in other parts of your code, you need to re-check everything, ok? Again:
char *buffer2[28568];
fread(&buffer2,1,28568,fp);
should be:
char buffer2[28568];
fread(buffer2, 1, 28568, fp);
// or: fread(buffer2, 1, sizeof(buffer2), fp);
There are some interesting tutorials on pointers and arrays, I suggest you read some.
I am currently having exceptional difficulty with CUDA programming--more specifically, in copying and reading an array which the device sends back to the host. When I attempt to read the data which I am supposed to have returned to me, all I get is junk data. Could anyone take a look at my code snippets and tell me what I'm doing wrong? Thank you very much!
struct intss {
u_int32_t one;
u_int32_t two;
};
int main()
{
int block_size = 3;
int grid_size = 1;
intss *device_fb = 0;
intss *host_fb = 0;
int num_bytes_fb = (block_size*grid_size)*sizeof(intss);
host_fb = (intss*)malloc(num_bytes_fb);
cudaMalloc((void **)&device_fb, num_bytes_fb);
....
render2<<<block_size,grid_size>>>(device_fb, device_pixelspercore, samples, obj_list_flat_dev, numOpsPerCore, lnumdev, camdev, lightsdev, uranddev, iranddev);
....
cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost);
printf("output %d ", host_fb[0].one);
printf("output %d ", host_fb[1].one);
printf("output %d ", host_fb[2].one);
//Note that I'm only looking at elements the 3 elements 0-2 from host_fb. I am doing this because block_size*grid_size = 3. Is this wrong?
cudaFree(device_fb);
free(host_fb);
}
__global__ void render2(intss *device_fb, struct parallelPixels *pixelsPerCore, int samples, double *obj_list_flat_dev, int numOpsPerCore, int lnumdev, struct camera camdev, struct vec3 *lightsdev, struct vec3 *uranddev, int *iranddev) //SPECIFY ARGUMENTS!!!
{
int index = blockIdx.x * blockDim.x + threadIdx.x; //DETERMINING INDEX BASED ON WHICH THREAD IS CURRENTLY RUNNING
....
//computing data...
device_fb[index].one = (((u_int32_t)(MIN(r, 1.0) * 255.0) & 0xff) << RSHIFT |
((u_int32_t)(MIN(g, 1.0) * 255.0) & 0xff) << GSHIFT |
((u_int32_t)(MIN(b, 1.0) * 255.0) & 0xff) << BSHIFT);
}
EDIT:
Thanks to a suggestion, I have implemented the CudaErrorCheck function in my program, and there seems to be a pattern in which functions are giving me errors.
In my program, I have a bunch of global host arrays(obj_list, lights, urand, irand). Whenever I attempt to use cudaMemCpy to copy these host arrays to device arrays, I receive the following error:
"Cuda error in file 'cudatrace.cu' in line x : invalid argument."
obj_list and lights are filled in the following function, load_scene():
void load_scene(FILE *fp) {
char line[256], *ptr, type;
obj_list = (sphere *)malloc(sizeof(struct sphere));
obj_list->next = 0;
objCounter = 0;
while((ptr = fgets(line, 256, fp))) {
int i;
struct vec3 pos, col;
double rad, spow, refl;
while(*ptr == ' ' || *ptr == '\t') ptr++;
if(*ptr == '#' || *ptr == '\n') continue;
if(!(ptr = strtok(line, DELIM))) continue;
type = *ptr;
for(i=0; i<3; i++) {
if(!(ptr = strtok(0, DELIM))) break;
*((double*)&pos.x + i) = atof(ptr);
}
if(type == 'l') {
lights[lnum++] = pos;
continue;
}
if(!(ptr = strtok(0, DELIM))) continue;
rad = atof(ptr);
for(i=0; i<3; i++) {
if(!(ptr = strtok(0, DELIM))) break;
*((double*)&col.x + i) = atof(ptr);
}
if(type == 'c') {
cam.pos = pos;
cam.targ = col;
cam.fov = rad;
continue;
}
if(!(ptr = strtok(0, DELIM))) continue;
spow = atof(ptr);
if(!(ptr = strtok(0, DELIM))) continue;
refl = atof(ptr);
if(type == 's') {
objCounter++;
struct sphere *sph = (sphere *)malloc(sizeof(*sph));
sph->next = obj_list->next;
obj_list->next = sph;
sph->pos = pos;
sph->rad = rad;
sph->mat.col = col;
sph->mat.spow = spow;
sph->mat.refl = refl;
} else {
fprintf(stderr, "unknown type: %c\n", type);
}
}
}
urand and irand are filled in main as follows:
/* initialize the random number tables for the jitter */
for(i=0; i<NRAN; i++) urand[i].x = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) urand[i].y = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) irand[i] = (int)(NRAN * ((double)rand() / RAND_MAX));
I don't think the invalid argument could be caused by the device array, since the cudaMalloc call creating the device array before the cudaMemcpy call did not have a CudaError message. For example, in the following lines of code:
cudaErrorCheck(cudaMalloc((void **)&lightsdev, MAX_LIGHTS*sizeof(struct vec3)) );
cudaErrorCheck( cudaMemcpy(&lightsdev, &lights, sizeof(struct vec3) * MAX_LIGHTS, cudaMemcpyHostToDevice) );
cudaMalloc did not produce an error, but cudaMemcpy did.
If I have not provided enough information on my code, I have pasted the entire code to: http://pastebin.com/UgzABPgH
(Note that in the pastebin version, I took out the CudaErrorCheck functions on the CudaMemcpy's which were producing the errors.)
Thank you very much!
EDIT:
Actually, I just tried to see what would happen if urand and irand were not global, and if they were initialized alongside the device arrays uranddev and iranddev. I'm still getting the same "invalid argument" error, so the whether or not a variable is global must not relate to the problem.
It is absolutely impossible to say anything when you have posted incomplete, uncompilable code with no proper description of the actual problem. You will get better answers by asking better questions on StackOverflow.
Having said that. the most likely problem isn't that the data is not being copied to or from the device, it is that the kernel itself is not running. Every CUDA runtime API call returns a status code, and you should be checking all of them. You can define an error checking macro like this one:
#include <stdio.h>
#define cudaErrorCheck(call) { cudaAssert(call,__FILE__,__LINE__) }
void cudaAssert(const cudaError err, const char *file, const int line)
{
if( cudaSuccess != err) {
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",
file, line, cudaGetErrorString(err) );
exit(1);
}
}
and wrap every API call in it, like this:
cudaErrorCheck( cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost) );
For the kernel launch, itself you can check for a launch failure or runtime error like this:
kernel<<<....>>>();
cudaErrorCheck( cudaPeekAtLastError() ); // Checks for launch error
cudaErrorCheck( cudaThreadSynchronize() ); // Checks for execution error
My suggestion is add thorough error checking to your code and then come back and edit your question with the results you get. Then someone might be able to offer concrete suggestions about what is happening.
I think you're not using the <<< >>> syntax correctly.
Here's a kernel invocation from the CUDA Programming Guide:
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
which would mean that the grid size should go first.
There's also a limitation on the maximum size for the arguments to a kernel. See this. If you go above it, I'm not sure whether the compiler complains or just goes on to do nasty things.
If I remove all the arguments but device_fb, and just set device_fb[index]=index in the kernel, I can read the values successfully.