AVX2 1GB long array - c

I have a 1gb long array with floats in a .bin file. After i read it how can i sum the elements with avx2 instrucion, and print the result?
I edited my code with Jake 'Alquimista' LEE's answer.
The problem is the result much smaller than it will be. And other question, how can i add a constant to each number that i readed from .bin file?
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
while (len--)
len >>= 3;
sum = _mm256_set1_ps(0.0f);
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
int main(void)
{
FILE *file;
float *buffer2;
uint32_t fileLen;
if((file = fopen("example.bin","rb"))==NULL)
{
printf("Error! opening file");
exit(1);
}
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
buffer2=(float *)malloc(fileLen+1);
if (!buffer2)
{
fprintf(stderr, "Memory error!");
fclose(file);
return 0;
}
fread(buffer2, fileLen, 1, file);
fclose(file);
printf( "File size : %lu Bits \n", fileLen );
for(int i = 0; i<10; i++)
printf("%f \n", buffer2[i]);
float sum =sumf(buffer2,fileLen);
printf("%f\n",s);
free(buffer2);
return 0;
}

Reading 1GB file into memory is big memory and I/O overhead. Although I'm not very familiar with AVX2, i read articles from Internet & i could come up with the following solution which is actually tested and proved to be working.
My solution consists of reading the file as chuncks of 512 Bytes (Blocks of 128 floats) then summing up the pairs of vectors (16 Total vectors per block) so that at the end we get a final __m256 vector, by casting it to a float* we could sum up its individual components to get the final result.
A case where the file is not 128-floats aligned is handled in the last for loop by summing up individual floats.
The code is commented but in case you have any suggestions to add more explanation to the answer then feel free to do so.
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int make_floatf(char *, int);
float avx_sfadd(char*);
char error_buf[1024];
#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)
/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;
if(!(fp = fopen(filename, "wb+")))
PERROR();
float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();
int j, i;
for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;
int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}
free(block_ptr);
fclose(fp);
return 0;
}
/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;
__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();
if(!(fp = fopen(filename, "rb")))
PERROR();
struct stat stat_buf;
stat(filename, &stat_buf);
size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));
printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);
/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();
int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();
/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);
sum += _mm256_add_ps(v1, v2);
}
}
/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();
int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}
float final_sum = rem_sum;
float *sum_ptr = (float*)∑ /* The final vector hold the sum of all vectors */
/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];
free(block_ptr);
fclose(fp);
return final_sum;
}
int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}
/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){
if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}
/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :
return 0;
}

Here's (most likely) your bug:
while (len--)
len >>= 3;
That's a while loop. As long as len != 0, you replace len with (len - 1) >> 3. And then you change it to -1. No loop to be seen.

inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
len >>= 3;
sum = _mm256_set1_ps(0.0f);
while (len--)
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
in = *(__m256 *)&_mm256_permute4x64_pd(*(__m256d *)&sum, 0b01001110);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
The function above will do. However, I don't think that it will bring much of a performance gain, if any, since it's a very trivial one, and the compiler will do auto-vectorize it anyway.
Please note that you have to typecast the pointer to float *, and divide filelen by sizeof(float) when you pass them as arguments.

Related

MPI basic image processing - convolution using MPI

I am trying to learn MPI, so i created this simple program which is doing the convolution on grayscale uint8 image. I modified the openMP code which was working quite ok, with sliceing the vectorized image to parts for each processor - to scatterDataSize, but i am getting weird error from MPI:
"Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
mpirun noticed that process rank 1 with PID 5360 on node wn25 exited on signal 11 (Segmentation fault)."
I've tried to make it using MPI_Scatter to brodcast the image to all processes, and after convolution collect the data with MPI_Gather or MPI_Allgather, but result from both is the same...
argv are image dimensions - the images are i.e. input1000_800.bin so the program is being executed as:
mpirun -np 4 ./main 1000 800 and the source code of my program is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <mpi.h>
const int ROOT = 0;
const double filter[][5] = { {0,0,1,0,0},
{0,1,2,1,0},
{1,2,-16,2,1},
{0,1,2,1,0},
{0,0,1,0,0} };
unsigned char normalize(double value);
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension);
void saveImage(char* filename[], unsigned char* image, long fileLength);
long getFileSize(char* filename[]);
unsigned char * readImage(char* filename[]);
int main(int argc, char * argv[])
{
int width = atoi(argv[1]);
int height = atoi(argv[2]);
const char * prefixInFile = "../../labMPI/infile";
const char * prefixOutFile = "result";
const char * fileExtension = ".bin";
char outFileName[64], fileName[64];
unsigned char * image, *buffer, *data;
long fileSize;
int processorsAmount, processId, scatterDataSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &processorsAmount);
MPI_Comm_rank(MPI_COMM_WORLD, &processId);
strcpy(fileName, prefixInFile);
strcat(fileName, argv[1]);
strcat(fileName, "_");
strcat(fileName, argv[2]);
strcat(fileName, fileExtension);
strcpy(outFileName, prefixOutFile);
strcat(outFileName, argv[1]);
strcat(outFileName, "_");
strcat(outFileName, argv[2]);
strcat(outFileName, fileExtension);
fileSize = getFileSize(fileName);
data = (unsigned char *)malloc(fileSize * sizeof(unsigned char));
scatterDataSize = fileSize / processorsAmount;
int startPoint = scatterDataSize * processId;
int endPoint = scatterDataSize * (processId + 1);
printf("filesize: %d\nprocessorsAmount: %d\niam: %d\nscatterDataSize: %d\nstartPoint: %d\nendPoint: %d\n",
fileSize, processorsAmount, processId, scatterDataSize, startPoint, endPoint);
if (processId == ROOT)
{
printf("Reading file %s on ROOT \n output file will be: %s \n", fileName, outFileName);
image = readImage(fileName);
}
MPI_Bcast(image,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
#pragma omp parallel for
for (int i = startPoint; i < endPoint; i++)
{
register int col = i % width;
register int row = i / width;
register long idx = col + width * row;
data[idx] = normalize(convolution(col, row, image, height, width, 5));
}
/* i am not sure how to collect processed data on workers back to root and perform save */
MPI_Bcast(data,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
if (processId == ROOT)
{
saveImage(outFileName, data, fileSize);
printf("Image processing is finished");
free(image);
}
free(data);
MPI_Finalize();
return 0;
}
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension)
{
register int filterHeight, filterWidth, kernelCenter, ii, jj;
filterHeight = filterWidth = filterDimension;
kernelCenter = filterHeight / 2;
register double tmp = 0;
for (long m = 0; m < filterHeight; ++m) {
for (long n = 0; n < filterWidth; ++n) {
ii = i + (kernelCenter - m);
jj = j + (kernelCenter - n);
if (ii >= 0 && ii < width && jj >= 0 && jj < height)
tmp += image[jj * width + ii] * filter[m][n];
}
}
return tmp;
}
unsigned char * readImage(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long long fileLength = ftell(inFile);
fseek(inFile, 0, SEEK_SET);
unsigned char * image = (unsigned char *)malloc(fileLength * sizeof(unsigned char));
fread(image, sizeof(unsigned char), fileLength, inFile);
fclose(inFile);
return image;
}
long getFileSize(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long fileLength = ftell(inFile);
fclose(inFile);
return fileLength;
}
void saveImage(char* filename[], unsigned char* image, long fileLength)
{
FILE *write = fopen(filename, "wb");
fwrite(image, sizeof(unsigned char), fileLength * sizeof(unsigned char), write);
fclose(write);
}
unsigned char normalize(double value)
{
if (value > 255)
{
value = 255;
}
else if (value < 0) {
value = 0;
}
return (unsigned char)value;
}

Unable to sequentially process a file > 250kb using Windows file mapping with n-sized chunks

I am trying to make a software that takes a txt file and xor every 4 byte with a pre-defined number.
I am doing this mapping the file in memory and opening chunks of the file with MapViewOfFile of size n.
The algorithm I'm attaching works well for txt files of less than 250 kb. But for file > 250kb it only xor some parts of the file and I cannot understand why and how to fix this.
Can someone help me?
#include "stdafx.h"
#include "Windows.h"
#include <stdio.h>
#include <stdint.h>
#include <iso646.h>
#include <math.h>
unsigned int strToUl(char *s)
{
int size = 4;
unsigned int ul = 0;
memcpy(&ul, (unsigned int *)s, size);
return ul;
}
char *ulToStr(unsigned int *ul)
{
int size = 4;
char *tch = (char *)calloc(size, sizeof(char *));
memcpy(tch, (char *)ul, size);
return tch;
}
unsigned int uixor(unsigned int n, unsigned int seed)
{
srand(seed);
unsigned int mask = rand();
char ch[5] = { 0 };
strcpy_s(ch, 5, ulToStr(&n));
for (int j = 0; j < 5; j++)
{
ch[j] = ch[j] ^ mask;
}
return strToUl(ch);
}
BOOL mapWriteChunk(PHANDLE phFile, DWORD dwFileSize, int start, int buffsize, uint32_t xork)
{
DWORD offset = start;// / 4;// / sizeof(DWORD);
SYSTEM_INFO SysInfo;
GetSystemInfo(&SysInfo);
DWORD dwSysGran = SysInfo.dwAllocationGranularity;
DWORD dwFileMapStart = (offset/dwSysGran) * dwSysGran;
DWORD dwMapViewSize = (offset % dwSysGran) + buffsize;
DWORD dwFileMapSize = offset + buffsize;
unsigned int *ulMVBuffer = (unsigned int *)MapViewOfFile(*phFile, FILE_MAP_ALL_ACCESS, 0, dwFileMapStart, 0);
if (ulMVBuffer == NULL)
{
printf("ulMVBuffer = NULL\n");
}
int iViewDelta = offset - dwFileMapStart;
for (int i = 0; i < buffsize; i++)
{
unsigned int *u = (unsigned int *)ulMVBuffer + (iViewDelta + i);
unsigned int u1 = *u;
unsigned int u2 = uixor(u1, xork);
*u = u2;
printf("write on %d -> ", iViewDelta);
}
UnmapViewOfFile(ulMVBuffer);
return TRUE;
}
int main()
{
char name[] = "test.txt";
OFSTRUCT tOfStrIn;
tOfStrIn.cBytes = sizeof tOfStrIn;
HANDLE hFile = (HANDLE)OpenFile(name, &tOfStrIn, OF_READWRITE);
DWORD dwFileSize = GetFileSize(hFile, NULL);
HANDLE hFileMap = CreateFileMapping(hFile, NULL, PAGE_READWRITE, 0, dwFileSize, NULL);
if (hFileMap == NULL)
{
printf("hFileMap = NULL\n");
}
int pos = 0;
int chunk = 4;
int bSize = dwFileSize / sizeof(DWORD);
int rseed = 10;
for (pos = 0; pos < bSize; pos+=chunk)
{
mapWriteChunk(&hFileMap, dwFileSize, pos, chunk, rseed);
}
CloseHandle(hFile);
CloseHandle(hFileMap);
system("PAUSE");
return 0;
}
Ok, I figured out the problem and I'm writing here so anyone who have the same problem, know what's wrong.
Talk is cheap, I show you the code (and then I'll explain):
char *ulMVBuffer = (char *)MapViewOfFile(phFile, FILE_MAP_ALL_ACCESS, 0, dwFileMapStart, 0);
if (ulMVBuffer == NULL)
{
printf("ulMVBuffer = NULL\n");
}
int iViewDelta = offset - dwFileMapStart;
unsigned int mask = myrand(xork);
for(int i = 0; i < buffsize; i++)
{
unsigned int c = ulMVBuffer[iViewDelta + i] ^ mask;
ulMVBuffer[iViewDelta + i] = c;
}
So you have to map the memory using a char pointer and then, when you use the XOR operator like that:
unsigned int c = ulMVBuffer[iViewDelta + i] ^ mask;
You obtain the XOR to be applied to a group of 4 bytes and not only on 1 byte, because - as far as I understood playing around - the XOR between a char (1 byte) and a unsigned int (4 bytes) forces the operator to pick 3 more bytes from the memory and use it for the bitwise operation.
This wasn't working using a pointer to unsigned int because, I guess, it stored the bytes from the memory in a different fashion (maybe OS or machine dependent?) and so you were able to XOR only 1 byte every 4 and not groups of 4 bytes all together.
If anyone has a better understanding to this or wants to add more to this solution, I will be more than happy to read it!

C - Read struct in file

Hi I want to read some information in a struct that i wrote in file with fwrite but there's a problem I can't extract these information. I got 2 file
tttfs.h :
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
typedef struct _disk disk;
typedef struct _block block;
struct _block{
uint8_t *unBlock;
};
struct _disk{
int id;
block *diskBlock;
};
tfs_create.c :
#include "tttfs.h"
uint8_t little[4];
int tttfs_create(int size, char *name);
void inttolitend(uint32_t x, uint8_t* lit_int);
int main(){
tttfs_create(7, "disk.tfs");
int f = 0;
if((f=open("disk.tfs",O_RDONLY)) < -1) //I took this from an example.
return 1;
disk *d = malloc(sizeof(disk)); //Create a disk
d->diskBlock = malloc(1024); //The block where I want to write the information from the file.
lseek(f,0,SEEK_SET);//I want to read from the beginning
read(f,d->diskBlock,1024); //I write all the information from the beginning to 1024th byte of my file
int i;
for(i=0; i<4; i++){
printf("%d\n", (uint8_t)&d->diskBlock[i]);//print my result.
}
}
int tttfs_create(int size, char *name){
FILE *f = NULL;
if ((f = fopen(name, "wb"))!=NULL) /** si ouverture ok **/
{
disk *d = malloc(sizeof(disk));
d->diskBlock = malloc(sizeof(block) * size);
d->id = 1;
int i;
for(i = 0; i<size; i++){
d->diskBlock[i].unBlock = malloc(sizeof(uint8_t) * 1024);
}
inttolitend(size, little);
for(i = 0; i<4; i++){
d->diskBlock[0].unBlock[i] = little[i];
}
for(i = 0; i<size; i++){
fwrite(&d->diskBlock[i],sizeof(block),1,f);
}
}
else
printf("Erreur\n\n");
return 0;
}
void inttolitend(uint32_t x, uint8_t* lit_int){
lit_int[3] = (uint8_t)x / (256*256*256);
lit_int[2] = (uint8_t)(x % (256*256*256)) / (256*256);
lit_int[1] = (uint8_t)((x % (256*256*256)) % (256*256)) / 256;
lit_int[0] = (uint8_t)((x % (256*256*256)) % (256*256)) % 256;
}
With this I create a disk with 7 block and I write all 7 block in my file. In my first block I wrote the size of my disk (7) in little endian. So block[0] = 00, block[1] = 00, block[2] = 01, block[3] = 11 or something like that.
But when I print my result I get :
0
8
16
24
Not what I expected I try without readand i got the same result. So my programm didn't write the information from the disk. Or there's a problem when I write my block in the disk ?
i can not be so sure what you want exactly but this should work
#define BLOCK_BUFF_SIZE (1024*sizeof(block))
int tttfs_create(int size, char *name);
int tttfs_load(char *fname, disk *pdisk)
void inttolitend(uint32_t x, uint8_t* lit_int);
int tttfs_load(char *fname, disk *pdisk){
int fd;
unsigned int n;
int i;
fd = open("disk.tfs",_O_RDONLY);
if(fd==-1){
perror("tttfs_load.open");
return -1;
}
n=lseek (fd,0,SEEK_END);
lseek(fd,0,SEEK_SET);
if(n==(unsigned int)-1){
close(fd);
return -1;
}
if(n){
n/=(BLOCK_BUFF_SIZE);
pdisk->diskBlock=malloc(n*sizeof(block));
for(i=0;i<n;i++){
pdisk->diskBlock[i].unBlock=malloc(BLOCK_BUFF_SIZE);
read(fd,pdisk->diskBlock[i].unBlock,BLOCK_BUFF_SIZE);
}
}
close(fd);
return n;
}
int main(){
unsigned int n;
disk d;
int i;
tttfs_create(7, "disk.tfs");
n=tttfs_load("disk.tfs",&d);
if(!n || n==(-1)) return -1;
for(i=0; i<4; i++){
printf("%d\n", (uint8_t)(d.diskBlock->unBlock[i]));//print my result.
}
}
int tttfs_create(int size, char *name){
FILE *f = NULL;
disk d;
int i;
if (!(f = fopen(name, "wb"))) {
perror("tttfs_create:open()");
return 0;
}
d.diskBlock = malloc(sizeof(block) * size);
d.id = 1;
for(i = 0; i<size; i++){
d.diskBlock[i].unBlock = malloc(BLOCK_BUFF_SIZE);
}
inttolitend(size, (uint8_t*)(d.diskBlock->unBlock));
for(i=0;i<size;i++){
fwrite(d.diskBlock[i].unBlock ,BLOCK_BUFF_SIZE,1,f);
}
return 1;
}
void inttolitend(uint32_t x, uint8_t* lit_int){
lit_int[3] = ((x<<24) & 0xff);
lit_int[2] = ((x<<16) & 0xff);
lit_int[1] = ((x<<8) & 0xff);
lit_int[0] = ((x<<0) & 0xff);
}

atomicAdd causing error Unable to Launch/Execute Kernel

I have the following CUDA C code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
which causes my program to crash with the error: "unable to launch/execute kernel"
Here buffer is an input array of integers to this function of size elements and histo_private is an array of integers in shared memory of histo_size elements.
I know this isn't an index out of bounds error because when I use the code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
int a = histo_private[buffer[i]];
i+=stride;
}
So I gather that there is something wrong with the atomicAdd function and/or the memory address of this 32-bit int array.
The kernel.cu file contains the following code:
// Define your kernels in this file you may use more than one kernel if you
// need to
// INSERT KERNEL(S) HERE
__global__ void histo_kernel(unsigned int* buffer, unsigned int size, int* histo, unsigned int histo_size)
{
extern __shared__ int histo_private[];
if(threadIdx.x < histo_size)
histo_private[threadIdx.x] = 0;
__syncthreads();
// compute block's histogram
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
//int a = histo_private[buffer[i]];
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
// store to global histogram
__syncthreads();
//if(threadIdx.x < histo_size)
// atomicAdd(&(histo[threadIdx.x]),histo_private[threadIdx.x]);
}
// ensures that no bins contains more than 255 elements
__global__ void enforce_saturation(int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
{
if(histo[i] > 255) // this will be necessary to prevent data loss
histo[i] = 255; // when converting from int to uint8_t
}
}
__global__ void construct_histo(uint8_t* histo_unpacked, int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
histo_unpacked[i] = histo[i];
}
// unpacks the input array into an output array with 'spaces'
__global__ void unpack(uint8_t* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[4*i] = in[i];
out[4*i+1] = 0;
out[4*i+2] = 0;
out[4*i+3] = 0;
}
}
// converts the input uint8_t array to an int array
__global__ void convert(uint8_t* in, int* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (int) in[4*i];
}
}
// converts the input int array to a uint8_t array
__global__ void convert_back(int* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (uint8_t) in[i];
}
}
void histogram(unsigned int* input, uint8_t* bins, unsigned int num_elements, unsigned int num_bins)
{
int BLOCK_SIZE = (int) num_bins;
BLOCK_SIZE = 512;
dim3 dim_grid, dim_block;
dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
dim_grid.x = 1+(num_elements-1)/BLOCK_SIZE; dim_grid.y = dim_grid.z = 1;
// create an array of uint8_t to be converted into an array of int
uint8_t* bins_unpacked;
cudaMalloc((void**)&bins_unpacked, 4 * num_bins * sizeof(uint8_t));
// unpack the input uint8_t array
unpack<<<dim_grid,dim_block>>>(bins, bins_unpacked, num_bins);
// need an int version of bins_d
int* bins_int_d;
cudaMalloc((void**)&bins_int_d, num_bins * sizeof(int));
// convert the uint8_t array to an int array
convert<<<dim_grid,dim_block>>>(bins_unpacked, bins_int_d, num_bins);
// run kernel and enforce saturation requirements
int histo_private_size = num_bins;
histo_kernel<<<dim_grid,dim_block,histo_private_size>>>(input, num_elements, bins_int_d, num_bins);
enforce_saturation<<<dim_grid,dim_block>>>(bins_int_d,num_bins);
// convert the int array back to uint8_t
convert_back<<<dim_grid,dim_block>>>(bins_int_d, bins, num_bins);
}
While the function that calls this last histogram function is in main.cu (I did NOT make this second file--it was provided to me--also, I have been testing this on consistent data by compiling via make test-mode):
#include <stdio.h>
#include <stdint.h>
#include "support.h"
#include "kernel.cu"
int main(int argc, char* argv[])
{
Timer timer;
// Initialize host variables ----------------------------------------------
#if TEST_MODE
printf("\n***Running in test mode***\n"); fflush(stdout);
#endif
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
unsigned int *in_h;
uint8_t* bins_h;
unsigned int *in_d;
uint8_t* bins_d;
unsigned int num_elements, num_bins;
cudaError_t cuda_ret;
if(argc == 1) {
num_elements = 1000000;
num_bins = 4096;
} else if(argc == 2) {
num_elements = atoi(argv[1]);
num_bins = 4096;
} else if(argc == 3) {
num_elements = atoi(argv[1]);
num_bins = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./histogram # Input: 1,000,000, Bins: 4,096"
"\n Usage: ./histogram <m> # Input: m, Bins: 4,096"
"\n Usage: ./histogram <m> <n> # Input: m, Bins: n"
"\n");
exit(0);
}
initVector(&in_h, num_elements, num_bins);
bins_h = (uint8_t*) malloc(num_bins*sizeof(uint8_t));
// TESTING
for(unsigned int i = 0; i < num_bins; ++i)
{
bins_h[i] = i;
//printf("uint8_t Element %u: is %u \n", i, bins_h[i]);
}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Input size = %u\n Number of bins = %u\n", num_elements,
num_bins);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMalloc((void**)&in_d, num_elements * sizeof(unsigned int));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cuda_ret = cudaMalloc((void**)&bins_d, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(in_d, in_h, num_elements * sizeof(unsigned int),
cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cuda_ret = cudaMemset(bins_d, 0, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to set device memory");
// TESTING
//cuda_ret = cudaMemcpy(bins_d, bins_h, num_bins * sizeof(uint8_t),
// cudaMemcpyHostToDevice);
//if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
histogram(in_d, bins_d, num_elements, num_bins);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(bins_h, bins_d, num_bins * sizeof(uint8_t),
cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
#if TEST_MODE
printf("\nResult:\n");
for(unsigned int binIdx = 0; binIdx < num_bins; ++binIdx) {
printf("Bin %u: %u elements\n", binIdx, bins_h[binIdx]);
}
printf("\nElements Vec:\n");
for(unsigned int i = 0; i < num_elements; ++i) {
printf("Element %u: %u is \n", i, in_h[i]);
}
#endif
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(in_h, bins_h, num_elements, num_bins);
// Free memory ------------------------------------------------------------
cudaFree(in_d); cudaFree(bins_d);
free(in_h); free(bins_h);
return 0;
}
Turns out that this was just an index out of bounds error. The element buffer[i] was greater than the length of histo_private. As another poster mentioned, this was not obvious due to the following artifact of the c compiler:
The compiler is permitted to assume every access is within bounds. That line of my test code did nothing if the access is within bounds and therefore the compiler is permitted to assume that line of code does nothing. Thus it didn't require an access so the successful run of the test code was misleading. Once that line was changed to where the variable hist_private was modified at buffer[i], runtime errors came about.

Negative transformation of an image in C

#include < stdio.h >
#include < conio.h >
#include < stdlib.h >
#include < process.h >
#include < string.h >
#include < math.h >
int count = 0;
typedef struct bitmap24 {
unsigned char header[54];
unsigned char * pixels;
}BMP;
void readBMP(char * filename) {
int i;
FILE * f = fopen(filename, "rb");
FILE * f1 = fopen("save.bmp", "wb");
FILE * pixelVals = fopen("vals.dat", "w");
unsigned char bmppad[3] = {
0,
0,
0
};
if (!f) {
printf("Could not read file!\n");
exit(0);
}
unsigned char info[54];
fread(info, sizeof(unsigned char), 54, f);
int width = * (int * ) & info[18];
int height = * (int * ) & info[22];
unsigned char * img = NULL;
if (img)
free(img);
img = (unsigned char * ) malloc(3 * width * height);
memset(img, 0, sizeof(img));
fwrite(info, sizeof(unsigned char), 54, f1);
int length = width * height;
unsigned long int image[10000][3];
for (i = 0; i < length; i++) {
image[i][2] = getc(f); // blue
image[i][1] = getc(f); // green
image[i][0] = getc(f); // red
img[count] = 255 - (unsigned char) image[i][0];
//img[count] = 10*(unsigned char)log10((double)image[i][0]+1);
count += 1;
img[count] = 255 - (unsigned char) image[i][2];
//img[count] = 10*(unsigned char)log10((double)image[i][3]+1);
count += 1;
img[count] = 255 - (unsigned char) image[i][2];
//img[count] = 10*(unsigned char)log10((double)image[i][2]+1);
count += 1;
printf("pixel %d : [%d,%d,%d]\n", i + 1, image[i][0], image[i][4], image[i][2]);
fprintf(pixelVals, "pixel %d : [%d,%d,%d]\n", i + 1, image[i][0], image[i][5], image[i][2]);
}
for (i = height - 1; i >= 0; i--) {
fwrite(img + (width * (height - i - 1) * 3), 3, width, f1);
fwrite(bmppad, 1, (4 - (width * 3) % 4) % 4, f1);
}
fclose(f);
fclose(f1);
fclose(pixelVals);
}
void main() {
char * fileName = "bitgray.bmp";
readBMP(fileName);
getch();
}
I am not getting the correct result when the image is saved. I am using 24bit bmp image of dimensions 114 X 81. The image was coming out to be inverted initially but that issue was solved. But I am still getting a slanted image. I know the problem is in the last 'for' loop.
How should I solve it ?
Bitmap scanlines are padded to 4-byte boundary. So you need to add an extra two bytes so that the row is divisible by 4. At the moment, you have 114 * 3 = 342 bytes of pixel data per line. The next number divisible by 4 is 344.
So, at the end of reading each line, just read an extra two bytes and discard them.
In general, you can work out the extra bytes like this:
extra = (alignment - ((width * bytesPerPixel) % alignment)) % alignment;
Where in this case alignment is 4.
From memory, there is a field in the header that should contain the value of the full scanwidth (width * bytesPerPixel + extra), but it's a good idea not to expect it to be correct because you can calculate it easily.
You must also be aware of this padding rule when you save a bitmap.
Your second for loop looks strange. I believe it should be:
for(i = 0; i < height; i++) {...}
or:
for(i = height-1; i >= 0; i--) {...}

Resources