MPI basic image processing - convolution using MPI - c

I am trying to learn MPI, so i created this simple program which is doing the convolution on grayscale uint8 image. I modified the openMP code which was working quite ok, with sliceing the vectorized image to parts for each processor - to scatterDataSize, but i am getting weird error from MPI:
"Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
mpirun noticed that process rank 1 with PID 5360 on node wn25 exited on signal 11 (Segmentation fault)."
I've tried to make it using MPI_Scatter to brodcast the image to all processes, and after convolution collect the data with MPI_Gather or MPI_Allgather, but result from both is the same...
argv are image dimensions - the images are i.e. input1000_800.bin so the program is being executed as:
mpirun -np 4 ./main 1000 800 and the source code of my program is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <mpi.h>
const int ROOT = 0;
const double filter[][5] = { {0,0,1,0,0},
{0,1,2,1,0},
{1,2,-16,2,1},
{0,1,2,1,0},
{0,0,1,0,0} };
unsigned char normalize(double value);
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension);
void saveImage(char* filename[], unsigned char* image, long fileLength);
long getFileSize(char* filename[]);
unsigned char * readImage(char* filename[]);
int main(int argc, char * argv[])
{
int width = atoi(argv[1]);
int height = atoi(argv[2]);
const char * prefixInFile = "../../labMPI/infile";
const char * prefixOutFile = "result";
const char * fileExtension = ".bin";
char outFileName[64], fileName[64];
unsigned char * image, *buffer, *data;
long fileSize;
int processorsAmount, processId, scatterDataSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &processorsAmount);
MPI_Comm_rank(MPI_COMM_WORLD, &processId);
strcpy(fileName, prefixInFile);
strcat(fileName, argv[1]);
strcat(fileName, "_");
strcat(fileName, argv[2]);
strcat(fileName, fileExtension);
strcpy(outFileName, prefixOutFile);
strcat(outFileName, argv[1]);
strcat(outFileName, "_");
strcat(outFileName, argv[2]);
strcat(outFileName, fileExtension);
fileSize = getFileSize(fileName);
data = (unsigned char *)malloc(fileSize * sizeof(unsigned char));
scatterDataSize = fileSize / processorsAmount;
int startPoint = scatterDataSize * processId;
int endPoint = scatterDataSize * (processId + 1);
printf("filesize: %d\nprocessorsAmount: %d\niam: %d\nscatterDataSize: %d\nstartPoint: %d\nendPoint: %d\n",
fileSize, processorsAmount, processId, scatterDataSize, startPoint, endPoint);
if (processId == ROOT)
{
printf("Reading file %s on ROOT \n output file will be: %s \n", fileName, outFileName);
image = readImage(fileName);
}
MPI_Bcast(image,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
#pragma omp parallel for
for (int i = startPoint; i < endPoint; i++)
{
register int col = i % width;
register int row = i / width;
register long idx = col + width * row;
data[idx] = normalize(convolution(col, row, image, height, width, 5));
}
/* i am not sure how to collect processed data on workers back to root and perform save */
MPI_Bcast(data,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
if (processId == ROOT)
{
saveImage(outFileName, data, fileSize);
printf("Image processing is finished");
free(image);
}
free(data);
MPI_Finalize();
return 0;
}
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension)
{
register int filterHeight, filterWidth, kernelCenter, ii, jj;
filterHeight = filterWidth = filterDimension;
kernelCenter = filterHeight / 2;
register double tmp = 0;
for (long m = 0; m < filterHeight; ++m) {
for (long n = 0; n < filterWidth; ++n) {
ii = i + (kernelCenter - m);
jj = j + (kernelCenter - n);
if (ii >= 0 && ii < width && jj >= 0 && jj < height)
tmp += image[jj * width + ii] * filter[m][n];
}
}
return tmp;
}
unsigned char * readImage(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long long fileLength = ftell(inFile);
fseek(inFile, 0, SEEK_SET);
unsigned char * image = (unsigned char *)malloc(fileLength * sizeof(unsigned char));
fread(image, sizeof(unsigned char), fileLength, inFile);
fclose(inFile);
return image;
}
long getFileSize(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long fileLength = ftell(inFile);
fclose(inFile);
return fileLength;
}
void saveImage(char* filename[], unsigned char* image, long fileLength)
{
FILE *write = fopen(filename, "wb");
fwrite(image, sizeof(unsigned char), fileLength * sizeof(unsigned char), write);
fclose(write);
}
unsigned char normalize(double value)
{
if (value > 255)
{
value = 255;
}
else if (value < 0) {
value = 0;
}
return (unsigned char)value;
}

Related

multithreading implementation read, rotate, and save an image using 2 threads

I have written this program that should use two threads one that reads in an image into a buffer and the second turns the image 90 degrees and saves it to a file. I need to run this for couple of minutes and collect some data. My problem is that the code runs ok without the while loop counter but I am having issue running multiple times. I assume the problem is with my threaded implementation. When the while loop is include the output image is just a set of vertical lines without the while counter image is reconstructed correctly. thank you
I have written this program that should use two threads one that reads in an image into a buffer and the second turns the image 90 degrees and saves it to a file. I need to run this for couple of minutes and collect some data. My problem is that the problem runs ok without the while loop counter but I am having issue running multiple times. I assume the problem is with my threaded implementation. thank you
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define WIDTH 512
#define HEIGHT 512
#define DEPTH 255
pthread_mutex_t mtx;
static const unsigned MAX = 8;
char *buffer = (char *)malloc((MAX) * sizeof(char));
void *grab(void *buffer)
{
FILE *infile;
size_t i;
pthread_mutex_lock(&mtx);
char *localbuffer = (char *)buffer;
infile = fopen("lena512.pgm", "r");
for (i = 0; i < 4; ++i)
fscanf(infile, "%*[^\n]\n");
//buffer = (char *)malloc((WIDTH/64) * sizeof(char));
//buffer = (char *)malloc((512) * sizeof(char)); /
for (i = 0; i < HEIGHT; ++i)
{
fread(&localbuffer[i * WIDTH], sizeof(char),
WIDTH, infile);
}
pthread_mutex_unlock(&mtx);
//fclose(infile);
//return localbuffer;
pthread_exit(NULL);
}
void *analyze(void *buffer)
{
pthread_mutex_lock(&mtx);
char *localbuffer = (char *)buffer;
FILE *outfile;
size_t i;
char *analyzed =
(char *)malloc(WIDTH * HEIGHT * sizeof(char));
for (int x = 0; x < WIDTH; x++)
{
for (int y = 0; y < HEIGHT; y++)
{
int offset = HEIGHT * x + y;
analyzed[offset] = localbuffer[y * HEIGHT - 1 + x];
}
}
outfile = fopen("analyzed.pgm", "w");
fputs("P5\n", outfile);
fprintf(outfile, "%d %d\n%d\n", WIDTH, HEIGHT, DEPTH);
for (i = 0; i < HEIGHT; ++i)
{
fwrite(&analyzed[i * WIDTH], sizeof(char),
WIDTH, outfile);
}
pthread_mutex_unlock(&mtx);
//fclose(outfile);
free(analyzed);
//return localbuffer;
pthread_exit(NULL);
}
int main(void)
{
char *localbuffer;
pthread_t thread1, thread2;
int counter = 10000;
while (counter != 0)
{
pthread_create(&thread1, NULL, grab, buffer);
pthread_create(&thread2, NULL, analyze, buffer);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
}
return 0;
}

Unable to sequentially process a file > 250kb using Windows file mapping with n-sized chunks

I am trying to make a software that takes a txt file and xor every 4 byte with a pre-defined number.
I am doing this mapping the file in memory and opening chunks of the file with MapViewOfFile of size n.
The algorithm I'm attaching works well for txt files of less than 250 kb. But for file > 250kb it only xor some parts of the file and I cannot understand why and how to fix this.
Can someone help me?
#include "stdafx.h"
#include "Windows.h"
#include <stdio.h>
#include <stdint.h>
#include <iso646.h>
#include <math.h>
unsigned int strToUl(char *s)
{
int size = 4;
unsigned int ul = 0;
memcpy(&ul, (unsigned int *)s, size);
return ul;
}
char *ulToStr(unsigned int *ul)
{
int size = 4;
char *tch = (char *)calloc(size, sizeof(char *));
memcpy(tch, (char *)ul, size);
return tch;
}
unsigned int uixor(unsigned int n, unsigned int seed)
{
srand(seed);
unsigned int mask = rand();
char ch[5] = { 0 };
strcpy_s(ch, 5, ulToStr(&n));
for (int j = 0; j < 5; j++)
{
ch[j] = ch[j] ^ mask;
}
return strToUl(ch);
}
BOOL mapWriteChunk(PHANDLE phFile, DWORD dwFileSize, int start, int buffsize, uint32_t xork)
{
DWORD offset = start;// / 4;// / sizeof(DWORD);
SYSTEM_INFO SysInfo;
GetSystemInfo(&SysInfo);
DWORD dwSysGran = SysInfo.dwAllocationGranularity;
DWORD dwFileMapStart = (offset/dwSysGran) * dwSysGran;
DWORD dwMapViewSize = (offset % dwSysGran) + buffsize;
DWORD dwFileMapSize = offset + buffsize;
unsigned int *ulMVBuffer = (unsigned int *)MapViewOfFile(*phFile, FILE_MAP_ALL_ACCESS, 0, dwFileMapStart, 0);
if (ulMVBuffer == NULL)
{
printf("ulMVBuffer = NULL\n");
}
int iViewDelta = offset - dwFileMapStart;
for (int i = 0; i < buffsize; i++)
{
unsigned int *u = (unsigned int *)ulMVBuffer + (iViewDelta + i);
unsigned int u1 = *u;
unsigned int u2 = uixor(u1, xork);
*u = u2;
printf("write on %d -> ", iViewDelta);
}
UnmapViewOfFile(ulMVBuffer);
return TRUE;
}
int main()
{
char name[] = "test.txt";
OFSTRUCT tOfStrIn;
tOfStrIn.cBytes = sizeof tOfStrIn;
HANDLE hFile = (HANDLE)OpenFile(name, &tOfStrIn, OF_READWRITE);
DWORD dwFileSize = GetFileSize(hFile, NULL);
HANDLE hFileMap = CreateFileMapping(hFile, NULL, PAGE_READWRITE, 0, dwFileSize, NULL);
if (hFileMap == NULL)
{
printf("hFileMap = NULL\n");
}
int pos = 0;
int chunk = 4;
int bSize = dwFileSize / sizeof(DWORD);
int rseed = 10;
for (pos = 0; pos < bSize; pos+=chunk)
{
mapWriteChunk(&hFileMap, dwFileSize, pos, chunk, rseed);
}
CloseHandle(hFile);
CloseHandle(hFileMap);
system("PAUSE");
return 0;
}
Ok, I figured out the problem and I'm writing here so anyone who have the same problem, know what's wrong.
Talk is cheap, I show you the code (and then I'll explain):
char *ulMVBuffer = (char *)MapViewOfFile(phFile, FILE_MAP_ALL_ACCESS, 0, dwFileMapStart, 0);
if (ulMVBuffer == NULL)
{
printf("ulMVBuffer = NULL\n");
}
int iViewDelta = offset - dwFileMapStart;
unsigned int mask = myrand(xork);
for(int i = 0; i < buffsize; i++)
{
unsigned int c = ulMVBuffer[iViewDelta + i] ^ mask;
ulMVBuffer[iViewDelta + i] = c;
}
So you have to map the memory using a char pointer and then, when you use the XOR operator like that:
unsigned int c = ulMVBuffer[iViewDelta + i] ^ mask;
You obtain the XOR to be applied to a group of 4 bytes and not only on 1 byte, because - as far as I understood playing around - the XOR between a char (1 byte) and a unsigned int (4 bytes) forces the operator to pick 3 more bytes from the memory and use it for the bitwise operation.
This wasn't working using a pointer to unsigned int because, I guess, it stored the bytes from the memory in a different fashion (maybe OS or machine dependent?) and so you were able to XOR only 1 byte every 4 and not groups of 4 bytes all together.
If anyone has a better understanding to this or wants to add more to this solution, I will be more than happy to read it!

AVX2 1GB long array

I have a 1gb long array with floats in a .bin file. After i read it how can i sum the elements with avx2 instrucion, and print the result?
I edited my code with Jake 'Alquimista' LEE's answer.
The problem is the result much smaller than it will be. And other question, how can i add a constant to each number that i readed from .bin file?
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
while (len--)
len >>= 3;
sum = _mm256_set1_ps(0.0f);
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
int main(void)
{
FILE *file;
float *buffer2;
uint32_t fileLen;
if((file = fopen("example.bin","rb"))==NULL)
{
printf("Error! opening file");
exit(1);
}
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
buffer2=(float *)malloc(fileLen+1);
if (!buffer2)
{
fprintf(stderr, "Memory error!");
fclose(file);
return 0;
}
fread(buffer2, fileLen, 1, file);
fclose(file);
printf( "File size : %lu Bits \n", fileLen );
for(int i = 0; i<10; i++)
printf("%f \n", buffer2[i]);
float sum =sumf(buffer2,fileLen);
printf("%f\n",s);
free(buffer2);
return 0;
}
Reading 1GB file into memory is big memory and I/O overhead. Although I'm not very familiar with AVX2, i read articles from Internet & i could come up with the following solution which is actually tested and proved to be working.
My solution consists of reading the file as chuncks of 512 Bytes (Blocks of 128 floats) then summing up the pairs of vectors (16 Total vectors per block) so that at the end we get a final __m256 vector, by casting it to a float* we could sum up its individual components to get the final result.
A case where the file is not 128-floats aligned is handled in the last for loop by summing up individual floats.
The code is commented but in case you have any suggestions to add more explanation to the answer then feel free to do so.
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int make_floatf(char *, int);
float avx_sfadd(char*);
char error_buf[1024];
#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)
/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;
if(!(fp = fopen(filename, "wb+")))
PERROR();
float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();
int j, i;
for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;
int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}
free(block_ptr);
fclose(fp);
return 0;
}
/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;
__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();
if(!(fp = fopen(filename, "rb")))
PERROR();
struct stat stat_buf;
stat(filename, &stat_buf);
size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));
printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);
/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();
int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();
/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);
sum += _mm256_add_ps(v1, v2);
}
}
/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();
int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}
float final_sum = rem_sum;
float *sum_ptr = (float*)∑ /* The final vector hold the sum of all vectors */
/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];
free(block_ptr);
fclose(fp);
return final_sum;
}
int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}
/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){
if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}
/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :
return 0;
}
Here's (most likely) your bug:
while (len--)
len >>= 3;
That's a while loop. As long as len != 0, you replace len with (len - 1) >> 3. And then you change it to -1. No loop to be seen.
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
len >>= 3;
sum = _mm256_set1_ps(0.0f);
while (len--)
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
in = *(__m256 *)&_mm256_permute4x64_pd(*(__m256d *)&sum, 0b01001110);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
The function above will do. However, I don't think that it will bring much of a performance gain, if any, since it's a very trivial one, and the compiler will do auto-vectorize it anyway.
Please note that you have to typecast the pointer to float *, and divide filelen by sizeof(float) when you pass them as arguments.

atomicAdd causing error Unable to Launch/Execute Kernel

I have the following CUDA C code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
which causes my program to crash with the error: "unable to launch/execute kernel"
Here buffer is an input array of integers to this function of size elements and histo_private is an array of integers in shared memory of histo_size elements.
I know this isn't an index out of bounds error because when I use the code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
int a = histo_private[buffer[i]];
i+=stride;
}
So I gather that there is something wrong with the atomicAdd function and/or the memory address of this 32-bit int array.
The kernel.cu file contains the following code:
// Define your kernels in this file you may use more than one kernel if you
// need to
// INSERT KERNEL(S) HERE
__global__ void histo_kernel(unsigned int* buffer, unsigned int size, int* histo, unsigned int histo_size)
{
extern __shared__ int histo_private[];
if(threadIdx.x < histo_size)
histo_private[threadIdx.x] = 0;
__syncthreads();
// compute block's histogram
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
//int a = histo_private[buffer[i]];
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
// store to global histogram
__syncthreads();
//if(threadIdx.x < histo_size)
// atomicAdd(&(histo[threadIdx.x]),histo_private[threadIdx.x]);
}
// ensures that no bins contains more than 255 elements
__global__ void enforce_saturation(int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
{
if(histo[i] > 255) // this will be necessary to prevent data loss
histo[i] = 255; // when converting from int to uint8_t
}
}
__global__ void construct_histo(uint8_t* histo_unpacked, int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
histo_unpacked[i] = histo[i];
}
// unpacks the input array into an output array with 'spaces'
__global__ void unpack(uint8_t* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[4*i] = in[i];
out[4*i+1] = 0;
out[4*i+2] = 0;
out[4*i+3] = 0;
}
}
// converts the input uint8_t array to an int array
__global__ void convert(uint8_t* in, int* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (int) in[4*i];
}
}
// converts the input int array to a uint8_t array
__global__ void convert_back(int* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (uint8_t) in[i];
}
}
void histogram(unsigned int* input, uint8_t* bins, unsigned int num_elements, unsigned int num_bins)
{
int BLOCK_SIZE = (int) num_bins;
BLOCK_SIZE = 512;
dim3 dim_grid, dim_block;
dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
dim_grid.x = 1+(num_elements-1)/BLOCK_SIZE; dim_grid.y = dim_grid.z = 1;
// create an array of uint8_t to be converted into an array of int
uint8_t* bins_unpacked;
cudaMalloc((void**)&bins_unpacked, 4 * num_bins * sizeof(uint8_t));
// unpack the input uint8_t array
unpack<<<dim_grid,dim_block>>>(bins, bins_unpacked, num_bins);
// need an int version of bins_d
int* bins_int_d;
cudaMalloc((void**)&bins_int_d, num_bins * sizeof(int));
// convert the uint8_t array to an int array
convert<<<dim_grid,dim_block>>>(bins_unpacked, bins_int_d, num_bins);
// run kernel and enforce saturation requirements
int histo_private_size = num_bins;
histo_kernel<<<dim_grid,dim_block,histo_private_size>>>(input, num_elements, bins_int_d, num_bins);
enforce_saturation<<<dim_grid,dim_block>>>(bins_int_d,num_bins);
// convert the int array back to uint8_t
convert_back<<<dim_grid,dim_block>>>(bins_int_d, bins, num_bins);
}
While the function that calls this last histogram function is in main.cu (I did NOT make this second file--it was provided to me--also, I have been testing this on consistent data by compiling via make test-mode):
#include <stdio.h>
#include <stdint.h>
#include "support.h"
#include "kernel.cu"
int main(int argc, char* argv[])
{
Timer timer;
// Initialize host variables ----------------------------------------------
#if TEST_MODE
printf("\n***Running in test mode***\n"); fflush(stdout);
#endif
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
unsigned int *in_h;
uint8_t* bins_h;
unsigned int *in_d;
uint8_t* bins_d;
unsigned int num_elements, num_bins;
cudaError_t cuda_ret;
if(argc == 1) {
num_elements = 1000000;
num_bins = 4096;
} else if(argc == 2) {
num_elements = atoi(argv[1]);
num_bins = 4096;
} else if(argc == 3) {
num_elements = atoi(argv[1]);
num_bins = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./histogram # Input: 1,000,000, Bins: 4,096"
"\n Usage: ./histogram <m> # Input: m, Bins: 4,096"
"\n Usage: ./histogram <m> <n> # Input: m, Bins: n"
"\n");
exit(0);
}
initVector(&in_h, num_elements, num_bins);
bins_h = (uint8_t*) malloc(num_bins*sizeof(uint8_t));
// TESTING
for(unsigned int i = 0; i < num_bins; ++i)
{
bins_h[i] = i;
//printf("uint8_t Element %u: is %u \n", i, bins_h[i]);
}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Input size = %u\n Number of bins = %u\n", num_elements,
num_bins);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMalloc((void**)&in_d, num_elements * sizeof(unsigned int));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cuda_ret = cudaMalloc((void**)&bins_d, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(in_d, in_h, num_elements * sizeof(unsigned int),
cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cuda_ret = cudaMemset(bins_d, 0, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to set device memory");
// TESTING
//cuda_ret = cudaMemcpy(bins_d, bins_h, num_bins * sizeof(uint8_t),
// cudaMemcpyHostToDevice);
//if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
histogram(in_d, bins_d, num_elements, num_bins);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(bins_h, bins_d, num_bins * sizeof(uint8_t),
cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
#if TEST_MODE
printf("\nResult:\n");
for(unsigned int binIdx = 0; binIdx < num_bins; ++binIdx) {
printf("Bin %u: %u elements\n", binIdx, bins_h[binIdx]);
}
printf("\nElements Vec:\n");
for(unsigned int i = 0; i < num_elements; ++i) {
printf("Element %u: %u is \n", i, in_h[i]);
}
#endif
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(in_h, bins_h, num_elements, num_bins);
// Free memory ------------------------------------------------------------
cudaFree(in_d); cudaFree(bins_d);
free(in_h); free(bins_h);
return 0;
}
Turns out that this was just an index out of bounds error. The element buffer[i] was greater than the length of histo_private. As another poster mentioned, this was not obvious due to the following artifact of the c compiler:
The compiler is permitted to assume every access is within bounds. That line of my test code did nothing if the access is within bounds and therefore the compiler is permitted to assume that line of code does nothing. Thus it didn't require an access so the successful run of the test code was misleading. Once that line was changed to where the variable hist_private was modified at buffer[i], runtime errors came about.

fread() in MPI is giving Signal 7 Bus Error

I am a newbie to C and MPI.
I have the following code which I am using with MPI.
#include "RabinKarp.c"
#include <stdio.h>
#include <stdlib.h>
#include<string.h>
#include<math.h>
#include </usr/include/mpi/mpi.h>
typedef struct {
int lowerOffset;
int upperOffset;
int processorNumber;
} patternPartitioning;
int rank;
FILE *fp;
char* filename = "/home/rohit/Downloads/10_seqs_2000_3000_bp.fasta";
int n = 0;
int d = 0;
//number of processors
int k, i = 0, lower_limit, upper_limit;
int main(int argc, char** argv) {
char* pattern= "taaat";
patternPartitioning partition[k];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &k);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fp = fopen(filename, "rb");
if (fp != '\0') {
fseek(fp, 0L, SEEK_END);
n = ftell(fp);
fseek(fp, 0L, SEEK_SET);
}
//Do for Master Processor
if(rank ==0){
int m = strlen(pattern);
printf("pattern length is %d \n", m);
d = (int)(n - m + 1) / k;
for (i = 0; i <= k - 2; i++) {
lower_limit = round(i * d);
upper_limit = round((i + 1) * d) + m - 1;
partition->lowerOffset = lower_limit;
partition->upperOffset = upper_limit;
partition->processorNumber = i+1;
// k-2 times calculate the limits like this
printf(" the lower limit is %d and upper limit is%d\n",
partition->lowerOffset, partition->upperOffset);
int mpi_send_block[2];
mpi_send_block[0]= lower_limit;
mpi_send_block[1] = upper_limit;
MPI_Send(mpi_send_block, 2, MPI_INT, i+1, i+1, MPI_COMM_WORLD);
//int MPI_Send(void *buf, int count, MPI_Datatype dtype, int dest, int tag, MPI_Comm comm);
}
// for the last processor calculate the index here
lower_limit = round((k - 1) * d);
upper_limit = n;
partition->lowerOffset = lower_limit;
partition->upperOffset = n;
partition->processorNumber = k;
printf("Processor : %d : has start : %d : and end : %d :\n",rank,partition->lowerOffset,partition->upperOffset);
//perform the search here
int size = partition->upperOffset-partition->lowerOffset;
char *text = (char*) malloc (size);
fseek(fp,partition->lowerOffset , SEEK_SET);
fread(&text, sizeof(char), size, fp);
printf("read in rank0");
fputs(text,stdout);
int number =0;
fputs(text,stdout);
fputs(pattern,stdout);
number = rabincarp(text,pattern);
for (i = 0; i <= k - 2; i++) {
int res[1];
res[0]=0;
MPI_Status status;
// MPI_Recv(res, 1, MPI_INT, i+1, i+1, MPI_COMM_WORLD, &status);
// number = number + res[0];
}
printf("\n\ntotal number of result found:%d\n", number);
} else {
patternPartitioning mypartition;
MPI_Status status;
int number[1];
int mpi_recv_block[2];
MPI_Recv(mpi_recv_block, 2, MPI_INT, 0, rank, MPI_COMM_WORLD,
&status);
printf("Processor : %d : has start : %d : and end : %d :\n",rank,mpi_recv_block[0],mpi_recv_block[1]);
//perform the search here
int size = mpi_recv_block[1]-mpi_recv_block[0];
char *text = (char*) malloc (size);
fseek(fp,mpi_recv_block[0] , SEEK_SET);
fread(&text, sizeof(char), size, fp);
printf("read in rank1");
// fread(text,size,size,fp);
printf("length of text segment by proc: %d is %d",rank,(int)strlen(text));
number[0] = rabincarp(text,pattern);
//MPI_Send(number, 1, MPI_INT, 0, rank, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
fclose(fp);
return (EXIT_SUCCESS);
}
if I run( mpirun -np 2 pnew ) this I am getting the following error:
[localhost:03265] *** Process received signal ***
[localhost:03265] *** Process received signal ***
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 3265 on node localhost exited on signal 7 (Bus error).
so if I remove the fread() statements I dont get the error.. can anyone tell me what am I missing?
char *text = (char*) malloc (size);
fseek(fp,partition->lowerOffset , SEEK_SET);
fread(&text, sizeof(char), size, fp);
The documentation for fread says "The function fread() reads nmemb elements of data, each size bytes long, from the stream pointed to by stream, storing them at the location given by ptr."
Since text is a char *, &text is the address of a char *. That won't have enough space to hold the data you're reading. You want to pass fread the address of the memory you allocated, not the address of the variable holding that address! (So remove the &.)
if (fp != '\0') {
fp is FILE* , '\0' is an int constant.
This is not the error, but I suggest you compile with a higher warning level to catch this kind of errors.

Resources