multithreading implementation read, rotate, and save an image using 2 threads - c

I have written this program that should use two threads one that reads in an image into a buffer and the second turns the image 90 degrees and saves it to a file. I need to run this for couple of minutes and collect some data. My problem is that the code runs ok without the while loop counter but I am having issue running multiple times. I assume the problem is with my threaded implementation. When the while loop is include the output image is just a set of vertical lines without the while counter image is reconstructed correctly. thank you
I have written this program that should use two threads one that reads in an image into a buffer and the second turns the image 90 degrees and saves it to a file. I need to run this for couple of minutes and collect some data. My problem is that the problem runs ok without the while loop counter but I am having issue running multiple times. I assume the problem is with my threaded implementation. thank you
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define WIDTH 512
#define HEIGHT 512
#define DEPTH 255
pthread_mutex_t mtx;
static const unsigned MAX = 8;
char *buffer = (char *)malloc((MAX) * sizeof(char));
void *grab(void *buffer)
{
FILE *infile;
size_t i;
pthread_mutex_lock(&mtx);
char *localbuffer = (char *)buffer;
infile = fopen("lena512.pgm", "r");
for (i = 0; i < 4; ++i)
fscanf(infile, "%*[^\n]\n");
//buffer = (char *)malloc((WIDTH/64) * sizeof(char));
//buffer = (char *)malloc((512) * sizeof(char)); /
for (i = 0; i < HEIGHT; ++i)
{
fread(&localbuffer[i * WIDTH], sizeof(char),
WIDTH, infile);
}
pthread_mutex_unlock(&mtx);
//fclose(infile);
//return localbuffer;
pthread_exit(NULL);
}
void *analyze(void *buffer)
{
pthread_mutex_lock(&mtx);
char *localbuffer = (char *)buffer;
FILE *outfile;
size_t i;
char *analyzed =
(char *)malloc(WIDTH * HEIGHT * sizeof(char));
for (int x = 0; x < WIDTH; x++)
{
for (int y = 0; y < HEIGHT; y++)
{
int offset = HEIGHT * x + y;
analyzed[offset] = localbuffer[y * HEIGHT - 1 + x];
}
}
outfile = fopen("analyzed.pgm", "w");
fputs("P5\n", outfile);
fprintf(outfile, "%d %d\n%d\n", WIDTH, HEIGHT, DEPTH);
for (i = 0; i < HEIGHT; ++i)
{
fwrite(&analyzed[i * WIDTH], sizeof(char),
WIDTH, outfile);
}
pthread_mutex_unlock(&mtx);
//fclose(outfile);
free(analyzed);
//return localbuffer;
pthread_exit(NULL);
}
int main(void)
{
char *localbuffer;
pthread_t thread1, thread2;
int counter = 10000;
while (counter != 0)
{
pthread_create(&thread1, NULL, grab, buffer);
pthread_create(&thread2, NULL, analyze, buffer);
pthread_join(thread1, NULL);
pthread_join(thread2, NULL);
}
return 0;
}

Related

Performance of multithreaded algorithm to find max number in array

I'm trying to learn about multithreaded algorithms so I've implemented a simple find max number function of an array.
I've made a baseline program (findMax1.c) which loads from a file about 263 million int numbers into memory.
Then I simply use a for loop to find the max number. Then I've made another program (findMax2.c) which uses 4 threads.
I chose 4 threads because the CPU (intel i5 4460) I'm using has 4 cores and 1 thread per core. So my guess is that
if I assign each core a chunk of the array to process it would be more efficient because that way I'll have fewer cache
misses. Now, each thread finds the max number from each chunk, then I join all threads to finally find the max number
from all those chunks. The baseline program findMax1.c takes about 660ms to complete the task, so my initial thought was
that findMax2.c (which uses 4 threads) would take about 165ms (660ms / 4) to complete since now I have 4 threads running
all in parallel to do the same task, but findMax2.c takes about 610ms. Only 50ms less than findMax1.c.
What am I missing? is there something wrong with the implementation of the threaded program?
findMax1.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
int main(void)
{
int i, *array, max = 0, position;
size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
FILE *f;
clock_t t;
double time;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is falsa
printf("Loading array...");
t = clock();
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
assert(elements_read == 1);
printf("done!\n");
printf("File load time: %f [s]\n", time);
fclose(f);
array_size = array_size_in_bytes / sizeof(int);
printf("Finding max...");
t = clock();
for(i = 0; i < array_size; i++)
if(array[i] > max)
{
max = array[i];
position = i;
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
return 0;
}
findMax2.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, elements_read;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is false
printf("Loading array...");
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
assert(elements_read == 1);
printf("done!\n");
fclose(f);
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
First of all, you're measuring your time wrong.
clock() measures process CPU time, i.e., time used by all threads. The real elapsed time will be fraction of that. clock_gettime(CLOCK_MONOTONIC,...) should yield better measurements.
Second, your core loops aren't at all comparable.
In the multithreaded program you're writing in each loop iteration to global variables that are very close to each other and that is horrible for cache contention.
You could space that global memory apart (make each array item a cache-aligned struct (_Alignas(64))) and that'll help the time, but a better and fairer approach would be to use local variables (which should go into registers), copying the approach of the first loop, and then write out the chunk result to memory at the end of the loop:
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
Here's your modified test program with expected speedups (I'm getting approx. a 2x speedup on my two-core processor).
(I've also taken the liberty of replacing the file load with in-memory initialization, to make it simpler to test.)
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000+(Ts1.tv_nsec - Ts0.tv_nsec); }
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
#if 1 //shouldn't make much difference
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
#endif
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
#if 0 //horrible for caches
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
#else
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
#endif
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);
array = (int*) malloc(array_size_in_bytes);
if(array == NULL) abort(); // assert if condition is false
for(size_t i=0; i<array_size; i++) array[i]=i;
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
clock_gettime(CLOCK_MONOTONIC,&ts0);
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
clock_gettime(CLOCK_MONOTONIC,&ts1);
printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
My timings:
0.188917 for the signle threaded version
2.511590 for the original multithreaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
0.099802 with the modified threaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
ran on a Linux machine with Intel(R) Core(TM) i7-2620M CPU # 2.70GHz.

MPI basic image processing - convolution using MPI

I am trying to learn MPI, so i created this simple program which is doing the convolution on grayscale uint8 image. I modified the openMP code which was working quite ok, with sliceing the vectorized image to parts for each processor - to scatterDataSize, but i am getting weird error from MPI:
"Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
mpirun noticed that process rank 1 with PID 5360 on node wn25 exited on signal 11 (Segmentation fault)."
I've tried to make it using MPI_Scatter to brodcast the image to all processes, and after convolution collect the data with MPI_Gather or MPI_Allgather, but result from both is the same...
argv are image dimensions - the images are i.e. input1000_800.bin so the program is being executed as:
mpirun -np 4 ./main 1000 800 and the source code of my program is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#include <mpi.h>
const int ROOT = 0;
const double filter[][5] = { {0,0,1,0,0},
{0,1,2,1,0},
{1,2,-16,2,1},
{0,1,2,1,0},
{0,0,1,0,0} };
unsigned char normalize(double value);
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension);
void saveImage(char* filename[], unsigned char* image, long fileLength);
long getFileSize(char* filename[]);
unsigned char * readImage(char* filename[]);
int main(int argc, char * argv[])
{
int width = atoi(argv[1]);
int height = atoi(argv[2]);
const char * prefixInFile = "../../labMPI/infile";
const char * prefixOutFile = "result";
const char * fileExtension = ".bin";
char outFileName[64], fileName[64];
unsigned char * image, *buffer, *data;
long fileSize;
int processorsAmount, processId, scatterDataSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &processorsAmount);
MPI_Comm_rank(MPI_COMM_WORLD, &processId);
strcpy(fileName, prefixInFile);
strcat(fileName, argv[1]);
strcat(fileName, "_");
strcat(fileName, argv[2]);
strcat(fileName, fileExtension);
strcpy(outFileName, prefixOutFile);
strcat(outFileName, argv[1]);
strcat(outFileName, "_");
strcat(outFileName, argv[2]);
strcat(outFileName, fileExtension);
fileSize = getFileSize(fileName);
data = (unsigned char *)malloc(fileSize * sizeof(unsigned char));
scatterDataSize = fileSize / processorsAmount;
int startPoint = scatterDataSize * processId;
int endPoint = scatterDataSize * (processId + 1);
printf("filesize: %d\nprocessorsAmount: %d\niam: %d\nscatterDataSize: %d\nstartPoint: %d\nendPoint: %d\n",
fileSize, processorsAmount, processId, scatterDataSize, startPoint, endPoint);
if (processId == ROOT)
{
printf("Reading file %s on ROOT \n output file will be: %s \n", fileName, outFileName);
image = readImage(fileName);
}
MPI_Bcast(image,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
#pragma omp parallel for
for (int i = startPoint; i < endPoint; i++)
{
register int col = i % width;
register int row = i / width;
register long idx = col + width * row;
data[idx] = normalize(convolution(col, row, image, height, width, 5));
}
/* i am not sure how to collect processed data on workers back to root and perform save */
MPI_Bcast(data,fileSize, MPI_UNSIGNED_CHAR, ROOT, MPI_COMM_WORLD);
if (processId == ROOT)
{
saveImage(outFileName, data, fileSize);
printf("Image processing is finished");
free(image);
}
free(data);
MPI_Finalize();
return 0;
}
double convolution(int i, int j, unsigned char *image, int height, int width, int filterDimension)
{
register int filterHeight, filterWidth, kernelCenter, ii, jj;
filterHeight = filterWidth = filterDimension;
kernelCenter = filterHeight / 2;
register double tmp = 0;
for (long m = 0; m < filterHeight; ++m) {
for (long n = 0; n < filterWidth; ++n) {
ii = i + (kernelCenter - m);
jj = j + (kernelCenter - n);
if (ii >= 0 && ii < width && jj >= 0 && jj < height)
tmp += image[jj * width + ii] * filter[m][n];
}
}
return tmp;
}
unsigned char * readImage(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long long fileLength = ftell(inFile);
fseek(inFile, 0, SEEK_SET);
unsigned char * image = (unsigned char *)malloc(fileLength * sizeof(unsigned char));
fread(image, sizeof(unsigned char), fileLength, inFile);
fclose(inFile);
return image;
}
long getFileSize(char* filename[])
{
FILE *inFile = fopen(filename, "rb");
fseek(inFile, 0, SEEK_END);
long fileLength = ftell(inFile);
fclose(inFile);
return fileLength;
}
void saveImage(char* filename[], unsigned char* image, long fileLength)
{
FILE *write = fopen(filename, "wb");
fwrite(image, sizeof(unsigned char), fileLength * sizeof(unsigned char), write);
fclose(write);
}
unsigned char normalize(double value)
{
if (value > 255)
{
value = 255;
}
else if (value < 0) {
value = 0;
}
return (unsigned char)value;
}

AVX2 1GB long array

I have a 1gb long array with floats in a .bin file. After i read it how can i sum the elements with avx2 instrucion, and print the result?
I edited my code with Jake 'Alquimista' LEE's answer.
The problem is the result much smaller than it will be. And other question, how can i add a constant to each number that i readed from .bin file?
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
while (len--)
len >>= 3;
sum = _mm256_set1_ps(0.0f);
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
int main(void)
{
FILE *file;
float *buffer2;
uint32_t fileLen;
if((file = fopen("example.bin","rb"))==NULL)
{
printf("Error! opening file");
exit(1);
}
fseek(file, 0, SEEK_END);
fileLen=ftell(file);
fseek(file, 0, SEEK_SET);
buffer2=(float *)malloc(fileLen+1);
if (!buffer2)
{
fprintf(stderr, "Memory error!");
fclose(file);
return 0;
}
fread(buffer2, fileLen, 1, file);
fclose(file);
printf( "File size : %lu Bits \n", fileLen );
for(int i = 0; i<10; i++)
printf("%f \n", buffer2[i]);
float sum =sumf(buffer2,fileLen);
printf("%f\n",s);
free(buffer2);
return 0;
}
Reading 1GB file into memory is big memory and I/O overhead. Although I'm not very familiar with AVX2, i read articles from Internet & i could come up with the following solution which is actually tested and proved to be working.
My solution consists of reading the file as chuncks of 512 Bytes (Blocks of 128 floats) then summing up the pairs of vectors (16 Total vectors per block) so that at the end we get a final __m256 vector, by casting it to a float* we could sum up its individual components to get the final result.
A case where the file is not 128-floats aligned is handled in the last for loop by summing up individual floats.
The code is commented but in case you have any suggestions to add more explanation to the answer then feel free to do so.
#include <immintrin.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int make_floatf(char *, int);
float avx_sfadd(char*);
char error_buf[1024];
#define PERROR() \
do { \
strerror_r(errno, error_buf, 1024); \
printf("Error: %s\n", error_buf); \
fclose(fp); \
return -1; \
} while(0)
/* This function generates a .bin file containing blocks
* of 128 floating point numbers
*/
int make_floatf(char *filename, int nblocks)
{
FILE *fp = NULL;
if(!(fp = fopen(filename, "wb+")))
PERROR();
float *block_ptr = malloc(sizeof(float) * 128); /* 512 Bytes block of 128 floats */
if(!block_ptr)
PERROR();
int j, i;
for(j = 0; j < nblocks; j++)
{
for(i = 0; i < 128; i++)
block_ptr[i] = 1.0;
int ret = fwrite(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
{
free(block_ptr);
PERROR();
}
}
free(block_ptr);
fclose(fp);
return 0;
}
/* This function reads the .bin file as chuncks of 512B
* blocks (128 floating point numbers) and calculates thier sum.
* The final sum in a form of vector is looped through and its
* components are summed up to get the final result.
*/
float avx_sfadd(char *filename)
{
FILE *fp = NULL;
__m256 v1;
__m256 v2;
__m256 sum = _mm256_setzero_ps();
if(!(fp = fopen(filename, "rb")))
PERROR();
struct stat stat_buf;
stat(filename, &stat_buf);
size_t fsize = stat_buf.st_size;
size_t nblocks = fsize / (sizeof(float) * 128);
size_t rem_size = fsize - nblocks * sizeof(float) * 128;
size_t rem_floats = rem_size / (sizeof(float));
printf("File size: %ld\nnblocks:%ld\nnremfloats: %ld\n",\
fsize, nblocks, rem_floats);
/* This memory area will hold the 128 floating point numbers per block */
float *block_ptr = malloc(sizeof(float) * 128);
if(!block_ptr)
PERROR();
int i;
for(i = 0; i < nblocks; i++)
{
int ret = fread(block_ptr, sizeof(float), 128, fp);
if(ret < 128)
PERROR();
/* Summing up vectors in a block of 16 vectors (128 floats) */
int j;
for(j = 0; j < 16; j += 2)
{
v1 = _mm256_loadu_ps(block_ptr + j*8);
v2 = _mm256_loadu_ps(block_ptr + (j+1)*8);
sum += _mm256_add_ps(v1, v2);
}
}
/* Handling the case if the last chunck of the file doesn't make
* a complete block.
*/
float rem_sum = 0;
if(rem_size > 0)
{
int ret = fread(block_ptr, 1, rem_size, fp);
if(ret < rem_floats)
PERROR();
int j;
for(j = 0; j < rem_floats; j++)
rem_sum += block_ptr[j];
}
float final_sum = rem_sum;
float *sum_ptr = (float*)∑ /* The final vector hold the sum of all vectors */
/* Summing up the values of the last vector to get the final result */
int k;
for(k = 0; k < 8; k++)
final_sum += sum_ptr[k];
free(block_ptr);
fclose(fp);
return final_sum;
}
int main(int argc, char **argv)
{
if(argc < 2){
puts("./main filename [nblocks]");
return 0;
}
/* ./main filename number_of_block_to_create (eg. ./main floats.bin 1024 )*/
else if(argc == 3){
if(!make_floatf(argv[1], atoi(argv[2])))
puts("File has been created sucessfully\n");
}
/* ./main filename (eg. ./main floats.bin) to calculate sum*/
else
printf("avx_sum = %f\n", avx_sfadd(argv[1])) :
return 0;
}
Here's (most likely) your bug:
while (len--)
len >>= 3;
That's a while loop. As long as len != 0, you replace len with (len - 1) >> 3. And then you change it to -1. No loop to be seen.
inline float sumf(const float *pSrc, uint32_t len)
{
__m256 sum, in;
float sumr;
uint32_t sumi;
uint32_t lenr = len & 7;
len >>= 3;
sum = _mm256_set1_ps(0.0f);
while (len--)
{
in = _mm256_loadu_ps(pSrc++);
sum = _mm256_add_ps(in, sum);
}
in = *(__m256 *)&_mm256_permute4x64_pd(*(__m256d *)&sum, 0b01001110);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sum = _mm256_hadd_ps(sum, in);
sumi = _mm256_extract_epi32(*(__m256i *)&sum, 0);
sumr = *(float *)&sumi;
while (lenr--)
{
sumr += *pSrc++;
}
return sumr;
}
The function above will do. However, I don't think that it will bring much of a performance gain, if any, since it's a very trivial one, and the compiler will do auto-vectorize it anyway.
Please note that you have to typecast the pointer to float *, and divide filelen by sizeof(float) when you pass them as arguments.

Storing pixel values of a BMP file

I'm trying to store the pixel values of a BMP file in a 2D dynamically allocated array of structs but it keeps giving a segmentation fault. Here's what I have so far:
#include <stdio.h>
#include <stdlib.h>
typedef struct PIXEL{
unsigned char Red, Green, Blue;
}*pixel;
int main (int argc, char *argv[])
{
//variable declarations and open the file
FILE* fin = fopen(argv[1], "rb");
if (fin == NULL){
printf("Error opening file.\n");
exit(0);
}
unsigned char info[54];
int width, height, i, j;
fread(info, sizeof(unsigned char), 54, fin); //read the header
width = *(int*)&info[18];
height = *(int*)&info[22];
pixel **image = (pixel **) malloc(sizeof(pixel *) * width); //reserve enough space for RGB for each pixel
for (i = 0; i < width; i++){
image[i] = (pixel *) malloc(sizeof(pixel) * height);
}
for (i = 0; i < width; i++){
for (j = 0; j < height; j++){
image[i][j]->Blue = getc(fin); //put the blue value of the pixel
image[i][j]->Green = getc(fin); //green value
image[i][j]->Red = getc(fin); //red value
printf("Pixel %d: [%d, %d, %d]\n", (i+1)*(j+1), image[i][j]->Blue, image[i][j]->Green, image[i][j]->Blue);
}
}
fclose(fin);
return 0;
}
You did not check for valid width and height values from the header. If for any reason they are huge (for instance if the file read failed) this will crash.
Also, %d in printf expects an int. You should cast your unsigned chars to int or it may crash.

Pthread_join() Causing segment default error

What the following code trying to accomplish is just to compute the Matrix Multiplication of A and B to get matrix C. It uses nXn threads to compute each entry of C independently. So the code works on Cygwin, but not on linux. I keep getting segment default with the Pthread_join calls.
#define _REENTRANT // Make sure the library functions are MT (muti-thread) safe
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define BUFFER_SIZE 512
// Declare a structure data type that will be used to pass arguments to the worker threads
typedef struct args_for_thread_t{
int *rowA;
int rowIdx;
int *columnB;
int columnIdx;
int **matrixC;
} ARGS_FOR_THREAD;
/* Global Variables */
int numRows,numColumns;
/*Function Prototype*/
void *computeC(void *this_arg);
void printMatrix(int** matrix,int numRows,int numColumns);
int main(void){
const char filename[] = "input_data.txt";
FILE *file = fopen(filename,"r");
char *delims = " ";
int **matrixA,**matrixB,**matrixC;
int flagB = 0; //Indicate wether the program should process matrixB
int i,j;
if (file){
char line[BUFFER_SIZE];
int rowIdx = 0;
while (fgets(line,sizeof(line), file)){
char substr[BUFFER_SIZE], *result;
//fputs(line,stdout);
result = strtok(line, delims);
int columnIdx = 0;
//Once we reach a line break, we start the processing of matrix B
if (!strcmp(line,"\n")){
flagB = 1;
rowIdx = 0; //Reset the rowIdx
continue; //Skip the new line, and start to read data into matrix B
}
while (result != NULL){
if (!strcmp(result,"ROWS")){ //To retrieve the number of rows
result = strtok(NULL,delims);
numRows = atoi(result);
matrixA = (int **) malloc(numRows*sizeof(int*));
matrixB = (int **) malloc(numRows*sizeof(int*));
matrixC = (int **) malloc(numRows*sizeof(int*));
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!strcmp(result,"COLUMNS")){//To retrieve the number of Columns
result = strtok(NULL,delims);
numColumns = atoi(result);
for (i=0;i<numRows;i++){ //Malloc the columns
matrixA[i] = (int *) malloc(numColumns*sizeof(int));
matrixB[i] = (int *) malloc(numColumns*sizeof(int));
matrixC[i] = (int *) malloc(numColumns*sizeof(int));
}
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!flagB){ //Processing Matrix A
matrixA[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
else if (flagB){ //Processing Matrix B
matrixB[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
}
rowIdx++;
}
}
else{
printf("No Such File exists!\n");
}
//At this point, matrix A and matrix B are both ready for computation. We will start to compute the product of the two matrices
int num_threads = numRows*numColumns; //The toal number of worker threads
pthread_t *worker_thread = (pthread_t *) malloc(sizeof(pthread_t)*num_threads);
ARGS_FOR_THREAD *args_for_thread;
for(i = 0; i < numRows; i++){
for(j = 0; j < numColumns; j++){
args_for_thread = (ARGS_FOR_THREAD *)malloc(sizeof(ARGS_FOR_THREAD)); // Allocate memory for the structure that will be used to pack the arguments
args_for_thread-> rowA = matrixA[i];
//We need to allocate the corresponding column in B for multiplication
int k;
args_for_thread->columnB =(int *) malloc(sizeof(int)*numRows);
for (k=0;k<numRows;k++){
args_for_thread-> columnB[k] = matrixB[k][j];
}
//rowIdx and columnIdx gives the corresponding entry for matrix C
args_for_thread-> rowIdx = i;
args_for_thread-> columnIdx = j;
args_for_thread-> matrixC = matrixC;
if((pthread_create(&worker_thread[i], NULL, computeC, (void *)args_for_thread)) != 0){
printf("Cannot create thread \n");
exit(0);
}
}
}
// Wait for all the worker threads to finish
for(i = 0; i < num_threads; i++)
pthread_join(worker_thread[i], NULL);
//Print out the Final Matrix C
printMatrix(matrixC,numRows,numColumns);
//Clean up pointers
for(i = 0; i < numRows; i++){
free(matrixA[i]);
free(matrixB[i]);
free(matrixC[i]);
}
free(matrixA);
free(matrixB);
free(matrixC);
}
void printMatrix(int** matrix,int numRows, int numColumns){
int i,j;
for (i=0;i<numRows;i++){
for (j=0;j<numColumns;j++){
printf("%d ",matrix[i][j]);
if (j==numColumns-1){
printf("\n");
}
}
}
}
/* Function that will be executed by all the worker threads. It will compute the i,j entry for column C */
void *computeC(void *this_arg){
ARGS_FOR_THREAD *arg = (ARGS_FOR_THREAD *) this_arg;
int rowIdx = arg->rowIdx;
int columnIdx = arg->columnIdx;
int *rowA = arg->rowA;
int *columnB = arg->columnB;
int **matrixC = arg->matrixC;
int i;
int sum = 0;
for (i=0;i<numRows;i++){ //Compute entry for matrix C. Since A,B are nxn square matrix, we can use either numRows or numColumns as the size
sum += rowA[i]*columnB[i];
}
matrixC[rowIdx][columnIdx] = sum;
free((void *) arg); // Free up the structure
pthread_exit(NULL);
}
What is the issue here? Thank you.
Here:
pthread_create(&worker_thread[i] ...
You create i * j threads, yet you only provide worker_threads[i] hence your program keeps using the same pthread_t variables. It later fails when you try to join the threads with undefined pthread_t values.
Replace by:
pthread_create(&worker_thread[i*numColumns+j] ...

Resources