I'm new in the world of C programming language and I was trying to read an image as a binary file. Unfortunately, when I tried to read the file I got an incomplete sequence ("\377\330\377", <incomplete sequence \340>).
I've already tried to use fread reading only the size of a byte and now I'm trying to use sizeof(unsigned char), but I always get the same incomplete sequence. Here is a part of the code:
caratteri **createStruct(unsigned char c[], caratteri car[], caratteri *ptr[], long size) {
for (long i = 0; i < size; i++) {
ptr[i] = &car[i];
car[i].first = &c[i];
car[i].last = &c[(size - 1 + i) % size];
car[i].index=i;
}
return ptr;
}
int compare(const void *a, const void *b) {
caratteri *ca = *(caratteri **) a;
caratteri *cb = *(caratteri **) b;
unsigned char *c1;
unsigned char *c2;
c1 = ca->first;
c2 = cb->first;
while (*c1 - *c2 == 0) {
c1++;
c2++;
}
return (*c1 - *c2);
}
caratteri **bwt(long size, FILE *file) {
FILE *risultato;
unsigned char *c = malloc(sizeof(unsigned char) * size);
fread(c, sizeof(unsigned char), size, file);
caratteri *car = malloc(sizeof(caratteri) * size);
caratteri **pCaratteri = malloc(sizeof(caratteri *) * size);
pCaratteri = createStruct(c, car, pCaratteri, size);
qsort(pCaratteri, size, sizeof(pCaratteri), compare);
risultato=fopen("risultato","wb");
for(long i = 0; i < size; i++)
fputc(*pCaratteri[i]->last,risultato);
fclose(risultato);
return pCaratteri;
}
int main() {
FILE *file;
file = fopen("thumbnail.jpg","rb");
if (file == NULL) {
printf("Errore di apertura file!");
exit(2);
}
fseek(file, SEEK_SET, SEEK_END);
long size = ftell(file)+1;
rewind(file);
caratteri **car = bwt(size, file);
FILE *risultato;
decryptbwt(risultato);
return 0;
}
This is not the full code but I need this part to work properly.
Also, I need this code to work with every kind of file (.txt, .jpg, .png, etc) and I need the \0 character in case of a txt file, that's why I used ftell(file)+1 but I'm not sure this is appropriate. Anyway, how can I read a file in binary mode avoiding the problem of incomplete sequences?
Thank you for your answers and sorry for the dumb question but I'm new in this world.
I need to take an extra \0 because I'm using an algorithm that needs an end of string character, that's why I'm using ftell with a +1. This algorithm should work with every kind of file so I need to read it correctly and then use the burrows wheeler transform in order to sort the file that I need to compress. Also, I'm not sure that I have to use the fseek in that way but I think there is not another way to get the size of the file.
I am trying to read a character array that contains the contents of many large files. The character array is going to be quite large, because the files are large, so I want to do it using multithreading (pthread). I want the user to be able to designate how many threads they want to run. I have something working, but increasing the number of threads does nothing to affect performance (i.e. 1 thread finishes just as fast as 10). In fact, it seems to be just the opposite: telling the program to use 10 threads runs much slower than telling it to use 1.
Here is the method for slicing up the character array according to the number of threads the user passes to the program. I know this is wrong, I could use some advice here.
//Universal variables
int numThreads;
size_t sizeOfAllFiles; // Size, in bytes, of allFiles
char* allFiles; // Where all of the files are stored, together
void *zip(void *nthread);
void *zip(void *nThread) {
int currentThread = *(int*)nThread;
int remainder = sizeOfAllFiles % currentThread;
int slice = (sizeOfAllFiles-remainder) / currentThread;
// I subtracted the remainder for my testing
// because I didn't want to worry about whether
// the char array's size is evenly divisible by numThreads
int i = (slice * (currentThread-1));
char currentChar = allFiles[i]; //Used for iterating
while(i<(slice * currentThread) && i>=(slice * (currentThread-1))) {
i++;
// Do things with the respective thread's
// 'slice' of the array.
.....
}
return 0;
}
And here is how I am spawning the threads, which I am almost positive that I am doing correctly:
for (int j = 1; j <= threadNum; j++) {
k = malloc(sizeof(int));
*k = j;
if (pthread_create (&thread[j], NULL, zip, k) != 0) {
printf("Error\n");
free(thread);
exit(EXIT_FAILURE);
}
}
for (int i = 1; i <= threadNum; i++)
pthread_join (thread[i], NULL);
This is all really confusing for me so if I could get some help on this, I'd greatly appreciate it. I specifically am struggling with the slicing part (cutting it up correctly), and with not seeing performance gains by using more than one thread. Thanks in advance.
I'm starting by throwing a test program at you:
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
#include <stddef.h>
#include <time.h>
bool
EnlargeBuffer(char ** const buffer_pointer,
size_t * const buffer_size)
{
char * larger_buffer = realloc(*buffer_pointer,
2 * *buffer_size);
if (! larger_buffer) {
larger_buffer = realloc(*buffer_pointer,
*buffer_size + 100);
if (! larger_buffer) {
return false;
}
*buffer_size += 100;
} else {
*buffer_size *= 2;
}
*buffer_pointer = larger_buffer;
printf("(Buffer size now at %zu)\n", *buffer_size);
return true;
}
bool
ReadAll(FILE * const source,
char ** pbuffer,
size_t * pbuffer_size,
size_t * pwrite_index)
{
int c;
while ((c = fgetc(source)) != EOF) {
assert(*pwrite_index < *pbuffer_size);
(*pbuffer)[(*pwrite_index)++] = c;
if (*pwrite_index == *pbuffer_size) {
if (! EnlargeBuffer(pbuffer, pbuffer_size)) {
free(*pbuffer);
return false;
}
}
}
if (ferror(source)) {
free(*pbuffer);
return false;
}
return true;
}
unsigned
CountAs(char const * const buffer,
size_t size)
{
unsigned count = 0;
while (size--)
{
if (buffer[size] == 'A') ++count;
}
return count;
}
int
main(int argc, char ** argv)
{
char * buffer = malloc(100);
if (! buffer) return 1;
size_t buffer_size = 100;
size_t write_index = 0;
clock_t begin = clock();
for (int i = 1; i < argc; ++i)
{
printf("Reading %s now ... \n", argv[i]);
FILE * const file = fopen(argv[i], "r");
if (! file) return 1;
if (! ReadAll(file, &buffer, &buffer_size, &write_index))
{
return 1;
}
fclose(file);
}
clock_t end = clock();
printf("Reading done, took %f seconds\n",
(double)(end - begin) / CLOCKS_PER_SEC);
begin = clock();
unsigned const as = CountAs(buffer, write_index);
end = clock();
printf("All files have %u 'A's, counting took %f seconds\n",
as,
(double)(end - begin) / CLOCKS_PER_SEC);
}
This program reads all files (passed as command line arguments) into one big large char * buffer, and then counts all bytes which are == 'A'. It also times both of these steps.
Example run with (shortened) output on my system:
# gcc -Wall -Wextra -std=c11 -pedantic allthefiles.c
# dd if=/dev/zero of=large_file bs=1M count=1000
# ./a.out allthefiles.c large_file
Reading allthefiles.c now ...
(Buffer size now at 200)
...
(Buffer size now at 3200)
Reading large_file now ...
(Buffer size now at 6400)
(Buffer size now at 12800)
...
(Buffer size now at 1677721600)
Reading done, took 4.828559 seconds
All files have 7 'A's, counting took 0.764503 seconds
Reading took almost 5 seconds, but counting (= iterating once, in a single thread, over all bytes) took a bit less than 1 second.
You're optimizing at the wrong place!
Using 1 thread to read all files, and then using N threads to operate on that one buffer isn't going to bring you places. The fastest way to read 1 file is to use 1 thread. For multiple files, use 1 thread per file!
So, in order to achieve the speedup that you need to show for your assignment:
Create a pool of threads with variable size.
Have a pool of tasks, where each task consists of
read one file
compute it's run-length encoding
store the run-length encoded file
let the threads take tasks from your task pool.
Things to consider: How do you combine the results of each task? Without requiring (costly) synchronization.
I have been finding a way to brute-force finding a int64_t in a file in C.
I have written the following code.
int64_t readbyte = 0, totalreadbytes = 0;
int64_t totalfound = 0;
const int64_t magic = MAGIC_NUMBER;
char *buffer = (char *)malloc(BUFFER_SIZE);
int64_t *offsets = (int64_t *)malloc(sizeof(int64_t) * (1 << 24));
if (buffer == NULL || offsets == NULL)
{
return -3;
}
while ((readbyte = fread(buffer, 1, BUFFER_SIZE, inptr)) > 0)
{
for (int i = 0; i <= readbyte - 8; i++)
{
if (memcmp(buffer + i, &magic, sizeof(magic))==0)
{
offsets[totalfound++] = totalreadbytes + i;
}
}
totalreadbytes += readbyte - 8;
fseek(inptr, -8, SEEK_CUR);
}
// Do something to those offsets found
free(offsets);
free(buffer);
I have been wondering if there is a way better to find that int64_t, because my goal is to find them in a file as large as 60gigs and there maybe several hundred thousands of them in that file
Backing up and re-reading data is going to slow things down quite a bit.
Building on #melpomene comment, here's a very simple way to do it with mmap():
uint64_t needle;
struct stat sb;
int fd = open( filename, O_RDONLY );
fstat( fd, &sb );
unsigned char *haystack = mmap( NULL, sb.st_size,
PROT_READ, MAP_PRIVATE, fd, 0 );
close( fd );
off_t bytesToSearch = sb.st_size - sizeof( needle );
// <= so the last bytes get searched
for ( off_t ii = 0; ii <= bytesToSearch; ii++ )
{
if ( 0 == memcmp( haystack + ii, &needle, sizeof( needle ) ) )
{
// found it!
}
}
Error checking and proper headers omitted for clarity.
There are a lot of ways to improve the performance of that. This IO pattern is the worst possible use of mmap() with regards to performance - read every byte in the file just once, then throw the mappings away. Because mapping a file isn't all that fast in the first place, and it impacts the entire machine.
It'd probably be a lot faster to just use open() and read() with direct IO in large page-sized chunks into page-aligned memory, especially if the file is a significant fraction of the system's RAM. But that would make the code much more complex, as the comparisons would have to span buffers - it's almost certainly much faster to use two buffers and copy a few bytes out to search across a break between buffers than it is to back up and do a non-aligned read.
I am currently making a small test program for simple file checking. The program writes two small matrices(A and B) to files, closes and reopens them, reads in the matrices from the files, multiplies them and writes the resulting matrix(C) to a new file. It then closes and reopens this file containing the answer and prints it out for me to check if the IO operation proceeded correctly.
My problem is that the result matrix reads differently than expected.
I consider myself a beginner in C and of file input/output operations and this is the code that is causing me trouble. I am using WinXP, Codeblocks and Mingw.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define bufferA(i,k) (bufferA[i*cols+k])
#define bufferB(k,j) (bufferB[k*cols+j])
#define bufferC(i,j) (bufferC[i*cols+j])
void printMatrix(int *nMatrixToPrint, int nNumberOfElements, int nDimension) {
// This function prints out the element of an Array. This array represents a matrix in memory.
int nIndex;
printf("\n");
for (nIndex = 0; nIndex < nNumberOfElements; nIndex++) {
if (nIndex % nDimension == 0)
printf("\n");
printf("%d,",nMatrixToPrint[nIndex]);
}
return;
}
int main(int argc, char *argv[]) {
int nElements = 16, nDim = 4;
int A[4][4] = {{1,2,3,1},{2,2,1,2},{4,2,3,1},{5,1,1,3}};
int B[4][4] = {{3,2,1,4},{2,2,3,3},{4,1,3,2},{2,2,5,1}};
// Create files of A and B, delete old ones if present
FILE *fpA = fopen("A.dat", "w+");
FILE *fpB = fopen("B.dat", "w+");
// Write data to them
fwrite((int*)A, sizeof(*A), nElements, fpA);
fwrite((int*)B, sizeof(*B), nElements, fpB);
// and close them
fclose(fpA);
fclose(fpB);
// Reopen files
fpA = fopen("A.dat", "r");
fpB = fopen("B.dat", "r");
// Allocate memory
int *bufferA = (int*)malloc(nElements * sizeof(*bufferA));
int *bufferB = (int*)malloc(nElements * sizeof(*bufferB));
int *bufferC = (int*)calloc(nElements, sizeof(*bufferC));
// Read files
fread(bufferA, sizeof(int), nElements, fpA);
fread(bufferB, sizeof(int), nElements, fpB);
printf("\nA");
printMatrix(bufferA, nElements, nDim);
printf("\n\nB");
printMatrix(bufferB, nElements, nDim);
// Matrix multiplication
// Calculate and write to C
int i,j,k = 0; // Loop indices
int n = nDim,l = nDim, m = nDim, cols = nDim;
// multiply
for (i = 0; i < n; i++) { // Columns
for (j = 0; j < m; j++) { // Rows
//C(i,j) = 0;
for (k = 0; k < l; k++) {
bufferC(i,j) += bufferA(i,k) * bufferB(k,j);
}
}
}
printf("\n\nC_buffer");
printMatrix(bufferC, nElements, nDim);
// Create C and write to it
FILE* Cfile = fopen("C.dat", "w");
fwrite(bufferC, sizeof(*bufferC), nElements, Cfile);
// Close files
fclose(fpA);
fclose(fpB);
fclose(Cfile);
// reopen C for reading
Cfile = fopen("C.dat", "r");
// Obtain file size
fseek(Cfile , 0 , SEEK_END);
long lSize = ftell(Cfile);
rewind(Cfile);
printf("\nC file length is: %ld", lSize);
// read data into bufferA
fread(bufferA, sizeof(int), lSize, Cfile);
fclose(Cfile);
printf("\n\nC_file");
printMatrix(bufferA, nElements, nDim);
// Free allocated memory and remove dangling pointers
free(bufferA); bufferA = NULL;
free(bufferB); bufferB = NULL;
free(bufferC); bufferC = NULL;
exit(0);
}
Which gives me the following output:
A
1,2,3,1,
2,2,1,2,
4,2,3,1,
5,1,1,3,
B
3,2,1,4,
2,2,3,3,
4,1,3,2,
2,2,5,1,
C_buffer
21,11,21,17,
18,13,21,18,
30,17,24,29,
27,19,26,28,
C file length is: 64
C_file
21,11,21,17,
18,13,21,18,
30,17,24,29,
27,19,1,3,
As you can see, the last two elements in C_file are wrong, instead the output shows the last two elements in A as I was writing the file contents into bufferA. A switch to bufferB would swap the last two characters with the last elements in B which is still erroneous. A filecopy into another project would yield the last two integers as whatever was in ram at that malloc address.
My question is as follows: Why does not fwrite write the proper data into the file. Why does it manage the first 14 elements but not the last two? And how does this differ from my previous correct uses of fwrite and fread when I wrote and retrieved the elements of A and B?
You are writing binary data, and therefore you have to open the file in binary mode, the default is text mode. This makes a difference on windows, but not on *nix, which explains why it works for the other people here.
for all your fopen calls, include the letter 'b' in the mode argument, e.g. replace "w+" with "w+b" , replace "r" with "rb" and so on.
Your program runs just fine on my Mac.
The results would look better if printMatrix() output a final newline. Perhaps the unterminated line is causing some sort of confusion on your system?