Related
I am processing *.fastq.gz file, something like the below:
#NB501139:187:H2Y5LBGXB:1:11101:17094:1060 2:N:0:CTTTGCGG
AGAGGATCCGTGTGANANNNGANNNCNNCCGNCTNNTANNAGATCACTTAGNNANNNNACAGCAGAAAANNNNNNNNNACAAGGTTGAAANTNTNTNN
+
A/AA///E/////EE#A###//###A##EEE#AE##EA##EE//<AE<EE<##/####/EE</EAE<//#########/E///</EE<<<#E#A#E##
#NB501139:187:H2Y5LBGXB:1:11101:3442:1060 2:N:0:CTTTGCGG
ACTGAGTCACGCACCNANNNCCNNNCNGCCGNCANNGCNNTGCACCGGTGGNCTNNNNTGTGTACTGAGNNTNNNNNNCATGCACACAGANTNCTCNN
+
AAAAAEEEEE/EEE6#<###EE###E#EEE/#AE##<E##EEEEEE/EEEA#EE####EAEEEEEEE/<##A######EEE/E<E/EEE/#A#AEA##
The file could be larger than RAM, so I process it block by block using file streaming, then output the result. By block, I mean 4 lines are a block.
#include <stdio.h>
#include <zlib.h>
#define MAX_LINE_LENGTH 128
typedef struct fastq
{
char *id;
char *seq;
char *qual;
} fastq;
fastq *get_fastq(gzFile file)
{
fastq *out = (fastq *)malloc(sizeof(fastq));
out->id = (char *)malloc(MAX_LINE_LENGTH);
out->seq = (char *)malloc(MAX_LINE_LENGTH);
out->qual = (char *)malloc(MAX_LINE_LENGTH);
char temp[MAX_LINE_LENGTH];
if (gzgets(file, out->id, MAX_LINE_LENGTH) != NULL)
{
gzgets(file, out->seq, MAX_LINE_LENGTH);
gzgets(file, temp, MAX_LINE_LENGTH);
gzgets(file, out->qual, MAX_LINE_LENGTH);
}
else
{
free_fastq(out);
return NULL;
}
return out;
}
To limit the I/O calling, I am looking for a way to read 4 lines at a time rather than calling gzgets() 4 times for one block. I'm unsure if this will save time; any suggestion is welcome.
I'm new in the world of C programming language and I was trying to read an image as a binary file. Unfortunately, when I tried to read the file I got an incomplete sequence ("\377\330\377", <incomplete sequence \340>).
I've already tried to use fread reading only the size of a byte and now I'm trying to use sizeof(unsigned char), but I always get the same incomplete sequence. Here is a part of the code:
caratteri **createStruct(unsigned char c[], caratteri car[], caratteri *ptr[], long size) {
for (long i = 0; i < size; i++) {
ptr[i] = &car[i];
car[i].first = &c[i];
car[i].last = &c[(size - 1 + i) % size];
car[i].index=i;
}
return ptr;
}
int compare(const void *a, const void *b) {
caratteri *ca = *(caratteri **) a;
caratteri *cb = *(caratteri **) b;
unsigned char *c1;
unsigned char *c2;
c1 = ca->first;
c2 = cb->first;
while (*c1 - *c2 == 0) {
c1++;
c2++;
}
return (*c1 - *c2);
}
caratteri **bwt(long size, FILE *file) {
FILE *risultato;
unsigned char *c = malloc(sizeof(unsigned char) * size);
fread(c, sizeof(unsigned char), size, file);
caratteri *car = malloc(sizeof(caratteri) * size);
caratteri **pCaratteri = malloc(sizeof(caratteri *) * size);
pCaratteri = createStruct(c, car, pCaratteri, size);
qsort(pCaratteri, size, sizeof(pCaratteri), compare);
risultato=fopen("risultato","wb");
for(long i = 0; i < size; i++)
fputc(*pCaratteri[i]->last,risultato);
fclose(risultato);
return pCaratteri;
}
int main() {
FILE *file;
file = fopen("thumbnail.jpg","rb");
if (file == NULL) {
printf("Errore di apertura file!");
exit(2);
}
fseek(file, SEEK_SET, SEEK_END);
long size = ftell(file)+1;
rewind(file);
caratteri **car = bwt(size, file);
FILE *risultato;
decryptbwt(risultato);
return 0;
}
This is not the full code but I need this part to work properly.
Also, I need this code to work with every kind of file (.txt, .jpg, .png, etc) and I need the \0 character in case of a txt file, that's why I used ftell(file)+1 but I'm not sure this is appropriate. Anyway, how can I read a file in binary mode avoiding the problem of incomplete sequences?
Thank you for your answers and sorry for the dumb question but I'm new in this world.
I need to take an extra \0 because I'm using an algorithm that needs an end of string character, that's why I'm using ftell with a +1. This algorithm should work with every kind of file so I need to read it correctly and then use the burrows wheeler transform in order to sort the file that I need to compress. Also, I'm not sure that I have to use the fseek in that way but I think there is not another way to get the size of the file.
I have multiple tiff images stored inside a zip file and would like to read their pixel values in c. I am very new to c so please forgive the dodgy code. I have got to the point where I have a char * with the contents of the tiff file but can't seem to work out how to now process that with libtiff (or something similar). libtiff seems to require that I pass TIFFOpen a filename to open. I could write the tiff to a temporary file but it feels like there must be a more efficient way.
So far I have:
#include <stdlib.h>
#include <stdio.h>
#include <zip.h>
#include <string.h>
int main()
{
//Open the ZIP archive
int err = 0;
struct zip *z = zip_open("test.zip", 0, &err);
// Determine how many files are inside and iterate through them
int num_files = zip_get_num_entries(z, 0);
printf("%u\n", num_files);
int i;
for (i=0; i < num_files; i++)
{
const char * filename;
filename = zip_get_name(z, i, 0);
// If the file name ends in .tif
if (strlen(filename) > 4 && !strcmp(filename + strlen(filename) - 4, ".tif"))
{
printf("%s\n", filename);
// Get information about file
struct zip_stat st;
zip_stat_init(&st);
zip_stat(z, name, 0, &st);
printf("%lld\n", st.size);
// Allocate memory for decompressed contents
char *contents;
contents = (char *)malloc(st.size);
// Read the file
struct zip_file *f = zip_fopen(z, filename, 0);
zip_fread(f, contents, st.size);
zip_fclose(f);
// Do something with the contents
// Free memory
free(contents);
}
}
//And close the archive
zip_close(z);
}
EDIT: My question is similar to this one but the accepted answer there relates to c++ and I'm not sure how to translate it to straight c.
I have to implement for a course assignment the Huffman encryption & decryption algorithm first in the classic way, then I have to try to make it parallel using various methods (openMP, MPI, phtreads). The scope of the project is not to make it necessarily faster, but to analyze the results and talk about them and why are they like that.
The serial version works perfectly. However, for the parallel version, I stumble with a reading from file problem. In the serial version, I have a pice of code that looks like this:
char *buffer = calloc(1, MAX_BUFF_SZ);
while (bytes_read = fread(buffer, 1, MAX_BUFF_SZ, input) > 0) {
compress_chunk(buffer, t, output);
memset(buffer, 0, MAX_BUFF_SZ);
}
This reads at most MAX_BUFF_SZ bytes from the input file and then encrypts them. I used the memset call for the case when bytes_read < MAX_BUFF_SZ (maybe a cleaner solution exists though).
However, for the parallel version (using openMP for example), I want each thread to analyze only a portion of the file, but the reading to be done still in chunks. Knowing that each thread has and id thread_id and there are at most total_threads, I calculate the start and the end positions as following:
int slice_size = (file_size + total_threads - 1) / total_threads;
int start = slice_size * thread_id;
int end = min((thread_id + 1) * slice_size, file_size);
I can move to the start position with a simple fseek(input, start, SEEK_SET) operation. However, I am not able to read the content in chunks. I tried with the following code (just to make sure the operation is okay):
int total_bytes = 0;
while ((bytes_read = fread(buffer, 1, MAX_BUFF_SZ, input)) > 0) {
total_bytes += bytes_read;
if (total_bytes >= end) {
int diff = total_bytes - end;
buffer[diff] = '\0';
break;
}
fwrite(buffer, 1, bytes_read, output);
memset(buffer, 0, MAX_BUFF_SZ);
}
output is a different file for each thread. Even when I try with just 2 threads, there are some missing characters from them. I think I am close to the right solution and I have something like an error-by-one.
So the question is: how can I read a slice of a file, but in chunks? Can you please help me identify the bug in the above code and make it work?
Edit:
If MAX_BUFF_SZ would be bigger than the size of the input and I'll have for example 4 threads, how should a clean code look to ensure that T0 will do all the job and T1, T2 and T3 will do nothing?
Some simple code that may be use to test the behavior is the following (note that is not from the Huffman code, is some auxiliary code to test things):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
#define MAX_BUFF_SZ 32
#define min(a, b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; })
int get_filesize(char *filename) {
FILE *f = fopen(filename, "r");
fseek(f, 0L, SEEK_END);
int size = ftell(f);
fclose(f);
return size;
}
static void compress(char *filename, int id, int tt) {
int total_bytes = 0;
int bytes_read;
char *newname;
char *buffer;
FILE *output;
FILE *input;
int fsize;
int slice;
int start;
int end;
newname = (char *) malloc(strlen(filename) + 2);
sprintf(newname, "%s-%d", filename, id);
fsize = get_filesize(filename);
buffer = calloc(1, MAX_BUFF_SZ);
input = fopen(filename, "r");
output = fopen(newname, "w");
slice = (fsize + tt - 1) / tt;
end = min((id + 1) * slice, fsize);
start = slice * id;
fseek(input, start, SEEK_SET);
while ((bytes_read = fread(buffer, 1, MAX_BUFF_SZ, input)) > 0) {
total_bytes += bytes_read;
printf("%s\n", buffer);
if (total_bytes >= end) {
int diff = total_bytes - end;
buffer[diff] = '\0';
break;
}
fwrite(buffer, 1, bytes_read, output);
memset(buffer, 0, MAX_BUFF_SZ);
}
fclose(output);
fclose(input);
}
int main() {
omp_set_num_threads(4);
#pragma omp parallel
{
int tt = omp_get_num_threads();;
int id = omp_get_thread_num();
compress("test.txt", id, tt);
}
}
You can compile it with gcc test.c -o test -fopenmp. You may generate a file test.txt with some random characters, more than 32 (or change the max buffer size).
Edit 2:
Again, my problem is reading a slice of a file in chunks, not the analysis per se. I know how to do that. It's an University course, I can't just say "IO bound, end of story, analysis complete".
Apparently I just had to take a pen and a paper and make a little scheme. After playing around with some indices, I came out with the following code (encbuff and written_bits are some auxiliary variables I use, since I am actually writing bits to a file and I use an intermediary buffer to limit the writes):
while ((bytes_read = fread(buffer, 1, MAX_BUFF_SZ, input)) > 0) {
total_bytes += bytes_read;
if (start + total_bytes > end) {
int diff = start + total_bytes - end;
buffer[bytes_read - diff] = '\0';
compress_chunk(buffer, t, output, encbuff, &written_bits);
break;
}
compress_chunk(buffer, t, output, encbuff, &written_bits);
memset(buffer, 0, MAX_BUFF_SZ);
}
I also finished implementing the openMP version. For small files the serial one is faster, but starting from 25+MB, the parallel one starts to beats the serial one with about 35-45%. Thank you all for the advice.
Cheers!
Firstly, i'm not very familiarized with C, i come from Java, C#, C++... and possibly i inherited defects from this languages in order to realize this practice, well i have the follows question, here is my code:
#include <stdio.h>
#include <stdlib.h>
void decrypt(unsigned long* v, unsigned long* k);
const int MAX = 32;
const long delta = 0x9e3779b9;
long sum=0xC6EF3720;
int main() {
FILE *fp;
FILE *destino;
unsigned long v[2];
unsigned long k[4] = { 128, 129, 130, 131 };
unsigned long tam=0;
char* buffer;
char* aux[sizeof(unsigned long)];
int i;
if ((fp = fopen("image.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fread(&aux,sizeof(unsigned long),1,fp);
memcpy(&tam,&aux,sizeof(unsigned long));
buffer = (char*)malloc(tam);
//fread(&buffer,1,tam,fp);
char *buffer2[28568];
fread(&buffer2,1,28568,fp);
/*for(i = 0;i < tam;++i) {
printf("%c", ((char *)buffer2)[i]);
}*/
for(i=4;i<tam;i+=8) {
memcpy(&v,&buffer2[i],8);
decrypt(&v,&k);
}
if ((result= fopen("image2.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fwrite(v,sizeof(unsigned long)*2,1,result);
fclose (result);
fclose(fp);
}
}
return 0;
}
void decrypt(unsigned long* v, unsigned long* k) {
int i=0;
while(i<MAX) {
v[1] = v[1] -((4 << v[0])+(k[2]^v[0])+(sum^(5 >> v[0]))+k[3]);
v[0] = v[0] -((4 << v[1])+(k[0]^v[1])+(sum^(5 >> v[1]))+k[1]);
sum = sum-delta;
i++;
}
}
Where tam is the size of my binary file (image in this case) where i store first 4 bytes (unsigned long) where is located the size in my png file (28568)
When i create my char* buffer i have to assign dynamically with malloc but when i make a new fread from my file i get a "No source available for "msvrct!memcpy() at 0xrandom_memory_address" from Eclipse when i debug, well, i comment this line and i try to make it manually set a new buffer2 with 28568 as size of my array, apparently works, making a iteration of buffer2 prints ascii characters values but when i call decrypt for make the decryption of my image, the final result is stored in v array which i have to copy in a new file, i tried to search how to make a empty image png in C but i didn't find anything, so i created a copy of my encrypt image calling it "image2.png" but i suppose this not the "clean solution" for that, because for the other hand is not working at all.
For more explanation about this exercise just say that the decrypt funcion work with blocks of 8 bytes (64 bits) that through a key (array k) make a series of operation where they store in v array itself, crossing through the loop 8 in 8 and retrieve the value of buffer in v in each one, after the loop execution we have the result in v and only left to copy in a new file where finally show up the image decrypt.
It's a very complex practice for all of one newbies in C, it's driving my crazy trying to figure out what i doing wrong.
I hope anyone can see what i'm not able to for now.
I think you are having problems with the declarations of the buffers. I think the correct should be:
FILE *fp;
FILE *destino;
unsigned long v[2];
unsigned long k[4] = { 128, 129, 130, 131 };
unsigned long tam=0;
char* buffer;
char aux[sizeof(unsigned long)]; // without the "*"
int i;
if ((fp = fopen("image.png", "rb")) == NULL) {
printf ("Error! \n ");
return 0;
}
else {
fread(aux,sizeof(unsigned long),1,fp);
memcpy(&tam,aux,sizeof(unsigned long));
buffer = (char*)malloc(tam);
//fread(buffer,1,tam,fp); // without the "&" in this case
char buffer2[28568]; // without the "*"
fread(buffer2,1,28568,fp); // or fread(buffer,1,tam,fp);
/*for(i = 0;i < tam;++i) {
printf("%c", buffer2[i]); // or buufer[i] if you change to use it again
}*/
for(i=4;i<tam;i+=8) {
memcpy(v,&buffer2[i],8);
decrypt(v,k);
}
...
I don't fully understand what you are trying to accomplish, but one problem is here:
char* aux[sizeof(unsigned long)];
// ... some code ...
fread(&aux,sizeof(unsigned long),1,fp);
Understand that char* aux[sizeof(unsigned long)]; means that you are declaring a double pointer, but fread() prototype states that the destination is a single pointer:
size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
so what you should be doing instead is:
char aux[sizeof(unsigned long)];
// ... some code ...
fread(aux,sizeof(unsigned long),1,fp);
Don't complicate things that are not complicated!
You also do this mistake in other parts of your code, you need to re-check everything, ok? Again:
char *buffer2[28568];
fread(&buffer2,1,28568,fp);
should be:
char buffer2[28568];
fread(buffer2, 1, 28568, fp);
// or: fread(buffer2, 1, sizeof(buffer2), fp);
There are some interesting tutorials on pointers and arrays, I suggest you read some.