Read from buffer C - c

I am trying to create a simple c program that strips the HTML from a webpage and keeps the text. So far i have come up with the code below. It uses cURL to get the contents of the webpage and write it to a file. How do i go through the memory buffer and remove all HTML tags and output to text to either the terminal or a file?
#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#define WEBPAGE_URL "http://homepages.paradise.net.nz/adrianfu/index.html"
#define DESTINATION_FILE "/home/acwest/data.txt"
size_t write_data( void *ptr, size_t size, size_t nmeb, void *stream)
{
return fwrite(ptr,size,nmeb,stream);
}
int main()
{
int in_tag = 0;
char * buffer;
char c;
long lSize;
size_t result;
FILE * file = fopen(DESTINATION_FILE,"w+");
if (file==NULL) {
fputs ("File error",stderr);
exit (1);
}
CURL *handle = curl_easy_init();
curl_easy_setopt(handle,CURLOPT_URL,WEBPAGE_URL); /*Using the http protocol*/
curl_easy_setopt(handle,CURLOPT_WRITEFUNCTION, write_data);
curl_easy_setopt(handle,CURLOPT_WRITEDATA, file);
curl_easy_perform(handle);
curl_easy_cleanup(handle);
// obtain file size:
fseek (file, 0, SEEK_END);
lSize = ftell (file);
rewind (file);
// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*lSize);
if (buffer == NULL) {
fputs ("Memory error",stderr);
exit (2);
}
// copy the file into the buffer:
result = fread (buffer,1,lSize,file);
if (result != lSize) {
fputs ("Reading error",stderr);
exit (3);
}
}

Curl will not help you with parsing HTML, and it is a complicated task. You can read the language specification and write a parser. There's an open source C++ project at http://www.mbayer.de/html2text/ or a python script at https://github.com/aaronsw/html2text. You can also install and use html2text from the command-line or execute it from your c code.

Related

Storing .raw File Data as a Pointer Using C

I am attempting to read a '.raw' file which stores the contents of an image that was taken on a camera using C. I would like to store these contents into a uint16_t *.
In the following code I attempt to store this data into a pointer, using fread(), and then write this data into a test file, using fwrite(), to check if my data was correct.
However, when I write the file back it is completely black when I check it.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define MAX_ROW 2560
#define MAX_COL 2160
int main()
{
char filename[32] = "image1.raw";
FILE * image_raw = fopen(filename, "rb");
fseek(image_raw, 0, 2);
long filesize = ftell(image_raw);
/*READ IMAGE DATA*/
uint16_t * image_data_ptr;
image_data_ptr = (uint16_t *)malloc(sizeof(uint16_t)*MAX_ROW*MAX_COL);
fread(image_data_ptr, sizeof(uint16_t), filesize, image_raw);
fclose(image_raw);
/*TEST WRITING THE SAME DATA BACK INTO TEST RAW FILE*/
FILE *fp;
fp = fopen("TEST.raw", "w");
fwrite(image_data_ptr, sizeof(uint16_t), filesize, fp);
fclose(fp);
return 0;
}
There are multiple issues with your code:
lack of error handling.
not seeking the input file back to offset 0 after seeking it to get its size. Consider using stat() or equivalent to get the file size without having to seek the file at all.
not dividing filesize by sizeof(uint16_t) when reading from the input file, or writing to the output file. filesize is expressed in bytes, but fread/fwrite are expressed in number of items of a given size instead, and your items are not 1 byte in size.
not opening the output file in binary mode.
leaking the buffer you allocate.
With that said, try something more like this instead:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
int main()
{
char filename[32] = "image1.raw";
FILE *image_raw = fopen(filename, "rb");
if (!image_raw) {
fprintf(stderr, "Can't open input file\n");
return -1;
}
if (fseek(image_raw, 0, SEEK_END) != 0) {
fprintf(stderr, "Can't seek input file\n");
fclose(image_raw);
return -1;
}
long filesize = ftell(image_raw);
if (filesize == -1L) {
fprintf(stderr, "Can't get input file size\n");
fclose(image_raw);
return -1;
}
rewind(image_raw);
long numSamples = filesize / sizeof(uint16_t);
/*READ IMAGE DATA*/
uint16_t *image_data_ptr = (uint16_t*) malloc(filesize);
if (!image_data_ptr) {
fprintf(stderr, "Can't allocate memory\n");
fclose(image_raw);
return -1;
}
size_t numRead = fread(image_data_ptr, sizeof(uint16_t), numSamples, image_raw);
if (numRead != numSamples) {
fprintf(stderr, "Can't read samples from file\n");
free(image_data_ptr);
fclose(image_raw);
return -1;
}
fclose(image_raw);
/*TEST WRITING THE SAME DATA BACK INTO TEST RAW FILE*/
FILE *fp = fopen("TEST.raw", "wb");
if (!fp) {
fprintf(stderr, "Can't open output file\n");
free(image_data_ptr);
return -1;
}
if (fwrite(image_data_ptr, sizeof(uint16_t), numSamples, fp) != numSamples) {
fprintf(stderr, "Can't write to output file\n");
fclose(fp);
free(image_data_ptr);
return -1;
}
fclose(fp);
free(image_data_ptr);
return 0;
}
You have already a great answer and useful comments
anyway, consider that if you want to iterate over your file, loaded in memory as a whole, as an array of unsigned words:
if the file size could be odd what to do at the last byte/word
you may read the file as a whole in a single call, after having the file size determined
fstat() is the normal way to get the file size
get the file name from the command line as an argument is much more flexible than recompile the program or change the file name in order to use the program
The code below does just that:
uses image.raw as a default for the file name, but allowing you to enter the file name on the command line
uses fstat() to get the file size
uses a single fread() call to read the entire file as a single record
A test using the original program file as input:
Mode LastWriteTime Length Name
---- ------------- ------ ----
-a---- 20/07/2021 17:40 1067 main.c
PS > gcc -Wall -o tst main.c
PS > ./tst main.c
File is "main.c". Size is 1067 bytes
File "main.c" loaded in memory.
PS > ./tst xys
File is "xys". Could not open: No such file or directory
The C example
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
int main(int argc, char**argv)
{
const char* default_file = "image.raw";
char f_name[256];
if (argc < 2)
strcpy(f_name, default_file);
else
strcpy(f_name, argv[1]);
FILE* F = fopen(f_name, "rb");
if (F == NULL)
{
printf("File is \"%s\". ", f_name);
perror("Could not open");
return -1;
}
struct stat info;
fstat(_fileno(F),&info);
printf("File is \"%s\". Size is %lu bytes\n", f_name, info.st_size);
uint16_t* image = malloc(info.st_size);
if (image == NULL)
{ perror("malloc() error");
return -2;
};
if (fread(image, info.st_size, 1, F) != 1)
{ perror("read error");
free(image);
return -3;
};
// use 'image'
printf("File \"%s\" loaded in memory.\n", f_name);
free(image);
fclose(F);
return 0;
}

C program to open binary elf files, read from them, and print them out (like objcopy)

I am trying to implement a functionality similar to objcopy where bytes of a binary file (specifically the .text section) will be printed out using open() and read(). How would I set the buffer sizes and iterate till the end of a .text section so that I don't read more bytes than I have to in order to avoid errors?
Here is how you read a file using open() and read().
P.S I used fopen() and fread() instead of open() and read() because I am currently working with a Windows machine. However, the results will be the same for either.
int main()
{
FILE *file = fopen("input.txt", "r");
char buffer[2048];
if (file)
{
/* Loop will continue until an end of file is reached i.e. fread returns 0 elements read */
while (fread(buffer, 4, 1, file) == 1)
{
printf("%s", buffer);
}
fclose(file);
}
}
Update: For interpreting ELF files specifically, I would recommend taking a look at the following resources:
Check out the following code snippet. It shows how you can interpret an ELF file.
#include <stdio.h>
#include <libelf.h>
#include <stdlib.h>
#include <string.h>
static void failure(void);
void main(int argc, char **argv)
{
Elf32_Shdr *shdr;
Elf32_Ehdr *ehdr;
Elf *elf;
Elf_Scn *scn;
Elf_Data *data;
int fd;
unsigned int cnt;
/* Open the input file */
if ((fd = open(argv[1], O_RDONLY)) == -1)
exit(1);
/* Obtain the ELF descriptor */
(void)elf_version(EV_CURRENT);
if ((elf = elf_begin(fd, ELF_C_READ, NULL)) == NULL)
failure();
/* Obtain the .shstrtab data buffer */
if (((ehdr = elf32_getehdr(elf)) == NULL) ||
((scn = elf_getscn(elf, ehdr->e_shstrndx)) == NULL) ||
((data = elf_getdata(scn, NULL)) == NULL))
failure();
/* Traverse input filename, printing each section */
for (cnt = 1, scn = NULL; scn = elf_nextscn(elf, scn); cnt++)
{
if ((shdr = elf32_getshdr(scn)) == NULL)
failure();
(void)printf("[%d] %s\n", cnt,
(char *)data->d_buf + shdr->sh_name);
}
} /* end main */
static void
failure()
{
(void)fprintf(stderr, "%s\n", elf_errmsg(elf_errno()));
exit(1);
}
I would also recommend checking out the elfutils library, which can be found here.

Sending .amr files to SIM900 over serial in c

I'm trying to send .amr files from my desktop to a SIM900 GSM module over UART.
I'm using teuniz's RS232 library.
I do the initialisation using AT commands and then read the file into a buffer and write it to the UART using the RS232_SendByte() library function byte-by-byte, but it doesn't seem to work.
I send the following AT commands:
AT+CFSINIT
AT+CFSWFILE=\"audio.amr\",0,6694,13000 # After which I get the CONNECT message from the SIM900 module
# Here's where I send the file
AT+CFSGFIS=\"audio.amr\"
Here's my code:
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include "rs232.h"
char *readFile(char *filename, int *size) {
char *source = NULL;
FILE *fp = fopen(filename, "rb");
if (fp != NULL) {
/* Go to the end of the file. */
if (fseek(fp, 0L, SEEK_END) == 0) {
/* Get the size of the file. */
long bufsize = ftell(fp);
if (bufsize == -1) { return NULL; }
/* Allocate our buffer to that size. */
source = malloc(sizeof(char) * (bufsize + 1));
if(!source) return NULL;
/* Go back to the start of the file. */
if (fseek(fp, 0L, SEEK_SET) != 0) { return NULL; }
/* Read the entire file into memory. */
size_t newLen = fread(source, sizeof(char), bufsize, fp);
if ( ferror( fp ) != 0 ) {
fputs("Error reading file", stderr);
free(source);
return NULL;
} else {
source[newLen++] = 0; /* Just to be safe. */
}
*size = bufsize;
}
fclose(fp);
}
return source;
}
int main(int argc, char *argv[])
{
int ret = 0, cport_nr = 2, bdrate=38400;
char data[2000] = {0};
if(RS232_OpenComport(cport_nr, bdrate)) {
printf("Can not open comport\n");
ret = -1;
goto END;
}
int size;
unsigned char *filebuf = readFile("audio.amr", &size);
if (!filebuf) {
ret = -1;
goto END_1;
}
/* Initialization */
RS232_cputs(cport_nr, "AT");
RS232_cputs(cport_nr, "AT+CFSINIT");
sleep(1);
RS232_cputs(cport_nr, "AT+CFSWFILE=\"audio.amr\",0,6694,13000");
/* Wait for CONNECT */
sleep(2);
printf("Sending file of size: %d\n", size);
int i;
for (i = 0; i < size; ++i) {
putchar(filebuf[i]);
RS232_SendByte(cport_nr, filebuf[i]);
}
free(filebuf);
sleep(1);
/* Check if file transferred right */
RS232_cputs(cport_nr, "AT+CFSGFIS=\"audio.amr\"");
END_1:
RS232_CloseComport(cport_nr);
END:
return ret;
}
EDIT 1
Normally, the procedure to send a file to SIM900 using AT commands would be as documented here:
AT+CFSINIT # Initialize flash; Response is OK
AT+CFSWFILE=<filename>,<writeMode>,<fileSize>,<InputTime> # Write file with these parameter; Response is CONNECT; So this is when I start sending the file
Here's where I send the file. If it worked and the sent file size matched the <filesize> sent in the above command, SIM900 must respond with OK, which it doesn't. :(
AT+CFSGFIS=<filename> # Gives the file size on flash. This gives me an error since the file didn't upload correctly.
This leads me to beleive there's something wrong with my program. I'm reading the file in binary mode. And the size reported is exacty the same as I specify in the AT+CFSWFILE=<filename>,<writeMode>,<fileSize>,<InputTime> command.

How to Calculate MD5 of xls file in C language

I have made many researches about MD5 of an xls file but my effort seems be in vain
I tried to used lirary and recommendation in this link "https://stackoverflow.com/questions/27858288/calculate-md5-for-a-file-in-c-language"
but , still give wrong result ,
can you help me ??
Well I used to answer the link you gave but then the question was closed.
The idea is as follows. First read the file into a buffer. You can do this using following function:
unsigned char * readFile(const char *path)
{
FILE * pFile;
long lSize;
unsigned char * buffer;
size_t result;
pFile = fopen (path , "rb" );
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
lSize = ftell (pFile);
rewind (pFile);
// allocate memory to contain the whole file:
buffer = malloc (sizeof(char)*lSize);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}
// copy the file into the buffer:
result = fread (buffer,1,lSize,pFile);
if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
// terminate
fclose (pFile);
return buffer;
}
Read the file
unsigned char * data = readFile("c:\\file.xls");
Then you must apply MD5 on this buffer of data. You can use code similar
to the one in that question (though I am not sure which library/implementation
of md5 author of that question used). e.g.,
char hash[64] = {0};
md5_byte_t digest[16] = {0};
md5_init(&state);
md5_append(&state, (const md5_byte_t *)data, filesize);
md5_finish(&state,digest);
int i=0;
for(i; i<16; i++)
{
snprintf(hash + i*2,sizeof(hash),"%02x",digest[i]);
}
Now hash should store the hash of the file, encoded in hexadecimal string. ps. Indeed that sample is incorrectly using strlen with binary file. That is why I suggested the readFile method above; that function also contains code to get file size - you can use that code to get file size and then pass the file size to md5_append method.
ps. also don't forget to free data when you are done with it.
MD5 of xls file is very same of MD5 of any other kind of file since it operates on bytes. See by example openssl implementation openssl/crypto/md5/md5.c and md5test.c ( code is in git://git.openssl.org/openssl.git ).
The problem is that your example uses strlen to determine the file size. But .xls format is binary, so strlen will not work properly.
Adapt the function to return the total data read from the file, and it should work.
Edit. Try something like this code:
void *addr;
struct stat s;
int ret, fd;
ret = stat(filename, &s);
if (ret) {
fprintf(stderr, "Error while stat()ing file: %m\n");
return -1;
}
fd = open(filename, O_RDONLY);;
if (fd < 0) {
fprintf(stderr, "Error while opening file: %m\n");
return -1;
}
addr = mmap(NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED) {
fprintf(stderr, "Error while mapping file: %m\n");
close(fd);
return -1;
}
md5_init(&state);
md5_append(&state,addr, s.st_size);
md5_finish(&state,digest);

Segmentation Fault Error over Writing the Twitter Stream on a Text File

I have a C program that opens a stream of tweets from Twitter API. What this program aims to do is to open a stream and write the stream to a text file. This program is successful when it just prints the stream in the terminal but when I change the code to write to a file there's now a "segmentation fault" error for about 40 seconds into the execution. size_t writefunc(void *ptr, size_t size, size_t nmemb, struct string *s) is the callback function that writes the stream to a file particularly at
fp=fopen("istream.txt", "a");
fprintf(fp, "%s", s->ptr);
fclose(fp);
Why is there a "segmentation fault" error? How should I fix this? I guess I am using the file pointer the wrong way.
==================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
struct string {
char *ptr;
size_t len;
};
void init_string(struct string *s) {
s->len = 0;
s->ptr = malloc(s->len+1);
if (s->ptr == NULL) {
fprintf(stderr, "malloc() failed\n");
exit(EXIT_FAILURE);
}
s->ptr[0] = '\0';
}
size_t writefunc(void *ptr, size_t size, size_t nmemb, struct string *s)
{
size_t new_len = s->len + size*nmemb;
size_t max_buffer = 10240;
FILE *fp;
fp=fopen("istream.txt", "a"); // <------------- the ERROR! Remove this.
s->ptr = realloc(s->ptr, new_len+1);
if (s->ptr == NULL) {
fprintf(stderr, "realloc() failed\n");
exit(EXIT_FAILURE);
}
memcpy(s->ptr+s->len, ptr, size*nmemb);
s->ptr[new_len] = '\0';
s->len = new_len;
if( s->len >= max_buffer )
{
fp=fopen("istream.txt", "a");
fprintf(fp, "%s", s->ptr);
fclose(fp);
fflush( stdout );
free(s->ptr);
init_string( s );
}
return size*nmemb;
}
int main(void)
{
CURL *curl;
CURLcode res;
curl = curl_easy_init();
if(curl) {
struct string s;
init_string(&s);
curl_easy_setopt(curl, CURLOPT_URL, "https://stream.twitter.com/1/statuses/sample.json");
//curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
curl_easy_setopt(curl, CURLOPT_USERPWD, "neilmarion:password");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writefunc);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &s);
res = curl_easy_perform(curl);
//printf("%s\n", s.ptr);
free(s.ptr);
/* always cleanup */
curl_easy_cleanup(curl);
}
return 0;
}
Most things looks correct in your program.
Most likely if fp=fopen("istream.txt", "a"); fails to open the file, it could be a segfault in the consecutive line.
Try to print the string in stderr instead to see. If that works, than issue is indeed in fp.
Other comment:
Since you are opening the file in append mode, there is actually no point in collecting the data in the buffer and keep re-allocating the buffer. You can at once append the file.
Every time you enter writefunc you open istream.txt:
FILE *fp;
fp=fopen("istream.txt", "a");
but you never call fclose on the fp. You do have this:
fp=fopen("istreaam.txt", "a");
fprintf(fp, "%s", s->ptr);
fclose(fp);
later on but the second fopen means that you no longer have a reference to the what the first fopen returned.
Eventually, you will run out of open files and further fopen calls will fail. Apparently this takes about 40 seconds. So get rid of the first fopen and try again.

Resources