Unix HEAD command implementation in C fails on larger lines - c

I am currently implementing the Unix HEAD command with C and using only system functions. So far, it works perfectly on files, which have lines with less length than the one that I specified for my buffer:
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define LINES_TO_READ 10
#define BUFF_SIZE 4096
int main(int argc, char const *argv[]) {
for (ssize_t i = 1; i < argc; i++) {
const char *filename = argv[i];
int fd = open(filename, O_RDONLY);
if (fd < 0) {
perror("open");
return -1;
}
char ch, buffer[BUFF_SIZE];
size_t index = 0, lines = 1;
ssize_t rresult, wresult;
// Read the file byte by byte
while ((rresult = read(fd, &ch, 1)) != 0) {
if (rresult < 0) {
perror("read");
return -1;
}
// Check if the current character is a new line (the line ends here)
if (ch == '\n') {
buffer[index] = ch;
buffer[index + 1] = '\0';
ch = 0;
index = 0;
// Print the line
wresult = 0;
ssize_t buffer_length = strlen(buffer);
while (wresult != buffer_length) {
ssize_t res = write(STDOUT_FILENO, buffer + wresult, buffer_length - wresult);
if (wresult < 0) {
perror("write");
return -1;
}
wresult += res;
}
// Stop if we read 10 lines already
if (lines == LINES_TO_READ) {
break;
}
lines++;
} else {
buffer[index++] = ch;
}
}
if (close(fd) < 0) {
perror("close");
return -1;
}
}
return 0;
}
And it works on files, which have a line length with less than BUFF_SIZE (as now set, 4096).
How to avoid this and make it work for whatever the line length is?

Don't read one byte at a time. Read a chunk (4096 or 8192 bytes are reasonable sizes, or use PIPE_BUF (from limits.h)) into a buffer. Output each character while counting newlines. If you print enough newlines, terminate. If you reach the end of the buffer and haven't printed enough lines, read more data into the buffer.

Related

How do I read from a file and output specific strings in c

I'm writing a program that will read from /etc/passwd and output the username and shell.
For example, here is the first line of the /etc/passwd file:
root:x:0:0:root:/root:/bin/bash
I need to only output the user and the shell. In this instance it would print:
root:/bin/bash
The values are separated by ':' so I just need to print the string before the first ':' and the string after the 6th ':'
Here is the code I have so far:
#include <string.h>
#define BUFFERSIZE 4096
int printf(const char *text, ...);
int main(void) {
int fd;
int buff_size = 1;
char buff[BUFFERSIZE];
int size;
fd = open("/etc/passwd", O_RDONLY);
if (fd < 0) {
printf("Error opening file \n");
return -1;
}
size = strlen(buff - 17);
size = size + 1;
while ((size = read(fd, buff, 1)) > 0) {
buff[1] = '\0';
write(STDOUT_FILENO, buff, size);
}
}
(I am creating prototypes for printf because one of the requirements was to write the program without including <stdio.h> or <stdlib.h>)
Another approach is to use a single loop and a state variable to track the state of where you are in each line based on the number of colons read. The state-variable ncolon does that below. Essentially you read every character and check whether the loop is in a state where you should write the character as output or not. You condition the write on the number of colons, whether you are before the 1st or after the last.
Putting it altogether, you could do:
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
int main (int argc, char **argv) {
int fd, /* file descriptor */
ofd = STDOUT_FILENO, /* output file descriptor */
ncolon = 0; /* counter - number of colons seen */
/* open file given on command line or read from stdin otherwise */
if ((fd = argc > 1 ? open (argv[1], O_RDONLY) : STDIN_FILENO) == -1) {
return 1;
}
for (;;) { /* loop continually */
unsigned char c; /* storage for character */
int rtn; /* var to save return */
if ((rtn = read (fd, &c, 1)) < 1) { /* validate read of 1 char */
if (rtn == -1) { /* return on error */
return 1;
}
break; /* break read loop on EOF */
}
if (ncolon < 1 || ncolon == 6) { /* if before 1st or after last */
write (ofd, &c, 1); /* output char */
}
if (c == '\n') { /* reset ncolon on newline */
ncolon = 0;
}
else if (c == ':') { /* increment on colon */
ncolon += 1;
}
}
if (fd != STDIN_FILENO) { /* close file */
close (fd);
}
}
Example Use/Output
$ ./read_etc-passwd /etc/passwd
root:/bin/bash
messagebus:/usr/bin/false
systemd-network:/usr/sbin/nologin
systemd-timesync:/usr/sbin/nologin
nobody:/bin/bash
mail:/usr/sbin/nologin
chrony:/usr/sbin/nologin
...
Confirm the Format
$ diff <(./read_etc-passwd /etc/passwd) <(awk -F: '{print $1":"$7}' /etc/passwd)
(no output means program output and awk output were identical)
Your program has undefined behavior when you evaluate strlen(buff - 17). It is unclear why you do this.
You can solve the problem with these simple steps:
read one byte at a time
count the ':' on the line
output the byte if the count is equal to 0 or equal to 6.
reset the count at newline (and print the newline)
Note that read(fd, &b, 1) and write(1, &b, 1) return -1 in case of error or interruption and should be restarted if errno is EINTR.
Here is a modified version:
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
int main(void) {
int fd;
unsigned char b;
int count;
ssize_t ret;
fd = open("/etc/passwd", O_RDONLY);
if (fd < 0) {
write(2, "Error opening /etc/password\n", 28);
return 1;
}
count = 0;
for (;;) {
ret = read(fd, &b, 1);
if (ret == 0) { // end of file
break;
}
if (ret < 0) { // error
if (errno == EINTR)
continue;
write(2, "Read error on /etc/password\n", 28);
return 1;
}
if (b == '\n') {
// reset count, print b
count = 0;
} else
if (b == ':') {
// increment count, print ':' only if count == 1
count = count + 1;
if (count != 1)
continue;
} else
if (count != 0 && count != 6) {
// print b only if count is 0 or 6
continue;
}
for (;;) {
ret = write(1, &b, 1);
if (ret == 1)
break;
if (ret < 0 && errno = EINTR)
continue;
write(2, "Write error\n", 12);
return 1;
}
}
close(fd);
return 0;
}

C mmap implementation for linux command

I have the following code which basically reproduce the functionality of the wc command in linux. My question is how I can rewrite the code using mmap? I know I can use struct stat sb; and then char *file_in_memory = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); but I can't get it work/ I don't know how to implement it correctly in the while loop while ((n = read(file, buffer, LUNG_BUF - 1)) > 0). In my tries after I run the code it will display only values of 0.
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#define LUNG_BUF 4096
int main(int argc, char** argv)
{
int bytes = 0;
int words = 0;
int newLine = 0;
int max_value; // the maximum of the above three
int dim; // string width of the max value
char buffer[LUNG_BUF];
enum states { WHITESPACE, WORD };
int state = WHITESPACE;
if ( argc !=2 )
{
printf( "No file name\n%s", argv[0]);
}
else
{
int file = open(argv[1], O_RDONLY);
if(file < 0)
{
printf("can not open :%s\n",argv[1]);
}
else
{
char *thefile = argv[1];
size_t n;
while ((n = read(file, buffer, LUNG_BUF - 1)) > 0)
{
buffer[n] = '\0';
char *ptr = buffer;
while (*ptr)
{
bytes++;
if (*ptr == ' ' || *ptr == '\t')
{
state = WHITESPACE;
}
else if (*ptr == '\n')
{
newLine++;
state = WHITESPACE;
}
else
{
if (state == WHITESPACE)
{
words++;
}
state = WORD;
}
ptr++;
}
}
// find out the largest value of all and determine the printed width of it
max_value = newLine;
if (words > max_value)
max_value = words;
if (bytes > max_value)
max_value = bytes;
dim = snprintf(NULL, 0, "%d", max_value);
// print lines, words, bytes and filename aligned to the longest number
printf("%*d %*d %*d %s\n", dim, newLine, dim, words, dim, bytes, thefile);
}
}
}
The script that I was trying:
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#define LUNG_BUF 4096
int main(int argc, char** argv)
{
int bytes = 0;
int words = 0;
int newLine = 0;
int max_value; // the maximum of the above three
int dim; // string width of the max value
char buffer[LUNG_BUF];
enum states { WHITESPACE, WORD };
int state = WHITESPACE;
if ( argc !=2 )
{
printf( "No file name\n%s", argv[0]);
}
else
{
int file = open(argv[1], O_RDONLY);
if(file < 0)
{
printf("can not open :%s\n",argv[1]);
}
else
{
char *thefile = argv[1];
size_t n;
struct stat sb;
char *file_in_memory = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
for(int i=0;i<=sb.st_size;i++)
{
buffer[i] = '\0';
char *ptr = buffer;
while (*ptr)
{
bytes++;
if (*ptr == ' ' || *ptr == '\t')
{
state = WHITESPACE;
}
else if (*ptr == '\n')
{
newLine++;
state = WHITESPACE;
}
else
{
if (state == WHITESPACE)
{
words++;
}
state = WORD;
}
ptr++;
}
}
// find out the largest value of all and determine the printed width of it
max_value = newLine;
if (words > max_value)
max_value = words;
if (bytes > max_value)
max_value = bytes;
dim = snprintf(NULL, 0, "%d", max_value);
// print lines, words, bytes and filename aligned to the longest number
printf("%*d %*d %*d %s\n", dim, newLine, dim, words, dim, bytes, thefile);
munmap(file_in_memory, sb.st_size);
close(file);
}
}
}
The code you posted above didn't compile and had quite a few problems. I've tidied it up a bit below, hopefully this will help. I tried not to change it too much so you could see what I did.
You hadn't actually called stat and the fd variable you had passed to mmap was not the variable you used to open the file.
I would always compile your code with "-Wall -Werror" if you can.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
int main(int argc, char** argv)
{
if ( argc !=2 )
{
printf( "No file name\n%s", argv[0]);
exit(-1);
}
char *fileName = argv[1];
int file = open(fileName, O_RDONLY);
if(file < 0)
{
perror("Error: ");
exit(-1);
}
struct stat sb = {0};
stat(fileName, &sb);
char *filePtr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, file, 0);
if (filePtr == MAP_FAILED)
{
perror("Error:");
exit(-1);
}
int bytes = sb.st_size;
int words = 0;
int newLine = 0;
enum states { WHITESPACE, WORD };
int state = WHITESPACE;
for(size_t pos=0;pos<=sb.st_size;pos++)
{
if (state == WHITESPACE)
{
if (filePtr[pos] == '\n')
{
newLine++;
}
else if ((filePtr[pos] != ' ') && (filePtr[pos] != '\t'))
{
state = WORD;
}
}
else // (state == WORD)
{
if (filePtr[pos] == ' ' || filePtr[pos] == '\t')
{
state = WHITESPACE;
words++;
}
else if (filePtr[pos] == '\n')
{
state = WHITESPACE;
words++;
newLine++;
}
}
}
// Max value is always bytes
int dim = snprintf(NULL, 0, "%d", bytes);
// print lines, words, bytes and filename aligned to the longest number
printf("%*d %*d %*d %s\n", dim, newLine, dim, words, dim, bytes, fileName);
munmap(filePtr, sb.st_size);
close(file);
}

find longest string in a file with linux kernel function and write in second file [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 years ago.
Improve this question
hi i want to Write a command that reads a string from a file using the Linux kernel functions, and then writes the string with the largest length in the second file./ its in c /
But I have a problem with the fact that first, the program takes the string from the input, not from the file that is in the system, and the second is that it displays the output instead of the second file.
I did a lot of work but I could not do it if you can correct the code for me
/* Trivial file copy program using low-level I/O */
#include <fcntl.h>
#include <stdlib.h>
#define BSIZE 16384
#include <stdio.h>
#include <string.h>
#include <ctype.h>
void main()
{
int fin, fout; /* Input and output handles */
char buf[BSIZE];
int count;
if ((fin = open("foo", O_RDONLY)) < 0) {
perror("foo");
exit(1);
}
if ((fout = open("bar", O_WRONLY | O_CREAT, 0644)) < 0) {
perror("bar");
exit(2);
}
while ((count = read(fin, buf, BSIZE)) > 0)
{
char string[100], word[20], max[20], min[20], c;
int i = 0, j = 0, flag = 0;
printf("Enter string: ");
i = 0;
do
{
fflush(stdin);
c = getchar();
string[i++] = c;
} while (c != '\n');
string[i - 1] = '\0';
for (i = 0; i < strlen(string); i++)
{
while (i < strlen(string) && !isspace(string[i]) && isalnum(string[i]))
{
word[j++] = string[i++];
}
if (j != 0)
{
word[j] = '\0';
if (!flag)
{
flag = !flag;
strcpy(max, word);
strcpy(min, word);
}
if (strlen(word) > strlen(max))
{
strcpy(max, word);
}
if (strlen(word) < strlen(min))
{
strcpy(min, word);
}
j = 0;
}
}
printf("The largest word is '%s' and smallest word is '%s' in '%s'.\n", max, min, string);
return 0;
}
write(fout, buf, count);
close(fin);
close(fout);
}
There are several issues here, but you're basically reading from standard input when you shouldn't be. In fact more than half of that code is unnecessary.
I have a small example here, that sticks to the wording of your original question and at least shows what part of the code needs cutting:
/* Find the longest word in a text file (foo) and write that word out to another file (bar) */
/* A word is a group of characters delimited by white space */
/* A word is shorter than 16384 characters */
/* We are trying to use low-level system functions (e.g. open, close, read, write) */
/* instead of standard library functions (e.g. fopen, fclose, fread, fwrite) */
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define BSIZE 16384
/* read a single word from a file */
static int read_word(int fd, char *buf)
{
int i = 0;
char c = 0;
int bytes = read(fd, &c, 1); /* extract a single character */
while (isspace(c) && bytes != 0) /* skip whitespace */
bytes = read(fd, &c, 1);
while (!isspace(c) && bytes != 0) {
buf[i++] = c; /* store character */
bytes = read(fd, &c, 1);
}
return i; /* return number of characters read */
}
int main()
{
int fin = open("foo", O_RDONLY);
if (fin < 0) {
perror("foo");
exit(1);
}
int fout = open("bar", O_WRONLY | O_CREAT, 0644);
if (fout < 0) {
perror("bar");
exit(2);
}
char buf[BSIZE] = {0};
char largest[BSIZE] = {0};
int bytes = 0;
int count = 0;
do {
bytes = read_word(fin, buf); /* read next word */
if (bytes > count) {
strncpy(largest, buf, BSIZE - 1); /* preserve largest word seen so far */
count = bytes;
}
memset(buf, 0, BSIZE);
} while (bytes > 0);
write(fout, largest, count); /* write largest word to file */
close(fin);
close(fout);
printf ("\nLargest word found was %s # %d characters length.\n", largest, count);
return 0;
}

How to get a line from csv file with a custom fgets

I'm currently writing a program in C that reads in from a CSV file, I have a defined buffer size but am having trouble separating each line from the buffer. I can see where the line ends by checking for a '\n' char. I cannot extract that line from the buffer for parsing however. Anybody have some ideas?
#ifndef BUFFSIZE
#define BUFFSIZE 4096
#endif
int main() {
int fd;
int fdBin;
char * buf = malloc(BUFFSIZE);
int count = 0;
bool EOFFlag = false;
fd = open("SongCSV.csv", O_RDONLY);
fdBin = open("BinarySongData.bin", O_CREAT | O_WRONLY, "0600");
if (fd == -1) {
printf("failed to open a file\n");
exit(1);
}
off_t offset = 0;
off_t offsetOld = 0;
int readBytes;
while (!EOFFlag) {
offsetOld = offset;
offset = lseek(fd, offset - offsetOld, SEEK_CUR);
readBytes = read(fd, buf, BUFFSIZE);
printf("\n\n%lld\n\n", (offset));
int i = 0;
int commaCounter = 0;
while (i < readBytes) {
if (buf[i] != '\n') {
}
if (buf[i] == '\n') {
printf("\t\t THIS IS END OF LINE \t%d", i);
commaCounter = 0;
}
if (buf[i] == ',') {
commaCounter++;
if (commaCounter == 4) {
printf("****Album Name****");
}
}
write(fdBin, buf, BUFFSIZE);
printf("%c", buf[i]);
i++;
}
if (readBytes < BUFFSIZE) {
EOFFlag = true;
printf("\nREACHED END OF FILE");
}
printf("\n");
printf("AA: END OF LINE ****%d*****", count);
count++;
}
close(fd);
close(fdBin);
return 0;
}
I do it this way, easy and simple. I just did it quickly, any doubts just ask me, Cheers.
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
int main()
{
int len = sending();
char *firstline;
int i = 0;
char buf[0];
int rd ;
int fd = open("hey.csv", O_RDONLY);
rd = read(fd, buf, 1);
firstline = malloc(sizeof(char) * len);
while (i != len)
{
firstline[i] = buf[0];
i++;
rd = read(fd, buf, 1);
}
firstline[i] = '\0';
printf("%s\n", firstline);
return (0);
}
int sending()
{
int fd = open("hey.csv", O_RDONLY);
char buf[1];
int r = 0;
r = read(fd, buf, 1);
int len = 0;
while (buf[0] != '\n')//getting exact size to malloc
{
len++;
r = read(fd, buf, 1);
}
return len;
}

Linux C read file UNICODE formatted text (notepad Windows)

Is there a way to read a text file, under Linux with C, saved on Windows as "UNICODE" with notepad?
The text in Linux with nano editor looks like:
��T^#e^#s^#t^#
^#
but under vi editor is read properly as:
Test
I must specify the text is normal strings ANSI (no Unicode characters or foreign languages related).
Tried like this but no result:
#include <stdio.h>
#include <wchar.h>
#include <locale.h>
int main() {
char *loc = setlocale(LC_ALL, 0);
setlocale(LC_ALL, loc);
FILE * f = fopen("unicode.txt", "r");
wint_t c;
while((c = fgetwc(f)) != WEOF) {
wprintf(L"%lc\n", c);
}
return 0;
}
UPDATE:
Forgot to mention the file format is Little-endian UTF-16 Unicode text or UTF-16LE
Include <wchar.h>, set an UTF-8 locale (setlocale(LC_ALL, "en_US.UTF-8") is fine), open the file or stream in byte-oriented mode (handle=fopen(filename, "rb"), fwide(handle,-1), i.e. in not-wide mode). Then you can use
wint_t getwc_utf16le(FILE *const in)
{
int lo, hi, code, also;
if ((lo = getc(in)) == EOF)
return WEOF;
if ((hi = getc(in)) == EOF)
return lo; /* Or abort; input sequence ends prematurely */
code = lo + 256 * hi;
if (code < 0xD800 || code > 0xDBFF)
return code; /* Or abort; input sequence is not UTF16-LE */
if ((lo = getc(in)) == EOF)
return code; /* Or abort; input sequence ends prematurely */
if ((hi = getc(in)) == EOF) {
ungetc(lo, in);
return code; /* Or abort; input sequence ends prematurely */
}
/* Note: if ((lo + 256*hi) < 0xDC00 || (lo + 256*hi) > 0xDFFF)
* the input sequence is not valid UTF16-LE. */
return 0x10000 + ((code & 0x3FF) << 10) + ((lo + 256 * hi) & 0x3FF);
}
to read code points from such an input file, assuming it contains UTF16-LE data.
The above function is more permissive than strictly necessary, but it does parse all UTF16-LE I could throw at it (including the sometimes problematic U+100000..U+10FFFF code points), so if the input is correct, this function should handle it just fine.
Because the locale is set to UTF-8 in Linux, and Linux implementations support the full Unicode set, the code points match the ones produced by above functions, and you can safely use wide character functions (from <wchar.h>) to handle the input.
Often the first character in the file is BOM, "byte-order mark", 0xFEFF. You can ignore it if it is the first character in the file. Elsewhere it is the zero-width non-breaking space. In my experience, those two bytes at the start of a file that is supposed to be text, is quite reliable indicator that the file is UTF16-LE. (So, you could peek at the first two bytes, and if they match those, assume it is UTF16-LE.)
Remember that wide-character end-of-file is WEOF, not EOF.
Hope this helps.
Edited 20150505: Here is a helper function one could use instead, to read inputs (using low-level unistd.h interface), converting to UTF-8: read_utf8.h:
#ifndef READ_UTF8_H
#define READ_UTF8_H
/* Read input from file descriptor fd,
* convert it to UTF-8 (using "UTF8//TRANSLIT" iconv conversion),
* and appending to the specified buffer.
* (*dataptr) points to a dynamically allocated buffer (may reallocate),
* (*sizeptr) points to the size allocated for that buffer,
* (*usedptr) points to the amount of data already in the buffer.
* You may initialize the values to NULL,0,0, in which case they will
* be dynamically allocated as needed.
*/
int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset);
#endif /* READ_UTF8_H */
read_utf8.c:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <string.h>
#include <errno.h>
#define INPUT_CHUNK 16384
#define OUTPUT_CHUNK 8192
int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset)
{
char *data;
size_t size;
size_t used;
char *input_data;
size_t input_size, input_head, input_tail;
int input_more;
iconv_t conversion = (iconv_t)-1;
if (!dataptr || !sizeptr || !usedptr || fd == -1 || !charset || !*charset)
return errno = EINVAL;
if (*dataptr) {
data = *dataptr;
size = *sizeptr;
used = *usedptr;
if (used > size)
return errno = EINVAL;
} else {
data = NULL;
size = 0;
used = 0;
}
conversion = iconv_open("UTF8//TRANSLIT", charset);
if (conversion == (iconv_t)-1)
return errno = ENOTSUP;
input_size = INPUT_CHUNK;
input_data = malloc(input_size);
if (!input_data) {
if (conversion != (iconv_t)-1)
iconv_close(conversion);
errno = ENOMEM;
return 0;
}
input_head = 0;
input_tail = 0;
input_more = 1;
while (1) {
if (input_tail > input_head) {
if (input_head > 0) {
memmove(input_data, input_data + input_head, input_tail - input_head);
input_tail -= input_head;
input_head = 0;
}
} else {
input_head = 0;
input_tail = 0;
}
if (input_more && input_tail < input_size) {
ssize_t n;
do {
n = read(fd, input_data + input_tail, input_size - input_tail);
} while (n == (ssize_t)-1 && errno == EINTR);
if (n > (ssize_t)0)
input_tail += n;
else
if (n == (ssize_t)0)
input_more = 0;
else
if (n != (ssize_t)-1) {
free(input_data);
iconv_close(conversion);
return errno = EIO;
} else {
const int errcode = errno;
free(input_data);
iconv_close(conversion);
return errno = errcode;
}
}
if (input_head == 0 && input_tail == 0)
break;
if (used + OUTPUT_CHUNK > size) {
size = (used / (size_t)OUTPUT_CHUNK + (size_t)2) * (size_t)OUTPUT_CHUNK;
data = realloc(data, size);
if (!data) {
free(input_data);
iconv_close(conversion);
return errno = ENOMEM;
}
*dataptr = data;
*sizeptr = size;
}
{
char *source_ptr = input_data + input_head;
size_t source_len = input_tail - input_head;
char *target_ptr = data + used;
size_t target_len = size - used;
size_t n;
n = iconv(conversion, &source_ptr, &source_len, &target_ptr, &target_len);
if (n == (size_t)-1 && errno == EILSEQ) {
free(input_data);
iconv_close(conversion);
return errno = EILSEQ;
}
if (source_ptr == input_data + input_head && target_ptr == data + used) {
free(input_data);
iconv_close(conversion);
return errno = EDEADLK;
}
input_head = (size_t)(source_ptr - input_data);
used = (size_t)(target_ptr - data);
*usedptr = used;
}
}
free(input_data);
iconv_close(conversion);
if (used + 16 >= size) {
size = (used | 15) + 17;
data = realloc(data, size);
if (!data)
return errno = ENOMEM;
*dataptr = data;
*sizeptr = size;
memset(data + used, 0, size - used);
} else
if (used + 32 < size)
memset(data + used, 0, size - used);
else
memset(data + used, 0, 32);
return errno = 0;
}
and an example program, example.c, on how to use it:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "read_utf8.h"
int main(int argc, char *argv[])
{
char *file_buffer = NULL;
size_t file_allocd = 0;
size_t file_length = 0;
int fd;
if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s FILENAME CHARSET\n", argv[0]);
fprintf(stderr, " %s FILENAME CHARSET//IGNORE\n", argv[0]);
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
do {
fd = open(argv[1], O_RDONLY | O_NOCTTY);
} while (fd == -1 && errno == EINTR);
if (fd == -1) {
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
if (read_utf8(&file_buffer, &file_allocd, &file_length, fd, argv[2])) {
if (errno == ENOTSUP)
fprintf(stderr, "%s: Unsupported character set.\n", argv[2]);
else
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
errno = EIO;
if (close(fd)) {
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
fprintf(stderr, "%s: read %zu bytes, allocated %zu.\n", argv[1], file_length, file_allocd);
if (file_length > 0)
if (fwrite(file_buffer, file_length, 1, stdout) != 1) {
fprintf(stderr, "Error writing to standard output.\n");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
This lets you read (either into an empty, dynamically allocated buffer, or append to an existing dynamically allocated buffer) using any character set supported by your system (use iconv --list to see the list), auto-converting the contents to UTF-8.
It uses a temporary input buffer (of INPUT_CHUNK bytes) to read the file part by part, and reallocates the output buffer in multiples of OUTPUT_CHUNK bytes, keeping at least OUTPUT_CHUNK bytes available for each conversion. The constants may need a bit of tuning for different use cases; they're by no means optimal or even suggested values. Larger ones lead to faster code, especially for INPUT_CHUNK, as most filesystems perform better when reading large chunks (2097152 is suggested size currently, if I/O performance is important) -- but you should have OUTPUT_CHUNK at similar size, or perhaps twice that, to reduce the number of reallocations needed. (You can trim the resulting buffer afterwards, to used+1 bytes, using realloc(), to avoid memory waste.)

Resources