Linux C read file UNICODE formatted text (notepad Windows) - c

Is there a way to read a text file, under Linux with C, saved on Windows as "UNICODE" with notepad?
The text in Linux with nano editor looks like:
��T^#e^#s^#t^#
^#
but under vi editor is read properly as:
Test
I must specify the text is normal strings ANSI (no Unicode characters or foreign languages related).
Tried like this but no result:
#include <stdio.h>
#include <wchar.h>
#include <locale.h>
int main() {
char *loc = setlocale(LC_ALL, 0);
setlocale(LC_ALL, loc);
FILE * f = fopen("unicode.txt", "r");
wint_t c;
while((c = fgetwc(f)) != WEOF) {
wprintf(L"%lc\n", c);
}
return 0;
}
UPDATE:
Forgot to mention the file format is Little-endian UTF-16 Unicode text or UTF-16LE

Include <wchar.h>, set an UTF-8 locale (setlocale(LC_ALL, "en_US.UTF-8") is fine), open the file or stream in byte-oriented mode (handle=fopen(filename, "rb"), fwide(handle,-1), i.e. in not-wide mode). Then you can use
wint_t getwc_utf16le(FILE *const in)
{
int lo, hi, code, also;
if ((lo = getc(in)) == EOF)
return WEOF;
if ((hi = getc(in)) == EOF)
return lo; /* Or abort; input sequence ends prematurely */
code = lo + 256 * hi;
if (code < 0xD800 || code > 0xDBFF)
return code; /* Or abort; input sequence is not UTF16-LE */
if ((lo = getc(in)) == EOF)
return code; /* Or abort; input sequence ends prematurely */
if ((hi = getc(in)) == EOF) {
ungetc(lo, in);
return code; /* Or abort; input sequence ends prematurely */
}
/* Note: if ((lo + 256*hi) < 0xDC00 || (lo + 256*hi) > 0xDFFF)
* the input sequence is not valid UTF16-LE. */
return 0x10000 + ((code & 0x3FF) << 10) + ((lo + 256 * hi) & 0x3FF);
}
to read code points from such an input file, assuming it contains UTF16-LE data.
The above function is more permissive than strictly necessary, but it does parse all UTF16-LE I could throw at it (including the sometimes problematic U+100000..U+10FFFF code points), so if the input is correct, this function should handle it just fine.
Because the locale is set to UTF-8 in Linux, and Linux implementations support the full Unicode set, the code points match the ones produced by above functions, and you can safely use wide character functions (from <wchar.h>) to handle the input.
Often the first character in the file is BOM, "byte-order mark", 0xFEFF. You can ignore it if it is the first character in the file. Elsewhere it is the zero-width non-breaking space. In my experience, those two bytes at the start of a file that is supposed to be text, is quite reliable indicator that the file is UTF16-LE. (So, you could peek at the first two bytes, and if they match those, assume it is UTF16-LE.)
Remember that wide-character end-of-file is WEOF, not EOF.
Hope this helps.
Edited 20150505: Here is a helper function one could use instead, to read inputs (using low-level unistd.h interface), converting to UTF-8: read_utf8.h:
#ifndef READ_UTF8_H
#define READ_UTF8_H
/* Read input from file descriptor fd,
* convert it to UTF-8 (using "UTF8//TRANSLIT" iconv conversion),
* and appending to the specified buffer.
* (*dataptr) points to a dynamically allocated buffer (may reallocate),
* (*sizeptr) points to the size allocated for that buffer,
* (*usedptr) points to the amount of data already in the buffer.
* You may initialize the values to NULL,0,0, in which case they will
* be dynamically allocated as needed.
*/
int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset);
#endif /* READ_UTF8_H */
read_utf8.c:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <string.h>
#include <errno.h>
#define INPUT_CHUNK 16384
#define OUTPUT_CHUNK 8192
int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset)
{
char *data;
size_t size;
size_t used;
char *input_data;
size_t input_size, input_head, input_tail;
int input_more;
iconv_t conversion = (iconv_t)-1;
if (!dataptr || !sizeptr || !usedptr || fd == -1 || !charset || !*charset)
return errno = EINVAL;
if (*dataptr) {
data = *dataptr;
size = *sizeptr;
used = *usedptr;
if (used > size)
return errno = EINVAL;
} else {
data = NULL;
size = 0;
used = 0;
}
conversion = iconv_open("UTF8//TRANSLIT", charset);
if (conversion == (iconv_t)-1)
return errno = ENOTSUP;
input_size = INPUT_CHUNK;
input_data = malloc(input_size);
if (!input_data) {
if (conversion != (iconv_t)-1)
iconv_close(conversion);
errno = ENOMEM;
return 0;
}
input_head = 0;
input_tail = 0;
input_more = 1;
while (1) {
if (input_tail > input_head) {
if (input_head > 0) {
memmove(input_data, input_data + input_head, input_tail - input_head);
input_tail -= input_head;
input_head = 0;
}
} else {
input_head = 0;
input_tail = 0;
}
if (input_more && input_tail < input_size) {
ssize_t n;
do {
n = read(fd, input_data + input_tail, input_size - input_tail);
} while (n == (ssize_t)-1 && errno == EINTR);
if (n > (ssize_t)0)
input_tail += n;
else
if (n == (ssize_t)0)
input_more = 0;
else
if (n != (ssize_t)-1) {
free(input_data);
iconv_close(conversion);
return errno = EIO;
} else {
const int errcode = errno;
free(input_data);
iconv_close(conversion);
return errno = errcode;
}
}
if (input_head == 0 && input_tail == 0)
break;
if (used + OUTPUT_CHUNK > size) {
size = (used / (size_t)OUTPUT_CHUNK + (size_t)2) * (size_t)OUTPUT_CHUNK;
data = realloc(data, size);
if (!data) {
free(input_data);
iconv_close(conversion);
return errno = ENOMEM;
}
*dataptr = data;
*sizeptr = size;
}
{
char *source_ptr = input_data + input_head;
size_t source_len = input_tail - input_head;
char *target_ptr = data + used;
size_t target_len = size - used;
size_t n;
n = iconv(conversion, &source_ptr, &source_len, &target_ptr, &target_len);
if (n == (size_t)-1 && errno == EILSEQ) {
free(input_data);
iconv_close(conversion);
return errno = EILSEQ;
}
if (source_ptr == input_data + input_head && target_ptr == data + used) {
free(input_data);
iconv_close(conversion);
return errno = EDEADLK;
}
input_head = (size_t)(source_ptr - input_data);
used = (size_t)(target_ptr - data);
*usedptr = used;
}
}
free(input_data);
iconv_close(conversion);
if (used + 16 >= size) {
size = (used | 15) + 17;
data = realloc(data, size);
if (!data)
return errno = ENOMEM;
*dataptr = data;
*sizeptr = size;
memset(data + used, 0, size - used);
} else
if (used + 32 < size)
memset(data + used, 0, size - used);
else
memset(data + used, 0, 32);
return errno = 0;
}
and an example program, example.c, on how to use it:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "read_utf8.h"
int main(int argc, char *argv[])
{
char *file_buffer = NULL;
size_t file_allocd = 0;
size_t file_length = 0;
int fd;
if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s FILENAME CHARSET\n", argv[0]);
fprintf(stderr, " %s FILENAME CHARSET//IGNORE\n", argv[0]);
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
do {
fd = open(argv[1], O_RDONLY | O_NOCTTY);
} while (fd == -1 && errno == EINTR);
if (fd == -1) {
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
if (read_utf8(&file_buffer, &file_allocd, &file_length, fd, argv[2])) {
if (errno == ENOTSUP)
fprintf(stderr, "%s: Unsupported character set.\n", argv[2]);
else
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
errno = EIO;
if (close(fd)) {
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
fprintf(stderr, "%s: read %zu bytes, allocated %zu.\n", argv[1], file_length, file_allocd);
if (file_length > 0)
if (fwrite(file_buffer, file_length, 1, stdout) != 1) {
fprintf(stderr, "Error writing to standard output.\n");
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
This lets you read (either into an empty, dynamically allocated buffer, or append to an existing dynamically allocated buffer) using any character set supported by your system (use iconv --list to see the list), auto-converting the contents to UTF-8.
It uses a temporary input buffer (of INPUT_CHUNK bytes) to read the file part by part, and reallocates the output buffer in multiples of OUTPUT_CHUNK bytes, keeping at least OUTPUT_CHUNK bytes available for each conversion. The constants may need a bit of tuning for different use cases; they're by no means optimal or even suggested values. Larger ones lead to faster code, especially for INPUT_CHUNK, as most filesystems perform better when reading large chunks (2097152 is suggested size currently, if I/O performance is important) -- but you should have OUTPUT_CHUNK at similar size, or perhaps twice that, to reduce the number of reallocations needed. (You can trim the resulting buffer afterwards, to used+1 bytes, using realloc(), to avoid memory waste.)

Related

How do I read from a file and output specific strings in c

I'm writing a program that will read from /etc/passwd and output the username and shell.
For example, here is the first line of the /etc/passwd file:
root:x:0:0:root:/root:/bin/bash
I need to only output the user and the shell. In this instance it would print:
root:/bin/bash
The values are separated by ':' so I just need to print the string before the first ':' and the string after the 6th ':'
Here is the code I have so far:
#include <string.h>
#define BUFFERSIZE 4096
int printf(const char *text, ...);
int main(void) {
int fd;
int buff_size = 1;
char buff[BUFFERSIZE];
int size;
fd = open("/etc/passwd", O_RDONLY);
if (fd < 0) {
printf("Error opening file \n");
return -1;
}
size = strlen(buff - 17);
size = size + 1;
while ((size = read(fd, buff, 1)) > 0) {
buff[1] = '\0';
write(STDOUT_FILENO, buff, size);
}
}
(I am creating prototypes for printf because one of the requirements was to write the program without including <stdio.h> or <stdlib.h>)
Another approach is to use a single loop and a state variable to track the state of where you are in each line based on the number of colons read. The state-variable ncolon does that below. Essentially you read every character and check whether the loop is in a state where you should write the character as output or not. You condition the write on the number of colons, whether you are before the 1st or after the last.
Putting it altogether, you could do:
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
int main (int argc, char **argv) {
int fd, /* file descriptor */
ofd = STDOUT_FILENO, /* output file descriptor */
ncolon = 0; /* counter - number of colons seen */
/* open file given on command line or read from stdin otherwise */
if ((fd = argc > 1 ? open (argv[1], O_RDONLY) : STDIN_FILENO) == -1) {
return 1;
}
for (;;) { /* loop continually */
unsigned char c; /* storage for character */
int rtn; /* var to save return */
if ((rtn = read (fd, &c, 1)) < 1) { /* validate read of 1 char */
if (rtn == -1) { /* return on error */
return 1;
}
break; /* break read loop on EOF */
}
if (ncolon < 1 || ncolon == 6) { /* if before 1st or after last */
write (ofd, &c, 1); /* output char */
}
if (c == '\n') { /* reset ncolon on newline */
ncolon = 0;
}
else if (c == ':') { /* increment on colon */
ncolon += 1;
}
}
if (fd != STDIN_FILENO) { /* close file */
close (fd);
}
}
Example Use/Output
$ ./read_etc-passwd /etc/passwd
root:/bin/bash
messagebus:/usr/bin/false
systemd-network:/usr/sbin/nologin
systemd-timesync:/usr/sbin/nologin
nobody:/bin/bash
mail:/usr/sbin/nologin
chrony:/usr/sbin/nologin
...
Confirm the Format
$ diff <(./read_etc-passwd /etc/passwd) <(awk -F: '{print $1":"$7}' /etc/passwd)
(no output means program output and awk output were identical)
Your program has undefined behavior when you evaluate strlen(buff - 17). It is unclear why you do this.
You can solve the problem with these simple steps:
read one byte at a time
count the ':' on the line
output the byte if the count is equal to 0 or equal to 6.
reset the count at newline (and print the newline)
Note that read(fd, &b, 1) and write(1, &b, 1) return -1 in case of error or interruption and should be restarted if errno is EINTR.
Here is a modified version:
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
int main(void) {
int fd;
unsigned char b;
int count;
ssize_t ret;
fd = open("/etc/passwd", O_RDONLY);
if (fd < 0) {
write(2, "Error opening /etc/password\n", 28);
return 1;
}
count = 0;
for (;;) {
ret = read(fd, &b, 1);
if (ret == 0) { // end of file
break;
}
if (ret < 0) { // error
if (errno == EINTR)
continue;
write(2, "Read error on /etc/password\n", 28);
return 1;
}
if (b == '\n') {
// reset count, print b
count = 0;
} else
if (b == ':') {
// increment count, print ':' only if count == 1
count = count + 1;
if (count != 1)
continue;
} else
if (count != 0 && count != 6) {
// print b only if count is 0 or 6
continue;
}
for (;;) {
ret = write(1, &b, 1);
if (ret == 1)
break;
if (ret < 0 && errno = EINTR)
continue;
write(2, "Write error\n", 12);
return 1;
}
}
close(fd);
return 0;
}

Is there a way to check for string content after a null-terminating byte?

I want to pipe an input to my program that only accepts lines that are valid for my regular expression, i.e. a number of max length 3, followed by at least one white space character, followed by a negative or positive number of max length 7.
The following call
echo -e '1 1\n1 1\x00junk' | ./myProgram
gets through my regular expression but shouldn't.
I guess its cause of getline(), that my regular expression only gets the content in front of the null-terminating byte and ignores everything after.
Is there a way to check for the content after a null-terminating byte without potentially violating the access of the allocated memory of my string, so that the given call ends in an error ?
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <errno.h>
int main() {
int reg;
regex_t regex;
char *regexStr = "^[0-9]{1,3} +-?[0-9]{1,7}(\n|\r|\r\n|0a|0d0a)?$";
int size = 16;
char *buffer = malloc(size * sizeof(char));
size_t len = size;
ssize_t nread;
if (regcomp(&regex, regexStr, REG_EXTENDED)) {
fprintf(stderr, "Couldn't compile regular expression.\n");
return -1;
}
while ((nread = getline(&buffer, &len, stdin)) != EOF) {
if (errno == ENOMEM) { // error if getline() couldnt allocate buffer
fprintf(stderr, "Couldn't allocate enough memory.\n");
return -1;
}
reg = regexec(&regex, buffer, 0, NULL, 0);
if (reg == REG_NOMATCH) { //input invalid if regular expression doesnt match with line
fprintf(stderr, "Input invalid.\n");
return -1;
}
printf("%s", buffer);
}
printf("\n");
printf("Input was valid.\n");
return 0;
}
Matching null bytes with regexec is tricky but could be achieved on some architectures with the optional flag REG_STARTEND as documented by KamilCuk, but this feature is non standard.
There is a simple solution for POSIX systems: unlike fgets(), getline() returns the number of bytes read from the stream, so you can detect if any of these bytes is a null byte by comparing nread with strlen(buffer).
Here is a modified version, with some other fixes:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
#include <errno.h>
int main() {
int reg;
regex_t regex;
const char *regexStr = "^[0-9]{1,3} +-?[0-9]{1,7}(\n|\r|\r\n)?$";
char *buffer = NULL; // no need to pre-allocate buffer
size_t size = 0;
ssize_t nread;
if (regcomp(&regex, regexStr, REG_EXTENDED)) {
fprintf(stderr, "Couldn't compile regular expression.\n");
return -1;
}
for (;;) {
errno = 0; // set errno so we can test it after getline
nread = getline(&buffer, &size, stdin);
if (nread < 0) {
if (errno == ENOMEM) {
// error if getline() couldnt allocate buffer
fprintf(stderr, "Couldn't allocate enough memory.\n");
return -1;
}
break;
}
if (strlen(buffer) != (size_t)nread) {
fprintf(stderr, "Invalid input: contains null bytes\n");
return -1;
}
reg = regexec(&regex, buffer, 0, NULL, 0);
if (reg == REG_NOMATCH) {
// input invalid if regular expression doesn't match line contents
fprintf(stderr, "Input invalid.\n");
return -1;
}
printf("%s", buffer);
}
printf("\n");
printf("Input was valid.\n");
return 0;
}
From man regexec:
REG_STARTEND
Use pmatch[0] on the input string, starting at byte pmatch[0].rm_so and ending before byte pmatch[0].rm_eo. This allows
matching embedded NUL bytes and avoids a strlen(3) on large strings. It does not use nmatch on input, and does not change
REG_NOTBOL or REG_NEWLINE processing. This flag is a BSD extension, not present in POSIX.
Use REG_STARTEND. Like so:
regmatch_t match[1] = {0};
match[0].rm_so = 0;
match[0].rm_eo = nread;
reg = regexec(&regex, buffer, 0, match, REG_STARTEND);
results in:
$ echo -e '1 1\n1 1\x00junk' | ./a.out
1 1
Input invalid.

Can the Pagemap folder of processes in the Linux kernel be read(64bit per read) a finite number of times?

I'm trying to keep track of the number of writes per physical page in the file "proc/PID/pagemap".But the file is binary, and the size shown in the file properties is 0, and the following function reads 0 as well.
struct stat buf;
int iRet = fstat(fd, &buf);
if(iRet == -1)
{
perror("fstat error");
exit(-1);
}
printf("the size of file is : %ld\n", buf.st_size);
I write a monitor program to read data from a process's "pagemap" 64bit one time and record the 55-bit(soft dirty bit)to check if one page is written.Of course before doing this I cleared all soft dirty bit in a process's pagemap.This method is provided by linux kernel and my question during coding is that when I use file descriptor(also tried fstream pointer) to get the data from pagemap.My reading of pagemap ends only when the process I'm monitoring is finished, as if the file were infinite.I know the process's logical address mangement is dynamic but I want to know how could I count the write number properly.Should I read a part of this infinite file within a fixed time intervals?And how many items should I read? T _ T.
You need something like the following:
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
struct pagemap_region {
struct pagemap_region *next;
uintptr_t addr; /* First address within region */
uintptr_t ends; /* First address after region */
size_t pages; /* Number of pages in this region */
uint64_t page[]; /* 64-bit pagemap flags per page */
};
static void free_pagemaps(struct pagemap_region *list)
{
while (list) {
struct pagemap_region *curr = list;
list = curr->next;
curr->addr = 0;
curr->ends = 0;
curr->pages = 0;
free(curr);
}
}
struct pagemap_region *get_pagemaps(const pid_t pid)
{
struct pagemap_region *list = NULL;
size_t page;
char *line_ptr = NULL;
size_t line_max = 256;
ssize_t line_len;
FILE *maps;
int n, fd;
page = sysconf(_SC_PAGESIZE);
/* We reuse this for the input line buffer. */
line_ptr = malloc(line_max);
if (!line_ptr) {
errno = ENOMEM;
return NULL;
}
/* First, fill it with the path to the map pseudo-file. */
if (pid > 0)
n = snprintf(line_ptr, line_max, "/proc/%d/maps", (int)pid);
else
n = snprintf(line_ptr, line_max, "/proc/self/maps");
if (n < 0 || (size_t)n + 1 >= line_max) {
free(line_ptr);
errno = EINVAL;
return NULL;
}
/* Read the maps pseudo-file. */
maps = fopen(line_ptr, "re"); /* Read-only, close-on-exec */
if (!maps) {
free(line_ptr);
errno = ESRCH;
return NULL;
}
while (1) {
struct pagemap_region *curr;
unsigned long addr, ends;
size_t pages;
char *ptr, *end;
line_len = getline(&line_ptr, &line_max, maps);
if (line_len < 0)
break;
/* Start address of the region. */
end = ptr = line_ptr;
errno = 0;
addr = strtoul(ptr, &end, 16);
if (errno || end == ptr || *end != '-')
break;
/* End address of the region. */
ptr = ++end;
errno = 0;
ends = strtoul(ptr, &end, 16);
if (errno || end == ptr || *end != ' ')
break;
/* Number of pages in the region. */
pages = (ends - addr) / page;
if (addr + page * pages != ends || (addr % page) != 0)
break;
/* Allocate new region map. */
curr = malloc(sizeof (struct pagemap_region) + pages * sizeof curr->page[0]);
if (!curr)
break;
curr->addr = addr;
curr->ends = ends;
curr->pages = pages;
/* Prepend to the region list. */
curr->next = list;
list = curr;
}
/* Any issues when reading the maps pseudo-file? */
if (!feof(maps) || ferror(maps)) {
fclose(maps);
free(line_ptr);
free_pagemaps(list);
errno = EIO;
return NULL;
} else
if (fclose(maps)) {
free(line_ptr);
free_pagemaps(list);
errno = EIO;
return NULL;
}
/* Reuse the line buffer for the pagemap pseudo-file path */
if (pid > 0)
n = snprintf(line_ptr, line_max, "/proc/%d/pagemap", (int)pid);
else
n = snprintf(line_ptr, line_max, "/proc/self/pagemap");
if (n < 0 || (size_t)n + 1 >= line_max) {
free(line_ptr);
free_pagemaps(list);
errno = ENOMEM;
return NULL;
}
do {
fd = open(line_ptr, O_RDONLY | O_NOCTTY | O_CLOEXEC);
} while (fd == -1 && errno == EINTR);
if (fd == -1) {
n = errno;
free(line_ptr);
free_pagemaps(list);
errno = n;
return NULL;
}
/* Path no longer needed. */
free(line_ptr);
line_ptr = NULL;
line_max = 0;
/* Read each pagemap section. */
for (struct pagemap_region *curr = list; curr != NULL; curr = curr->next) {
off_t offset = (size_t)(curr->addr / page) * (sizeof curr->page[0]);
unsigned char *ptr = (unsigned char *)&(curr->page[0]);
size_t need = curr->pages * sizeof curr->page[0];
ssize_t bytes;
while (need > 0) {
bytes = pread(fd, ptr, need, offset);
if (bytes >= need)
break;
else
if (bytes > 0) {
ptr += bytes;
offset += bytes;
need -= bytes;
} else
if (bytes == 0) {
/* Assume this is a region we can't access, like [VSYSCALL]; clear the rest of the bits. */
memset(ptr, 0, need);
break;
} else
if (bytes != -1 || errno != EINTR) {
close(fd);
free_pagemaps(list);
errno = EIO;
return NULL;
}
}
}
if (close(fd) == -1) {
free_pagemaps(list);
errno = EIO;
return NULL;
}
return list;
}
int main(int argc, char *argv[])
{
struct pagemap_region *list, *curr;
long pid;
char *end;
if (argc != 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
const char *argv0 = (argc > 0 && argv && argv[1]) ? argv[1] : "(this)";
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv0);
fprintf(stderr, " %s PID\n", argv0);
fprintf(stderr, "\n");
fprintf(stderr, "This program prints the a map of the pages of process PID;\n");
fprintf(stderr, "R for pages in RAM, S for pages in swap space, and . for others.\n");
fprintf(stderr, "You can use -1 for the PID of this process itself.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
end = argv[1];
errno = 0;
pid = strtol(argv[1], &end, 10);
if (errno || end == argv[1] || *end) {
fprintf(stderr, "%s: Invalid PID.\n", argv[1]);
return EXIT_FAILURE;
}
if (pid != -1 && (pid < 1 || (long)(pid_t)pid != pid)) {
fprintf(stderr, "%s: Not a valid PID.\n", argv[1]);
return EXIT_FAILURE;
}
list = get_pagemaps(pid);
if (!list) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
for (curr = list; curr != NULL; curr = curr->next) {
printf("Region %p - %p: %zu pages\n", (void *)(curr->addr), (void *)(curr->ends), curr->pages);
for (uint64_t *map = curr->page; map < curr->page + curr->pages; map++) {
if ((*map >> 63) & 1)
putchar('R');
else
if ((*map >> 62) & 1)
putchar('S');
else
putchar('.');
}
putchar('\n');
}
return EXIT_SUCCESS;
}
We read /proc/PID/maps line by line, and construct a struct pagemap_region for each; this contains the start address, the end address, and the number of pages in the region. (I didn't bother to support huge pages, though; if you do, consider parsing /proc/PID/smaps instead. If a line begins with a 0-9 or lowercase a-f, it specifies an region; otherwise the line begins with a capital letter A-Z and specifies a property of that region.)
Each struct pagemap_region also contains room for the 64-bit pagemap value per page. After the regions have been found/chosen – this one tries all –, the /proc/PID/pagemap file is opened, and the corresponding data read from the proper location using pread(), which works like read(), but also takes the file offset as an extra parameter.
Not all regions are accessible. I do believe [VSYSCALL] is one of those, but being a kernel-userspace interface, its pagemap bits are uninteresting anyway. Instead of removing such regions from the list, the above just clears the bits to zero.
This is not intended as a "do it exactly like this, just copy and paste this" answer, but as a suggestion of how to start going about this, perhaps exploring a bit, comparing the results or behaviour to your particular needs; a sort of a rough outline for an initial suggestion only.
Also, as I wrote it in a single sitting, it's likely got nasty bugs in it. (If I knew where or for sure, I'd fix them; it's just that bugs happen.)

Unix HEAD command implementation in C fails on larger lines

I am currently implementing the Unix HEAD command with C and using only system functions. So far, it works perfectly on files, which have lines with less length than the one that I specified for my buffer:
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define LINES_TO_READ 10
#define BUFF_SIZE 4096
int main(int argc, char const *argv[]) {
for (ssize_t i = 1; i < argc; i++) {
const char *filename = argv[i];
int fd = open(filename, O_RDONLY);
if (fd < 0) {
perror("open");
return -1;
}
char ch, buffer[BUFF_SIZE];
size_t index = 0, lines = 1;
ssize_t rresult, wresult;
// Read the file byte by byte
while ((rresult = read(fd, &ch, 1)) != 0) {
if (rresult < 0) {
perror("read");
return -1;
}
// Check if the current character is a new line (the line ends here)
if (ch == '\n') {
buffer[index] = ch;
buffer[index + 1] = '\0';
ch = 0;
index = 0;
// Print the line
wresult = 0;
ssize_t buffer_length = strlen(buffer);
while (wresult != buffer_length) {
ssize_t res = write(STDOUT_FILENO, buffer + wresult, buffer_length - wresult);
if (wresult < 0) {
perror("write");
return -1;
}
wresult += res;
}
// Stop if we read 10 lines already
if (lines == LINES_TO_READ) {
break;
}
lines++;
} else {
buffer[index++] = ch;
}
}
if (close(fd) < 0) {
perror("close");
return -1;
}
}
return 0;
}
And it works on files, which have a line length with less than BUFF_SIZE (as now set, 4096).
How to avoid this and make it work for whatever the line length is?
Don't read one byte at a time. Read a chunk (4096 or 8192 bytes are reasonable sizes, or use PIPE_BUF (from limits.h)) into a buffer. Output each character while counting newlines. If you print enough newlines, terminate. If you reach the end of the buffer and haven't printed enough lines, read more data into the buffer.

Is it possible "force" UTF-8 in a C program?

Usually when I want my program to use UTF-8 encoding, I write setlocale (LC_ALL, "");. But today I found that it's just setting locate to environment's default locale, and I can't know whether the environment is using UTF-8 by default.
I wonder is there any way to force the character encoding to be UTF-8? Also, is there any way to check whether my program is using UTF-8?
It is possible, but it is the completely wrong thing to do.
First of all, the current locale is for the user to decide. It is not just the character set, but also the language, date and time formats, and so on. Your program has absolutely no "right" to mess with it.
If you cannot localize your program, just tell the user the environmental requirements your program has, and let them worry about it.
Really, you should not really rely on UTF-8 being the current encoding, but use wide character support, including functions like wctype(), mbstowcs(), and so on. POSIXy systems also provide iconv_open() and iconv() function family in their C libraries to convert between encodings (which should always include conversion to and from wchar_t); on Windows, you need a separate version libiconv library. This is how for example the GCC compiler handles different character sets. (Internally, it uses Unicode/UTF-8, but if you ask it to, it can do the necessary conversions to work with other character sets.)
I am personally a strong proponent of using UTF-8 everywhere, but overriding the user locale in a program is horrific. Abominable. Distasteful; like a desktop applet changing the display resolution because the programmer is particularly fond of certain one.
I would be happy to write some example code to show how to correctly solve any character-set-sensible situation, but there are so many, I don't know where to start.
If the OP amends their question to state exactly what problem overriding the character set is supposed to solve, I'm willing to show how to use the aforementioned utilities and POSIX facilities (or equivalent freely available libraries on Windows) to solve it correctly.
If this seems harsh to someone, it is, but only because taking the easy and simple route here (overriding the user's locale setting) is so ... wrong, purely on technical grounds. Even no action is better, and actually quite acceptable, as long as you just document your application only handles UTF-8 input/output.
Example 1. Localized Happy New Year!
#include <stdlib.h>
#include <locale.h>
#include <stdio.h>
#include <wchar.h>
int main(void)
{
/* We wish to use the user's current locale. */
setlocale(LC_ALL, "");
/* We intend to use wide functions on standard output. */
fwide(stdout, 1);
/* For Windows compatibility, print out a Byte Order Mark.
* If you save the output to a file, this helps tell Windows
* applications that the file is Unicode.
* Other systems don't need it nor use it.
*/
fputwc(L'\uFEFF', stdout);
wprintf(L"Happy New Year!\n");
wprintf(L"С новым годом!\n");
wprintf(L"新年好!\n");
wprintf(L"賀正!\n");
wprintf(L"¡Feliz año nuevo!\n");
wprintf(L"Hyvää uutta vuotta!\n");
return EXIT_SUCCESS;
}
Note that wprintf() takes a wide string (wide string constants are of form L"", wide character constants L'', as opposed to normal/narrow counterparts "" and ''). Formats are still the same; %s prints a normal/narrow string, and %ls a wide string.
Example 2. Reading input lines from standard input, and optionally saving them to a file. The file name is supplied on the command line.
#include <stdlib.h>
#include <string.h>
#include <locale.h>
#include <wctype.h>
#include <wchar.h>
#include <errno.h>
#include <stdio.h>
typedef enum {
TRIM_LEFT = 1, /* Remove leading whitespace and control characters */
TRIM_RIGHT = 2, /* Remove trailing whitespace and control characters */
TRIM_NEWLINE = 4, /* Remove newline at end of line */
TRIM = 7, /* Remove leading and trailing whitespace and control characters */
OMIT_NUL = 8, /* Skip NUL characters (embedded binary zeros, L'\0') */
OMIT_CONTROLS = 16, /* Skip control characters */
CLEANUP = 31, /* All of the above. */
COMBINE_LWS = 32, /* Combine all whitespace into a single space */
} trim_opts;
/* Read an unlimited-length line from a wide input stream.
*
* This function takes a pointer to a wide string pointer,
* pointer to the number of wide characters dynamically allocated for it,
* the stream to read from, and a set of options on how to treat the line.
*
* If an error occurs, this will return 0 with errno set to nonzero error number.
* Use strerror(errno) to obtain the error description (as a narrow string).
*
* If there is no more data to read from the stream,
* this will return 0 with errno 0, and feof(stream) will return true.
*
* If an empty line is read,
* this will return 0 with errno 0, but feof(stream) will return false.
*
* Typically, you initialize variables like
* wchar_t *line = NULL;
* size_t size = 0;
* before calling this function, so that subsequent calls the same, dynamically
* allocated buffer for the line, and it is automatically grown if necessary.
* There are no built-in limits to line lengths this way.
*/
size_t getwline(wchar_t **const lineptr,
size_t *const sizeptr,
FILE *const in,
trim_opts const trimming)
{
wchar_t *line;
size_t size;
size_t used = 0;
wint_t wc;
fpos_t startpos;
int seekable;
if (lineptr == NULL || sizeptr == NULL || in == NULL) {
errno = EINVAL;
return 0;
}
if (*lineptr != NULL) {
line = *lineptr;
size = *sizeptr;
} else {
line = NULL;
size = 0;
*sizeptr = 0;
}
/* In error cases, we can try and get back to this position
* in the input stream, as we cannot really return the data
* read thus far. However, some streams like pipes are not seekable,
* so in those cases we should not even try.
* Use (seekable) as a flag to remember if we should try.
*/
if (fgetpos(in, &startpos) == 0)
seekable = 1;
else
seekable = 0;
while (1) {
/* When we read a wide character from a wide stream,
* fgetwc() will return WEOF with errno set if an error occurs.
* However, fgetwc() will return WEOF with errno *unchanged*
* if there is no more input in the stream.
* To detect which of the two happened, we need to clear errno
* first.
*/
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno) {
const int saved_errno = errno;
if (seekable)
fsetpos(in, &startpos);
errno = saved_errno;
return 0;
}
if (ferror(in)) {
if (seekable)
fsetpos(in, &startpos);
errno = EIO;
return 0;
}
break;
}
/* Dynamically grow line buffer if necessary.
* We need room for the current wide character,
* plus at least the end-of-string mark, L'\0'.
*/
if (used + 2 > size) {
/* Size policy. This can be anything you see fit,
* as long as it yields size >= used + 2.
*
* This one increments size to next multiple of
* 1024 (minus 16). It works well in practice,
* but do not think of it as the "best" way.
* It is just a robust choice.
*/
size = (used | 1023) + 1009;
line = realloc(line, size * sizeof line[0]);
if (!line) {
/* Memory allocation failed. */
if (seekable)
fsetpos(in, &startpos);
errno = ENOMEM;
return 0;
}
*lineptr = line;
*sizeptr = size;
}
/* Append character to buffer. */
if (!trimming)
line[used++] = wc;
else {
/* Check if we have reasons to NOT add the character to buffer. */
do {
/* Omit NUL if asked to. */
if (trimming & OMIT_NUL)
if (wc == L'\0')
break;
/* Omit controls if asked to. */
if (trimming & OMIT_CONTROLS)
if (iswcntrl(wc))
break;
/* If we are at start of line, and we are left-trimming,
* only graphs (printable non-whitespace characters) are added. */
if (trimming & TRIM_LEFT)
if (wc == L'\0' || !iswgraph(wc))
break;
/* Combine whitespaces if asked to. */
if (trimming & COMBINE_LWS)
if (iswspace(wc)) {
if (used > 0 && line[used-1] == L' ')
break;
else
wc = L' ';
}
/* Okay, add the character to buffer. */
line[used++] = wc;
} while (0);
}
/* End of the line? */
if (wc == L'\n')
break;
}
/* The above loop will only end (break out)
* if end of line or end of input was found,
* and no error occurred.
*/
/* Trim right if asked to. */
if (trimming & TRIM_RIGHT)
while (used > 0 && iswspace(line[used-1]))
--used;
else
if (trimming & TRIM_NEWLINE)
while (used > 0 && (line[used-1] == L'\r' || line[used-1] == L'\n'))
--used;
/* Ensure we have room for end-of-string L'\0'. */
if (used >= size) {
size = used + 1;
line = realloc(line, size * sizeof line[0]);
if (!line) {
if (seekable)
fsetpos(in, &startpos);
errno = ENOMEM;
return 0;
}
*lineptr = line;
*sizeptr = size;
}
/* Add end of string mark. */
line[used] = L'\0';
/* Successful return. */
errno = 0;
return used;
}
/* Counts the number of wide characters in 'alpha' class.
*/
size_t count_letters(const wchar_t *ws)
{
size_t count = 0;
if (ws)
while (*ws != L'\0')
if (iswalpha(*(ws++)))
count++;
return count;
}
int main(int argc, char *argv[])
{
FILE *out;
wchar_t *line = NULL;
size_t size = 0;
size_t len;
setlocale(LC_ALL, "");
/* Standard input and output should use wide characters. */
fwide(stdin, 1);
fwide(stdout, 1);
/* Check if the user asked for help. */
if (argc < 2 || argc > 3 || strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "/?") == 0) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help | /? ]\n", argv[0]);
fprintf(stderr, " %s FILENAME [ PROMPT ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "The program will read input lines until an only '.' is supplied.\n");
fprintf(stderr, "If you do not want to save the output to a file,\n");
fprintf(stderr, "use '-' as the FILENAME.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
/* Open file for output, unless it is "-". */
if (strcmp(argv[1], "-") == 0)
out = NULL; /* No output to file */
else {
out = fopen(argv[1], "w");
if (out == NULL) {
fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
/* The output file is used with wide strings. */
fwide(out, 1);
}
while (1) {
/* Prompt? Note: our prompt string is narrow, but stdout is wide. */
if (argc > 2) {
wprintf(L"%s\n", argv[2]);
fflush(stdout);
}
len = getwline(&line, &size, stdin, CLEANUP);
if (len == 0) {
if (errno) {
fprintf(stderr, "Error reading standard input: %s.\n", strerror(errno));
break;
}
if (feof(stdin))
break;
}
/* The user does not wish to supply more lines? */
if (wcscmp(line, L".") == 0)
break;
/* Print the line to the file. */
if (out != NULL) {
fputws(line, out);
fputwc(L'\n', out);
}
/* Tell the user what we read. */
wprintf(L"Received %lu wide characters, %lu of which were letterlike.\n",
(unsigned long)len, (unsigned long)count_letters(line));
fflush(stdout);
}
/* The line buffer is no longer needed, so we can discard it.
* Note that free(NULL) is safe, so we do not need to check.
*/
free(line);
/* I personally also like to reset the variables.
* It helps with debugging, and to avoid reuse-after-free() errors. */
line = NULL;
size = 0;
return EXIT_SUCCESS;
}
The getwline() function above is pretty much at the most complicated end of functions you might need when dealing with localized wide character support. It allows you to read localized input lines without length restrictions, and optionally trims and cleans up (removing control codes and embedded binary zeros) the returned string. It also works fine with both LF and CR-LF (\n and \r\n) newline encodings.
Try:
setlocale(LC_ALL, "en_US.UTF-8");
You can run locale -a in the terminal to get a full list of locales supported by your system ("en_US.UTF-8" should be supported by most/all UTF-8 supporting systems).
EDIT 1 (alternate spelling)
In the comments, Lee points out that some systems have an alternate spelling, "en_US.utf8" (which surprised me, but we learn new stuff every day).
Since setlocale returns NULL when it fails, you can chain these calls:
if(!setlocale(LC_ALL, "en_US.UTF-8") && !setlocale(LC_ALL, "en_US.utf8"))
printf("failed to set locale to UTF-8");
EDIT 2 (finding out if we're using UTF-8)
To find out if the locale is set to UFT-8 (after attempting to set it), you can either check for the returned value (NULL means the call failed) or check the locale used.
Option 1:
char * result;
if((result = setlocale (LC_ALL, "en_US.UTF-8")) == NULL)
printf("failed to set locale to UTF-8");
Option 2:
setlocale (LC_ALL, "en_US.UTF-8"); // set
char * result = setlocale (LC_ALL, NULL); // review
if(!strstr(result, "UTF-8"))
printf("failed to set locale to UTF-8");
This is not an answer, but a third, quite complex example, on how to use wide character I/O. This was too long to add to my actual answer to this question.
This example shows how to read and process CSV files (RFC-4180 format, optionally with limited backslash escape support) using wide strings.
The following code is CC0/public domain, so you are free to use it any way you like, even include in your own proprietary projects, but if it breaks anything, you get to keep all the bits and not complain to me. (I'll be happy to include any bug fixes if you find and report them in a comment below, though.)
The logic of the code is robust, however. In particular, it supports universal newlines, all four common newline types: Unix-like LF (\n), old CR LF (\r\n), old Mac CR (\r), and the occasionally encountered weird LF CR (\n\r). There are no built-in limitations wrt. the length of a field, the number of fields in a record, or the number of records in a file. It works very nicely if you need to convert CSV or process CSV input stream-like (field by field or record-by-record), without having to have more than one in memory at one point. If you want to construct structures to describe the records and fields in memory, you'll need to add some scaffolding code for that.
Because of universal newline support, when reading input interactively, this program might require two consecutive end-of-inputs (Ctrl+Z in Windows and MS-DOS, Ctrl+D everywhere else), as the first one is usually "consumed" by the csv_next_field() or csv_skip_field() function, and the csv_next_record() function needs to re-read it again to actually detect it. However, you do not normally ask the user to input CSV data interactively, so this should be an acceptable quirk.
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>
/* RFC-4180 -format CSV file processing using wide input streams.
*
* #define BACKSLASH_ESCAPES if you additionally wish to have
* \\, \a, \b, \t, \n, \v, \f, \r, \", and \, de-escaped to their
* C string equivalents when reading CSV fields.
*/
typedef enum {
CSV_OK = 0,
CSV_END = 1,
CSV_INVALID_PARAMETERS = -1,
CSV_FORMAT_ERROR = -2,
CSV_CHARSET_ERROR = -3,
CSV_READ_ERROR = -4,
CSV_OUT_OF_MEMORY = -5,
} csv_status;
const char *csv_error(const csv_status code)
{
switch (code) {
case CSV_OK: return "No error";
case CSV_END: return "At end";
case CSV_INVALID_PARAMETERS: return "Invalid parameters";
case CSV_FORMAT_ERROR: return "Bad CSV format";
case CSV_CHARSET_ERROR: return "Illegal character in CSV file (incorrect locale?)";
case CSV_READ_ERROR: return "Read error";
case CSV_OUT_OF_MEMORY: return "Out of memory";
default: return "Unknown csv_status code";
}
}
/* Start the next record. Automatically skips any remaining fields in current record.
* Returns CSV_OK if successful, CSV_END if no more records, or a negative CSV_ error code. */
csv_status csv_next_record (FILE *const in);
/* Skip the next field. Returns CSV_OK if successful, CSV_END if no more fields in current record,
* or a negative CSV_ error code. */
csv_status csv_skip_field (FILE *const in);
/* Read the next field. Returns CSV_OK if successful, CSV_END if no more fields in current record,
* or a negative CSV_ error code.
* If this returns CSV_OK, then *dataptr is a dynamically allocated wide string to the field
* contents, space allocated for *sizeptr wide characters; and if lengthptr is not NULL, then
* *lengthptr is the number of wide characters in said wide string. */
csv_status csv_next_field (FILE *const in, wchar_t **const dataptr,
size_t *const sizeptr,
size_t *const lengthptr);
static csv_status internal_skip_quoted(FILE *const in)
{
while (1) {
wint_t wc;
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_FORMAT_ERROR;
}
if (wc == L'"') {
errno = 0;
wc = fgetwc(in);
if (wc == L'"')
continue;
while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc)) {
errno = 0;
wc = fgetwc(in);
}
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L',') {
errno = 0;
return CSV_OK;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
ungetwc(wc, in);
errno = 0;
return CSV_FORMAT_ERROR;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == L'"')
continue;
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
}
#endif
}
}
static csv_status internal_skip_unquoted(FILE *const in, wint_t wc)
{
while (1) {
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L',') {
errno = 0;
return CSV_OK;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
}
#endif
errno = 0;
wc = fgetwc(in);
}
}
csv_status csv_next_record(FILE *const in)
{
while (1) {
wint_t wc;
csv_status status;
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L'\n' || wc == L'\r') {
wint_t next_wc;
errno = 0;
next_wc = fgetwc(in);
if (next_wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if ((wc == L'\n' && next_wc == L'\r') ||
(wc == L'\r' && next_wc == L'\n')) {
errno = 0;
return CSV_OK;
}
ungetwc(next_wc, in);
errno = 0;
return CSV_OK;
}
if (wc == L'"')
status = internal_skip_quoted(in);
else
status = internal_skip_unquoted(in, wc);
if (status < 0)
return status;
}
}
csv_status csv_skip_field(FILE *const in)
{
wint_t wc;
if (!in) {
errno = EINVAL;
return CSV_INVALID_PARAMETERS;
} else
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
/* Skip leading whitespace. */
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == L'"')
return internal_skip_quoted(in);
else
return internal_skip_unquoted(in, wc);
}
csv_status csv_next_field(FILE *const in, wchar_t **const dataptr,
size_t *const sizeptr,
size_t *const lengthptr)
{
wchar_t *data;
size_t size;
size_t used = 0; /* length */
wint_t wc;
if (lengthptr)
*lengthptr = 0;
if (!in || !dataptr || !sizeptr) {
errno = EINVAL;
return CSV_INVALID_PARAMETERS;
} else
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
if (*dataptr) {
data = *dataptr;
size = *sizeptr;
} else {
data = NULL;
size = 0;
*sizeptr = 0;
}
/* Skip leading whitespace. */
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
if (wc == L'"')
while (1) {
errno = 0;
wc = getwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_FORMAT_ERROR;
} else
if (wc == L'"') {
errno = 0;
wc = getwc(in);
if (wc != L'"') {
/* Not an escaped doublequote. */
while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc)) {
errno = 0;
wc = getwc(in);
}
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
} else
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
} else
if (wc != L',') {
errno = 0;
return CSV_FORMAT_ERROR;
}
break;
}
#ifdef BACKSLASH_ESCAPES
} else
if (wc == L'\\') {
errno = 0;
wc = getwc(in);
if (wc == L'\0')
continue;
else
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
break;
} else
switch (wc) {
case L'a': wc = L'\a'; break;
case L'b': wc = L'\b'; break;
case L't': wc = L'\t'; break;
case L'n': wc = L'\n'; break;
case L'v': wc = L'\v'; break;
case L'f': wc = L'\f'; break;
case L'r': wc = L'\r'; break;
case L'\\': wc = L'\\'; break;
case L'"': wc = L'"'; break;
case L',': wc = L','; break;
default:
ungetwc(wc, in);
wc = L'\\';
}
#endif
}
if (used + 2 > size) {
/* Allocation policy.
* Anything that yields size >= used + 2 is acceptable.
* This one allocates in roughly 1024 byte chunks,
* and is known to be robust (but not optimal) in practice. */
size = (used | 1023) + 1009;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used++] = wc;
}
else
while (1) {
if (wc == L',')
break;
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
break;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
wc = L'\\';
} else
switch (wc) {
case L'a': wc = L'\a'; break;
case L'b': wc = L'\b'; break;
case L't': wc = L'\t'; break;
case L'n': wc = L'\n'; break;
case L'v': wc = L'\v'; break;
case L'f': wc = L'\f'; break;
case L'r': wc = L'\r'; break;
case L'"': wc = L'"'; break;
case L',': wc = L','; break;
case L'\\': wc = L'\\'; break;
default:
ungetwc(wc, in);
wc = L'\\';
}
}
#endif
if (used + 2 > size) {
/* Allocation policy.
* Anything that yields size >= used + 2 is acceptable.
* This one allocates in roughly 1024 byte chunks,
* and is known to be robust (but not optimal) in practice. */
size = (used | 1023) + 1009;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used++] = wc;
errno = 0;
wc = getwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
break;
}
}
/* Ensure there is room for the end-of-string mark. */
if (used >= size) {
size = used + 1;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used] = L'\0';
if (lengthptr)
*lengthptr = used;
errno = 0;
return CSV_OK;
}
/* Helper function: print a wide string as if in quotes, but backslash-escape special characters.
*/
static void wquoted(FILE *const out, const wchar_t *ws, const size_t len)
{
if (out) {
size_t i;
for (i = 0; i < len; i++)
if (ws[i] == L'\0')
fputws(L"\\0", out);
else
if (ws[i] == L'\a')
fputws(L"\\a", out);
else
if (ws[i] == L'\b')
fputws(L"\\b", out);
else
if (ws[i] == L'\t')
fputws(L"\\t", out);
else
if (ws[i] == L'\n')
fputws(L"\\n", out);
else
if (ws[i] == L'\v')
fputws(L"\\v", out);
else
if (ws[i] == L'\f')
fputws(L"\\f", out);
else
if (ws[i] == L'\r')
fputws(L"\\r", out);
else
if (ws[i] == L'"')
fputws(L"\\\"", out);
else
if (ws[i] == L'\\')
fputws(L"\\\\", out);
else
if (iswprint(ws[i]))
fputwc(ws[i], out);
else
if (ws[i] < 65535)
fwprintf(out, L"\\x%04x", (unsigned int)ws[i]);
else
fwprintf(out, L"\\x%08x", (unsigned long)ws[i]);
}
}
static int show_csv(FILE *const in, const char *const filename)
{
wchar_t *field_contents = NULL;
size_t field_allocated = 0;
size_t field_length = 0;
unsigned long record = 0UL;
unsigned long field;
csv_status status;
while (1) {
/* First field in this record. */
field = 0UL;
record++;
while (1) {
status = csv_next_field(in, &field_contents, &field_allocated, &field_length);
if (status == CSV_END)
break;
if (status < 0) {
fprintf(stderr, "%s: %s.\n", filename, csv_error(status));
free(field_contents);
return -1;
}
field++;
wprintf(L"Record %lu, field %lu is \"", record, field);
wquoted(stdout, field_contents, field_length);
wprintf(L"\", %lu characters.\n", (unsigned long)field_length);
}
status = csv_next_record(in);
if (status == CSV_END) {
free(field_contents);
return 0;
}
if (status < 0) {
fprintf(stderr, "%s: %s.\n", filename, csv_error(status));
free(field_contents);
return -1;
}
}
}
static int usage(const char *argv0)
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help | /? ]\n", argv0);
fprintf(stderr, " %s CSV-FILE [ ... ]\n", argv0);
fprintf(stderr, "\n");
fprintf(stderr, "Use special file name '-' to read from standard input.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
int main(int argc, char *argv[])
{
FILE *in;
int arg;
setlocale(LC_ALL, "");
fwide(stdin, 1);
fwide(stdout, 1);
if (argc < 1)
return usage(argv[0]);
for (arg = 1; arg < argc; arg++) {
if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help") || !strcmp(argv[arg], "/?"))
return usage(argv[0]);
if (!strcmp(argv[arg], "-")) {
if (show_csv(stdin, "(standard input)"))
return EXIT_FAILURE;
} else {
in = fopen(argv[arg], "r");
if (!in) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
return EXIT_FAILURE;
}
if (show_csv(in, argv[arg]))
return EXIT_FAILURE;
if (ferror(in)) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(EIO));
fclose(in);
return EXIT_FAILURE;
}
if (fclose(in)) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(EIO));
return EXIT_FAILURE;
}
}
}
return EXIT_SUCCESS;
}
The use of the above csv_next_field(), csv_skip_field(), and csv_next_record() is quite straightforward.
Open the CSV file normally, then call fwide(stream, 1) on it to tell the C library you intend to use the wide string variants instead of the standard narrow string I/O functions.
Create four variables, and initialize the first two:
wchar_t *field = NULL;
size_t allocated = 0;
size_t length;
csv_status status;
field is a pointer to the dynamically allocated contents of each field you read. It is allocated automatically; essentially, you don't need to worry about it at all. allocated holds the currently allocated size (in wide characters, including terminating L'\0'), and we'll use length and status later.
At this point, you are ready to read or skip the first field in the first record.
You do not wish to call csv_next_record() at this point, unless you wish to skip the very first record entirely in the file.
Call status = csv_skip_field(stream); to skip the next field, or status = csv_next_field(stream, &field, &allocated, &length); to read it.
If status == CSV_OK, you have the field contents in wise string field. It has length wide characters in it.
If status == CSV_END, there was no more fields in the current record. (The field is unchanged, and you should not examine it.)
Otherwise, status < 0, and it describes an error code. You can use csv_error(status) to obtain a (narrow) string describing it.
At any point, you can move (skip) to the start of the next record by calling status = csv_next_record(stream);.
If it returns CSV_OK, there might be a new record available. (We only know when you try to read or skip the first field. This is similar to how standard C library function feof() only tells you whether you have tried to read past the end of input, it does not tell whether there is more data available or not.)
If it returns CSV_END, you already have processed the last record, and there are no more records.
Otherwise, it returns a negative error code, status < 0. You can use csv_error(status) to obtain a (narrow) string describing it.
After you are done, discard the field buffer:
free(field);
field = NULL;
allocated = 0;
You do not actually need to reset the variables to NULL and zero, but I recommend it. In fact, you can do the above at any point (when you are no longer interested in the contents of the current field), as the csv_next_field() will then automatically allocate a new buffer as necessary.
Note that free(NULL); is always safe and does nothing. You do not need to check if field is NULL or not before freeing it. This is also the reason why I recommend initializing the variables immediately when you declare them. It just makes everything so much easier to handle.
The compiled example program takes one or more CSV file names as command-line parameters, then reads the files and reports the contents of each field in the file. If you have a particularly fiendishly complex CSV file, this is optimal for checking if this approach reads all the fields correctly.

Resources