Polish characters in C using files and lists - c

I need to get proper Polish characters "ąężźćśół". I used some solutions like setlocale, system chcp, wchar_t. Everything goes well as long as I don't use files/lists. wscanf, wprintf and wchar_t works perfectly.
But if I'm trying to read something from a file and save that into a list (even in array), then trying to put that to the screen, I can't get proper Polish characters, and in case of the lists, I'm getting different results from time to time for example, z` , A2 , like random characters from nowhere. I've been trying to get good results by using fscanf and fgets with w(wide) variations, but it doesn't work. Did I something wrong?
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <locale.h>
struct dyk{
wchar_t line[200];
struct dyk *next;
};
typedef struct dyk dyk;
void printdyk(char name[100]){
dyk *wyp;
wyp = malloc(sizeof(dyk));
wchar_t yt[100];
FILE *dyktando;
dyktando = fopen(name, "r+");
if(dyktando == NULL){
wprintf(L"Błąd otwarcia pliku!\n"); //Can't open file
}else{
fgets(&wyp->line, sizeof(dyk), dyktando); //reading from file and send to the list
wprintf(L"%s\n", wyp->line); //write text from the list on the screen
wchar_t yt[100];
wscanf(L"%s", &yt); //testing strings comparing, so I have to put some variables
int n=strcmp(yt, wyp->line); //str compare
printf("%d", n); //result, it gives me -1 every time
}
fclose(dyktando);
}
I tested function with txt file that contents only one character "ż". Can't read from file properly. At the start of main function I put these 2 lines:
system("chcp 852");
setlocale(LC_ALL, ".852");
I'm using codeblock, mingw32-gcc compiler, and no flags.

You are not using wchar_t compatible functions everywhere in your code. In particular:
fgets(&wyp->line, sizeof(dyk), dyktando); //reading from file and send to the list
The wchar_t compatible version is fgetws. Also, wyp->line (without the & operator) is the correct argument.
int n=strcmp(yt, wyp->line); //str compare
wcscmp should be used instead.
Also note that sizeof on a wchar_t array is not correct when a function expects length in characters rather than bytes (like fgetws does).

A comment OP (Amatheon) made indicates that the true underlying problem is how to properly read files using wide-character functions.
To ensure maximum compatibility and portability, let's restrict to C99. Consider the following example program:
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>
#ifdef USE_ERRNO_CONSTANTS
#define SET_ERRNO(value) (errno = (value))
#else
#define SET_ERRNO(value)
#endif
ssize_t get_wide_delimited(wchar_t **lineptr, size_t *sizeptr, wint_t delim, FILE *stream)
{
wchar_t *line = NULL;
size_t size = 0;
size_t used = 0;
wint_t wc;
if (!lineptr || !sizeptr || !stream) {
/* Invalid function parameters. NULL pointers are not allowed. */
SET_ERRNO(EINVAL);
return -1;
}
if (ferror(stream)) {
/* Stream is already in error state. */
SET_ERRNO(EIO);
return -1;
}
if (*sizeptr > 0) {
line = *lineptr;
size = *sizeptr;
} else {
*lineptr = NULL;
}
while (1) {
wc = fgetwc(stream);
if (wc == WEOF || wc == delim)
break;
if (used + 1 > size) {
/* Growth policy. We wish to allocate a chunk of memory at once,
so we don't need to do realloc() too often as it is a bit slow,
relatively speaking. On the other hand, we don't want to do
too large allocations, because that would waste memory.
Anything that makes 'size' larger than 'used' will work.
*/
if (used < 254)
size = 256;
else
if (used < 65536)
size = 2 * used;
else
size = (used | 65535) + 65521;
line = realloc(line, size * sizeof (wchar_t));
if (!line) {
/* Out of memory. */
SET_ERRNO(ENOMEM);
return -1;
}
*lineptr = line;
*sizeptr = size;
}
line[used++] = wc;
}
if (wc == WEOF) {
/* Verify that the WEOF did not indicate a read error. */
if (ferror(stream)) {
/* Read error. */
SET_ERRNO(EIO);
return -1;
}
}
/* Ensure there is enough room for the delimiter and end-of-string mark. */
if (used + 2 > size) {
/* We could reuse the reallocation policy here,
with the exception that the minimum is used + 2, not used + 1.
For simplicity, we use the minimum reallocation instead.
*/
size = used + 2;
line = realloc(line, size * sizeof (wchar_t));
if (!line) {
/* Out of memory. */
SET_ERRNO(ENOMEM);
return -1;
}
*lineptr = line;
*sizeptr = size;
}
/* Append the delimiter, unless end-of-stream mark. */
if (wc != WEOF)
line[used++] = wc;
/* Append the end-of-string nul wide char,
but do not include it in the returned length. */
line[used] = L'\0';
/* Success! */
return (ssize_t)used;
}
ssize_t get_wide_line(wchar_t **lineptr, size_t *sizeptr, FILE *stream)
{
return get_wide_delimited(lineptr, sizeptr, L'\n', stream);
}
int main(int argc, char *argv[])
{
wchar_t *line = NULL, *p;
size_t size = 0;
unsigned long linenum;
FILE *in;
int arg;
if (!setlocale(LC_ALL, ""))
fprintf(stderr, "Warning: Your C library does not support your current locale.\n");
if (fwide(stdout, 1) < 1)
fprintf(stderr, "Warning: Your C library does not support wide standard output.\n");
if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s FILENAME [ FILENAME ... ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will output the named files, using wide I/O.\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
for (arg = 1; arg < argc; arg++) {
in = fopen(argv[arg], "r");
if (!in) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
return EXIT_FAILURE;
}
if (fwide(in, 1) < 1) {
fprintf(stderr, "%s: Wide input is not supported from this file.\n", argv[arg]);
fclose(in);
return EXIT_FAILURE;
}
linenum = 0;
while (get_wide_line(&line, &size, in) > 0) {
linenum++;
/* We use another pointer to the line for simplicity.
We must not modify 'line' (except via 'free(line); line=NULL; size=0;'
or a similar reallocation), because it points to dynamically allocated buffer. */
p = line;
/* Remove leading whitespace. */
while (iswspace(*p))
p++;
/* Trim off the line at the first occurrence of newline or carriage return.
(The line will also end at the first embedded nul wide character, L'\0',
if the file contains any.) */
p[wcscspn(p, L"\r\n")] = L'\0';
wprintf(L"%s: Line %lu: '%ls', %zu characters.\n", argv[arg], linenum, p, wcslen(p));
}
if (ferror(in)) {
fprintf(stderr, "%s: Read error.\n", argv[arg]);
fclose(in);
return EXIT_FAILURE;
}
if (fclose(in)) {
fprintf(stderr, "%s: Delayed read error.\n", argv[arg]);
return EXIT_FAILURE;
}
wprintf(L"%s: Total %lu lines read.\n", argv[arg], linenum);
fflush(stdout);
}
free(line);
line = NULL;
size = 0;
return EXIT_SUCCESS;
}
Because the EINVAL, EIO, and ENOMEM errno constants are not defined in the C standards, the get_wide_line() and get_wide_delimited() only set errno if you define the USE_ERRNO_CONSTANTS preprocessor value.
The get_wide_line() and get_wide_delimited() are reimplementations of the getwline() and getwdelim() functions from ISO/IEC TR 24731-2:2010; the wide-character equivalents of the POSIX.1 getline() and getdelim() functions. Unlike fgets() or fgetws(), these use a dynamically allocated buffer to hold the line, so there is no fixed line length limits, other than available memory.
I've explicitly marked the code to be under Creative Commons Zero license: No Rights Reserved. It means you can use it in your own code, under whatever license you want.
Note: I would really love users to push their vendors and C standard committee members to get these included in the bog-standard C library part in the next version of the C standard. As you can see from above, they can be implemented in standard C already; it is just that the C library itself can do the same much more efficiently. The GNU C library is a perfect example of that (although even they are stalling with the implementation, because lack of standardization). Just think how many buffer overflow bugs would be avoided if people used getline()/getdelim()/getwline()/getwdelim() instead of fgets()/fgetws()! And avoid having to think about what the maximum reasonable line length in each instance would be to, too. Win-win!
(In fact, we could switch the return type to size_t, and use 0 instead of -1 as the error indicator. That would limit the changes to the text of the C standard to the addition of the four functions. It saddens and irritates me to no end, to have such a significant group of trivial functions so callously and ignorantly overlooked, for no sensible reason. Please, bug your vendors and any C standards committee members you have access to about this, as incessantly and relentlessly as you can manage. Both you and they deserve it.)
The essential parts of the program are
if (!setlocale(LC_ALL, ""))
This tells the C library to use the locale the user has specified.
Please, do not hardcode the locale value into your programs. In most operating systems, all you need to do is to change the LANG or LC_ALL environment variable to the locale you want to use, before running your program.
You might think that "well, I can hardcode it this time, because this is the locale used for this data", but even that can be a mistake, because new locales can be created at any time. This is particularly annoying when the character set part is hardcoded. For example, the ISO 8859 single-byte character set used in Western Europe is ISO 8859-15, not ISO 8859-1, because ISO 8859-15 has the € character in it, whereas ISO 8859-1 does not. If you have hardcoded ISO 8859-1 in your program, then it cannot correctly handle the € character at all.
if (fwide(stream, 1) < 1) for both stdout and file handles
While the C library does internally do an equivalent of the fwide() call based on which type of I/O function you use on the file handle the very first time, the explicit check is much better.
In particular, if the C library cannot support wide I/O to the file or stream represented by the handle, fwide() will return negative. (Unless the second parameter is also zero, it should never return zero; because of the issues in standardization, I recommend a strict return value check approach in this case, to catch vendors who decide to try to make life as difficult as possible for programmers trying to write portable code while technically still fulfilling the standard text, like Microsoft is doing. They even stuffed the C standard committee with their own representatives, so they could tweak C11 away from C99 features they didn't want to support, plus get a stamp of approval of their own nonstandard extensions nobody used before, to help create barriers for developers writing portable C code. Yeah, I don't trust their behaviour at all.)
ssize_t len = get_wide_line(&line, &size, handle);
If you initialize wchar_t *line = NULL; and size_t size = 0; prior to first call to get_wide_line() or get_wide_delimited(), the function will dynamically resize the buffer as needed.
The return value is negative if and only if an error occurs. (The functions should never return zero.)
When a line is read successfully, the return value reflects the number of wide characters in the buffer, including the delimiter (newline, L'\n' for get_wide_delimited()), and is always positive (greater than zero). The contents in the buffer will have a terminating end-of-wide-string character, L'\0', but it is not counted in the return value.
Note that when the delimiter is not L'\0', the buffer may contain embedded wide nul characters, L'\0'. In that case, len > wcslen(line).
The above example programs skips any leading whitespace on each input line, and trims off the line at the first linefeed (L'\n'), carriage return (L'\r'), or nul (L'\0'). Because of this, the return value len is only checked for success (a positive return value greater than zero).
free(line); line = NULL; size = 0;
It is okay to discard the line at any point its contents are no longer needed. I recommend explicitly setting the line pointer to NULL, and the size to zero, to avoid use-after-free bugs. Furthermore, that allows any following get_wide_line() or get_wide_delimited() to correctly dynamically allocate a new buffer.
ferror(handle) after a wide input function fails
Just like with narrow streams and EOF, there are two cases why wide input functions might return WEOF (or return -1, depending on the function): because there is no more input, or because a read error occurred.
There is no reason whatsoever to write computer programs that ignore read or write errors, without reporting them to the user. Sure, they are rare, but not so rare that a programmer can sanely expect them to never occur. (In fact, with Flash memory on flimsy circuits stored in weak plastic housings and subjected to human-sized stresses (I've sat on mine time and time again), the errors aren't that rare.) It is just evil, rather similar to food preparers being too lazy to wash their hands, causing fecal bacteria outbreaks every now and then. Don't be a fecal bacteria spreader equivalent programmer.
Let's say you have a harebrained lecturer who does not allow you to use the above get_wide_line() or get_wide_delimited() functions.
Don't worry. We can implement the same program using fgetws(), if we restrict line to some fixed upper limit (of wide characters). Lines longer than that will read as two or more lines instead:
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>
#ifndef MAX_WIDE_LINE_LEN
#define MAX_WIDE_LINE_LEN 1023
#endif
int main(int argc, char *argv[])
{
wchar_t line[MAX_WIDE_LINE_LEN + 1], *p;
unsigned long linenum;
FILE *in;
int arg;
if (!setlocale(LC_ALL, ""))
fprintf(stderr, "Warning: Your C library does not support your current locale.\n");
if (fwide(stdout, 1) < 1)
fprintf(stderr, "Warning: Your C library does not support wide standard output.\n");
if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s FILENAME [ FILENAME ... ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will output the named files, using wide I/O.\n");
fprintf(stderr, "\n");
return EXIT_FAILURE;
}
for (arg = 1; arg < argc; arg++) {
in = fopen(argv[arg], "r");
if (!in) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
return EXIT_FAILURE;
}
if (fwide(in, 1) < 1) {
fprintf(stderr, "%s: Wide input is not supported from this file.\n", argv[arg]);
fclose(in);
return EXIT_FAILURE;
}
linenum = 0;
while (1) {
/* If line is an array, (sizeof line / sizeof line[0]) evaluates to
the number of elements in it. This does not work if line is a pointer
to dynamically allocated memory. In that case, you need to remember
number of wide characters you allocated for in a separate variable,
and use that variable here instead. */
p = fgetws(line, sizeof line / sizeof line[0], in);
if (!p)
break;
/* Have a new line. */
linenum++;
/* Remove leading whitespace. */
while (iswspace(*p))
p++;
/* Trim off the line at the first occurrence of newline or carriage return.
(The line will also end at the first embedded nul wide character, L'\0',
if the file contains any.) */
p[wcscspn(p, L"\r\n")] = L'\0';
wprintf(L"%s: Line %lu: '%ls', %zu characters.\n", argv[arg], linenum, p, wcslen(p));
}
if (ferror(in)) {
fprintf(stderr, "%s: Read error.\n", argv[arg]);
fclose(in);
return EXIT_FAILURE;
}
if (fclose(in)) {
fprintf(stderr, "%s: Delayed read error.\n", argv[arg]);
return EXIT_FAILURE;
}
wprintf(L"%s: Total %lu lines read.\n", argv[arg], linenum);
fflush(stdout);
}
return EXIT_SUCCESS;
}
Aside from the function used to read each line, the difference is that instead of keeping the while loop condition as while ((p = fgetws(line, ...))) { ... }, I changed to the while (1) { p = fgetws(line, ...); if (!p) break; ... form that I believe is more readable.
I did deliberately show the longer, more complicated-looking one first, and this simpler one last, in the hopes that you would see that the more complicated-looking one actually has the simpler main() -- if we don't just count lines of code or something equally silly, but look at how many opportunities for mistakes there are.
As OP themselves wrote in a comment, the size of the buffer passed to fgets() or fgetws() is a real issue. There are rules of thumb, but they all suffer from being fragile against edits (especially the differences between arrays and pointers). With getline()/getdelim()/getwline()/getwdelim()/get_wide_line()/get_wide_delimited(), the rule of thumb is wchar_t *line = NULL; size_t size = 0; ssize_t len; and len = get_wide_line(&line, &size, handle);. No variations, and simple to remember and use. Plus it gets rid of any fixed limitations.

Related

fgets statement reads first line and not sure how to modify because I have to return a pointer [duplicate]

I need to copy the contents of a text file to a dynamically-allocated character array.
My problem is getting the size of the contents of the file; Google reveals that I need to use fseek and ftell, but for that the file apparently needs to be opened in binary mode, and that gives only garbage.
EDIT: I tried opening in text mode, but I get weird numbers. Here's the code (I've omitted simple error checking for clarity):
long f_size;
char* code;
size_t code_s, result;
FILE* fp = fopen(argv[0], "r");
fseek(fp, 0, SEEK_END);
f_size = ftell(fp); /* This returns 29696, but file is 85 bytes */
fseek(fp, 0, SEEK_SET);
code_s = sizeof(char) * f_size;
code = malloc(code_s);
result = fread(code, 1, f_size, fp); /* This returns 1045, it should be the same as f_size */
The root of the problem is here:
FILE* fp = fopen(argv[0], "r");
argv[0] is your executable program, NOT the parameter. It certainly won't be a text file. Try argv[1], and see what happens then.
You cannot determine the size of a file in characters without reading the data, unless you're using a fixed-width encoding.
For example, a file in UTF-8 which is 8 bytes long could be anything from 2 to 8 characters in length.
That's not a limitation of the file APIs, it's a natural limitation of there not being a direct mapping from "size of binary data" to "number of characters."
If you have a fixed-width encoding then you can just divide the size of the file in bytes by the number of bytes per character. ASCII is the most obvious example of this, but if your file is encoded in UTF-16 and you happen to be on a system which treats UTF-16 code points as the "native" internal character type (which includes Java, .NET and Windows) then you can predict the number of "characters" to allocate as if UTF-16 were fixed width. (UTF-16 is variable width due to Unicode characters above U+FFFF being encoded in multiple code points, but a lot of the time developers ignore this.)
I'm pretty sure argv[0] won't be an text file.
Give this a try (haven't compiled this, but I've done this a bazillion times, so I'm pretty sure it's at least close):
char* readFile(char* filename)
{
FILE* file = fopen(filename,"r");
if(file == NULL)
{
return NULL;
}
fseek(file, 0, SEEK_END);
long int size = ftell(file);
rewind(file);
char* content = calloc(size + 1, 1);
fread(content,1,size,file);
return content;
}
If you're developing for Linux (or other Unix-like operating systems), you can retrieve the file-size with stat before opening the file:
#include <stdio.h>
#include <sys/stat.h>
int main() {
struct stat file_stat;
if(stat("main.c", &file_stat) != 0) {
perror("could not stat");
return (1);
}
printf("%d\n", (int) file_stat.st_size);
return (0);
}
EDIT: As I see the code, I have to get into the line with the other posters:
The array that takes the arguments from the program-call is constructed this way:
[0] name of the program itself
[1] first argument given
[2] second argument given
[n] n-th argument given
You should also check argc before trying to use a field other than '0' of the argv-array:
if (argc < 2) {
printf ("Usage: %s arg1", argv[0]);
return (1);
}
argv[0] is the path to the executable and thus argv[1] will be the first user submitted input. Try to alter and add some simple error-checking, such as checking if fp == 0 and we might be ble to help you further.
You can open the file, put the cursor at the end of the file, store the offset, and go back to the top of the file, and make the difference.
You can use fseek for text files as well.
fseek to end of file
ftell the offset
fseek back to the begining
and you have size of the file
Kind of hard with no sample code, but fstat (or stat) will tell you how big the file is. You allocate the memory required, and slurp the file in.
Another approach is to read the file a piece at a time and extend your dynamic buffer as needed:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define PAGESIZE 128
int main(int argc, char **argv)
{
char *buf = NULL, *tmp = NULL;
size_t bufSiz = 0;
char inputBuf[PAGESIZE];
FILE *in;
if (argc < 2)
{
printf("Usage: %s filename\n", argv[0]);
return 0;
}
in = fopen(argv[1], "r");
if (in)
{
/**
* Read a page at a time until reaching the end of the file
*/
while (fgets(inputBuf, sizeof inputBuf, in) != NULL)
{
/**
* Extend the dynamic buffer by the length of the string
* in the input buffer
*/
tmp = realloc(buf, bufSiz + strlen(inputBuf) + 1);
if (tmp)
{
/**
* Add to the contents of the dynamic buffer
*/
buf = tmp;
buf[bufSiz] = 0;
strcat(buf, inputBuf);
bufSiz += strlen(inputBuf) + 1;
}
else
{
printf("Unable to extend dynamic buffer: releasing allocated memory\n");
free(buf);
buf = NULL;
break;
}
}
if (feof(in))
printf("Reached the end of input file %s\n", argv[1]);
else if (ferror(in))
printf("Error while reading input file %s\n", argv[1]);
if (buf)
{
printf("File contents:\n%s\n", buf);
printf("Read %lu characters from %s\n",
(unsigned long) strlen(buf), argv[1]);
}
free(buf);
fclose(in);
}
else
{
printf("Unable to open input file %s\n", argv[1]);
}
return 0;
}
There are drawbacks with this approach; for one thing, if there isn't enough memory to hold the file's contents, you won't know it immediately. Also, realloc() is relatively expensive to call, so you don't want to make your page sizes too small.
However, this avoids having to use fstat() or fseek()/ftell() to figure out how big the file is beforehand.

C: Parse a file to obtain number of columns while reading by mmap

I have a file such as the following:
1-3-5 2 1
2 3-4-1 2
4-1 2-41-2 3-4
I want to return the number of columns of this file. I am reading the file with mmap in C. I have been trying to do with strtok(), but failing, so far. This is just a testfile, my original file is in GB scale.
pmap = mmap(0,mystat.st_size,PROT_READ|PROT_WRITE,MAP_PRIVATE,fd,0);
char *start = pmap;
char *token;
token = strtok(start, "\t");
while (token != NULL){
printf("%s \n",token);
token = strtok(NULL, "\t");
col_len++;
}
I have been trying something on these lines, but, obviously there is a logical error. I am getting the following output:
number of cols = 1
Although, the # of cols should be 3.
It'd be great if you guys can help with any idea on how to parse this kind of a file using mmap.
I am using mmap because of faster execution for a single pass over the file.
It is hard to provide a definitive answer without a definitive question; as written, the question does not contain complete code, does not show the precise input, and does not show the debugging output.
But it is possible to provide some suggestions based on the non-applicability of strtok to this problem.
(strtok modifies its first argument, so it is really not a good idea to use it with an mmaped resource. However, that is not directly relevant to the problem you are having.)
You should ensure that the columns in the file are really separated by tabs. It seems to me most likely that the file contains spaces, not tabs, which is why the program reports that the entire file contains one column. If this were the only problem, you could call strtok with the second argument " \t" rather than "\t". But remember that strtok combines successive delimiters into a single separator so if the file is tab-separated and there are empty fields, strtok will not report the empty fields.
Related to the phrase "entire file" above, you do not tell strtok to recognized a newline character as terminating a token. So the strtok loop will try to analyze the entire file, counting the last field of each line as part of the same token as the first field of the next line. That is surely not what you want.
However, strtok overwrites the column delimiter that it finds, so if you did fix the strtok calls to include \n as a delimiter character, you would no longer be able to tell where the lines ended. That is probably important to your code, and it is a key reason why strtok is not an appropriate tool in this case. The Gnu strtok manpage (man strtok, emphasis added) provides a warning about this very issue (in the BUGS section at the end):
Be cautious when using these functions. If you do use them, note that:
These functions modify their first argument.
These functions cannot be used on constant strings.
The identity of the delimiting byte is lost.
There is no guarantee that a file ends with a NUL character. In fact, the file is very unlikely to contain a NUL character, and it is undefined behaviour to reference bytes in the mmap'ed region which are not in the file, but in practice most OSs will mmap an integral number of pages, zero-filling the last page. So 4095 times out of 4096, you will not notice this problem, and the 4096th time when the file is precisely an integral number of pages, your program will crash and burn, along with whatever sensitive equipment it is controlling. This is another reason strtok should never be used on mmaped files.
My comment was actually not correct, as you use MAP_PRIVATE, you don't risk destroying your file. But still, if you modify the memory area, the touched pages are copied, and you probably don't want this overhead, otherwise you could just copy the file to RAM from the beginning. So I'd still say: don't use strtok() here.
A solution with an own loop based on the functions in <ctype.h> is quite simple, though. As I wanted to try it myself, see here a working program to demonstrate it (the relevant part is the countCols() function):
#define _POSIX_C_SOURCE 200112L
#include <ctype.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
int countCols(const char *line, size_t maxlen)
{
int cols = 0;
int incol = 0;
const char *c = line;
while (maxlen && (!isspace(*c) || isblank(*c)))
{
if (isblank(*c))
{
incol = 0;
}
else
{
if (!incol)
{
incol = 1;
++cols;
}
}
++c;
--maxlen;
}
return cols;
}
int main(int argc, char **argv)
{
if (argc != 2)
{
fprintf(stderr, "Usage: %s [file]\n", argv[0]);
return EXIT_FAILURE;
}
struct stat st;
if (stat(argv[1], &st) < 0)
{
fprintf(stderr, "Could not stat `%s': %s\n", argv[1],
strerror(errno));
return EXIT_FAILURE;
}
int dataFd = open(argv[1], O_RDONLY);
if (dataFd < 0)
{
fprintf(stderr, "Could not open `%s': %s\n", argv[1],
strerror(errno));
return EXIT_FAILURE;
}
char *data = mmap(0, st.st_size, PROT_READ, MAP_PRIVATE, dataFd, 0);
if (data == MAP_FAILED)
{
close(dataFd);
fprintf(stderr, "Could not mmap `%s': %s\n", argv[1],
strerror(errno));
return EXIT_FAILURE;
}
int cols = countCols(data, st.st_size);
printf("found %d columns.\n", cols);
munmap(data, st.st_size);
return EXIT_SUCCESS;
}

How multibyte string is converted to wide-character string in fxprintf.c in glibc?

Currently, the logic in glibc source of perror is such:
If stderr is oriented, use it as is, else dup() it and use perror() on dup()'ed fd.
If stderr is wide-oriented, the following logic from stdio-common/fxprintf.c is used:
size_t len = strlen (fmt) + 1;
wchar_t wfmt[len];
for (size_t i = 0; i < len; ++i)
{
assert (isascii (fmt[i]));
wfmt[i] = fmt[i];
}
res = __vfwprintf (fp, wfmt, ap);
The format string is converted to wide-character form by the following code, which I do not understand:
wfmt[i] = fmt[i];
Also, it uses isascii assert:
assert (isascii(fmt[i]));
But the format string is not always ascii in wide-character programs, because we may use UTF-8 format string, which can contain non-7bit value(s).
Why there is no assert warning when we run the following code (assuming UTF-8 locale and UTF-8 compiler encoding)?
#include <stdio.h>
#include <errno.h>
#include <wchar.h>
#include <locale.h>
int main(void)
{
setlocale(LC_CTYPE, "en_US.UTF-8");
fwide(stderr, 1);
errno = EINVAL;
perror("привет мир"); /* note, that the string is multibyte */
return 0;
}
$ ./a.out
привет мир: Invalid argument
Can we use dup() on wide-oriented stderr to make it not wide-oriented? In such case the code could be rewritten without using this mysterious conversion, taking into account the fact that perror() takes only multibyte strings (const char *s) and locale messages are all multibyte anyway.
Turns out we can. The following code demonstrates this:
#include <stdio.h>
#include <wchar.h>
#include <unistd.h>
int main(void)
{
fwide(stdout,1);
FILE *fp;
int fd = -1;
if ((fd = fileno (stdout)) == -1) return 1;
if ((fd = dup (fd)) == -1) return 1;
if ((fp = fdopen (fd, "w+")) == NULL) return 1;
wprintf(L"stdout: %d, dup: %d\n", fwide(stdout, 0), fwide(fp, 0));
return 0;
}
$ ./a.out
stdout: 1, dup: 0
BTW, is it worth posting an issue about this improvement to glibc developers?
NOTE
Using dup() is limited with respect to buffering. I wonder if it is considered in the implementation of perror() in glibc. The following example demonstrates this issue.
The output is done not in the order of writing to the stream, but in the order in which the data in the buffer is written-off.
Note, that the order of values in the output is not the same as in the program, because the output of fprintf is written-off first (because of "\n"), and the output of fwprintf is written off when program exits.
#include <wchar.h>
#include <stdio.h>
#include <unistd.h>
int main(void)
{
wint_t wc = L'b';
fwprintf(stdout, L"%lc", wc);
/* --- */
FILE *fp;
int fd = -1;
if ((fd = fileno (stdout)) == -1) return 1;
if ((fd = dup (fd)) == -1) return 1;
if ((fp = fdopen (fd, "w+")) == NULL) return 1;
char c = 'h';
fprintf(fp, "%c\n", c);
return 0;
}
$ ./a.out
h
b
But if we use \n in fwprintf, the output is the same as in the program:
$ ./a.out
b
h
perror() manages to get away with that, because in GNU libc stderr is unbuffered. But will it work safely in programs where stderr is manually set to buffered mode?
This is the patch that I would propose to glibc developers:
diff -urN glibc-2.24.orig/stdio-common/perror.c glibc-2.24/stdio-common/perror.c
--- glibc-2.24.orig/stdio-common/perror.c 2016-08-02 09:01:36.000000000 +0700
+++ glibc-2.24/stdio-common/perror.c 2016-10-10 16:46:03.814756394 +0700
## -36,7 +36,7 ##
errstring = __strerror_r (errnum, buf, sizeof buf);
- (void) __fxprintf (fp, "%s%s%s\n", s, colon, errstring);
+ (void) _IO_fprintf (fp, "%s%s%s\n", s, colon, errstring);
}
## -55,7 +55,7 ##
of the stream. What is supposed to happen when the stream isn't
oriented yet? In this case we'll create a new stream which is
using the same underlying file descriptor. */
- if (__builtin_expect (_IO_fwide (stderr, 0) != 0, 1)
+ if (__builtin_expect (_IO_fwide (stderr, 0) < 0, 1)
|| (fd = __fileno (stderr)) == -1
|| (fd = __dup (fd)) == -1
|| (fp = fdopen (fd, "w+")) == NULL)
NOTE: It wasn't easy to find concrete questions in this post; on the whole, the post seems to be an attempt to engage in a discussion about implementation details of glibc, which it seems to me would be better directed to a forum specifically oriented to development of that library such as the libc-alpha mailing list. (Or see https://www.gnu.org/software/libc/development.html for other options.) This sort of discussion is not really a good match for StackOverflow, IMHO. Nonetheless, I tried to answer the questions I could find.
How does wfmt[i] = fmt[i]; convert from multibyte to wide character?
Actually, the code is:
assert(isascii(fmt[i]));
wfmt[i] = fmt[i];
which is based on the fact that the numeric value of an ascii character is the same as a wchar_t. Strictly speaking, this need not be the case. The C standard specifies:
Each member of the basic character set shall have a code value equal to its value when used as the lone character in an integer character constant if an implementation does not define __STDC_MB_MIGHT_NEQ_WC__. (§7.19/2)
(gcc does not define that symbol.)
However, that only applies to characters in the basic set, not to all characters recognized by isascii. The basic character set contains the 91 printable ascii characters as well as space, newline, horizontal tab, vertical tab and form feed. So it is theoretically possible that one of the remaining control characters will not be correctly converted. However, the actual format string used in the call to __fxprintf only contains characters from the basic character set, so in practice this pedantic detail is not important.
Why there is no assert warning when we execute perror("привет мир");?
Because only the format string is being converted, and the format string (which is "%s%s%s\n") contains only ascii characters. Since the format string contains %s (and not %ls), the argument is expected to be char* (and not wchar_t*) in both the narrow- and wide-character orientations.
Can we use dup() on wide-oriented stderr to make it not wide-oriented?
That would not be a good idea. First, if the stream has an orientation, it might also have a non-empty internal buffer. Since that buffer is part of the stdio library and not of the underlying Posix fd, it will not be shared with the duplicate fd. So the message printed by perror might be interpolated in the middle of some existing output. In addition, it is possible that the multibyte encoding has shift states, and that the output stream is not currently in the initial shift state. In that case, outputting an ascii sequence could result in garbled output.
In the actual implementation, the dup is only performed on streams without orientation; these streams have never had any output directed at them, so they are definitely still in the initial shift state with an empty buffer (if the stream is buffered).
Is it worth posting an issue about this improvement to glibc developers?
That is up to you, but don't do it here. The normal way of doing that would be to file a bug. There is no reason to believe that glibc developers read SO questions, and even if they do, someone would have to copy the issue to a bug, and also copy any proposed patch.
it uses isascii assert.
This is OK. You are not supposed to call this function. It is a glibc internal. Note the two underscores in front of the name. When called from perror, the argument in question is "%s%s%s\n", which is entirely ASCII.
But the format string is not always ascii in wide-character programs, because we may use UTF-8
First, UTF-8 has nothing to do with wide characters. Second, the format string is always ASCII because the function is only called by other glibc functions that know what they are doing.
perror("привет мир");
This is not the format string, this is one of the arguments that corresponds to one of the %s in the actual format string.
Can we use dup() on wide-oriented stderr
You cannot use dup on a FILE*, it operates on POSIX
file descriptors that don't have orientation.
This is the patch that I would propose to glibc developers:
Why? What isn't working?

C: writing the following code into functions

Dear respected programmers. Please could you help me (again) on how to put the following code into functions for my program.
I have read on-line and understand how functions work but when I do it myself it all goes pear shaped/wrong(I am such a noob).
Please could you help with how to for example to write the code below into functions.(like opening the input file).
My initial code looks like:
main (int argc, char **argv)
{
int bytes_read, bytes_written;
struct stat inode;
int input_fd, output_fd;
char buffer[64];
int eof = 0;
int i;
/* Check the command line arguments */
if (argc != 3)
{
printf("syntax is: %s \n", <fromfile> <tofile>\n", argv[0]);
exit (1);
}
/* Check the input file exists and is a file */
if ((stat(argv[1], &inode) == -1) || (!S_ISREG(inode.st_mode)))
{
printf("%s is not a file\n", argv[1]);
exit(2);
}
/* Check that the output file doesnt exist */
if (stat(argv[2], &inode) != -1)
{
printf("Warning: The file %s already exists. Not going to overwrite\n", argv[2]);
exit(2);
}
/* Open the input file for reading */
input_fd = open(argv[1], O_RDONLY, 0);
if (input_fd == -1)
{
printf("%s cannot be opened\n", argv[1]);
exit(3);
}
output_fd = open(argv[2], O_CREAT | O_WRONLY | O_EXCL , S_IRUSR|S_IWUSR);
if (output_fd == -1)
{
printf("%s cannot be opened\n", argv[2]);
exit(3);
}
/* Begin processing the input file here */
while (!eof)
{
bytes_read = read(input_fd, buffer, sizeof(buffer));
if (bytes_read == -1)
{
printf("%s cannot be read\n", argv[1]);
exit(4);
}
if (bytes_read > > 0)
{
bytes_written = write(output_fd, buffer, bytes_read);
if (bytes_written == -1)
{
printf("There was an error writing to the file %s\n",argv[2]);
exit(4);
}
if (bytes_written != bytes_read)
{
printf("Devistating failure! Bytes have either magically appeared and been written or dissapeard and been skipped. Data is inconsistant!\n");
exit(101);
}
}
else
{
eof = 1;
}
}
close(input_fd);
close(output_fd);
}
My attempt at opening an output file:
void outputFile(int argc, char **argv)
{
/* Check that the output file doesnt exist */
if (stat(argv[argc-1], &inode) != -1)
{
printf("Warning: The file %s already exists. Not going to overwrite\n", argv[argc-1]);
return -1;
}
/*Opening ouput files*/
file_desc_out = open(argv[i],O_CREAT | O_WRONLY | O_EXCL , S_IRUSR|S_IWUSR);
if(file_desc_out == -1)
{
printf("Error: %s cannot be opened. \n",argv[i]); //insted of argv[2] have pointer i.
return -1;
}
}
Any help on how I would now reference to this in my program is appreciated thank you.
I tried:
ouputfile (but I cant figure out what goes here and why either).
Maybe the most useful function for you is:
#include <stdio.h>
#include <stdarg.h>
extern void error_exit(int rc, const char *format, ...); /* In a header */
void error_exit(int rc, const char *format, ...)
{
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(rc);
}
You can then write:
if (stat(argv[2], &inode) != -1)
error_exit(2, "Warning: The file %s exists. Not going to overwrite\n",
argv[2]);
Which has the merit of brevity.
You write functions to do sub-tasks. Deciding where to break up your code into functions is tricky - as much art as science. Your code is not so big that it is completely awful to leave it as it is - one function (though the error handling can be simplified as above).
If you want to practice writing functions, consider splitting it up:
open_input_file()
open_output_file()
checked_read()
checked_write()
checked_close()
These functions would allow your main code to be written as:
int main(int argc, char **argv)
{
int bytes_read;
int input_fd, output_fd;
char buffer[64];
if (argc != 3)
error_exit(1, "Usage: %s <fromfile> <tofile>\n", argv[0]);
input_fd = open_input_file(argv[1]);
output_fd = open_output_file(argv[2]);
while ((bytes_read = checked_read(input_fd, buffer, sizeof(buffer)) > 0)
check_write(output_fd, buffer, bytes_read);
checked_close(input_fd);
checked_close(output_fd);
return 0;
}
Because you've tucked the error handling out of sight, it is now much easier to see the structure of the program. If you don't have enough functions yet, you can bury the loop into a function void file_copy(int fd_in, int fd_out). That removes more clutter from main() and leaves you with very simple code.
Given an initial attempt at a function to open the output file:
void outputFile(int argc, char **argv)
{
/* Check that the output file doesnt exist */
if (stat(argv[argc-1], &inode) != -1)
{
printf("Warning: The file %s already exists. Not going to overwrite\n", argv[argc-1]);
return -1;
}
/*Opening ouput files*/
file_desc_out = open(argv[i],O_CREAT | O_WRONLY | O_EXCL , S_IRUSR|S_IWUSR);
if(file_desc_out == -1)
{
printf("Error: %s cannot be opened. \n",argv[i]); //insted of argv[2] have pointer i.
return -1;
}
}
Critique:
You have to define the variables used by the function in the function (you will want to avoid global variables as much as possible, and there is no call for any global variable in this code).
You have to define the return type. You are opening a file - how is the file descriptor going to be returned to the calling code? So, the return type should be int.
You pass only the information needed to the function - a simple form of 'information hiding'. In this case, you only need to pass the name of the file; the information about file modes and the like is implicit in the name of the function.
In general, you have to decide how to handle errors. Unless you have directives otherwise from your homework setter, it is reasonable to exit on error with an appropriate message. If you return an error indicator, then the calling code has to test for it, and decide what to do about the error.
Errors and warnings should be written to stderr, not to stdout. The main program output (if any) goes to stdout.
Your code is confused about whether argv[i] or argv[argc-1] is the name of the output file. In a sense, this criticism is irrelevant once you pass just the filename to the function. However, consistency is a major virtue in programming, and using the same expression to identify the same thing is usually a good idea.
Consistency of layout is also important. Don't use both if( and if ( in your programs; use the canonical if ( notation as used by the language's founding fathers, K&R.
Similarly, be consistent with no spaces before commas, a space after a comma, and be consistent with spaces around operators such as '|'. Consistency makes your code easier to read, and you'll be reading your code a lot more often than you write it (at least, once you've finished your course, you will do more reading than writing).
You cannot have return -1; inside a function that returns no value.
When you a splitting up code into functions, you need to copy/move the paragraphs of code that you are extracting, leaving behind a call to the new function. You also need to copy the relevant local variables from the calling function into the new function - possibly eliminating the variables in the calling function if they are no longer used there. You do compile with most warnings enabled, don't you? You want to know about unused variables etc.
When you create the new function, one of the most important parts is working out what the correct signature of the function is. Does it return a value? If so, which value, and what is its type? If not, how does it handle errors? In this case, you probably want the function to bail out (terminate the program) if it runs into an error. In bigger systems, you might need to consistently return an error indicator (0 implies success, negative implies failure, different negatives indicating different errors). When you work with function that return an error indicator, it is almost always crucial that you check the error indicators in the calling code. For big programs, big swathes of the code can be all about error handling. Similarly, you need to work out which values are passed into the function.
I'm omitting advice about things such as 'be const correct' as overkill for your stage in learning to program in C.
you seem to actually understand how to make a function. making a function really isnt that hard. first, you need to kind of understand that a function has a type. in otherwords, argc has type int and argv has type char *, your function (currently) has type void. void means it has no value, which means when you return, you return nothing.
however, if you look at your code, you do return -1. it looks like you want to return an interger. so you should change the top from void outputfile(...) to int outputfile(...).
next, your function must return. it wont compile if there is a circumstance where it won't return (besides infinite loops). so at the very bottom, if no errors happen, it will reach the end. since you're no longer using "void" as the return type, you must return something before the end of the function. so i suggest putting a return 1; to show that everything went great
There's several things.
The function return type isn't what you want. You either want to return a file descriptor or an error code. IIRC, the file descriptor is a nonnegative int, so you can use a return type of int rather than void. You also need to return something on either path, either -1 or file_desc_out.
You probably don't want to pass in the command-line arguments as a whole, but rather something like argv[argc - 1]. In that case, the argument should be something like char * filename rather than the argc/argv it has now. (Note that the argv[i] you've got in the last printf is almost certainly wrong.)
This means it would be called something like
int file_desc_out = outputFile(argv[argc - 1]);
You need to have all variables declared in the function, specifically inode and file_desc_out.
Finally, put an extra level of indentation on the code inside the { and } of the function itself.

C readline function

In an assignment for college it was suggested to use the C readline function in an exercise. I have searched for its reference but still haven't found it. Does it really exist? In which header? Can you please post the link to the reference?
Readline exists in two places, libreadline and libedit (also called libeditline). Both have an identical interface. The difference is libreadline is licensed under the GPL, libedit is 3 clause BSD. Licensing is really not a concern for an assignment, at least I don't think it is. Either license allows you to use the code freely. If you link against readline, be sure to make the whole program GPL 2 or later which will satisfy whatever version of the GPL governs the system readline. It may be GPL2+ or GPL3+, depending on the age of the system. I'm not advocating either license, that's up to you.
Note, take care to install either / or and adjust linking as needed (-lreadline or -ledit or -leditline). Both are libraries and not a part of the standard C library.
Edit (afterthought):
If releasing a program to the wild, its a nice gesture to allow the user to configure it with their readline of choice. For instance: --with-readline or --with-libedit, etc. This allows a binary package that conforms to their choice of license, at least as far as readline is concerned.
Links: Readline and Edit/Editline.
I don't think it's a standard function.
I simple implementation would be like this:
char *Readline(char *in) {
char *cptr;
if (cptr = fgets(in, MAX_LINE, stdin)) {
/* kill preceding whitespace but leave \n so we're guaranteed to have something
while(*cptr == ' ' || *cptr == '\t') {
cptr++;
}
return cptr;
} else {
return 0;
}
}
It uses fgets() to read up to MAX_LINE - 1 characters into the buffer 'in'. It strips preceding whitespace and returns a pointer to the first non-whitespace character.
Not sure if you tried reading this from the GNU C Library: ssize_t getline (char **lineptr, size_t *n, FILE *stream).
This function reads a line from a file and can even re-allocate more space if needed.
An example of this is found in the manpage of getline. Below is a copy of it.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
int
main(int argc, char *argv[])
{
FILE *stream;
char *line = NULL;
size_t len = 0;
ssize_t nread;
if (argc != 2) {
fprintf(stderr, "Usage: %s <file>\n", argv[0]);
exit(EXIT_FAILURE);
}
stream = fopen(argv[1], "r");
if (stream == NULL) {
perror("fopen");
exit(EXIT_FAILURE);
}
while ((nread = getline(&line, &len, stream)) != -1) {
printf("Retrieved line of length %zu:\n", nread);
fwrite(line, nread, 1, stdout);
}
free(line);
fclose(stream);
exit(EXIT_SUCCESS);
}
If you need a "readLine()" function, like the readLine() in Java-BufferedReader, you can also freely use my function «char* get_line(FILE *filePointer)» in "line.h", which I wrote just for this purpose: https://github.com/pheek/line.h/blob/master/line.h
It doesn't exist.
They were mistaken and referred to gets() from stdio.h.
Also this is a very unsafe function due to no maximum size to read parameter, making it immediate security whole (lookup buffer overrun attack). You may use fgets() instead.

Resources