Creating file that uses UTF-8 encoding - c

I am trying to create a file and encode its content in the UTF-8 format using C. I have tried several things and looked around but I can not seem to find a solution to the problem.
This is the code I am currently trying (u8_wc_tout8 function taken from here):
int u8_wc_toutf8(char *dest, u_int32_t ch)
{
if (ch < 0x80) {
dest[0] = (char)ch;
return 1;
}
if (ch < 0x800) {
dest[0] = (ch>>6) | 0xC0;
dest[1] = (ch & 0x3F) | 0x80;
return 2;
}
if (ch < 0x10000) {
dest[0] = (ch>>12) | 0xE0;
dest[1] = ((ch>>6) & 0x3F) | 0x80;
dest[2] = (ch & 0x3F) | 0x80;
return 3;
}
if (ch < 0x110000) {
dest[0] = (ch>>18) | 0xF0;
dest[1] = ((ch>>12) & 0x3F) | 0x80;
dest[2] = ((ch>>6) & 0x3F) | 0x80;
dest[3] = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
int main ()
{
printf(setlocale(LC_ALL, "")); //Prints C.UTF-8
FILE * fout;
fout=fopen("out.txt","w");
u_int32_t c = 'Å';
char convertedChar[6];
int cNum = u8_wc_toutf8(convertedChar, c);
printf(convertedChar); //Prints ?
fprintf(fout, convertedChar);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
When I run this from the command prompt in Windows it prints ? and when I open the file created I get some weird characters. If I check the encoding in Firefox on the file it says:
"windows-1252"
Are there any better ways to check the encoding of the file?
Any tips to point me in the right direction would be really nice, it feels like this should not be that hard to do.

You should allocate the memory for convertedChar and set c to 197, which is the unicode char id of the angstrom character (Å). Then you can now encode this character in utf-8 or anything else if you want:
int main ()
{
FILE * fout;
fout=fopen("out.txt","wb");
u_int32_t c = 197; // Or 0xC5
char convertedChar[4];
int cNum = u8_wc_toutf8(convertedChar, c);
fwrite(convertedChar, sizeof(char), cNum, fout);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
And in the case, for example, that your locale uses UTF-8 encoding, then you can use this to print the character on your console:
wchar_t wc;
mbtowc(&wc, convertedChar, sizeof(wchar_t));
putwc(wc, stdout);

Related

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

I have been trying to convert Chinese character input from Windows command prompt in Big5 to UTF-8 by first converting the received input to char32_t in UTF-32 encoding, then convert it to UTF-8. I've been calling the function mbtoc32 from <uchar.h> to do this job, however it kept sending "Encoding error".
The following is the conditions I have encountered:
Converting the sequence (Big5) to a wchar_t representation by mbstowcs is successful.
mbrtoc32 takes the multibyte sequence as UTF-8, though the locale is not. (Set to "", returns "Chinese (Traditional)_Hong Kong SAR.950" on my machine)
Below is the code I've been writing to try to debug my problem, however no success. It tries to convert the "香" Chinese character (U+9999) into the multibyte representation, then tries to convert the Big5 encoding of "香" (0xADBB) into wchar_t and char32_t. However, converting from multibyte (Big5) to char32_t returns encoding error. (In contradictory, inputting the UTF-8 sequence of "香" to mbrtoc32 does return 0x9999 successfully)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
I've also read documentation from cppreference.com, it said,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
I expect mbrtoc32 to behave like mbtowc, which is converting the character from the locale-specific encoding to UTF-32 (in this case Big5 to UTF-32).
Is there any solutions to use mbrtoc32 to convert the multibyte character into char32_t without having the "Encoding error"?
P.S.: I'm using Mingw-64 on Windows 10, compiled with gcc.
I've found the problem. The Mingw-w64 I'm using is expecting all multi-byte string passed to mbrtoc32 and c32rtomb to be in UTF-8 encoding.
Code for mbrtoc32:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
and for c32rtomb:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
both of these functions expected the given multi-byte string to be in UTF-8 without considering the locale settings. Functions mbrtoc32 and c32rtomb on glibc simply calls their wide character counterpart to convert the characters. As
wide character convertions are working properly on Mingw-w64, I used mbrtowc and wcrtomb to replace mbrtoc32 and c32rtomb respectively like the way on glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}

How to dump utf8 from octal ISO-8859-1 in C

I'm trying to output the right character in utf8 given the following octal sequence \303\255 and \346\234\254, but I don't get the correct output.
#include <stdio.h>
#include <stdlib.h>
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
*buf++ = (char)ch;
return 1;
}
if(ch < 0x800) {
*buf++ = (ch >> 6) | 0xC0;
*buf++ = (ch & 0x3F) | 0x80;
return 2;
}
if(ch < 0x10000) {
*buf++ = (ch >> 12) | 0xE0;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 3;
}
if(ch < 0x110000) {
*buf++ = (ch >> 18) | 0xF0;
*buf++ = ((ch >> 12) & 0x3F) | 0x80;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
void output (char *str) {
char *buffer = calloc(8, sizeof(char));
int n = 0;
while(*str) {
n = encode(buffer + n, *str++);
}
printf("%s\n", buffer);
free (buffer);
}
int main() {
char *str1 = "\303\255";
char *str2 = "\346\234\254";
output(str1);
output(str2);
return 0;
}
Outputs: í & æ¬ instead of í & 本
The problem is that the code sequence you use is already UTF-8
/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";
So your encode function is trying to encode an already encoded UTF-8 which should not work.
When i print these sequences in my UTF-8 enabled terminal i see what you are expecting to see:
$ printf "%s\n" $'\303\255'
í
$ printf "%s\n" $'\346\234\254'
本
So maybe you need to rethink what you are trying to accomplish and post a new question if you have new problems there.
It's a pity, but you cannot compare a char value (being it signed or unsigned) with values over 0x100. You are missing something if you try to convert one byte (iso-8859-1) values to utf-8. The iso-8859-1 characters have the same code values as their UTF counterparts, so the conversion is fairly straightforward, as will be shown below.
First of all, all the iso-8859-1 characters are the same as their UTF counterparts, so the first transformation is the identity: We convert each value in iso-8859-1 to the same value in UTF (look that when I say UTF y mean the UTF code for that character, without using any codification, as when I say UTF-8, which is actually an encoding of UTF in eight bit bytes)
UTF values in the range 0x80...0xff must be encoded with two bytes, the first byte using bits 7 and 6 with pattern 110000xx being xx the two most significant bits of the input code, and followed by a second byte with 10xxxxxx being xxxxxx the six least significant bits (bits 5 to 0) of the input code. For UTF values in the range 0x00...0x7f you encode them with just the same byte as the UTF code.
The following function does preciselly this:
size_t iso2utf( unsigned char *buf, unsigned char iso )
{
size_t res = 0;
if ( iso & 0x80 ) {
*buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */
*buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */
res += 2;
} else {
*buf++ = iso; /* a 0xxxxxxx character, untouched. */
res++;
}
*buf = '\0';
return res;
} /* iso2utf */
If you want a complete UTF into UTF-8 encoder, you can try this (I used a different approach, as there can be as much as seven bytes per UTF char ---actually not so much, as currently only 24 or 25 bit codes are used):
#include <string.h>
#include <stdlib.h>
typedef unsigned int UTF; /* you can use wchar_t if you prefer */
typedef unsigned char BYTE;
/* I will assume that UTF string is also zero terminated */
size_t utf_utf8 (BYTE *out, UTF *in)
{
size_t res = 0;
for (;*in;in++) {
UTF c = *in; /* copy the UTF value */
/* we are constructing the string backwards, so finally
* we have it properly ordered. */
size_t n = 0; /* number of characters for this one */
BYTE aux[7], /* buffer to construct the string */
*p = aux + sizeof aux; /* point one cell past the end */
static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01};
static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
for (;c >= limits[n]; c >>= 6) {
*--p = 0x80 | (c & 0x3f); n++;
} /* for */
*--p = masks[n] | c; n++;
memcpy(out, p, n); out += n; res += n;
} /* for */
*out = '\0'; /* terminate string */
return res;
} /* utf_utf8 */
See that the seven bytes per UTF code is hardwired, as it is the fact of UTF codes being 32bit integer. I don't expect UTF codes to go further past the 32 bit limit, but in that case, both, the UTF typedef, and the sizes and contents of the tables aux, limits and masks might be changed accordingly. There's a maximum limit of 7 or 8 for the number of characters used for the utf-8 encoding also, and it's not specified in the standard in any form how to proceed if the UTF codespace should run out of codes any time, so better not to mesh too much with this.
Useless function parameter: unsigned char ch
/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
...
return 1;
if(ch < 0x800) {
...
return 2;
if(ch < 0x10000) {
Sorry, GTG.
Note: Code incorrectly does not detect high and low surrogates.

How to iterate through unicode characters and print them on the screen with printf in C?

I want to iterate through all (at least the 16 bit) unicode characters and print them on the screen with C.
I know there are related questions on SO but they don't solve the problem with printf in C, but this is what I want to achieve, if it's possible after all. I think it should be possible maybe with a trick I'm not aware of.
Since I want to use printf, I thought about something like this:
for (int i = 0x0000; i <= 0xffff; i++) {
//then somehow increment the string
char str[] = "\u25A1\n";
printf("%s", str);
char str[] = "\u25A2\n";
printf("%s", str);
char str[] = "\u25A3\n";
printf("%s", str);
...
}
But it's a bit of a problem to increment the unicode code point, here \u25A1. I'm aware it's not possible per se because some characters like \u0000 are not printable and the compiler says no. But apart from that, how could I increment from hexadecimal 0000 to ffff and print the character with printf.
If the __STDC_ISO_10646__ macro is defined, wide characters correspond to Unicode codepoints. So, assuming a locale that can represent the characters you are interested in, you can just printf() wide characters via the %lc format conversion:
#include <stdio.h>
#include <locale.h>
#ifndef __STDC_ISO_10646__
#error "Oops, our wide chars are not Unicode codepoints, sorry!"
#endif
int main()
{
int i;
setlocale(LC_ALL, "");
for (i = 0; i < 0xffff; i++) {
printf("%x - %lc\n", i, i);
}
return 0;
}
In C99, you can use wide character to multibyte character conversion functions wctomb() or wcrtomb() to convert each code point to a local representation, using the current character set. (The code points are in the current character set, not Unicode.) Remember to use setlocale() to ensure conversion functions are aware of the user locale (most importantly, the current character set used). The conversion functions use the LC_CTYPE category, but you should still use setlocale(LC_ALL, ""); as for any other locale-aware program.
(Not all systems have the C.UTF-8 locale installed, so I do not recommend trying to override the locale to the standard C with UTF-8 using setlocale(LC_ALL, "C.UTF-8");. It works on some systems, but not all. AFAIK it does not work in Fedora-based Linux distributions, for example.)
Because you want to output all Unicode code points, I suggest a different approach: Use one of the Universal Character Set Transformation Formats, i.e. UTF-8, UTF-16 (UCS-2 was superseded by UTF-16 in 1996), or UTF-32 (also known as UCS-4). UTF-8 is the one most often used on the Web -- in particular, on this very web page you're looking at right now -- and is very easy to use.
For further reading on why you should prefer UTF-8 over "native wide strings", see utf8everywhere.org.
If you want truly portable code, you can use this header file, utf8.h, to convert UTF-8 to unicode code points (utf8_to_code()) and Unicode code points to UTF-8 (code_to_utf8()):
#ifndef UTF8_H
#define UTF8_H
#include <stdlib.h>
#include <errno.h>
#define UTF8_MAXLEN 6
static size_t utf8_to_code(const unsigned char *const buffer, unsigned int *const codeptr)
{
if (!buffer) {
errno = EINVAL;
return 0;
}
if (*buffer == 0U) {
errno = 0;
return 0;
}
if (*buffer < 128U) {
if (codeptr)
*codeptr = buffer[0];
return 1;
}
if (*buffer < 192U) {
errno = EILSEQ;
return 0;
}
if (*buffer < 224U) {
if (buffer[1] >= 128U && buffer[1] < 192U)
return ((buffer[0] - 192U) << 6U)
| (buffer[1] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 240U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U)
return ((buffer[0] - 224U) << 12U)
| ((buffer[1] - 128U) << 6U)
| (buffer[2] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 248U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U)
return ((buffer[0] - 240U) << 18U)
| ((buffer[1] - 128U) << 12U)
| ((buffer[2] - 128U) << 6U)
| (buffer[3] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 252U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U)
return ((buffer[0] - 248U) << 24U)
| ((buffer[1] - 128U) << 18U)
| ((buffer[2] - 128U) << 12U)
| ((buffer[3] - 128U) << 6U)
| (buffer[4] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 254U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U &&
buffer[5] >= 128U && buffer[5] < 192U)
return ((buffer[0] - 252U) << 30U)
| ((buffer[1] - 128U) << 24U)
| ((buffer[2] - 128U) << 18U)
| ((buffer[3] - 128U) << 12U)
| ((buffer[4] - 128U) << 6U)
| (buffer[5] - 128U);
errno = EILSEQ;
return 0;
}
errno = EILSEQ;
return 0;
}
static size_t code_to_utf8(unsigned char *const buffer, const unsigned int code)
{
if (code < 128U) {
buffer[0] = code;
return 1;
}
if (code < 2048U) {
buffer[0] = 0xC0U | (code >> 6U);
buffer[1] = 0x80U | (code & 0x3FU);
return 2;
}
if (code < 65536) {
buffer[0] = 0xE0U | (code >> 12U);
buffer[1] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[2] = 0x80U | (code & 0x3FU);
return 3;
}
if (code < 2097152U) {
buffer[0] = 0xF0U | (code >> 18U);
buffer[1] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[3] = 0x80U | (code & 0x3FU);
return 4;
}
if (code < 67108864U) {
buffer[0] = 0xF8U | (code >> 24U);
buffer[1] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[4] = 0x80U | (code & 0x3FU);
return 5;
}
if (code <= 2147483647U) {
buffer[0] = 0xFCU | (code >> 30U);
buffer[1] = 0x80U | ((code >> 24U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[4] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[5] = 0x80U | (code & 0x3FU);
return 6;
}
errno = EINVAL;
return 0;
}
#endif /* UTF8_H */
It is not fast, but it should be easy to understand, and supports all possible Unicode code points (U+0000 to U+10FFFF, inclusive), on all systems with at least 32-bit unsigned ints. On systems with 16-bit unsigned ints, your compiler may warn about unreachable code, and it'll only support the first 65536 code points (U+0000 to U+FFFF).
Using above utf8.h, you can easily write a C program that outputs a HTML page containing the Unicode characters you want (excluding control characters U+0000-U+001F and U+007F-U+00BF, inclusive, and invalid code points U+D800-U+DFFF, inclusive). For example, page.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
const char *str;
size_t n, len;
/* HTML5 DOCTYPE */
printf("<!DOCTYPE html>\n");
printf("<html>\n");
/* Header part. */
printf(" <head>\n");
printf(" <title> Unicode character list </title>\n");
printf(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
printf(" <style type=\"text/css\">\n");
/* with internal CSS stylesheet: */
printf(" html {\n");
printf(" font-family: \"DejaVu Mono\", \"Courier New\", \"Courier\", monospace;\n");
printf(" font-weight: normal;\n");
printf(" font-size: 100%%;\n");
printf(" text-decoration: none;\n");
printf(" background: #f7f7f7;\n");
printf(" color: #000000;\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0\n");
printf(" }\n");
printf(" body {\n");
printf(" background: #ffffff;\n");
printf(" padding: 0.5em 1em 0.5em 1em;\n");
printf(" border: 1px solid #cccccc;\n");
printf(" margin: 0 auto auto auto;\n");
printf(" width: 12em;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p {\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0;\n");
printf(" outline: 0 none;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p.odd {\n");
printf(" background: #efefef;\n");
printf(" }\n");
printf(" p.even {\n");
printf(" background: #f7f7f7;\n");
printf(" }\n");
printf(" span.code {\n");
printf(" width: 8em;\n");
printf(" text-align: right;\n");
printf(" }\n");
printf(" span.char {\n");
printf(" width: 4em;\n");
printf(" text-align: left;\n");
printf(" }\n");
printf(" </style>\n");
printf(" </head>\n");
/* Body part. */
printf(" <body>\n");
n = 0;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
/* HTML does not like " & < > */
if (i == 32U)
str = " ";
else
if (i == 34U)
str = """;
else
if (i == 38U)
str = "&";
else
if (i == 60U)
str = "<";
else
if (i == 62U)
str = ">";
else
str = (const char *)ch;
if (n & 1) {
printf(" <p class=\"odd\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
} else {
printf(" <p class=\"even\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
}
n++;
}
}
printf(" </body>\n");
printf("</html>\n");
return EXIT_SUCCESS;
}
Redirect the output to a file, and you can open the file in whatever browser you prefer. If your browser is sane, and does not treat local files any different to those it obtains from a web server, then you should see the correct output.
(If you see multiple characters per code point after U+00A0, your browser has decided that because the file is local, it is using a different character set that it explicitly states it uses. Switch to a sane browser if that happens, or override the character set selection.)
If you want, you can just print the codes out as UTF-8 text, say using text.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
size_t len;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
printf("U+%04X %s \n", i, ch);
}
}
return EXIT_SUCCESS;
}
but then you must either be sure your terminal or terminal emulator supports UTF-8 and uses an UTF-8 locale, or you redirect the output to a text file and open that file in an editor which either assumes the file uses UTF-8 or lets you explicitly select the UTF-8 character set.
Note that there is a space before and after each character. Because some of the code points are combining characters, they may not show up at all unless they can be combined with another character, and most (all?) combine with space just fine.
If you use Windows, then you must conform to Microsoft stupidity, and add a special "byte order mark" -- printf("\xEF\xBB\xBF"); -- to the beginning of the output, so that its utilities like Notepad recognizes the file as UTF-8. It's a Windows-only wart, and treat it as such.
Questions?
The function to convert a 16-bit Unicode codepoint to a multibyte character sequence is c16rtomb; there is also c32rtomb if you want to handle 32-bit codepoints:
#include <uchar.h>
mbstate_t ps;
char buf[MB_CUR_MAX];
size_t bytes = c16rtomb(buf, i, &ps);
if (bytes != (size_t) -1) {
printf("%.*s\n", bytes, buf);
}
If c16rtomb is not available you will need to use platform-specific facilities.
I would go for something like this (using raw UTF-8 encoding):
char unicode[3] = { 0x00, 0x00, 0x00 };
for(size_t i=0; i<0xffff; i++)
{
printf("%s\n", unicode);
uint16_t * code = &unicode[0];
*code = *code +1;
}
Define a string on 3 bytes, the last one is the NULL terminating byte allowing a display via printf
Consider the two first bytes as your 16-bit unicode and increment it on each loop
Of course it can be optimized as:
Many characters won't be displayable
The cast char* -> uint16_t is not very elegant (triggers a warning)
As there is 2 bytes for UTF-8 encoding it will actually browse 11 bits of codepoints. To get the 16 bits you might want to actually use uint32_t and define a 5 bytes char* buffer
[EDIT] As stated in the comment, this loop will actually generates a lot of invalid UTF-8 sequences.
Indeed, going from U+007F to U+0080 is a +1 for code points but in UTF-8 you jump from 0x7F to 0xC280: you need to exclude some ranges in the loop.

Simulating basic machine language in C.

So for my assignment I have to simulate basic machine language with C. The machine has 16 registers (reg[]), a program counter(pc) and memory (mem[]) all of which are unsigned chars. The instructions are read in from a file and are in the format:
B404 (1RXY = Load register R with the value at memory address XY). All numbers are in hex. C000 is the halt command.
Now my problem is that when I print out the instructions (when stored in a,b,c,d and cd) b and d have leading zeros. How do I get rid of them so the instruction is printed as above? (Line 48).
Also it appears some of my if statements aren't called as if I put a printf() in them and they are on the file it is never printed. (Lines 48 to 78).
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main () {
FILE *file;
file = fopen("a3.txt","r");
unsigned int temp = 0;
unsigned char pc, mem[256], reg[16],a,b,c,d,cd;
unsigned int count;
count = 0;
pc = 0;
for (int i=0; i < 16;++i) {
reg[i] = 0;
}
if (file == NULL) {
printf("\n Error opening the file");
exit(0);
}
for (int i=0; !feof(file); i += 2) {
fscanf(file, "%X", &temp);
mem[i] = ((temp & 0xFF00) >> 8);
mem[i + 1] = (temp & 0xFF);
if (feof(file)) {
break; // exit from while
}
++count;
}
int fclose(FILE *file);
/*while (mem[pc] != 0xC0) {*/
for (unsigned int i=0; i < count-1;++i) {
if (mem[i] == 0xC0) {
break; // exit from for
exit(1);
}
cd = mem[pc + 1];
a = (mem[pc] & 0xF0);
b = (mem[pc] & 0xF);
c = (mem[pc + 1] & 0xF0);
d = (mem[pc + 1] & 0xF);
printf("%02X ",pc);
printf("%X%X%X%X - [",a,b,c,d);
printf("%02X %02X %02X %02X ",reg[0],reg[1],reg[2],reg[3]);
printf("%02X %02X %02X %02X ",reg[4],reg[5],reg[6],reg[7]);
printf("%02X %02X %02X %02X ",reg[8],reg[9],reg[10],reg[11]);
printf("%02X %02X %02X %02X]\n",reg[12],reg[13],reg[14],reg[15]);
if (a == 0x1) {
reg[b] = reg[cd];
}
if (a == 0x2) {
reg[b] = cd;
//printf("2 reporting in");
}
if (a == 0x3) {
reg[cd] = reg[b];
}
if (a == 0x4) {
reg[d] = reg[c];
}
if (a == 0x05) {
reg[d] = (reg[b] + reg[c]);
//printf("5 reporting in");
}
if (a == 0x7) {
reg[d] = (reg[b] | reg[c]);
}
if (a == 0x8) {
reg[d] = (reg[b] & reg[c]);
}
if (a == 0x9) {
reg[d] = (reg[b] ^ reg[c]);
}
if (a == 0xA0) {
reg[b] = (reg[b] >> reg[d]) | (reg[b] << (32 - reg[d]));
}
pc += 2;
if (a == 0xB0) {
//printf("B reporting in");
if (reg[b] == reg[0]) {
pc = cd;
}
}
}
return 0;
}
Thanks!
To get the high bits of an unsigned char, use the following function:
unsigned char hi(unsigned char bits) {
return (bits >> 4) & 0x0F;
}
And when you are doing the instruction dispatch, it is better to use a switch statement instead of an if ... else if ... else cascade. Then you can have a simple default: fprintf(stderr, "unknown instruction: %u\n", a); exit(1); for error detection.
Also, 0xA0 is not the value 10, its rather 160, so you should either write 10 or 0x0A.
Update:
The code for extracting a, b, c and d should look like this:
a = (mem[pc + 0] >> 4) & 0x0F;
b = (mem[pc + 0] >> 0) & 0x0F;
c = (mem[pc + 1] >> 4) & 0x0F;
d = (mem[pc + 1] >> 0) & 0x0F;
This is simple to read and easy to check for correctness. One can clearly see the uniform handling, and that each value is restricted to the range [0x00;0x0F].

Mapping multibyte characters to their unicode point representation

How do you map a single UTF-8 character to its unicode point in C?
[For example, È would be mapped to 00c8].
If your platform's wchar_t stores unicode (if it's a 32-bit type, it probably does) and you have an UTF-8 locale, you can call mbrtowc (from C90.1).
mbstate_t state = {0};
wchar_t wch;
char s[] = "\303\210";
size_t n;
memset(&state, 0, sizeof(state));
setlocale(LC_CTYPE, "en_US.utf8"); /*error checking omitted*/
n = mbrtowc(&wch, s, strlen(s), &state);
if (n <= (size_t)-2) printf("%lx\n", (unsigned long)wch);
For more flexibility, you can call the iconv interface.
char s[] = "\303\210";
iconv_t cd = iconv_open("UTF-8", "UCS-4");
if (cd != -1) {
char *inp = s;
size_t ins = strlen(s);
uint32_t c;
uint32_t *outp = &c;
size_t outs = 0;
if (iconv(cd, &inp, &ins, &outp, &outs) + 1 >= 2) printf("%lx\n", c);
iconv_close(cd);
}
Some things to look at :
libiconv
ConvertUTF.h
MultiByteToWideChar (under windows)
An reasonably fast implementation of an UTF-8 to UCS-2 converter. Surrogate and characters outside the BMP left as exercice.
The function returns the number of bytes consumed from the input s string. A negative value represents an error.
The resulting unicode character is put at the address p points to.
int utf8_to_wchar(wchar_t *p, const char *s)
{
const unsigned char *us = (const unsigned char *)s;
p[0] = 0;
if(!*us)
return 0;
else
if(us[0] < 0x80) {
p[0] = us[0];
return 1;
}
else
if(((us[0] & 0xE0) == 0xC0) && (us[1] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x1F) << 6) | (us[1] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x80) return -2;
#endif
return 2;
}
else
if(((us[0] & 0xF0) == 0xE0) && (us[1] & 0xC0) == 0x80 && (us[2] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x0F) << 12) | ((us[1] & 0x3F) << 6) | (us[2] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x800) return -2;
#endif
return 3;
}
return -1;
}

Resources