convert ucs(Universal Character Set) character to unicode? - c

I am reading code from somebody, I happended to see code as follow.
According to comment, this function is to Convert a UCS character to an UTF-8 string. But what is ucs character, what is the rule to convert ucs to unicode, where can I find the documents?
/*
* Convert a UCS character to an UTF-8 string
*
* Returns the string length of the result
*/
size_t
tUcs2Utf8(ULONG ulChar, char *szResult, size_t tMaxResultLen)
{
if (szResult == NULL || tMaxResultLen == 0) {
return 0;
}
if (ulChar < 0x80 && tMaxResultLen >= 2) {
szResult[0] = (char)ulChar;
szResult[1] = '\0';
return 1;
}
if (ulChar < 0x800 && tMaxResultLen >= 3) {
szResult[0] = (char)(0xc0 | ulChar >> 6);
szResult[1] = (char)(0x80 | (ulChar & 0x3f));
szResult[2] = '\0';
return 2;
}
if (ulChar < 0x10000 && tMaxResultLen >= 4) {
szResult[0] = (char)(0xe0 | ulChar >> 12);
szResult[1] = (char)(0x80 | (ulChar >> 6 & 0x3f));
szResult[2] = (char)(0x80 | (ulChar & 0x3f));
szResult[3] = '\0';
return 3;
}
if (ulChar < 0x200000 && tMaxResultLen >= 5) {
szResult[0] = (char)(0xf0 | ulChar >> 18);
szResult[1] = (char)(0x80 | (ulChar >> 12 & 0x3f));
szResult[2] = (char)(0x80 | (ulChar >> 6 & 0x3f));
szResult[3] = (char)(0x80 | (ulChar & 0x3f));
szResult[4] = '\0';
return 4;
}
szResult[0] = '\0';
return 0;
} /* end of tUcs2Utf8 */

Universal Character Set is an ISO standard. It defines the same characters as Unicode, so there's no need for character conversion. Every version of UCS is essentially a small subset of a certain version of the Unicode standard. New characters are first added to Unicode and every so often, UCS is synchronized with Unicode. Appendix C of the Unicode standard contains a table that shows the relationship between different versions.
Also note that the code you posted uses a non-standard upper limit of 0x200000. This should be changed to 0x110000.

Related

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

I have been trying to convert Chinese character input from Windows command prompt in Big5 to UTF-8 by first converting the received input to char32_t in UTF-32 encoding, then convert it to UTF-8. I've been calling the function mbtoc32 from <uchar.h> to do this job, however it kept sending "Encoding error".
The following is the conditions I have encountered:
Converting the sequence (Big5) to a wchar_t representation by mbstowcs is successful.
mbrtoc32 takes the multibyte sequence as UTF-8, though the locale is not. (Set to "", returns "Chinese (Traditional)_Hong Kong SAR.950" on my machine)
Below is the code I've been writing to try to debug my problem, however no success. It tries to convert the "香" Chinese character (U+9999) into the multibyte representation, then tries to convert the Big5 encoding of "香" (0xADBB) into wchar_t and char32_t. However, converting from multibyte (Big5) to char32_t returns encoding error. (In contradictory, inputting the UTF-8 sequence of "香" to mbrtoc32 does return 0x9999 successfully)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
I've also read documentation from cppreference.com, it said,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
I expect mbrtoc32 to behave like mbtowc, which is converting the character from the locale-specific encoding to UTF-32 (in this case Big5 to UTF-32).
Is there any solutions to use mbrtoc32 to convert the multibyte character into char32_t without having the "Encoding error"?
P.S.: I'm using Mingw-64 on Windows 10, compiled with gcc.
I've found the problem. The Mingw-w64 I'm using is expecting all multi-byte string passed to mbrtoc32 and c32rtomb to be in UTF-8 encoding.
Code for mbrtoc32:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
and for c32rtomb:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
both of these functions expected the given multi-byte string to be in UTF-8 without considering the locale settings. Functions mbrtoc32 and c32rtomb on glibc simply calls their wide character counterpart to convert the characters. As
wide character convertions are working properly on Mingw-w64, I used mbrtowc and wcrtomb to replace mbrtoc32 and c32rtomb respectively like the way on glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}

How to print Unicode codepoints as characters in C?

I have an array of uint32_t elements that each store a codepoint for a non-latin Unicode character. How do I print them on the console or store them in a file as UTF-8 encoded characters? I understand that they may fail to render properly on a console, but they should display fine if I open them in a compatible editor.
I have tried using wprintf(L"%lc", UINT32_T_VARIABLE), and fwprintf(FILE_STREAM, L"%lc", UINT32_T_VARIABLE) but to no avail.
You must first select the proper locale with:
#include <locale.h>
setlocale(LC_ALL, "C.UTF-8");
or
setlocale(LC_ALL, "en_US.UTF-8");
And then use printf or fprintf with the %lc format:
printf("%lc", UINT32_T_VARIABLE);
This will work only for Unicode code points small enough to fit in a wchar_t. For a more complete and portable solution, you may nee to implement the Unicode to UTF-8 conversion yourself, which is not very difficult.
Best to use existing code when available.
Rolling ones own Unicode code-point to UTF8 is simply, yet easy to mess up. The answer took 2 edits to fix. #Jonathan Leffler #chqrlie, so rigorous testing is recommended for any self-coded solution. Follows is lightly tested code to convert a code-point to an array.
Note that the result is not a string.
// Populate utf8 with 0-4 bytes
// Return length used in utf8[]
// 0 implies bad codepoint
unsigned Unicode_CodepointToUTF8(uint8_t *utf8, uint32_t codepoint) {
if (codepoint <= 0x7F) {
utf8[0] = codepoint;
return 1;
}
if (codepoint <= 0x7FF) {
utf8[0] = 0xC0 | (codepoint >> 6);
utf8[1] = 0x80 | (codepoint & 0x3F);
return 2;
}
if (codepoint <= 0xFFFF) {
// detect surrogates
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return 0;
utf8[0] = 0xE0 | (codepoint >> 12);
utf8[1] = 0x80 | ((codepoint >> 6) & 0x3F);
utf8[2] = 0x80 | (codepoint & 0x3F);
return 3;
}
if (codepoint <= 0x10FFFF) {
utf8[0] = 0xF0 | (codepoint >> 18);
utf8[1] = 0x80 | ((codepoint >> 12) & 0x3F);
utf8[2] = 0x80 | ((codepoint >> 6) & 0x3F);
utf8[3] = 0x80 | (codepoint & 0x3F);
return 4;
}
return 0;
}
// Sample usage
uint32_t cp = foo();
uint8_t utf8[4];
unsigned len = Unicode_CodepointToUTF8(utf8, cp);
if (len == 0) Handle_BadCodePoint();
size_t y = fwrite(utf8, 1, len, stream_opened_in_binary_mode);

Creating file that uses UTF-8 encoding

I am trying to create a file and encode its content in the UTF-8 format using C. I have tried several things and looked around but I can not seem to find a solution to the problem.
This is the code I am currently trying (u8_wc_tout8 function taken from here):
int u8_wc_toutf8(char *dest, u_int32_t ch)
{
if (ch < 0x80) {
dest[0] = (char)ch;
return 1;
}
if (ch < 0x800) {
dest[0] = (ch>>6) | 0xC0;
dest[1] = (ch & 0x3F) | 0x80;
return 2;
}
if (ch < 0x10000) {
dest[0] = (ch>>12) | 0xE0;
dest[1] = ((ch>>6) & 0x3F) | 0x80;
dest[2] = (ch & 0x3F) | 0x80;
return 3;
}
if (ch < 0x110000) {
dest[0] = (ch>>18) | 0xF0;
dest[1] = ((ch>>12) & 0x3F) | 0x80;
dest[2] = ((ch>>6) & 0x3F) | 0x80;
dest[3] = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
int main ()
{
printf(setlocale(LC_ALL, "")); //Prints C.UTF-8
FILE * fout;
fout=fopen("out.txt","w");
u_int32_t c = 'Å';
char convertedChar[6];
int cNum = u8_wc_toutf8(convertedChar, c);
printf(convertedChar); //Prints ?
fprintf(fout, convertedChar);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
When I run this from the command prompt in Windows it prints ? and when I open the file created I get some weird characters. If I check the encoding in Firefox on the file it says:
"windows-1252"
Are there any better ways to check the encoding of the file?
Any tips to point me in the right direction would be really nice, it feels like this should not be that hard to do.
You should allocate the memory for convertedChar and set c to 197, which is the unicode char id of the angstrom character (Å). Then you can now encode this character in utf-8 or anything else if you want:
int main ()
{
FILE * fout;
fout=fopen("out.txt","wb");
u_int32_t c = 197; // Or 0xC5
char convertedChar[4];
int cNum = u8_wc_toutf8(convertedChar, c);
fwrite(convertedChar, sizeof(char), cNum, fout);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
And in the case, for example, that your locale uses UTF-8 encoding, then you can use this to print the character on your console:
wchar_t wc;
mbtowc(&wc, convertedChar, sizeof(wchar_t));
putwc(wc, stdout);

How to iterate through unicode characters and print them on the screen with printf in C?

I want to iterate through all (at least the 16 bit) unicode characters and print them on the screen with C.
I know there are related questions on SO but they don't solve the problem with printf in C, but this is what I want to achieve, if it's possible after all. I think it should be possible maybe with a trick I'm not aware of.
Since I want to use printf, I thought about something like this:
for (int i = 0x0000; i <= 0xffff; i++) {
//then somehow increment the string
char str[] = "\u25A1\n";
printf("%s", str);
char str[] = "\u25A2\n";
printf("%s", str);
char str[] = "\u25A3\n";
printf("%s", str);
...
}
But it's a bit of a problem to increment the unicode code point, here \u25A1. I'm aware it's not possible per se because some characters like \u0000 are not printable and the compiler says no. But apart from that, how could I increment from hexadecimal 0000 to ffff and print the character with printf.
If the __STDC_ISO_10646__ macro is defined, wide characters correspond to Unicode codepoints. So, assuming a locale that can represent the characters you are interested in, you can just printf() wide characters via the %lc format conversion:
#include <stdio.h>
#include <locale.h>
#ifndef __STDC_ISO_10646__
#error "Oops, our wide chars are not Unicode codepoints, sorry!"
#endif
int main()
{
int i;
setlocale(LC_ALL, "");
for (i = 0; i < 0xffff; i++) {
printf("%x - %lc\n", i, i);
}
return 0;
}
In C99, you can use wide character to multibyte character conversion functions wctomb() or wcrtomb() to convert each code point to a local representation, using the current character set. (The code points are in the current character set, not Unicode.) Remember to use setlocale() to ensure conversion functions are aware of the user locale (most importantly, the current character set used). The conversion functions use the LC_CTYPE category, but you should still use setlocale(LC_ALL, ""); as for any other locale-aware program.
(Not all systems have the C.UTF-8 locale installed, so I do not recommend trying to override the locale to the standard C with UTF-8 using setlocale(LC_ALL, "C.UTF-8");. It works on some systems, but not all. AFAIK it does not work in Fedora-based Linux distributions, for example.)
Because you want to output all Unicode code points, I suggest a different approach: Use one of the Universal Character Set Transformation Formats, i.e. UTF-8, UTF-16 (UCS-2 was superseded by UTF-16 in 1996), or UTF-32 (also known as UCS-4). UTF-8 is the one most often used on the Web -- in particular, on this very web page you're looking at right now -- and is very easy to use.
For further reading on why you should prefer UTF-8 over "native wide strings", see utf8everywhere.org.
If you want truly portable code, you can use this header file, utf8.h, to convert UTF-8 to unicode code points (utf8_to_code()) and Unicode code points to UTF-8 (code_to_utf8()):
#ifndef UTF8_H
#define UTF8_H
#include <stdlib.h>
#include <errno.h>
#define UTF8_MAXLEN 6
static size_t utf8_to_code(const unsigned char *const buffer, unsigned int *const codeptr)
{
if (!buffer) {
errno = EINVAL;
return 0;
}
if (*buffer == 0U) {
errno = 0;
return 0;
}
if (*buffer < 128U) {
if (codeptr)
*codeptr = buffer[0];
return 1;
}
if (*buffer < 192U) {
errno = EILSEQ;
return 0;
}
if (*buffer < 224U) {
if (buffer[1] >= 128U && buffer[1] < 192U)
return ((buffer[0] - 192U) << 6U)
| (buffer[1] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 240U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U)
return ((buffer[0] - 224U) << 12U)
| ((buffer[1] - 128U) << 6U)
| (buffer[2] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 248U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U)
return ((buffer[0] - 240U) << 18U)
| ((buffer[1] - 128U) << 12U)
| ((buffer[2] - 128U) << 6U)
| (buffer[3] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 252U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U)
return ((buffer[0] - 248U) << 24U)
| ((buffer[1] - 128U) << 18U)
| ((buffer[2] - 128U) << 12U)
| ((buffer[3] - 128U) << 6U)
| (buffer[4] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 254U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U &&
buffer[5] >= 128U && buffer[5] < 192U)
return ((buffer[0] - 252U) << 30U)
| ((buffer[1] - 128U) << 24U)
| ((buffer[2] - 128U) << 18U)
| ((buffer[3] - 128U) << 12U)
| ((buffer[4] - 128U) << 6U)
| (buffer[5] - 128U);
errno = EILSEQ;
return 0;
}
errno = EILSEQ;
return 0;
}
static size_t code_to_utf8(unsigned char *const buffer, const unsigned int code)
{
if (code < 128U) {
buffer[0] = code;
return 1;
}
if (code < 2048U) {
buffer[0] = 0xC0U | (code >> 6U);
buffer[1] = 0x80U | (code & 0x3FU);
return 2;
}
if (code < 65536) {
buffer[0] = 0xE0U | (code >> 12U);
buffer[1] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[2] = 0x80U | (code & 0x3FU);
return 3;
}
if (code < 2097152U) {
buffer[0] = 0xF0U | (code >> 18U);
buffer[1] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[3] = 0x80U | (code & 0x3FU);
return 4;
}
if (code < 67108864U) {
buffer[0] = 0xF8U | (code >> 24U);
buffer[1] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[4] = 0x80U | (code & 0x3FU);
return 5;
}
if (code <= 2147483647U) {
buffer[0] = 0xFCU | (code >> 30U);
buffer[1] = 0x80U | ((code >> 24U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[4] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[5] = 0x80U | (code & 0x3FU);
return 6;
}
errno = EINVAL;
return 0;
}
#endif /* UTF8_H */
It is not fast, but it should be easy to understand, and supports all possible Unicode code points (U+0000 to U+10FFFF, inclusive), on all systems with at least 32-bit unsigned ints. On systems with 16-bit unsigned ints, your compiler may warn about unreachable code, and it'll only support the first 65536 code points (U+0000 to U+FFFF).
Using above utf8.h, you can easily write a C program that outputs a HTML page containing the Unicode characters you want (excluding control characters U+0000-U+001F and U+007F-U+00BF, inclusive, and invalid code points U+D800-U+DFFF, inclusive). For example, page.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
const char *str;
size_t n, len;
/* HTML5 DOCTYPE */
printf("<!DOCTYPE html>\n");
printf("<html>\n");
/* Header part. */
printf(" <head>\n");
printf(" <title> Unicode character list </title>\n");
printf(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
printf(" <style type=\"text/css\">\n");
/* with internal CSS stylesheet: */
printf(" html {\n");
printf(" font-family: \"DejaVu Mono\", \"Courier New\", \"Courier\", monospace;\n");
printf(" font-weight: normal;\n");
printf(" font-size: 100%%;\n");
printf(" text-decoration: none;\n");
printf(" background: #f7f7f7;\n");
printf(" color: #000000;\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0\n");
printf(" }\n");
printf(" body {\n");
printf(" background: #ffffff;\n");
printf(" padding: 0.5em 1em 0.5em 1em;\n");
printf(" border: 1px solid #cccccc;\n");
printf(" margin: 0 auto auto auto;\n");
printf(" width: 12em;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p {\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0;\n");
printf(" outline: 0 none;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p.odd {\n");
printf(" background: #efefef;\n");
printf(" }\n");
printf(" p.even {\n");
printf(" background: #f7f7f7;\n");
printf(" }\n");
printf(" span.code {\n");
printf(" width: 8em;\n");
printf(" text-align: right;\n");
printf(" }\n");
printf(" span.char {\n");
printf(" width: 4em;\n");
printf(" text-align: left;\n");
printf(" }\n");
printf(" </style>\n");
printf(" </head>\n");
/* Body part. */
printf(" <body>\n");
n = 0;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
/* HTML does not like " & < > */
if (i == 32U)
str = " ";
else
if (i == 34U)
str = """;
else
if (i == 38U)
str = "&";
else
if (i == 60U)
str = "<";
else
if (i == 62U)
str = ">";
else
str = (const char *)ch;
if (n & 1) {
printf(" <p class=\"odd\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
} else {
printf(" <p class=\"even\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
}
n++;
}
}
printf(" </body>\n");
printf("</html>\n");
return EXIT_SUCCESS;
}
Redirect the output to a file, and you can open the file in whatever browser you prefer. If your browser is sane, and does not treat local files any different to those it obtains from a web server, then you should see the correct output.
(If you see multiple characters per code point after U+00A0, your browser has decided that because the file is local, it is using a different character set that it explicitly states it uses. Switch to a sane browser if that happens, or override the character set selection.)
If you want, you can just print the codes out as UTF-8 text, say using text.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
size_t len;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
printf("U+%04X %s \n", i, ch);
}
}
return EXIT_SUCCESS;
}
but then you must either be sure your terminal or terminal emulator supports UTF-8 and uses an UTF-8 locale, or you redirect the output to a text file and open that file in an editor which either assumes the file uses UTF-8 or lets you explicitly select the UTF-8 character set.
Note that there is a space before and after each character. Because some of the code points are combining characters, they may not show up at all unless they can be combined with another character, and most (all?) combine with space just fine.
If you use Windows, then you must conform to Microsoft stupidity, and add a special "byte order mark" -- printf("\xEF\xBB\xBF"); -- to the beginning of the output, so that its utilities like Notepad recognizes the file as UTF-8. It's a Windows-only wart, and treat it as such.
Questions?
The function to convert a 16-bit Unicode codepoint to a multibyte character sequence is c16rtomb; there is also c32rtomb if you want to handle 32-bit codepoints:
#include <uchar.h>
mbstate_t ps;
char buf[MB_CUR_MAX];
size_t bytes = c16rtomb(buf, i, &ps);
if (bytes != (size_t) -1) {
printf("%.*s\n", bytes, buf);
}
If c16rtomb is not available you will need to use platform-specific facilities.
I would go for something like this (using raw UTF-8 encoding):
char unicode[3] = { 0x00, 0x00, 0x00 };
for(size_t i=0; i<0xffff; i++)
{
printf("%s\n", unicode);
uint16_t * code = &unicode[0];
*code = *code +1;
}
Define a string on 3 bytes, the last one is the NULL terminating byte allowing a display via printf
Consider the two first bytes as your 16-bit unicode and increment it on each loop
Of course it can be optimized as:
Many characters won't be displayable
The cast char* -> uint16_t is not very elegant (triggers a warning)
As there is 2 bytes for UTF-8 encoding it will actually browse 11 bits of codepoints. To get the 16 bits you might want to actually use uint32_t and define a 5 bytes char* buffer
[EDIT] As stated in the comment, this loop will actually generates a lot of invalid UTF-8 sequences.
Indeed, going from U+007F to U+0080 is a +1 for code points but in UTF-8 you jump from 0x7F to 0xC280: you need to exclude some ranges in the loop.

Mapping multibyte characters to their unicode point representation

How do you map a single UTF-8 character to its unicode point in C?
[For example, È would be mapped to 00c8].
If your platform's wchar_t stores unicode (if it's a 32-bit type, it probably does) and you have an UTF-8 locale, you can call mbrtowc (from C90.1).
mbstate_t state = {0};
wchar_t wch;
char s[] = "\303\210";
size_t n;
memset(&state, 0, sizeof(state));
setlocale(LC_CTYPE, "en_US.utf8"); /*error checking omitted*/
n = mbrtowc(&wch, s, strlen(s), &state);
if (n <= (size_t)-2) printf("%lx\n", (unsigned long)wch);
For more flexibility, you can call the iconv interface.
char s[] = "\303\210";
iconv_t cd = iconv_open("UTF-8", "UCS-4");
if (cd != -1) {
char *inp = s;
size_t ins = strlen(s);
uint32_t c;
uint32_t *outp = &c;
size_t outs = 0;
if (iconv(cd, &inp, &ins, &outp, &outs) + 1 >= 2) printf("%lx\n", c);
iconv_close(cd);
}
Some things to look at :
libiconv
ConvertUTF.h
MultiByteToWideChar (under windows)
An reasonably fast implementation of an UTF-8 to UCS-2 converter. Surrogate and characters outside the BMP left as exercice.
The function returns the number of bytes consumed from the input s string. A negative value represents an error.
The resulting unicode character is put at the address p points to.
int utf8_to_wchar(wchar_t *p, const char *s)
{
const unsigned char *us = (const unsigned char *)s;
p[0] = 0;
if(!*us)
return 0;
else
if(us[0] < 0x80) {
p[0] = us[0];
return 1;
}
else
if(((us[0] & 0xE0) == 0xC0) && (us[1] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x1F) << 6) | (us[1] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x80) return -2;
#endif
return 2;
}
else
if(((us[0] & 0xF0) == 0xE0) && (us[1] & 0xC0) == 0x80 && (us[2] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x0F) << 12) | ((us[1] & 0x3F) << 6) | (us[2] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x800) return -2;
#endif
return 3;
}
return -1;
}

Resources