Mapping multibyte characters to their unicode point representation

Mapping multibyte characters to their unicode point representation - c

How do you map a single UTF-8 character to its unicode point in C?
[For example, È would be mapped to 00c8].

If your platform's wchar_t stores unicode (if it's a 32-bit type, it probably does) and you have an UTF-8 locale, you can call mbrtowc (from C90.1).
mbstate_t state = {0};
wchar_t wch;
char s[] = "\303\210";
size_t n;
memset(&state, 0, sizeof(state));
setlocale(LC_CTYPE, "en_US.utf8"); /*error checking omitted*/
n = mbrtowc(&wch, s, strlen(s), &state);
if (n <= (size_t)-2) printf("%lx\n", (unsigned long)wch);
For more flexibility, you can call the iconv interface.
char s[] = "\303\210";
iconv_t cd = iconv_open("UTF-8", "UCS-4");
if (cd != -1) {
char *inp = s;
size_t ins = strlen(s);
uint32_t c;
uint32_t *outp = &c;
size_t outs = 0;
if (iconv(cd, &inp, &ins, &outp, &outs) + 1 >= 2) printf("%lx\n", c);
iconv_close(cd);
}

Some things to look at :
libiconv
ConvertUTF.h
MultiByteToWideChar (under windows)

An reasonably fast implementation of an UTF-8 to UCS-2 converter. Surrogate and characters outside the BMP left as exercice.
The function returns the number of bytes consumed from the input s string. A negative value represents an error.
The resulting unicode character is put at the address p points to.
int utf8_to_wchar(wchar_t *p, const char *s)
{
const unsigned char *us = (const unsigned char *)s;
p[0] = 0;
if(!*us)
return 0;
else
if(us[0] < 0x80) {
p[0] = us[0];
return 1;
}
else
if(((us[0] & 0xE0) == 0xC0) && (us[1] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x1F) << 6) | (us[1] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x80) return -2;
#endif
return 2;
}
else
if(((us[0] & 0xF0) == 0xE0) && (us[1] & 0xC0) == 0x80 && (us[2] & 0xC0) == 0x80) {
p[0] = ((us[0] & 0x0F) << 12) | ((us[1] & 0x3F) << 6) | (us[2] & 0x3F);
#ifdef DETECT_OVERLONG
if(p[0] < 0x800) return -2;
#endif
return 3;
}
return -1;
}

Related

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

I have been trying to convert Chinese character input from Windows command prompt in Big5 to UTF-8 by first converting the received input to char32_t in UTF-32 encoding, then convert it to UTF-8. I've been calling the function mbtoc32 from <uchar.h> to do this job, however it kept sending "Encoding error".
The following is the conditions I have encountered:
Converting the sequence (Big5) to a wchar_t representation by mbstowcs is successful.
mbrtoc32 takes the multibyte sequence as UTF-8, though the locale is not. (Set to "", returns "Chinese (Traditional)_Hong Kong SAR.950" on my machine)
Below is the code I've been writing to try to debug my problem, however no success. It tries to convert the "香" Chinese character (U+9999) into the multibyte representation, then tries to convert the Big5 encoding of "香" (0xADBB) into wchar_t and char32_t. However, converting from multibyte (Big5) to char32_t returns encoding error. (In contradictory, inputting the UTF-8 sequence of "香" to mbrtoc32 does return 0x9999 successfully)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
I've also read documentation from cppreference.com, it said,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
I expect mbrtoc32 to behave like mbtowc, which is converting the character from the locale-specific encoding to UTF-32 (in this case Big5 to UTF-32).
Is there any solutions to use mbrtoc32 to convert the multibyte character into char32_t without having the "Encoding error"?
P.S.: I'm using Mingw-64 on Windows 10, compiled with gcc.

I've found the problem. The Mingw-w64 I'm using is expecting all multi-byte string passed to mbrtoc32 and c32rtomb to be in UTF-8 encoding.
Code for mbrtoc32:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
and for c32rtomb:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
both of these functions expected the given multi-byte string to be in UTF-8 without considering the locale settings. Functions mbrtoc32 and c32rtomb on glibc simply calls their wide character counterpart to convert the characters. As
wide character convertions are working properly on Mingw-w64, I used mbrtowc and wcrtomb to replace mbrtoc32 and c32rtomb respectively like the way on glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}

Creating file that uses UTF-8 encoding

I am trying to create a file and encode its content in the UTF-8 format using C. I have tried several things and looked around but I can not seem to find a solution to the problem.
This is the code I am currently trying (u8_wc_tout8 function taken from here):
int u8_wc_toutf8(char *dest, u_int32_t ch)
{
if (ch < 0x80) {
dest[0] = (char)ch;
return 1;
}
if (ch < 0x800) {
dest[0] = (ch>>6) | 0xC0;
dest[1] = (ch & 0x3F) | 0x80;
return 2;
}
if (ch < 0x10000) {
dest[0] = (ch>>12) | 0xE0;
dest[1] = ((ch>>6) & 0x3F) | 0x80;
dest[2] = (ch & 0x3F) | 0x80;
return 3;
}
if (ch < 0x110000) {
dest[0] = (ch>>18) | 0xF0;
dest[1] = ((ch>>12) & 0x3F) | 0x80;
dest[2] = ((ch>>6) & 0x3F) | 0x80;
dest[3] = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
int main ()
{
printf(setlocale(LC_ALL, "")); //Prints C.UTF-8
FILE * fout;
fout=fopen("out.txt","w");
u_int32_t c = 'Å';
char convertedChar[6];
int cNum = u8_wc_toutf8(convertedChar, c);
printf(convertedChar); //Prints ?
fprintf(fout, convertedChar);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
When I run this from the command prompt in Windows it prints ? and when I open the file created I get some weird characters. If I check the encoding in Firefox on the file it says:
"windows-1252"
Are there any better ways to check the encoding of the file?
Any tips to point me in the right direction would be really nice, it feels like this should not be that hard to do.

You should allocate the memory for convertedChar and set c to 197, which is the unicode char id of the angstrom character (Å). Then you can now encode this character in utf-8 or anything else if you want:
int main ()
{
FILE * fout;
fout=fopen("out.txt","wb");
u_int32_t c = 197; // Or 0xC5
char convertedChar[4];
int cNum = u8_wc_toutf8(convertedChar, c);
fwrite(convertedChar, sizeof(char), cNum, fout);
fclose(fout);
printf("\nFile has been created...\n");
return 0;
}
And in the case, for example, that your locale uses UTF-8 encoding, then you can use this to print the character on your console:
wchar_t wc;
mbtowc(&wc, convertedChar, sizeof(wchar_t));
putwc(wc, stdout);

How to dump utf8 from octal ISO-8859-1 in C

I'm trying to output the right character in utf8 given the following octal sequence \303\255 and \346\234\254, but I don't get the correct output.
#include <stdio.h>
#include <stdlib.h>
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
*buf++ = (char)ch;
return 1;
}
if(ch < 0x800) {
*buf++ = (ch >> 6) | 0xC0;
*buf++ = (ch & 0x3F) | 0x80;
return 2;
}
if(ch < 0x10000) {
*buf++ = (ch >> 12) | 0xE0;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 3;
}
if(ch < 0x110000) {
*buf++ = (ch >> 18) | 0xF0;
*buf++ = ((ch >> 12) & 0x3F) | 0x80;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
void output (char *str) {
char *buffer = calloc(8, sizeof(char));
int n = 0;
while(*str) {
n = encode(buffer + n, *str++);
}
printf("%s\n", buffer);
free (buffer);
}
int main() {
char *str1 = "\303\255";
char *str2 = "\346\234\254";
output(str1);
output(str2);
return 0;
}
Outputs: Ã & æ¬ instead of í & 本

The problem is that the code sequence you use is already UTF-8
/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";
So your encode function is trying to encode an already encoded UTF-8 which should not work.
When i print these sequences in my UTF-8 enabled terminal i see what you are expecting to see:
$ printf "%s\n" $'\303\255'
í
$ printf "%s\n" $'\346\234\254'
本
So maybe you need to rethink what you are trying to accomplish and post a new question if you have new problems there.

It's a pity, but you cannot compare a char value (being it signed or unsigned) with values over 0x100. You are missing something if you try to convert one byte (iso-8859-1) values to utf-8. The iso-8859-1 characters have the same code values as their UTF counterparts, so the conversion is fairly straightforward, as will be shown below.
First of all, all the iso-8859-1 characters are the same as their UTF counterparts, so the first transformation is the identity: We convert each value in iso-8859-1 to the same value in UTF (look that when I say UTF y mean the UTF code for that character, without using any codification, as when I say UTF-8, which is actually an encoding of UTF in eight bit bytes)
UTF values in the range 0x80...0xff must be encoded with two bytes, the first byte using bits 7 and 6 with pattern 110000xx being xx the two most significant bits of the input code, and followed by a second byte with 10xxxxxx being xxxxxx the six least significant bits (bits 5 to 0) of the input code. For UTF values in the range 0x00...0x7f you encode them with just the same byte as the UTF code.
The following function does preciselly this:
size_t iso2utf( unsigned char *buf, unsigned char iso )
{
size_t res = 0;
if ( iso & 0x80 ) {
*buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */
*buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */
res += 2;
} else {
*buf++ = iso; /* a 0xxxxxxx character, untouched. */
res++;
}
*buf = '\0';
return res;
} /* iso2utf */
If you want a complete UTF into UTF-8 encoder, you can try this (I used a different approach, as there can be as much as seven bytes per UTF char ---actually not so much, as currently only 24 or 25 bit codes are used):
#include <string.h>
#include <stdlib.h>
typedef unsigned int UTF; /* you can use wchar_t if you prefer */
typedef unsigned char BYTE;
/* I will assume that UTF string is also zero terminated */
size_t utf_utf8 (BYTE *out, UTF *in)
{
size_t res = 0;
for (;*in;in++) {
UTF c = *in; /* copy the UTF value */
/* we are constructing the string backwards, so finally
* we have it properly ordered. */
size_t n = 0; /* number of characters for this one */
BYTE aux[7], /* buffer to construct the string */
*p = aux + sizeof aux; /* point one cell past the end */
static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01};
static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
for (;c >= limits[n]; c >>= 6) {
*--p = 0x80 | (c & 0x3f); n++;
} /* for */
*--p = masks[n] | c; n++;
memcpy(out, p, n); out += n; res += n;
} /* for */
*out = '\0'; /* terminate string */
return res;
} /* utf_utf8 */
See that the seven bytes per UTF code is hardwired, as it is the fact of UTF codes being 32bit integer. I don't expect UTF codes to go further past the 32 bit limit, but in that case, both, the UTF typedef, and the sizes and contents of the tables aux, limits and masks might be changed accordingly. There's a maximum limit of 7 or 8 for the number of characters used for the utf-8 encoding also, and it's not specified in the standard in any form how to proceed if the UTF codespace should run out of codes any time, so better not to mesh too much with this.

Useless function parameter: unsigned char ch
/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
...
return 1;
if(ch < 0x800) {
...
return 2;
if(ch < 0x10000) {
Sorry, GTG.
Note: Code incorrectly does not detect high and low surrogates.

How to iterate through unicode characters and print them on the screen with printf in C?

I want to iterate through all (at least the 16 bit) unicode characters and print them on the screen with C.
I know there are related questions on SO but they don't solve the problem with printf in C, but this is what I want to achieve, if it's possible after all. I think it should be possible maybe with a trick I'm not aware of.
Since I want to use printf, I thought about something like this:
for (int i = 0x0000; i <= 0xffff; i++) {
//then somehow increment the string
char str[] = "\u25A1\n";
printf("%s", str);
char str[] = "\u25A2\n";
printf("%s", str);
char str[] = "\u25A3\n";
printf("%s", str);
...
}
But it's a bit of a problem to increment the unicode code point, here \u25A1. I'm aware it's not possible per se because some characters like \u0000 are not printable and the compiler says no. But apart from that, how could I increment from hexadecimal 0000 to ffff and print the character with printf.

If the __STDC_ISO_10646__ macro is defined, wide characters correspond to Unicode codepoints. So, assuming a locale that can represent the characters you are interested in, you can just printf() wide characters via the %lc format conversion:
#include <stdio.h>
#include <locale.h>
#ifndef __STDC_ISO_10646__
#error "Oops, our wide chars are not Unicode codepoints, sorry!"
#endif
int main()
{
int i;
setlocale(LC_ALL, "");
for (i = 0; i < 0xffff; i++) {
printf("%x - %lc\n", i, i);
}
return 0;
}

In C99, you can use wide character to multibyte character conversion functions wctomb() or wcrtomb() to convert each code point to a local representation, using the current character set. (The code points are in the current character set, not Unicode.) Remember to use setlocale() to ensure conversion functions are aware of the user locale (most importantly, the current character set used). The conversion functions use the LC_CTYPE category, but you should still use setlocale(LC_ALL, ""); as for any other locale-aware program.
(Not all systems have the C.UTF-8 locale installed, so I do not recommend trying to override the locale to the standard C with UTF-8 using setlocale(LC_ALL, "C.UTF-8");. It works on some systems, but not all. AFAIK it does not work in Fedora-based Linux distributions, for example.)
Because you want to output all Unicode code points, I suggest a different approach: Use one of the Universal Character Set Transformation Formats, i.e. UTF-8, UTF-16 (UCS-2 was superseded by UTF-16 in 1996), or UTF-32 (also known as UCS-4). UTF-8 is the one most often used on the Web -- in particular, on this very web page you're looking at right now -- and is very easy to use.
For further reading on why you should prefer UTF-8 over "native wide strings", see utf8everywhere.org.
If you want truly portable code, you can use this header file, utf8.h, to convert UTF-8 to unicode code points (utf8_to_code()) and Unicode code points to UTF-8 (code_to_utf8()):
#ifndef UTF8_H
#define UTF8_H
#include <stdlib.h>
#include <errno.h>
#define UTF8_MAXLEN 6
static size_t utf8_to_code(const unsigned char *const buffer, unsigned int *const codeptr)
{
if (!buffer) {
errno = EINVAL;
return 0;
}
if (*buffer == 0U) {
errno = 0;
return 0;
}
if (*buffer < 128U) {
if (codeptr)
*codeptr = buffer[0];
return 1;
}
if (*buffer < 192U) {
errno = EILSEQ;
return 0;
}
if (*buffer < 224U) {
if (buffer[1] >= 128U && buffer[1] < 192U)
return ((buffer[0] - 192U) << 6U)
| (buffer[1] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 240U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U)
return ((buffer[0] - 224U) << 12U)
| ((buffer[1] - 128U) << 6U)
| (buffer[2] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 248U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U)
return ((buffer[0] - 240U) << 18U)
| ((buffer[1] - 128U) << 12U)
| ((buffer[2] - 128U) << 6U)
| (buffer[3] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 252U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U)
return ((buffer[0] - 248U) << 24U)
| ((buffer[1] - 128U) << 18U)
| ((buffer[2] - 128U) << 12U)
| ((buffer[3] - 128U) << 6U)
| (buffer[4] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 254U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U &&
buffer[5] >= 128U && buffer[5] < 192U)
return ((buffer[0] - 252U) << 30U)
| ((buffer[1] - 128U) << 24U)
| ((buffer[2] - 128U) << 18U)
| ((buffer[3] - 128U) << 12U)
| ((buffer[4] - 128U) << 6U)
| (buffer[5] - 128U);
errno = EILSEQ;
return 0;
}
errno = EILSEQ;
return 0;
}
static size_t code_to_utf8(unsigned char *const buffer, const unsigned int code)
{
if (code < 128U) {
buffer[0] = code;
return 1;
}
if (code < 2048U) {
buffer[0] = 0xC0U | (code >> 6U);
buffer[1] = 0x80U | (code & 0x3FU);
return 2;
}
if (code < 65536) {
buffer[0] = 0xE0U | (code >> 12U);
buffer[1] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[2] = 0x80U | (code & 0x3FU);
return 3;
}
if (code < 2097152U) {
buffer[0] = 0xF0U | (code >> 18U);
buffer[1] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[3] = 0x80U | (code & 0x3FU);
return 4;
}
if (code < 67108864U) {
buffer[0] = 0xF8U | (code >> 24U);
buffer[1] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[4] = 0x80U | (code & 0x3FU);
return 5;
}
if (code <= 2147483647U) {
buffer[0] = 0xFCU | (code >> 30U);
buffer[1] = 0x80U | ((code >> 24U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[4] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[5] = 0x80U | (code & 0x3FU);
return 6;
}
errno = EINVAL;
return 0;
}
#endif /* UTF8_H */
It is not fast, but it should be easy to understand, and supports all possible Unicode code points (U+0000 to U+10FFFF, inclusive), on all systems with at least 32-bit unsigned ints. On systems with 16-bit unsigned ints, your compiler may warn about unreachable code, and it'll only support the first 65536 code points (U+0000 to U+FFFF).
Using above utf8.h, you can easily write a C program that outputs a HTML page containing the Unicode characters you want (excluding control characters U+0000-U+001F and U+007F-U+00BF, inclusive, and invalid code points U+D800-U+DFFF, inclusive). For example, page.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
const char *str;
size_t n, len;
/* HTML5 DOCTYPE */
printf("<!DOCTYPE html>\n");
printf("<html>\n");
/* Header part. */
printf(" <head>\n");
printf(" <title> Unicode character list </title>\n");
printf(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
printf(" <style type=\"text/css\">\n");
/* with internal CSS stylesheet: */
printf(" html {\n");
printf(" font-family: \"DejaVu Mono\", \"Courier New\", \"Courier\", monospace;\n");
printf(" font-weight: normal;\n");
printf(" font-size: 100%%;\n");
printf(" text-decoration: none;\n");
printf(" background: #f7f7f7;\n");
printf(" color: #000000;\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0\n");
printf(" }\n");
printf(" body {\n");
printf(" background: #ffffff;\n");
printf(" padding: 0.5em 1em 0.5em 1em;\n");
printf(" border: 1px solid #cccccc;\n");
printf(" margin: 0 auto auto auto;\n");
printf(" width: 12em;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p {\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0;\n");
printf(" outline: 0 none;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p.odd {\n");
printf(" background: #efefef;\n");
printf(" }\n");
printf(" p.even {\n");
printf(" background: #f7f7f7;\n");
printf(" }\n");
printf(" span.code {\n");
printf(" width: 8em;\n");
printf(" text-align: right;\n");
printf(" }\n");
printf(" span.char {\n");
printf(" width: 4em;\n");
printf(" text-align: left;\n");
printf(" }\n");
printf(" </style>\n");
printf(" </head>\n");
/* Body part. */
printf(" <body>\n");
n = 0;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
/* HTML does not like " & < > */
if (i == 32U)
str = " ";
else
if (i == 34U)
str = """;
else
if (i == 38U)
str = "&";
else
if (i == 60U)
str = "<";
else
if (i == 62U)
str = ">";
else
str = (const char *)ch;
if (n & 1) {
printf(" <p class=\"odd\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
} else {
printf(" <p class=\"even\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
}
n++;
}
}
printf(" </body>\n");
printf("</html>\n");
return EXIT_SUCCESS;
}
Redirect the output to a file, and you can open the file in whatever browser you prefer. If your browser is sane, and does not treat local files any different to those it obtains from a web server, then you should see the correct output.
(If you see multiple characters per code point after U+00A0, your browser has decided that because the file is local, it is using a different character set that it explicitly states it uses. Switch to a sane browser if that happens, or override the character set selection.)
If you want, you can just print the codes out as UTF-8 text, say using text.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
size_t len;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
printf("U+%04X %s \n", i, ch);
}
}
return EXIT_SUCCESS;
}
but then you must either be sure your terminal or terminal emulator supports UTF-8 and uses an UTF-8 locale, or you redirect the output to a text file and open that file in an editor which either assumes the file uses UTF-8 or lets you explicitly select the UTF-8 character set.
Note that there is a space before and after each character. Because some of the code points are combining characters, they may not show up at all unless they can be combined with another character, and most (all?) combine with space just fine.
If you use Windows, then you must conform to Microsoft stupidity, and add a special "byte order mark" -- printf("\xEF\xBB\xBF"); -- to the beginning of the output, so that its utilities like Notepad recognizes the file as UTF-8. It's a Windows-only wart, and treat it as such.
Questions?

The function to convert a 16-bit Unicode codepoint to a multibyte character sequence is c16rtomb; there is also c32rtomb if you want to handle 32-bit codepoints:
#include <uchar.h>
mbstate_t ps;
char buf[MB_CUR_MAX];
size_t bytes = c16rtomb(buf, i, &ps);
if (bytes != (size_t) -1) {
printf("%.*s\n", bytes, buf);
}
If c16rtomb is not available you will need to use platform-specific facilities.

I would go for something like this (using raw UTF-8 encoding):
char unicode[3] = { 0x00, 0x00, 0x00 };
for(size_t i=0; i<0xffff; i++)
{
printf("%s\n", unicode);
uint16_t * code = &unicode[0];
*code = *code +1;
}
Define a string on 3 bytes, the last one is the NULL terminating byte allowing a display via printf
Consider the two first bytes as your 16-bit unicode and increment it on each loop
Of course it can be optimized as:
Many characters won't be displayable
The cast char* -> uint16_t is not very elegant (triggers a warning)
As there is 2 bytes for UTF-8 encoding it will actually browse 11 bits of codepoints. To get the 16 bits you might want to actually use uint32_t and define a 5 bytes char* buffer
[EDIT] As stated in the comment, this loop will actually generates a lot of invalid UTF-8 sequences.
Indeed, going from U+007F to U+0080 is a +1 for code points but in UTF-8 you jump from 0x7F to 0xC280: you need to exclude some ranges in the loop.

Is there a way to convert from UTF8 to ISO-8859-1?

My software is getting some strings in UTF8 than I need to convert to ISO 8859 1. I know that UTF8 domain is bigger than ISO 8859. But the data in UTF8 has been previously upconverted from ISO, so I should not miss anything.
I would like to know if there is an easy / direct way to convert from UTF8 to iso-8859-1.

Here is a function you might find useful: utf8_to_latin9(). It converts to ISO-8859-15 (including EURO, which ISO-8859-1 does not have), but also works correctly for the UTF-8->ISO-8859-1 conversion part of a ISO-8859-1->UTF-8->ISO-8859-1 round-trip.
The function ignores invalid code points similar to //IGNORE flag for iconv, but does not recompose decomposed UTF-8 sequences; that is, it won't turn U+006E U+0303 into U+00F1. I don't bother recomposing because iconv does not either.
The function is very careful about the string access. It will never scan beyond the buffer. The output buffer must be one byte longer than length, because it always appends the end-of-string NUL byte. The function returns the number of characters (bytes) in output, not including the end-of-string NUL byte.
/* UTF-8 to ISO-8859-1/ISO-8859-15 mapper.
* Return 0..255 for valid ISO-8859-15 code points, 256 otherwise.
*/
static inline unsigned int to_latin9(const unsigned int code)
{
/* Code points 0 to U+00FF are the same in both. */
if (code < 256U)
return code;
switch (code) {
case 0x0152U: return 188U; /* U+0152 = 0xBC: OE ligature */
case 0x0153U: return 189U; /* U+0153 = 0xBD: oe ligature */
case 0x0160U: return 166U; /* U+0160 = 0xA6: S with caron */
case 0x0161U: return 168U; /* U+0161 = 0xA8: s with caron */
case 0x0178U: return 190U; /* U+0178 = 0xBE: Y with diaresis */
case 0x017DU: return 180U; /* U+017D = 0xB4: Z with caron */
case 0x017EU: return 184U; /* U+017E = 0xB8: z with caron */
case 0x20ACU: return 164U; /* U+20AC = 0xA4: Euro */
default: return 256U;
}
}
/* Convert an UTF-8 string to ISO-8859-15.
* All invalid sequences are ignored.
* Note: output == input is allowed,
* but input < output < input + length
* is not.
* Output has to have room for (length+1) chars, including the trailing NUL byte.
*/
size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
{
unsigned char *out = (unsigned char *)output;
const unsigned char *in = (const unsigned char *)input;
const unsigned char *const end = (const unsigned char *)input + length;
unsigned int c;
while (in < end)
if (*in < 128)
*(out++) = *(in++); /* Valid codepoint */
else
if (*in < 192)
in++; /* 10000000 .. 10111111 are invalid */
else
if (*in < 224) { /* 110xxxxx 10xxxxxx */
if (in + 1 >= end)
break;
if ((in[1] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x1FU)) << 6U)
| ((unsigned int)(in[1] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 2;
} else
if (*in < 240) { /* 1110xxxx 10xxxxxx 10xxxxxx */
if (in + 2 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x0FU)) << 12U)
| (((unsigned int)(in[1] & 0x3FU)) << 6U)
| ((unsigned int)(in[2] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 3;
} else
if (*in < 248) { /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 3 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x07U)) << 18U)
| (((unsigned int)(in[1] & 0x3FU)) << 12U)
| (((unsigned int)(in[2] & 0x3FU)) << 6U)
| ((unsigned int)(in[3] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 4;
} else
if (*in < 252) { /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 4 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U &&
(in[4] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x03U)) << 24U)
| (((unsigned int)(in[1] & 0x3FU)) << 18U)
| (((unsigned int)(in[2] & 0x3FU)) << 12U)
| (((unsigned int)(in[3] & 0x3FU)) << 6U)
| ((unsigned int)(in[4] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 5;
} else
if (*in < 254) { /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (in + 5 >= end)
break;
if ((in[1] & 192U) == 128U &&
(in[2] & 192U) == 128U &&
(in[3] & 192U) == 128U &&
(in[4] & 192U) == 128U &&
(in[5] & 192U) == 128U) {
c = to_latin9( (((unsigned int)(in[0] & 0x01U)) << 30U)
| (((unsigned int)(in[1] & 0x3FU)) << 24U)
| (((unsigned int)(in[2] & 0x3FU)) << 18U)
| (((unsigned int)(in[3] & 0x3FU)) << 12U)
| (((unsigned int)(in[4] & 0x3FU)) << 6U)
| ((unsigned int)(in[5] & 0x3FU)) );
if (c < 256)
*(out++) = c;
}
in += 6;
} else
in++; /* 11111110 and 11111111 are invalid */
/* Terminate the output string. */
*out = '\0';
return (size_t)(out - (unsigned char *)output);
}
Note that you can add custom transliteration for specific code points in the to_latin9() function, but you are limited to one-character replacements.
As it is currently written, the function can do in-place conversion safely: input and output pointers can be the same. The output string will never be longer than the input string. If your input string has room for an extra byte (for example, it has the NUL terminating the string), you can safely use the above function to convert it from UTF-8 to ISO-8859-1/15. I deliberately wrote it this way, because it should save you some effort in an embedded environment, although this approach is a bit limited wrt. customization and extension.
Edit:
I included a pair of conversion functions in an edit to this answer for both Latin-1/9 to/from UTF-8 conversion (ISO-8859-1 or -15 to/from UTF-8); the main difference is that those functions return a dynamically allocated copy, and keep the original string intact.

iconv - perform character set conversion
size_t iconv(iconv_t cd,
char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft);
iconv_t iconv_open(const char *tocode, const char *fromcode);
tocode is "ISO_8859-1" and fromcode is "UTF-8".
Working example:
#include <iconv.h>
#include <stdio.h>
int main (void) {
iconv_t cd = iconv_open("ISO_8859-1", "UTF-8");
if (cd == (iconv_t) -1) {
perror("iconv_open failed!");
return 1;
}
char input[] = "Test äöü";
char *in_buf = &input[0];
size_t in_left = sizeof(input) - 1;
char output[32];
char *out_buf = &output[0];
size_t out_left = sizeof(output) - 1;
do {
if (iconv(cd, &in_buf, &in_left, &out_buf, &out_left) == (size_t) -1) {
perror("iconv failed!");
return 1;
}
} while (in_left > 0 && out_left > 0);
*out_buf = 0;
iconv_close(cd);
printf("%s -> %s\n", input, output);
return 0;
}

The following example uses iconv library also.
It works even when you have a file (or input stream) that contains mixed UTF-8 and ISO-8859-1 characters (this could happen, for example, if you have an UTF-8 file and edit it in environement that uses ISO-8859-1).
int Utf8ToLatin1(char* input, char* output, size_t size)
{
size_t in_left = size;
size_t out_left = size;
char *in_buf = input;
char *out_buf = output;
iconv_t cd = iconv_open("ISO_8859-1", "UTF-8");
if (cd == (iconv_t)-1) {
(void) fprintf(stderr, "iconv_open() failed, msg encoding will be kept!");
strncpy(output, input, size);
return -1;
}
do {
if (iconv(cd, &in_buf, &in_left, &out_buf, &out_left) == (size_t) -1) {
if (errno == EILSEQ) {
/* Input conversion stopped due to an input byte that
* does not belong to the input codeset.
*/
printf("Input conversion stopped due to an input byte that does not belong to the input codeset.\n");
*out_buf= *in_buf;
out_buf++ ;out_left--;
in_buf++ ;in_left--;
} else if (errno == E2BIG) {
/* Input conversion stopped due to lack of space in
* the output buffer.
*/
printf("Input conversion stopped due to lack of space in the output buffer.\n");
perror("iconv failed!, propably the encoding is already Latin, msg encoding will be kept!\n");
strncpy(output, input, size);
return -1;
} else if (errno == EINVAL) {
/* Input conversion stopped due to an incomplete
* character or shift sequence at the end of the
* input buffer.
*/
printf("Input conversion stopped due to an incomplete character or shift sequence at the end of the input buffer.\n");
*out_buf= *in_buf;
out_buf++ ;out_left--;
in_buf++ ;in_left--;
}
}
} while (in_left > 0 && out_left > 0);
*out_buf = 0;
iconv_close(cd);
printf("*********************************************************\n");
printf("ISO-8859-1:\n %s\n", input, output);
return 0;
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Mapping multibyte characters to their unicode point representation - c

How do you map a single UTF-8 character to its unicode point in C? [For example, È would be mapped to 00c8].

Some things to look at : libiconv ConvertUTF.h MultiByteToWideChar (under windows)

Related

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

Creating file that uses UTF-8 encoding

How to dump utf8 from octal ISO-8859-1 in C

How to iterate through unicode characters and print them on the screen with printf in C?

Is there a way to convert from UTF8 to ISO-8859-1?

Categories

Resources