I have an array of uint32_t elements that each store a codepoint for a non-latin Unicode character. How do I print them on the console or store them in a file as UTF-8 encoded characters? I understand that they may fail to render properly on a console, but they should display fine if I open them in a compatible editor.
I have tried using wprintf(L"%lc", UINT32_T_VARIABLE), and fwprintf(FILE_STREAM, L"%lc", UINT32_T_VARIABLE) but to no avail.
You must first select the proper locale with:
#include <locale.h>
setlocale(LC_ALL, "C.UTF-8");
or
setlocale(LC_ALL, "en_US.UTF-8");
And then use printf or fprintf with the %lc format:
printf("%lc", UINT32_T_VARIABLE);
This will work only for Unicode code points small enough to fit in a wchar_t. For a more complete and portable solution, you may nee to implement the Unicode to UTF-8 conversion yourself, which is not very difficult.
Best to use existing code when available.
Rolling ones own Unicode code-point to UTF8 is simply, yet easy to mess up. The answer took 2 edits to fix. #Jonathan Leffler #chqrlie, so rigorous testing is recommended for any self-coded solution. Follows is lightly tested code to convert a code-point to an array.
Note that the result is not a string.
// Populate utf8 with 0-4 bytes
// Return length used in utf8[]
// 0 implies bad codepoint
unsigned Unicode_CodepointToUTF8(uint8_t *utf8, uint32_t codepoint) {
if (codepoint <= 0x7F) {
utf8[0] = codepoint;
return 1;
}
if (codepoint <= 0x7FF) {
utf8[0] = 0xC0 | (codepoint >> 6);
utf8[1] = 0x80 | (codepoint & 0x3F);
return 2;
}
if (codepoint <= 0xFFFF) {
// detect surrogates
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return 0;
utf8[0] = 0xE0 | (codepoint >> 12);
utf8[1] = 0x80 | ((codepoint >> 6) & 0x3F);
utf8[2] = 0x80 | (codepoint & 0x3F);
return 3;
}
if (codepoint <= 0x10FFFF) {
utf8[0] = 0xF0 | (codepoint >> 18);
utf8[1] = 0x80 | ((codepoint >> 12) & 0x3F);
utf8[2] = 0x80 | ((codepoint >> 6) & 0x3F);
utf8[3] = 0x80 | (codepoint & 0x3F);
return 4;
}
return 0;
}
// Sample usage
uint32_t cp = foo();
uint8_t utf8[4];
unsigned len = Unicode_CodepointToUTF8(utf8, cp);
if (len == 0) Handle_BadCodePoint();
size_t y = fwrite(utf8, 1, len, stream_opened_in_binary_mode);
Related
I have been trying to convert Chinese character input from Windows command prompt in Big5 to UTF-8 by first converting the received input to char32_t in UTF-32 encoding, then convert it to UTF-8. I've been calling the function mbtoc32 from <uchar.h> to do this job, however it kept sending "Encoding error".
The following is the conditions I have encountered:
Converting the sequence (Big5) to a wchar_t representation by mbstowcs is successful.
mbrtoc32 takes the multibyte sequence as UTF-8, though the locale is not. (Set to "", returns "Chinese (Traditional)_Hong Kong SAR.950" on my machine)
Below is the code I've been writing to try to debug my problem, however no success. It tries to convert the "香" Chinese character (U+9999) into the multibyte representation, then tries to convert the Big5 encoding of "香" (0xADBB) into wchar_t and char32_t. However, converting from multibyte (Big5) to char32_t returns encoding error. (In contradictory, inputting the UTF-8 sequence of "香" to mbrtoc32 does return 0x9999 successfully)
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char32_t chi_c = 0x9999;
printf("Character U+9999 is 香\n");
char *mbc = (char *)calloc(32, sizeof(char));
size_t mb_len;
mb_len = c32rtomb(mbc, chi_c, &state);
int i;
printf("The multibyte representation of U+9999 is:\n");
// 0xE9A699, UTF-8
for (i = 0; i < mb_len; i++){
printf("%#2x\t", *(mbc + i));
}
char *src_mbs = (char *)calloc(32, sizeof(char));
// "香" in Big5 encoding
*(src_mbs + 0) = 0xad;
*(src_mbs + 1) = 0xbb;
wchar_t res_wc;
mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
char32_t res_c32;
mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
}
return 0;
}
I've also read documentation from cppreference.com, it said,
In any case, the multibyte character encoding used by this function is specified by the currently active C locale.
I expect mbrtoc32 to behave like mbtowc, which is converting the character from the locale-specific encoding to UTF-32 (in this case Big5 to UTF-32).
Is there any solutions to use mbrtoc32 to convert the multibyte character into char32_t without having the "Encoding error"?
P.S.: I'm using Mingw-64 on Windows 10, compiled with gcc.
I've found the problem. The Mingw-w64 I'm using is expecting all multi-byte string passed to mbrtoc32 and c32rtomb to be in UTF-8 encoding.
Code for mbrtoc32:
size_t mbrtoc32 (char32_t *__restrict__ pc32,
const char *__restrict__ s,
size_t n,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (*s == 0)
{
*pc32 = 0;
return 0;
}
/* ASCII character - high bit unset */
if ((*s & 0x80) == 0)
{
*pc32 = *s;
return 1;
}
/* Multibyte chars */
if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
{
if (n < 2)
return (size_t)-2;
*pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
return 2;
}
else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
{
if (n < 3)
return (size_t)-2;
*pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
return 3;
}
else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
{
if (n < 4)
return (size_t)-2;
*pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
and for c32rtomb:
size_t c32rtomb (char *__restrict__ s,
char32_t c32,
mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
if (c32 <= 0x7F) /* 7 bits needs 1 byte */
{
*s = (char)c32 & 0x7F;
return 1;
}
else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
{
s[1] = 0x80 | (char)(c32 & 0x3F);
s[0] = 0xC0 | (char)(c32 >> 6);
return 2;
}
else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
{
s[2] = 0x80 | (char)(c32 & 0x3F);
s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[0] = 0xE0 | (char)(c32 >> 12);
return 3;
}
else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
{
s[3] = 0x80 | (char)(c32 & 0x3F);
s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
s[0] = 0xF0 | (char)(c32 >> 18);
return 4;
}
errno = EILSEQ;
return (size_t)-1;
}
both of these functions expected the given multi-byte string to be in UTF-8 without considering the locale settings. Functions mbrtoc32 and c32rtomb on glibc simply calls their wide character counterpart to convert the characters. As
wide character convertions are working properly on Mingw-w64, I used mbrtowc and wcrtomb to replace mbrtoc32 and c32rtomb respectively like the way on glibc:
#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
mbstate_t state;
int main(void){
setlocale(LC_CTYPE, "");
printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
char32_t src_c32 = 0x9999; // "香" code point
unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
if (r_mbc == NULL){
perror("Failed to allocate memory");
return errno;
}
size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
printf("Character U+9999 is %s, ( ", r_mbc);
for (int i = 0; i < mb_len; i++){
printf("%#hhx ", *(r_mbc + i));
}
printf(")\n");
// mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
// printf("\nThe multibyte representation of U+9999 is:\n");
// for (i = 0; i < mb_len; i++){
// printf("%#hhX\t", *(r_mbc + i));
// }
char32_t r_c32 = 0;
// mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
// Returns (size_t)-1, encoding error
mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
if (mb_len == (size_t)-1){
perror("Encoding error");
return errno;
}
else {
printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
}
return 0;
}
I am reading code from somebody, I happended to see code as follow.
According to comment, this function is to Convert a UCS character to an UTF-8 string. But what is ucs character, what is the rule to convert ucs to unicode, where can I find the documents?
/*
* Convert a UCS character to an UTF-8 string
*
* Returns the string length of the result
*/
size_t
tUcs2Utf8(ULONG ulChar, char *szResult, size_t tMaxResultLen)
{
if (szResult == NULL || tMaxResultLen == 0) {
return 0;
}
if (ulChar < 0x80 && tMaxResultLen >= 2) {
szResult[0] = (char)ulChar;
szResult[1] = '\0';
return 1;
}
if (ulChar < 0x800 && tMaxResultLen >= 3) {
szResult[0] = (char)(0xc0 | ulChar >> 6);
szResult[1] = (char)(0x80 | (ulChar & 0x3f));
szResult[2] = '\0';
return 2;
}
if (ulChar < 0x10000 && tMaxResultLen >= 4) {
szResult[0] = (char)(0xe0 | ulChar >> 12);
szResult[1] = (char)(0x80 | (ulChar >> 6 & 0x3f));
szResult[2] = (char)(0x80 | (ulChar & 0x3f));
szResult[3] = '\0';
return 3;
}
if (ulChar < 0x200000 && tMaxResultLen >= 5) {
szResult[0] = (char)(0xf0 | ulChar >> 18);
szResult[1] = (char)(0x80 | (ulChar >> 12 & 0x3f));
szResult[2] = (char)(0x80 | (ulChar >> 6 & 0x3f));
szResult[3] = (char)(0x80 | (ulChar & 0x3f));
szResult[4] = '\0';
return 4;
}
szResult[0] = '\0';
return 0;
} /* end of tUcs2Utf8 */
Universal Character Set is an ISO standard. It defines the same characters as Unicode, so there's no need for character conversion. Every version of UCS is essentially a small subset of a certain version of the Unicode standard. New characters are first added to Unicode and every so often, UCS is synchronized with Unicode. Appendix C of the Unicode standard contains a table that shows the relationship between different versions.
Also note that the code you posted uses a non-standard upper limit of 0x200000. This should be changed to 0x110000.
I'm trying to output the right character in utf8 given the following octal sequence \303\255 and \346\234\254, but I don't get the correct output.
#include <stdio.h>
#include <stdlib.h>
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
*buf++ = (char)ch;
return 1;
}
if(ch < 0x800) {
*buf++ = (ch >> 6) | 0xC0;
*buf++ = (ch & 0x3F) | 0x80;
return 2;
}
if(ch < 0x10000) {
*buf++ = (ch >> 12) | 0xE0;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 3;
}
if(ch < 0x110000) {
*buf++ = (ch >> 18) | 0xF0;
*buf++ = ((ch >> 12) & 0x3F) | 0x80;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
void output (char *str) {
char *buffer = calloc(8, sizeof(char));
int n = 0;
while(*str) {
n = encode(buffer + n, *str++);
}
printf("%s\n", buffer);
free (buffer);
}
int main() {
char *str1 = "\303\255";
char *str2 = "\346\234\254";
output(str1);
output(str2);
return 0;
}
Outputs: à & æ¬ instead of í & 本
The problem is that the code sequence you use is already UTF-8
/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";
So your encode function is trying to encode an already encoded UTF-8 which should not work.
When i print these sequences in my UTF-8 enabled terminal i see what you are expecting to see:
$ printf "%s\n" $'\303\255'
í
$ printf "%s\n" $'\346\234\254'
本
So maybe you need to rethink what you are trying to accomplish and post a new question if you have new problems there.
It's a pity, but you cannot compare a char value (being it signed or unsigned) with values over 0x100. You are missing something if you try to convert one byte (iso-8859-1) values to utf-8. The iso-8859-1 characters have the same code values as their UTF counterparts, so the conversion is fairly straightforward, as will be shown below.
First of all, all the iso-8859-1 characters are the same as their UTF counterparts, so the first transformation is the identity: We convert each value in iso-8859-1 to the same value in UTF (look that when I say UTF y mean the UTF code for that character, without using any codification, as when I say UTF-8, which is actually an encoding of UTF in eight bit bytes)
UTF values in the range 0x80...0xff must be encoded with two bytes, the first byte using bits 7 and 6 with pattern 110000xx being xx the two most significant bits of the input code, and followed by a second byte with 10xxxxxx being xxxxxx the six least significant bits (bits 5 to 0) of the input code. For UTF values in the range 0x00...0x7f you encode them with just the same byte as the UTF code.
The following function does preciselly this:
size_t iso2utf( unsigned char *buf, unsigned char iso )
{
size_t res = 0;
if ( iso & 0x80 ) {
*buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */
*buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */
res += 2;
} else {
*buf++ = iso; /* a 0xxxxxxx character, untouched. */
res++;
}
*buf = '\0';
return res;
} /* iso2utf */
If you want a complete UTF into UTF-8 encoder, you can try this (I used a different approach, as there can be as much as seven bytes per UTF char ---actually not so much, as currently only 24 or 25 bit codes are used):
#include <string.h>
#include <stdlib.h>
typedef unsigned int UTF; /* you can use wchar_t if you prefer */
typedef unsigned char BYTE;
/* I will assume that UTF string is also zero terminated */
size_t utf_utf8 (BYTE *out, UTF *in)
{
size_t res = 0;
for (;*in;in++) {
UTF c = *in; /* copy the UTF value */
/* we are constructing the string backwards, so finally
* we have it properly ordered. */
size_t n = 0; /* number of characters for this one */
BYTE aux[7], /* buffer to construct the string */
*p = aux + sizeof aux; /* point one cell past the end */
static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01};
static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
for (;c >= limits[n]; c >>= 6) {
*--p = 0x80 | (c & 0x3f); n++;
} /* for */
*--p = masks[n] | c; n++;
memcpy(out, p, n); out += n; res += n;
} /* for */
*out = '\0'; /* terminate string */
return res;
} /* utf_utf8 */
See that the seven bytes per UTF code is hardwired, as it is the fact of UTF codes being 32bit integer. I don't expect UTF codes to go further past the 32 bit limit, but in that case, both, the UTF typedef, and the sizes and contents of the tables aux, limits and masks might be changed accordingly. There's a maximum limit of 7 or 8 for the number of characters used for the utf-8 encoding also, and it's not specified in the standard in any form how to proceed if the UTF codespace should run out of codes any time, so better not to mesh too much with this.
Useless function parameter: unsigned char ch
/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
...
return 1;
if(ch < 0x800) {
...
return 2;
if(ch < 0x10000) {
Sorry, GTG.
Note: Code incorrectly does not detect high and low surrogates.
I want to iterate through all (at least the 16 bit) unicode characters and print them on the screen with C.
I know there are related questions on SO but they don't solve the problem with printf in C, but this is what I want to achieve, if it's possible after all. I think it should be possible maybe with a trick I'm not aware of.
Since I want to use printf, I thought about something like this:
for (int i = 0x0000; i <= 0xffff; i++) {
//then somehow increment the string
char str[] = "\u25A1\n";
printf("%s", str);
char str[] = "\u25A2\n";
printf("%s", str);
char str[] = "\u25A3\n";
printf("%s", str);
...
}
But it's a bit of a problem to increment the unicode code point, here \u25A1. I'm aware it's not possible per se because some characters like \u0000 are not printable and the compiler says no. But apart from that, how could I increment from hexadecimal 0000 to ffff and print the character with printf.
If the __STDC_ISO_10646__ macro is defined, wide characters correspond to Unicode codepoints. So, assuming a locale that can represent the characters you are interested in, you can just printf() wide characters via the %lc format conversion:
#include <stdio.h>
#include <locale.h>
#ifndef __STDC_ISO_10646__
#error "Oops, our wide chars are not Unicode codepoints, sorry!"
#endif
int main()
{
int i;
setlocale(LC_ALL, "");
for (i = 0; i < 0xffff; i++) {
printf("%x - %lc\n", i, i);
}
return 0;
}
In C99, you can use wide character to multibyte character conversion functions wctomb() or wcrtomb() to convert each code point to a local representation, using the current character set. (The code points are in the current character set, not Unicode.) Remember to use setlocale() to ensure conversion functions are aware of the user locale (most importantly, the current character set used). The conversion functions use the LC_CTYPE category, but you should still use setlocale(LC_ALL, ""); as for any other locale-aware program.
(Not all systems have the C.UTF-8 locale installed, so I do not recommend trying to override the locale to the standard C with UTF-8 using setlocale(LC_ALL, "C.UTF-8");. It works on some systems, but not all. AFAIK it does not work in Fedora-based Linux distributions, for example.)
Because you want to output all Unicode code points, I suggest a different approach: Use one of the Universal Character Set Transformation Formats, i.e. UTF-8, UTF-16 (UCS-2 was superseded by UTF-16 in 1996), or UTF-32 (also known as UCS-4). UTF-8 is the one most often used on the Web -- in particular, on this very web page you're looking at right now -- and is very easy to use.
For further reading on why you should prefer UTF-8 over "native wide strings", see utf8everywhere.org.
If you want truly portable code, you can use this header file, utf8.h, to convert UTF-8 to unicode code points (utf8_to_code()) and Unicode code points to UTF-8 (code_to_utf8()):
#ifndef UTF8_H
#define UTF8_H
#include <stdlib.h>
#include <errno.h>
#define UTF8_MAXLEN 6
static size_t utf8_to_code(const unsigned char *const buffer, unsigned int *const codeptr)
{
if (!buffer) {
errno = EINVAL;
return 0;
}
if (*buffer == 0U) {
errno = 0;
return 0;
}
if (*buffer < 128U) {
if (codeptr)
*codeptr = buffer[0];
return 1;
}
if (*buffer < 192U) {
errno = EILSEQ;
return 0;
}
if (*buffer < 224U) {
if (buffer[1] >= 128U && buffer[1] < 192U)
return ((buffer[0] - 192U) << 6U)
| (buffer[1] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 240U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U)
return ((buffer[0] - 224U) << 12U)
| ((buffer[1] - 128U) << 6U)
| (buffer[2] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 248U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U)
return ((buffer[0] - 240U) << 18U)
| ((buffer[1] - 128U) << 12U)
| ((buffer[2] - 128U) << 6U)
| (buffer[3] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 252U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U)
return ((buffer[0] - 248U) << 24U)
| ((buffer[1] - 128U) << 18U)
| ((buffer[2] - 128U) << 12U)
| ((buffer[3] - 128U) << 6U)
| (buffer[4] - 128U);
errno = EILSEQ;
return 0;
}
if (*buffer < 254U) {
if (buffer[1] >= 128U && buffer[1] < 192U &&
buffer[2] >= 128U && buffer[2] < 192U &&
buffer[3] >= 128U && buffer[3] < 192U &&
buffer[4] >= 128U && buffer[4] < 192U &&
buffer[5] >= 128U && buffer[5] < 192U)
return ((buffer[0] - 252U) << 30U)
| ((buffer[1] - 128U) << 24U)
| ((buffer[2] - 128U) << 18U)
| ((buffer[3] - 128U) << 12U)
| ((buffer[4] - 128U) << 6U)
| (buffer[5] - 128U);
errno = EILSEQ;
return 0;
}
errno = EILSEQ;
return 0;
}
static size_t code_to_utf8(unsigned char *const buffer, const unsigned int code)
{
if (code < 128U) {
buffer[0] = code;
return 1;
}
if (code < 2048U) {
buffer[0] = 0xC0U | (code >> 6U);
buffer[1] = 0x80U | (code & 0x3FU);
return 2;
}
if (code < 65536) {
buffer[0] = 0xE0U | (code >> 12U);
buffer[1] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[2] = 0x80U | (code & 0x3FU);
return 3;
}
if (code < 2097152U) {
buffer[0] = 0xF0U | (code >> 18U);
buffer[1] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[3] = 0x80U | (code & 0x3FU);
return 4;
}
if (code < 67108864U) {
buffer[0] = 0xF8U | (code >> 24U);
buffer[1] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[4] = 0x80U | (code & 0x3FU);
return 5;
}
if (code <= 2147483647U) {
buffer[0] = 0xFCU | (code >> 30U);
buffer[1] = 0x80U | ((code >> 24U) & 0x3FU);
buffer[2] = 0x80U | ((code >> 18U) & 0x3FU);
buffer[3] = 0x80U | ((code >> 12U) & 0x3FU);
buffer[4] = 0x80U | ((code >> 6U) & 0x3FU);
buffer[5] = 0x80U | (code & 0x3FU);
return 6;
}
errno = EINVAL;
return 0;
}
#endif /* UTF8_H */
It is not fast, but it should be easy to understand, and supports all possible Unicode code points (U+0000 to U+10FFFF, inclusive), on all systems with at least 32-bit unsigned ints. On systems with 16-bit unsigned ints, your compiler may warn about unreachable code, and it'll only support the first 65536 code points (U+0000 to U+FFFF).
Using above utf8.h, you can easily write a C program that outputs a HTML page containing the Unicode characters you want (excluding control characters U+0000-U+001F and U+007F-U+00BF, inclusive, and invalid code points U+D800-U+DFFF, inclusive). For example, page.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
const char *str;
size_t n, len;
/* HTML5 DOCTYPE */
printf("<!DOCTYPE html>\n");
printf("<html>\n");
/* Header part. */
printf(" <head>\n");
printf(" <title> Unicode character list </title>\n");
printf(" <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
printf(" <style type=\"text/css\">\n");
/* with internal CSS stylesheet: */
printf(" html {\n");
printf(" font-family: \"DejaVu Mono\", \"Courier New\", \"Courier\", monospace;\n");
printf(" font-weight: normal;\n");
printf(" font-size: 100%%;\n");
printf(" text-decoration: none;\n");
printf(" background: #f7f7f7;\n");
printf(" color: #000000;\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0\n");
printf(" }\n");
printf(" body {\n");
printf(" background: #ffffff;\n");
printf(" padding: 0.5em 1em 0.5em 1em;\n");
printf(" border: 1px solid #cccccc;\n");
printf(" margin: 0 auto auto auto;\n");
printf(" width: 12em;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p {\n");
printf(" padding: 0 0 0 0;\n");
printf(" border: 0 none;\n");
printf(" margin: 0 0 0 0;\n");
printf(" outline: 0 none;\n");
printf(" text-align: center;\n");
printf(" }\n");
printf(" p.odd {\n");
printf(" background: #efefef;\n");
printf(" }\n");
printf(" p.even {\n");
printf(" background: #f7f7f7;\n");
printf(" }\n");
printf(" span.code {\n");
printf(" width: 8em;\n");
printf(" text-align: right;\n");
printf(" }\n");
printf(" span.char {\n");
printf(" width: 4em;\n");
printf(" text-align: left;\n");
printf(" }\n");
printf(" </style>\n");
printf(" </head>\n");
/* Body part. */
printf(" <body>\n");
n = 0;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
/* HTML does not like " & < > */
if (i == 32U)
str = " ";
else
if (i == 34U)
str = """;
else
if (i == 38U)
str = "&";
else
if (i == 60U)
str = "<";
else
if (i == 62U)
str = ">";
else
str = (const char *)ch;
if (n & 1) {
printf(" <p class=\"odd\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
} else {
printf(" <p class=\"even\" title=\"%u in decimal, &#%u; = %s\">", i, i, str);
printf("<span class=\"code\">U+%04X</span>", i);
printf(" <span class=\"char\">%s</span>", str);
printf("</p>\n");
}
n++;
}
}
printf(" </body>\n");
printf("</html>\n");
return EXIT_SUCCESS;
}
Redirect the output to a file, and you can open the file in whatever browser you prefer. If your browser is sane, and does not treat local files any different to those it obtains from a web server, then you should see the correct output.
(If you see multiple characters per code point after U+00A0, your browser has decided that because the file is local, it is using a different character set that it explicitly states it uses. Switch to a sane browser if that happens, or override the character set selection.)
If you want, you can just print the codes out as UTF-8 text, say using text.c:
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include "utf8.h"
int main(void)
{
unsigned char ch[UTF8_MAXLEN + 1];
unsigned int i;
size_t len;
for (i = 0U; i <= 0xFFFFU; i++) {
/* Skip Unicode control characters. */
if ((i >= 0U && i <= 31U) ||
(i >= 127U && i <= 159U))
continue;
/* Skip invalid Unicode code points. */
if (i >= 0xD800U && i <= 0xDFFFU)
continue;
len = code_to_utf8(ch, i);
if (len > 0) {
ch[len] = '\0';
printf("U+%04X %s \n", i, ch);
}
}
return EXIT_SUCCESS;
}
but then you must either be sure your terminal or terminal emulator supports UTF-8 and uses an UTF-8 locale, or you redirect the output to a text file and open that file in an editor which either assumes the file uses UTF-8 or lets you explicitly select the UTF-8 character set.
Note that there is a space before and after each character. Because some of the code points are combining characters, they may not show up at all unless they can be combined with another character, and most (all?) combine with space just fine.
If you use Windows, then you must conform to Microsoft stupidity, and add a special "byte order mark" -- printf("\xEF\xBB\xBF"); -- to the beginning of the output, so that its utilities like Notepad recognizes the file as UTF-8. It's a Windows-only wart, and treat it as such.
Questions?
The function to convert a 16-bit Unicode codepoint to a multibyte character sequence is c16rtomb; there is also c32rtomb if you want to handle 32-bit codepoints:
#include <uchar.h>
mbstate_t ps;
char buf[MB_CUR_MAX];
size_t bytes = c16rtomb(buf, i, &ps);
if (bytes != (size_t) -1) {
printf("%.*s\n", bytes, buf);
}
If c16rtomb is not available you will need to use platform-specific facilities.
I would go for something like this (using raw UTF-8 encoding):
char unicode[3] = { 0x00, 0x00, 0x00 };
for(size_t i=0; i<0xffff; i++)
{
printf("%s\n", unicode);
uint16_t * code = &unicode[0];
*code = *code +1;
}
Define a string on 3 bytes, the last one is the NULL terminating byte allowing a display via printf
Consider the two first bytes as your 16-bit unicode and increment it on each loop
Of course it can be optimized as:
Many characters won't be displayable
The cast char* -> uint16_t is not very elegant (triggers a warning)
As there is 2 bytes for UTF-8 encoding it will actually browse 11 bits of codepoints. To get the 16 bits you might want to actually use uint32_t and define a 5 bytes char* buffer
[EDIT] As stated in the comment, this loop will actually generates a lot of invalid UTF-8 sequences.
Indeed, going from U+007F to U+0080 is a +1 for code points but in UTF-8 you jump from 0x7F to 0xC280: you need to exclude some ranges in the loop.
the data type is char, and the pattern is follow:
source byte: [0][1][2][3][4][5][6][7]
destination: [6][7][4][5][2][3][0][1]
for example, if I pass a char, 29 to this function, it will do the swapping and return a char type value, which is 116.
How can I do the swapping?
thank you.
========================
Just wondering if I can do in this way?
unsigned char mask = 128;
char num = 0, value1 = 29;
int i, a;
for(i = 0; i < 8; i++) {
if (i == 0 || i == 1 || i == 6 || i == 7)
a = 6;
else
a = 2;
if(i < 4)
num = ((value1 & mask) >> a);
else
num = ((value1 & mask) << a);
result = (result | num);
if(i<7)
mask = mask >> 1;
}
I usually number my bits the other way -- so that bit 0 is the LSB. Following your numbering scheme:
unsigned char src = 29;
unsigned char dst = 0;
dst = (((src & 0x80) >> 6) | // bit 0
((src & 0x40) >> 6) | // bit 1
((src & 0x20) >> 2) | // bit 2
((src & 0x10) >> 2) | // bit 3
((src & 0x08) << 2) | // bit 4
((src & 0x04) << 2) | // bit 5
((src & 0x02) << 6) | // bit 6
((src & 0x01) << 6) // bit 7
);
Unless of course, you're numbering them "the right way", but drawing them "backwards" -- then just reverse what I've done above. (Not that I'm trying to start a religious war here...)
or a lookup table
just in case you dont understand that. Here is more detail
For each of the 256 possible inputs work out the answer (by hand)
then do
unsigned char answers[256] = {0x00, 0x40,0x21.....};
unsigned char answer = answers[input];
I hasten to add that the values I gave are an example - and are certainly not correct
See the "Reversing bit sequences" section on Bit Twiddling Hacks.
Also, if you want to do it yourself:
To read the n-th bit: int bit = value & (1 << n); If the bit is not set, bit is 0.
To set the n-th bit: value |= 1 << n; (value = value OR (1 shifted by n digits))
To clear the n-th bit: value &= ~(1 << n); (value = value AND NOT (1 shifted by n digits))
First swap the lower four bits with the higher four bits, then swap all adjacent pairs of bits:
dst = src;
dst = ((dst & 0xF0) >> 4) | ((dst & 0x0F) << 4);
dst = ((dst & 0xCC) >> 2) | ((dst & 0x33) << 2);
You may find this helpful:
http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious
but it the bit reversal there isn't exactly what you need. With just a little work you could change the "obvious" algorithm to do what you want.
source byte: [01][23][45][67] to
destination: [67][45][23][01]
Implementation:
unsigned char shiftit( unsigned char in ) {
unsigned char out;
out = (
(( in & 0xC0 ) >> 6) + /* top 2 to bottom 2 */
(( in & 0x30 ) >> 2) + /* centre-left 2 to centre-right */
(( in & 0x0C ) << 2) + /* centre-right 2 to centre-left */
(( in & 0x03 ) << 6) /* bottom 2 to top 2 */
);
return( out );
}
Returns 116 when called shiftit( 29 ).
Rotate through carry http://en.wikipedia.org/wiki/Bitwise_operation#Rotate_through_carry
So this would work:
myByte = myByte << 2 | myByte >> 6;