How to print UTF-16 LE in Linux? - c

my XML file printed returns a error data when i used a FF FE to print a UTF-16 LE file.
i've also use the setlocale and wchar. Can someone explain me what is the cause of this?
#define XML_UNICODE "\xFF\xFE"
#define XML_HEADER "<?xml version=\"1.0\" encoding=\"UTF-16\" standalone=\"yes\"?>"
Below is the snippet of my code:
int liIndex=0;
int liSize=0;
char* sTxt=NULL;
wchar_t swTxt[LEN_XML_CONTENT];
wchar_t swUnicode[MAX_BUFF];
const char* cTxt;
const char* cUnicode;
cUnicode=XML_UNICODE;
mbstowcs(swUnicode, cUnicode, MAX_BUFF);
fwprintf(file, L"%ls", swUnicode);
cTxt=XML_HEADER;
liSize=strlen(cTxt);
mbstowcs(swTxt, cTxt, MAX_BUFF);
for(liIndex=0; liIndex<liSize; liIndex++)
{
fwprintf(file, L"%lc", swTxt[liIndex]);
}
fwprintf(file, L"\n");
The output of XML file when check using notepad++, the encoding was UCS-2 LE already but the output was garbage/error.

Related

Change the character encode in PostgreSQL C language function

I am using PostgreSQL 9.5 64bit version on windows server.
The character encoding of the database is set to UTF8.
I'd like to create a function that manipulates multibyte strings.
(e.g. cleansing, replace etc.)
I copied C language logic for manipulating characters from a other system,
The logic assumes that the character code is sjis.
I do not want to change C language logic, so I want to convert from UTF8 to sjis in C language function of Postgresql.
Like the convert_to function. (However, since the convert_to function returns bytea type, I want to obtain it with TEXT type.)
Please tell me how to convert from UTF 8 to sjis in C language.
Create Function Script:
CREATE FUNCTION CLEANSING_STRING(character varying)
RETURNS character varying AS
'$libdir/MyFunc/CLEANSING_STRING.dll', 'CLEANSING_STRING'
LANGUAGE c VOLATILE STRICT;
C Source:
#include <stdio.h>
#include <string.h>
#include <postgres.h>
#include <port.h>
#include <fmgr.h>
#include <stdlib.h>
#include <builtins.h>
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
extern PGDLLEXPORT Datum CLEANSING_STRING(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(CLEANSING_STRING);
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
//Char *sjisChar[] = foo(arg); // something like that..
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
//arg = bar(sjisChar); // something like that..
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}
Succeeded in the way I was taught by question comments.
#include <mb/pg_wchar.h> //Add to include.
...
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
Char *sjisChar[] = pg_server_to_any(arg, strlen(arg), PG_SJIS);
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
arg = pg_any_to_server(sjisChar, strlen(sjisChar), PG_SJIS); //It converts from SJIS to server (UTF 8), the third argument sets the encoding of the conversion source.
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}

iconv_open() returning EINVAL on Solaris 8

In Solaris 8, it looks like iconv*() family of functions is broken and only supports conversion between single-byte charsets and UTF-8, which can be verified using this code example:
#include <stdio.h>
#include <errno.h>
#include <iconv.h>
#if defined(__sun) && defined(__SVR4)
#define CP1251 "ansi-1251"
#define ISO_8859_5 "ISO8859-5"
#else
#define CP1251 "CP1251"
#define ISO_8859_5 "ISO-8859-5"
#endif
void iconv_open_debug(const char *, const char *);
int main() {
iconv_open_debug(CP1251, CP1251);
iconv_open_debug(CP1251, ISO_8859_5);
iconv_open_debug(CP1251, "KOI8-R");
iconv_open_debug(CP1251, "UTF-8");
iconv_open_debug(CP1251, "WCHAR_T");
iconv_open_debug(ISO_8859_5, CP1251);
iconv_open_debug(ISO_8859_5, ISO_8859_5);
iconv_open_debug(ISO_8859_5, "KOI8-R");
iconv_open_debug(ISO_8859_5, "UTF-8");
iconv_open_debug(ISO_8859_5, "WCHAR_T");
iconv_open_debug("KOI8-R", CP1251);
iconv_open_debug("KOI8-R", ISO_8859_5);
iconv_open_debug("KOI8-R", "KOI8-R");
iconv_open_debug("KOI8-R", "UTF-8");
iconv_open_debug("KOI8-R", "WCHAR_T");
iconv_open_debug("UTF-8", CP1251);
iconv_open_debug("UTF-8", ISO_8859_5);
iconv_open_debug("UTF-8", "KOI8-R");
iconv_open_debug("UTF-8", "UTF-8");
iconv_open_debug("UTF-8", "WCHAR_T");
iconv_open_debug("WCHAR_T", CP1251);
iconv_open_debug("WCHAR_T", ISO_8859_5);
iconv_open_debug("WCHAR_T", "KOI8-R");
iconv_open_debug("WCHAR_T", "UTF-8");
iconv_open_debug("WCHAR_T", "WCHAR_T");
return 0;
}
void iconv_open_debug(const char *from, const char *to) {
errno = 0;
if (iconv_open(to, from) == (iconv_t) -1) {
fprintf(stderr, "iconv_open(\"%s\", \"%s\") FAIL: errno = %d\n", to, from, errno);
perror("iconv_open()");
} else {
fprintf(stdout, "iconv_open(\"%s\", \"%s\") PASS\n", to, from);
}
}
which only prints
iconv_open("UTF-8", "ansi-1251") PASS
iconv_open("UTF-8", "ISO8859-5") PASS
iconv_open("UTF-8", "KOI8-R") PASS
iconv_open("ansi-1251", "UTF-8") PASS
iconv_open("ISO8859-5", "UTF-8") PASS
iconv_open("KOI8-R", "UTF-8") PASS
to stdout and returns EINVAL for other pairs. Note that even conversion to the same charset (e.g. UTF-8 -> UTF-8) is not supported.
Questions
Can anyone reference a document describing the limitations of Solaris version of iconv.h?
How can I convert a wchar_t* to a single- or multibyte string w/o relying on GNU libiconv? wcstombs() would be fine except that it relies on the current locale's charset, while I want a wide string converted to a regular string using a particular charset, possibly different from the default one.
Running sdtconvtool shows most legacy codepages are supported.
After re-running the same utility with truss -u libc::iconv_open, I learnt that conversion from one single-byte encoding to another single-byte one is done in two steps, with intermediate conversion to UTF-8.
Speaking of conversion from "WCHAR_T", iconv(3) also does support it, but "UCS-4" should be used as a source charset name since sizeof(wchar_t) is 4 on Solaris (for both x86 and SPARC).

fprintf() issues utf-8 linux

Ok, I got to print UTF-8 encoded chars to terminal. But printing to file is not working like i expected. Using wchar.h and locale.h as such:
#include <locale.h>
#include <wchar.h>
int main(){
setlocale(LC_ALL,"");
wint_t index = 0;
FILE* fpinout = fopen("UTF-8.txt","w");
for(index = 0; index < 0x200; index++){
printf("%i:\t%lc\n", index, index); //works fine, prints utf-8 chars to terminal
fprintf(fpinout,"%i\t%lc", index, index); //does not work, output is wierd
}
fclose(fpinout);
}
I tried to use index there both as wint_t and wchar_t.
My UTF-8.txt file looks like this:
र㄀ĉल㌂̉ऴ㔄ԉश㜆܉स㤈उ〱ਉㄱଉ㈱ఉ㌱ഉ㐱ฉ㔱༉㘱ဉ㜱ᄉ㠱ሉ㤱ጉ〲ᐉㄲᔉ㈲ᘉ㌲ᜉ㐲᠉㔲ᤉ㘲ᨉ㜲ᬉ㠲ᰉ㤲ᴉ〳ḉㄳἉ㈳ ㌳℉㐳∉㔳⌉㘳␉㜳
┉㠳☉㤳✉〴⠉ㄴ⤉㈴⨉㌴⬉㐴Ⰹ㔴ⴉ㘴⸉㜴⼉㠴〉㤴ㄉ〵㈉ㄵ㌉㈵㐉㌵㔉㐵㘉㔵㜉㘵㠉㜵㤉㠵㨉㤵㬉〶㰉ㄶ㴉㈶㸉㌶㼉㐶䀉㔶䄉㘶䈉
㜶䌉㠶䐉㤶䔉〷䘉ㄷ䜉㈷䠉㌷䤉㐷䨉㔷䬉㘷䰉㜷䴉㠷三㤷伉〸倉ㄸ儉㈸刉㌸匉㐸吉㔸唉㘸嘉㜸圉㠸堉㤸変〹娉ㄹ嬉㈹尉㌹崉㐹帉
㔹弉㘹怉㜹愉㠹戉㤹按〱रㅤ㄰攉〱लㅦ㌰有〱ऴㅨ㔰椉〱शㅪ㜰欉〱सㅬ㤰洉ㄱरㅮㄱ漉ㄱलㅰ㌱焉ㄱऴㅲ㔱猉ㄱशㅴ㜱甉ㄱसㅶ㤱眉
㈱रㅸㄲ礉㈱लㅺ㌲笉㈱ऴㅼ㔲紉㈱शㅾ㜲缉㈱स胂㈱ह臂㌱र苂㌱ऱ菂㌱ल蓂㌱ळ藂㌱ऴ蛂㌱व蟂㌱श裂㌱ष观㌱स諂㌱ह诂㐱र賂㐱ऱ跂㐱ल軂㐱
ळ迂㐱ऴ郂㐱व釂㐱श鋂㐱ष鏂㐱स铂㐱ह闂㔱र雂㔱ऱ韂㔱ल飂㔱ळ駂㔱ऴ髂㔱व鯂㔱श鳂㔱ष鷂㔱स黂㔱ह鿂㘱रꃂ㘱ऱꇂ㘱लꋂ㘱ळꏂ㘱ऴ꓂
㘱वꗂ㘱शꛂ㘱षꟂ㘱सꣂ㘱ह꧂㜱रꫂ㜱ऱꯂ㜱ल곂㜱ळ귂㜱ऴ껂㜱व꿂㜱श냂㜱ष뇂㜱स닂㜱ह돂㠱र듂㠱ऱ뗂㠱ल뛂㠱ळ럂㠱ऴ룂㠱व맂㠱श뫂
㠱ष믂㠱स볂㠱ह뷂㤱र뻂㤱ऱ뿂㤱ल胃㤱ळ臃㤱ऴ苃㤱व菃㤱श蓃㤱ष藃㤱स蛃㤱ह蟃〲र裃〲ऱ觃〲ल諃〲ळ诃〲ऴ賃〲व跃〲श軃〲ष迃〲स郃〲ह
釃ㄲर鋃ㄲऱ鏃ㄲल铃ㄲळ闃ㄲऴ雃ㄲव韃ㄲश飃ㄲष駃ㄲस髃ㄲह鯃㈲र鳃㈲ऱ鷃㈲ल黃㈲ळ鿃㈲ऴꃃ㈲वꇃ㈲शꋃ㈲षꏃ㈲स꓃㈲हꗃ㌲रꛃ㌲ऱꟃ㌲
लꣃ㌲ळ꧃㌲ऴ꫃㌲वꯃ㌲श곃㌲ष귃㌲स껃㌲ह꿃㐲र냃㐲ऱ뇃㐲ल닃㐲ळ돃㐲ऴ듃㐲व뗃㐲श뛃㐲ष럃㐲स룃㐲ह맃㔲र뫃㔲ऱ믃㔲ल볃㔲ळ뷃㔲ऴ뻃
㔲व뿃
Any help is appreciated.
This way you write UTF32 by fact. Opening file in binary mode won't help. it will remain UTF32LE.
You should use transformation to UTF8 encoding. Either use ICU library or wctomb / wctombs / wclen c functions ( http://man7.org/linux/man-pages/man3/wctomb.3.html ). be aware that wctomb* functions usually are locale dependent (often won't work correctly with Japanese if you have Greek locale)

what chars are special in c?

Hey guys so I have this text with special chars in it and I wold like to "escape" special chars to be able to compile my program but I do not know are / : - = special chars ? And do I need to escape them as well ? Here is example
static const char *postthis="text and spec chars";
and here is example of text which I want to put in
<text:Text
text:text="http://text.text.text/text/text/"
text:text="text:text:text-text-text">
<text:Text>
<text text="http://text.com/text">
<productID>20630175</textID>
</text>
</text:text>
</text:text>
So I put \ before < and " but again I got error, what do I need to escape and how ?
static const char *postthis="\<text:Text
text:text=\"http://text.text.text/text/text/\"
text:text=\"text:text:text-text-text\"\>
\<text:Text\>
\<text text="http://text.com/text\"\>
\<textID\>20630175\</textID\>
\</text\>
\</text:text\>
\</text:text\>";
I think you need to escape the end of the line in a multiple line initializer
and the quotes. If there were tabs or new-lines you would need to escape those too \t \n.
static const char *postthis="\<text:Text\
text:text=\"http://text.text.text/text/text/\"\n\
text:text=\"text:text:text-text-text\">\n\
<text:Text>\n\
<text text="http://text.com/text\"\>\n\
<textID>20630175</textID>\n\
</text>\n\
</text:text>\n\
</text:text>";
The only characters you need to escape inside of a C string are \, and ". There's no reason to quote <. Newlines can be escaped, or you can include them as \n (which I prefer, since it lets you indent more nicely IMO). You cannot simply embed unquoted newlines, however (which is probably your main problem).
XML supports both ' and " as quotation, so you can almost always simplify your life by using '.
static const char *postthis=
"<text:Text\n"
" text:text='http://text.text.text/text/text/'\n"
" text:text='text:text:text-text-text'>\n"
" <text:Text>\n"
" <text text='http://text.com/text'>\n"
" <productID>20630175</textID>\n"
" </text>\n"
" </text:text>\n"
"</text:text>";
Of course XML does not need the newlines, so you could just drop them unless you want this to be human-read.
Note that the above uses a multi-part C string. (Multiple "..." that are joined together.) I can't remember what version of C added that (or if it's a GNU extension). If you can't use it, then you can just escape the newlines:
static const char *postthis=
"<text:Text\
text:text='http://text.text.text/text/text/'\
...
</text:text>";
But it makes the indentation a little harder to customize without making your code look worse.
You time is much better spent putting that text in to a file and reading that. Hand editing XML is guaranteed to backfire.
If you are working an an iOS project you'd be able to use the NSString initialiser:
- (id)initWithContentsOfFile:(NSString *)path usedEncoding:(NSStringEncoding *)enc error:(NSError **)error
Or if you are using vanilla C something along the lines of
#include <stdio.h>
#define BUFF_SIZE 1024
int main(int argc, char **argv) {
char s[BUFF_SIZE];
FILE *fp = fopen( "file.xml", "r" );
int i = 0;
while ( !feof(fp) && i < BUFF_SIZE-1 ) {
s[i++] = fgetc(fp);
}
s[i]='\0';
printf("My string is here %s\n", s);
}
See http://en.cppreference.com/w/cpp/language/escape for all escape sequences in a string literal. These will need to be escaped if you want them unchanged. Any new lines you want in your string will need to be written as \n.
As already mentioned earlier, you could keep the text in a file and read it at run time.
If you have a lot of text you want to compile into your program, another easy solution is to use objcopy:
objcopy -I binary -O elf64-x86-64 -B i386 --rename-section .data=.rodata,alloc,load,readonly,data,contents your_text.txt your_text.o
This will give you a object file with the following symbols in it:
_binary_your_text_txt_start
_binary_your_text_txt_end
_binary_your_text_txt_size
Link against your_text.o and use the symbols to access the text. For example:
#include <stdio.h>
extern char _binary_your_text_txt_start;
extern char _binary_your_text_txt_end;
extern char _binary_your_text_txt_size;
int main(int argc, char *argv[])
{
const char * b = &_binary_your_text_txt_start;
const char * e = &_binary_your_text_txt_end;
size_t s = (size_t)&_binary_your_text_txt_size;
fwrite(b, s, 1, stdout);
return 0;
}

PCRE is not matching utf8 characters

I'm compiling a PCRE pattern with utf8 flag enabled and am trying to match a utf8 char* string against it, but it is not matching and pcre_exec returns negative. I'm passing the subject length as 65 to pcre_exec which is the number of characters in the string. I believe it expects the number of bytes so I have tried with increasing the argument till 70 but still get the same result. I don't know what else is making the match fail. Please help before I shoot myself.
(If I try without the flag PCRE_UTF8 however, it matches but the offset vector[1] is 30 which is index of the character just before a unicode character in my input string)
#include "stdafx.h"
#include "pcre.h"
#include <pcre.h> /* PCRE lib NONE */
#include <stdio.h> /* I/O lib C89 */
#include <stdlib.h> /* Standard Lib C89 */
#include <string.h> /* Strings C89 */
#include <iostream>
int main(int argc, char *argv[])
{
pcre *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
char* aStrRegex = "(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of parenthesis or have parameters inside them as well.
"\\(\\s*(?<params>[?\\w,]+)\\s*\\)"
// paramList along with its parenthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
")?";
reCompiled = pcre_compile(aStrRegex, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
char* line = "?rt?=call SqlTxFunctionTesting(?înFîéld?,?outField?,?inOutField?)";
pcreExecRet = pcre_exec(reCompiled,
NULL,
line,
65, // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
//int substrLen = pcre_get_substring(line, subStrVec, pcreExecRet, 1, &mantissa);
}
1)
char * q= "î";
printf("%d, %s", q[0], q);
Output:
63, ?
2) You must rebuild PCRE with PCRE_BUILD_PCRE16 (or 32) and PCRE_SUPPORT_UTF. And use pcre16.lib and/or pcre16.dll. Then you can try this code:
pcre16 *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
wchar_t* aStrRegex = L"(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of paranthesis or have parameters inside them as well.
L"\\(\\s*(?<params>[?,\\w\\p{L}]+)\\s*\\)"
// paramList along with its paranthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
L")?";
reCompiled = pcre16_compile((PCRE_SPTR16)aStrRegex, PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
const wchar_t* line = L"?rt?=call SqlTxFunctionTesting( ?inField?,?outField?,?inOutField?,?fd? )";
const wchar_t* mantissa=new wchar_t[wcslen(line)];
pcreExecRet = pcre16_exec(reCompiled,
NULL,
(PCRE_SPTR16)line,
wcslen(line), // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
for (int i=0;i<pcreExecRet;i++){
int substrLen = pcre16_get_substring((PCRE_SPTR16)line, subStrVec, pcreExecRet, i, (PCRE_SPTR16 *)&mantissa);
wprintf(L"\nret string=%s, length=%i\n",mantissa,substrLen);
}
3) \w = [0-9A-Z_a-z]. It doesn't contains unicode symbols.
4) This can really help: http://answers.oreilly.com/topic/215-how-to-use-unicode-code-points-properties-blocks-and-scripts-in-regular-expressions/
5) from PCRE 8.33 source (pcre_exec.c:2251)
/* Find out if the previous and current characters are "word" characters.
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
be "non-word" characters. Remember the earliest consulted character for
partial matching. */

Resources