In Solaris 8, it looks like iconv*() family of functions is broken and only supports conversion between single-byte charsets and UTF-8, which can be verified using this code example:
#include <stdio.h>
#include <errno.h>
#include <iconv.h>
#if defined(__sun) && defined(__SVR4)
#define CP1251 "ansi-1251"
#define ISO_8859_5 "ISO8859-5"
#else
#define CP1251 "CP1251"
#define ISO_8859_5 "ISO-8859-5"
#endif
void iconv_open_debug(const char *, const char *);
int main() {
iconv_open_debug(CP1251, CP1251);
iconv_open_debug(CP1251, ISO_8859_5);
iconv_open_debug(CP1251, "KOI8-R");
iconv_open_debug(CP1251, "UTF-8");
iconv_open_debug(CP1251, "WCHAR_T");
iconv_open_debug(ISO_8859_5, CP1251);
iconv_open_debug(ISO_8859_5, ISO_8859_5);
iconv_open_debug(ISO_8859_5, "KOI8-R");
iconv_open_debug(ISO_8859_5, "UTF-8");
iconv_open_debug(ISO_8859_5, "WCHAR_T");
iconv_open_debug("KOI8-R", CP1251);
iconv_open_debug("KOI8-R", ISO_8859_5);
iconv_open_debug("KOI8-R", "KOI8-R");
iconv_open_debug("KOI8-R", "UTF-8");
iconv_open_debug("KOI8-R", "WCHAR_T");
iconv_open_debug("UTF-8", CP1251);
iconv_open_debug("UTF-8", ISO_8859_5);
iconv_open_debug("UTF-8", "KOI8-R");
iconv_open_debug("UTF-8", "UTF-8");
iconv_open_debug("UTF-8", "WCHAR_T");
iconv_open_debug("WCHAR_T", CP1251);
iconv_open_debug("WCHAR_T", ISO_8859_5);
iconv_open_debug("WCHAR_T", "KOI8-R");
iconv_open_debug("WCHAR_T", "UTF-8");
iconv_open_debug("WCHAR_T", "WCHAR_T");
return 0;
}
void iconv_open_debug(const char *from, const char *to) {
errno = 0;
if (iconv_open(to, from) == (iconv_t) -1) {
fprintf(stderr, "iconv_open(\"%s\", \"%s\") FAIL: errno = %d\n", to, from, errno);
perror("iconv_open()");
} else {
fprintf(stdout, "iconv_open(\"%s\", \"%s\") PASS\n", to, from);
}
}
which only prints
iconv_open("UTF-8", "ansi-1251") PASS
iconv_open("UTF-8", "ISO8859-5") PASS
iconv_open("UTF-8", "KOI8-R") PASS
iconv_open("ansi-1251", "UTF-8") PASS
iconv_open("ISO8859-5", "UTF-8") PASS
iconv_open("KOI8-R", "UTF-8") PASS
to stdout and returns EINVAL for other pairs. Note that even conversion to the same charset (e.g. UTF-8 -> UTF-8) is not supported.
Questions
Can anyone reference a document describing the limitations of Solaris version of iconv.h?
How can I convert a wchar_t* to a single- or multibyte string w/o relying on GNU libiconv? wcstombs() would be fine except that it relies on the current locale's charset, while I want a wide string converted to a regular string using a particular charset, possibly different from the default one.
Running sdtconvtool shows most legacy codepages are supported.
After re-running the same utility with truss -u libc::iconv_open, I learnt that conversion from one single-byte encoding to another single-byte one is done in two steps, with intermediate conversion to UTF-8.
Speaking of conversion from "WCHAR_T", iconv(3) also does support it, but "UCS-4" should be used as a source charset name since sizeof(wchar_t) is 4 on Solaris (for both x86 and SPARC).
Related
Say, I want to interpret(i.e. through stdout) a UTF32-encoded string: zß水🍌, taking the examples found on cppreference.
With U prefix, it's straightforward:
printf("%ls",U"zß水🍌")
However, it won't work unless an appropriate "locale" being set before print:
setlocale(LC_ALL, "XXX.UTF-8")
My simplified question is that, why should we use XXX.UTF-8 locale settings, instead of some others like XXX.UTF-32?
My confusion arises when I was testing the code below:
#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#include <uchar.h>
void test2() {
char32_t w_str[] = U"zß水🍌";
printf("wchar width: %d\n", __WCHAR_WIDTH__);
if(__STDC_UTF_32__) printf("confirmed: utf32 used.\n");
printf("(with ' C' locale) wide string is interpreted as: ");
/* set locale */ if (setlocale(LC_ALL, "C") == NULL) perror("setlocale");
if (printf("[%ls]", w_str) == -1) { perror(" ERROR('C' locale)"); clearerr(stdout); }
printf("\n");
printf("(with 'utf8' locale) wide string is interpreted as: ");
/* set locale */ if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) perror("setlocale");
if (printf("[%ls]", w_str) == -1) { perror(" ERROR('utf8' locale)"); clearerr(stdout); }
printf("\n");
}
int main(){test2();}
Along with output:
# gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04)
wchar width: 32
confirmed: utf32 used.
ERROR('C' locale): Invalid or incomplete multibyte or wide character
(with ' C' locale) wide string is interpreted as: [
(with 'utf8' locale) wide string is interpreted as: [zß水🍌]
According to C11-6.4.4.4-9:
Prefix : Corresponding Type
U : char32_t
and C11-6.10.8.2-1:
_ _ STDC_UTF_32 _ _ The integer constant 1, intended to indicate that values of type
char32_t are UTF−32 encoded.
My complete question is that, I've already specified an exact UTF32 string literal (and I'm 100% certain that they are exactly UTF32 encoded, by checking the assembly output of the code listed above), isn't it more appropriate to use some locale settings like XXX.UTF-32? If not, then why XXX.UTF-8 is qualified to decode the UTF32 byte sequence?
because of my limited knowledge in C and SWIG i couldn't manage to adopt any public example for converting c-pointer chars to tcl strings ....
I always get stuck at the problem that my tcl variable just doesn't get dereferenced
like this :
tcl_str = _30e84c05ef550000_p_stringout2
string_pointer.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "string_pointer.h"
stringout2 Itla_Get_Model_Version (int laser, char * mv_string)
{
stringout2 * pointer2;
char *mod_ver ="PPCL600";
pointer2 = malloc( sizeof(stringout2) );
pointer2-> modelvers= *mod_ver;
printf ( "Itla_Get_Model_Version : read %s \n", mod_ver );
return *pointer2 ;
}
string_pointer.h
#include <sys/types.h>
#include <sys/resource.h>
typedef struct {
char * modelvers;
} stringout2;
stringout2 Itla_Get_Model_Version (int laser, char * mv_string) ;
string_pointer.swig
/* File : string_pointer.swig */
%module string_pointer
%{
#include "string_pointer.h"
%}
%include "typemaps.i"
%include "cpointer.i"
%include "cstring.i"
%typemap(argout) char* (char tmp) %{
$1 = &tmp;
%}
stringout2 Itla_Get_Model_Version (int laser, char *OUTPUT) ;
%include "string_pointer.h"
test.tcl
load ./string_pointer.so
proc test { laser } {
scan [Itla_Get_Model_Version $laser ] %s a
puts "$a "
return $a
}
set name [test 1 ]
puts "Itla_Get_Model_Version= $name"
when executing the tcl-script you get :
Itla_Get_Model_Version : read PPCL600
_f0a759f8d9550000_p_stringout2
Itla_Get_Model_Version= _f0a759f8d9550000_p_stringout2
so i finally need to dereference the Pointer to its value ...
But i don't know how to succeed.....
The C-function is given and can't be modified !
Anybody out there, knowing how to do it ?
If your strings are basically ASCII or UTF-8, all you need to do is to tell SWIG that your function has allocated the string it is returning. For details see, the SWIG docs on C strings.
yourcode.c
char *Itla_Get_Model_Version (int laser, char * mv_string) {
// I assume this is a proxy for something more complicated...
const char *mod_ver ="PPCL600";
size_t len = strlen(mod_ver) + 1;
char *output = malloc(len);
memcpy(output, mod_ver, len);
printf ( "Itla_Get_Model_Version : read %s \n", mod_ver );
return output;
}
yourcode.h
char *Itla_Get_Model_Version(int laser, char * mv_string);
yourcode.swig
/* Tell SWIG that this function returns something to be freed */
%newobject Itla_Get_Model_Version
/* And now we can use the standard C header */
%include "yourcode.h"
If the above simple solution doesn't work…
Things get a lot more complicated if you are using a different encoding for your strings or if you wrap them inside a structure (as you did in your question). That's when you need a typemap, particularly ones of the Tcl variety. Correctly writing a typemap depends on understanding the semantics of the values that you are producing and/or consuming and the semantics of the language that you're using. Assuming you want the wrapping, here's a very simple output typemap that might work:
%typemap(out) stringout2* {
Tcl_SetObjResult(interp, Tcl_NewStringObj($1->modelvers, -1));
free($1);
}
Your function also needs to be modified to return a stringout2* by doing return pointer2;, and not a stringout2 since otherwise you will be leaking memory on every call. You can return a stringout2, but if you are doing that then you should not allocate it with malloc, but rather keep it as a structure directly in a local variable.
In that case, the typemap you'd use is:
%typemap(out) stringout2 {
Tcl_SetObjResult(interp, Tcl_NewStringObj($1.modelvers, -1));
}
(Note the different type, different access to the field, and lack of free.)
And your structure should be declared as containing a const char * if it really is that.
If you have strings in a different encoding (and it isn't ISO 8859-1, for which you can cheat and use a binary string using Tcl_NewByteArrayObj; that's also what you want for slabbing a chunk of binary data over) then you'll need to write a typemap using Tcl_ExternalToUtfDString, and the amount of boilerplate code goes up. Tcl insists that its internal strings are in (almost) UTF-8, and ASCII is OK too as that's a strict subset; everything else must be converted.
Ask another question if that's what you need. You probably are either dealing with ASCII or binary data, so I'll leave (quite a bit more complex!) encoding conversion alone until requested.
I am using PostgreSQL 9.5 64bit version on windows server.
The character encoding of the database is set to UTF8.
I'd like to create a function that manipulates multibyte strings.
(e.g. cleansing, replace etc.)
I copied C language logic for manipulating characters from a other system,
The logic assumes that the character code is sjis.
I do not want to change C language logic, so I want to convert from UTF8 to sjis in C language function of Postgresql.
Like the convert_to function. (However, since the convert_to function returns bytea type, I want to obtain it with TEXT type.)
Please tell me how to convert from UTF 8 to sjis in C language.
Create Function Script:
CREATE FUNCTION CLEANSING_STRING(character varying)
RETURNS character varying AS
'$libdir/MyFunc/CLEANSING_STRING.dll', 'CLEANSING_STRING'
LANGUAGE c VOLATILE STRICT;
C Source:
#include <stdio.h>
#include <string.h>
#include <postgres.h>
#include <port.h>
#include <fmgr.h>
#include <stdlib.h>
#include <builtins.h>
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
extern PGDLLEXPORT Datum CLEANSING_STRING(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(CLEANSING_STRING);
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
//Char *sjisChar[] = foo(arg); // something like that..
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
//arg = bar(sjisChar); // something like that..
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}
Succeeded in the way I was taught by question comments.
#include <mb/pg_wchar.h> //Add to include.
...
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
Char *sjisChar[] = pg_server_to_any(arg, strlen(arg), PG_SJIS);
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
arg = pg_any_to_server(sjisChar, strlen(sjisChar), PG_SJIS); //It converts from SJIS to server (UTF 8), the third argument sets the encoding of the conversion source.
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}
I'm compiling a PCRE pattern with utf8 flag enabled and am trying to match a utf8 char* string against it, but it is not matching and pcre_exec returns negative. I'm passing the subject length as 65 to pcre_exec which is the number of characters in the string. I believe it expects the number of bytes so I have tried with increasing the argument till 70 but still get the same result. I don't know what else is making the match fail. Please help before I shoot myself.
(If I try without the flag PCRE_UTF8 however, it matches but the offset vector[1] is 30 which is index of the character just before a unicode character in my input string)
#include "stdafx.h"
#include "pcre.h"
#include <pcre.h> /* PCRE lib NONE */
#include <stdio.h> /* I/O lib C89 */
#include <stdlib.h> /* Standard Lib C89 */
#include <string.h> /* Strings C89 */
#include <iostream>
int main(int argc, char *argv[])
{
pcre *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
char* aStrRegex = "(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of parenthesis or have parameters inside them as well.
"\\(\\s*(?<params>[?\\w,]+)\\s*\\)"
// paramList along with its parenthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
")?";
reCompiled = pcre_compile(aStrRegex, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
char* line = "?rt?=call SqlTxFunctionTesting(?înFîéld?,?outField?,?inOutField?)";
pcreExecRet = pcre_exec(reCompiled,
NULL,
line,
65, // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
//int substrLen = pcre_get_substring(line, subStrVec, pcreExecRet, 1, &mantissa);
}
1)
char * q= "î";
printf("%d, %s", q[0], q);
Output:
63, ?
2) You must rebuild PCRE with PCRE_BUILD_PCRE16 (or 32) and PCRE_SUPPORT_UTF. And use pcre16.lib and/or pcre16.dll. Then you can try this code:
pcre16 *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
wchar_t* aStrRegex = L"(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of paranthesis or have parameters inside them as well.
L"\\(\\s*(?<params>[?,\\w\\p{L}]+)\\s*\\)"
// paramList along with its paranthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
L")?";
reCompiled = pcre16_compile((PCRE_SPTR16)aStrRegex, PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
const wchar_t* line = L"?rt?=call SqlTxFunctionTesting( ?inField?,?outField?,?inOutField?,?fd? )";
const wchar_t* mantissa=new wchar_t[wcslen(line)];
pcreExecRet = pcre16_exec(reCompiled,
NULL,
(PCRE_SPTR16)line,
wcslen(line), // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
for (int i=0;i<pcreExecRet;i++){
int substrLen = pcre16_get_substring((PCRE_SPTR16)line, subStrVec, pcreExecRet, i, (PCRE_SPTR16 *)&mantissa);
wprintf(L"\nret string=%s, length=%i\n",mantissa,substrLen);
}
3) \w = [0-9A-Z_a-z]. It doesn't contains unicode symbols.
4) This can really help: http://answers.oreilly.com/topic/215-how-to-use-unicode-code-points-properties-blocks-and-scripts-in-regular-expressions/
5) from PCRE 8.33 source (pcre_exec.c:2251)
/* Find out if the previous and current characters are "word" characters.
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
be "non-word" characters. Remember the earliest consulted character for
partial matching. */
my XML file printed returns a error data when i used a FF FE to print a UTF-16 LE file.
i've also use the setlocale and wchar. Can someone explain me what is the cause of this?
#define XML_UNICODE "\xFF\xFE"
#define XML_HEADER "<?xml version=\"1.0\" encoding=\"UTF-16\" standalone=\"yes\"?>"
Below is the snippet of my code:
int liIndex=0;
int liSize=0;
char* sTxt=NULL;
wchar_t swTxt[LEN_XML_CONTENT];
wchar_t swUnicode[MAX_BUFF];
const char* cTxt;
const char* cUnicode;
cUnicode=XML_UNICODE;
mbstowcs(swUnicode, cUnicode, MAX_BUFF);
fwprintf(file, L"%ls", swUnicode);
cTxt=XML_HEADER;
liSize=strlen(cTxt);
mbstowcs(swTxt, cTxt, MAX_BUFF);
for(liIndex=0; liIndex<liSize; liIndex++)
{
fwprintf(file, L"%lc", swTxt[liIndex]);
}
fwprintf(file, L"\n");
The output of XML file when check using notepad++, the encoding was UCS-2 LE already but the output was garbage/error.