Change the character encode in PostgreSQL C language function - c

I am using PostgreSQL 9.5 64bit version on windows server.
The character encoding of the database is set to UTF8.
I'd like to create a function that manipulates multibyte strings.
(e.g. cleansing, replace etc.)
I copied C language logic for manipulating characters from a other system,
The logic assumes that the character code is sjis.
I do not want to change C language logic, so I want to convert from UTF8 to sjis in C language function of Postgresql.
Like the convert_to function. (However, since the convert_to function returns bytea type, I want to obtain it with TEXT type.)
Please tell me how to convert from UTF 8 to sjis in C language.
Create Function Script:
CREATE FUNCTION CLEANSING_STRING(character varying)
RETURNS character varying AS
'$libdir/MyFunc/CLEANSING_STRING.dll', 'CLEANSING_STRING'
LANGUAGE c VOLATILE STRICT;
C Source:
#include <stdio.h>
#include <string.h>
#include <postgres.h>
#include <port.h>
#include <fmgr.h>
#include <stdlib.h>
#include <builtins.h>
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
extern PGDLLEXPORT Datum CLEANSING_STRING(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(CLEANSING_STRING);
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
//Char *sjisChar[] = foo(arg); // something like that..
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
//arg = bar(sjisChar); // something like that..
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}

Succeeded in the way I was taught by question comments.
#include <mb/pg_wchar.h> //Add to include.
...
Datum CLEANSING_STRING(PG_FUNCTION_ARGS)
{
// Get Arg
text *arg1 = (text *)PG_GETARG_TEXT_P(0);
// Text to Char[]
char *arg;
arg = text_to_cstring(arg1);
// UTF8 to Sjis
Char *sjisChar[] = pg_server_to_any(arg, strlen(arg), PG_SJIS);
// Copied from other system.(Assumes that the character code is sjis.)
cleansingString(sjisChar);
replaceStrimg(sjisChar);
// Sjis to UTF8
arg = pg_any_to_server(sjisChar, strlen(sjisChar), PG_SJIS); //It converts from SJIS to server (UTF 8), the third argument sets the encoding of the conversion source.
//Char[] to Text and Return
PG_RETURN_TEXT_P(cstring_to_text(arg));
}

Related

SWIG convert C-Pointer stringvalue to tcl string

because of my limited knowledge in C and SWIG i couldn't manage to adopt any public example for converting c-pointer chars to tcl strings ....
I always get stuck at the problem that my tcl variable just doesn't get dereferenced
like this :
tcl_str = _30e84c05ef550000_p_stringout2
string_pointer.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "string_pointer.h"
stringout2 Itla_Get_Model_Version (int laser, char * mv_string)
{
stringout2 * pointer2;
char *mod_ver ="PPCL600";
pointer2 = malloc( sizeof(stringout2) );
pointer2-> modelvers= *mod_ver;
printf ( "Itla_Get_Model_Version : read %s \n", mod_ver );
return *pointer2 ;
}
string_pointer.h
#include <sys/types.h>
#include <sys/resource.h>
typedef struct {
char * modelvers;
} stringout2;
stringout2 Itla_Get_Model_Version (int laser, char * mv_string) ;
string_pointer.swig
/* File : string_pointer.swig */
%module string_pointer
%{
#include "string_pointer.h"
%}
%include "typemaps.i"
%include "cpointer.i"
%include "cstring.i"
%typemap(argout) char* (char tmp) %{
$1 = &tmp;
%}
stringout2 Itla_Get_Model_Version (int laser, char *OUTPUT) ;
%include "string_pointer.h"
test.tcl
load ./string_pointer.so
proc test { laser } {
scan [Itla_Get_Model_Version $laser ] %s a
puts "$a "
return $a
}
set name [test 1 ]
puts "Itla_Get_Model_Version= $name"
when executing the tcl-script you get :
Itla_Get_Model_Version : read PPCL600
_f0a759f8d9550000_p_stringout2
Itla_Get_Model_Version= _f0a759f8d9550000_p_stringout2
so i finally need to dereference the Pointer to its value ...
But i don't know how to succeed.....
The C-function is given and can't be modified !
Anybody out there, knowing how to do it ?
If your strings are basically ASCII or UTF-8, all you need to do is to tell SWIG that your function has allocated the string it is returning. For details see, the SWIG docs on C strings.
yourcode.c
char *Itla_Get_Model_Version (int laser, char * mv_string) {
// I assume this is a proxy for something more complicated...
const char *mod_ver ="PPCL600";
size_t len = strlen(mod_ver) + 1;
char *output = malloc(len);
memcpy(output, mod_ver, len);
printf ( "Itla_Get_Model_Version : read %s \n", mod_ver );
return output;
}
yourcode.h
char *Itla_Get_Model_Version(int laser, char * mv_string);
yourcode.swig
/* Tell SWIG that this function returns something to be freed */
%newobject Itla_Get_Model_Version
/* And now we can use the standard C header */
%include "yourcode.h"
If the above simple solution doesn't work…
Things get a lot more complicated if you are using a different encoding for your strings or if you wrap them inside a structure (as you did in your question). That's when you need a typemap, particularly ones of the Tcl variety. Correctly writing a typemap depends on understanding the semantics of the values that you are producing and/or consuming and the semantics of the language that you're using. Assuming you want the wrapping, here's a very simple output typemap that might work:
%typemap(out) stringout2* {
Tcl_SetObjResult(interp, Tcl_NewStringObj($1->modelvers, -1));
free($1);
}
Your function also needs to be modified to return a stringout2* by doing return pointer2;, and not a stringout2 since otherwise you will be leaking memory on every call. You can return a stringout2, but if you are doing that then you should not allocate it with malloc, but rather keep it as a structure directly in a local variable.
In that case, the typemap you'd use is:
%typemap(out) stringout2 {
Tcl_SetObjResult(interp, Tcl_NewStringObj($1.modelvers, -1));
}
(Note the different type, different access to the field, and lack of free.)
And your structure should be declared as containing a const char * if it really is that.
If you have strings in a different encoding (and it isn't ISO 8859-1, for which you can cheat and use a binary string using Tcl_NewByteArrayObj; that's also what you want for slabbing a chunk of binary data over) then you'll need to write a typemap using Tcl_ExternalToUtfDString, and the amount of boilerplate code goes up. Tcl insists that its internal strings are in (almost) UTF-8, and ASCII is OK too as that's a strict subset; everything else must be converted.
Ask another question if that's what you need. You probably are either dealing with ASCII or binary data, so I'll leave (quite a bit more complex!) encoding conversion alone until requested.

How to create shortcut with Win32 API and c language

I want to write a program to create a shortcut for a specific file by using win32 API in c. my IDE is visual studio 2010.
I found this page but its sample just not compile and return many errors.
I also find this code but this always create a link with Target: "D:\Desktop\㩣睜湩潤獷湜瑯灥摡攮數" and I don't know why.
can someone tell me why the sample code of Microsoft is not working or the second one return something in Chinese shape language and also with wrong and constant location for any argument?
This is my code for MSDN sample:
#include "stdafx.h"
#include "windows.h"
#include "winnls.h"
#include "shobjidl.h"
#include "objbase.h"
#include "objidl.h"
#include "shlguid.h"
void _tmain(int argc, TCHAR *argv[])
{
CreateLink(argv[1],__argv[2],argv[3]);
}
HRESULT CreateLink(LPCWSTR lpszPathObj, LPCSTR lpszPathLink, LPCWSTR lpszDesc)
{
HRESULT hres;
IShellLink* psl;
// Get a pointer to the IShellLink interface. It is assumed that CoInitialize
// has already been called.
hres = CoCreateInstance(CLSID_ShellLink, NULL, CLSCTX_INPROC_SERVER, IID_IShellLink, (LPVOID*)&psl);
if (SUCCEEDED(hres))
{
IPersistFile* ppf;
// Set the path to the shortcut target and add the description.
psl->SetPath(lpszPathObj);
psl->SetDescription(lpszDesc);
// Query IShellLink for the IPersistFile interface, used for saving the
// shortcut in persistent storage.
hres = psl->QueryInterface(IID_IPersistFile, (LPVOID*)&ppf);
if (SUCCEEDED(hres))
{
WCHAR wsz[MAX_PATH];
// Ensure that the string is Unicode.
MultiByteToWideChar(CP_ACP, 0, lpszPathLink, -1, wsz, MAX_PATH);
// Add code here to check return value from MultiByteWideChar
// for success.
// Save the link by calling IPersistFile::Save.
hres = ppf->Save(wsz, TRUE);
ppf->Release();
}
psl->Release();
}
return hres;
}
and the errors are:
1 error C1083: Cannot open include file: 'stdafx.h' and
2 IntelliSense: cannot open source file "stdafx.h"
CreateLink(argv[1],__argv[2],argv[3]);
This call looks weird. You are using argv[] for two LPCWSTR (const wchar_t *) parameters, but are using __argv[] for an LPCSTR (const char *) parameter. You should change the 2nd parameter to LPCWSTR to match the other parameters, and then use argv[] instead of __argv[].
The TCHAR-based IShellLink works with LP(C)WSTR string parameters, and LP(C)TSTR is LP(C)WSTR when compiling for Unicode. Which you are obviously doing, given that you are passing TCHAR-based argv[] values to LPCWSTR parameters, which will only compile if TCHAR is wchar_t.
IPersistFile::Save() takes only a Unicode string as input, regardless of what TCHAR maps to. You are converting the char* value from __argv[] from ANSI to Unicode, so you may as well just get a Unicode string from argv[] to begin with, and omit the call to MultiByteToWideChar() altogether.
There is no good reason to mix ANSI and Unicode strings like this. This is something the MSDN example is getting wrong.
And since your function parameters are working with Unicode strings, you should use the IShellLinkW interface directly instead of the TCHAR-based IShellLink interface.
Try this:
#include "stdafx.h"
#include "windows.h"
#include "shobjidl.h"
#include "objbase.h"
#include "objidl.h"
#include "shlguid.h"
HRESULT CreateLink(LPCWSTR lpszPathObj, LPCWSTR lpszPathLink, LPCWSTR lpszDesc)
{
HRESULT hres;
IShellLinkW* psl;
// Get a pointer to the IShellLink interface. It is assumed that CoInitialize
// has already been called.
hres = CoCreateInstance(CLSID_ShellLink, NULL, CLSCTX_INPROC_SERVER, IID_IShellLinkW, (LPVOID*)&psl);
if (SUCCEEDED(hres))
{
IPersistFile* ppf;
// Set the path to the shortcut target and add the description.
psl->SetPath(lpszPathObj);
psl->SetDescription(lpszDesc);
// Query IShellLink for the IPersistFile interface, used for saving the
// shortcut in persistent storage.
hres = psl->QueryInterface(IID_IPersistFile, (LPVOID*)&ppf);
if (SUCCEEDED(hres))
{
// Save the link by calling IPersistFile::Save.
hres = ppf->Save(lpszPathLink, TRUE);
ppf->Release();
}
psl->Release();
}
return hres;
}
void _tmain(int argc, TCHAR *argv[])
{
if (argc > 3)
CreateLink(argv[1], argv[2], argv[3]);
}
Visual Studio generates stdafx.h and stdafx.cpp when you are using new project wizard. If Create empty project checkmark is marked, it will not generate them. These files are used to build a precompiled header file Projname.pch and a precompiled types file Stdafx.obj
For small projects you can eventually remove #include "stdafx.h", but it would be better to create new project with Create empty project unmarked.

iconv_open() returning EINVAL on Solaris 8

In Solaris 8, it looks like iconv*() family of functions is broken and only supports conversion between single-byte charsets and UTF-8, which can be verified using this code example:
#include <stdio.h>
#include <errno.h>
#include <iconv.h>
#if defined(__sun) && defined(__SVR4)
#define CP1251 "ansi-1251"
#define ISO_8859_5 "ISO8859-5"
#else
#define CP1251 "CP1251"
#define ISO_8859_5 "ISO-8859-5"
#endif
void iconv_open_debug(const char *, const char *);
int main() {
iconv_open_debug(CP1251, CP1251);
iconv_open_debug(CP1251, ISO_8859_5);
iconv_open_debug(CP1251, "KOI8-R");
iconv_open_debug(CP1251, "UTF-8");
iconv_open_debug(CP1251, "WCHAR_T");
iconv_open_debug(ISO_8859_5, CP1251);
iconv_open_debug(ISO_8859_5, ISO_8859_5);
iconv_open_debug(ISO_8859_5, "KOI8-R");
iconv_open_debug(ISO_8859_5, "UTF-8");
iconv_open_debug(ISO_8859_5, "WCHAR_T");
iconv_open_debug("KOI8-R", CP1251);
iconv_open_debug("KOI8-R", ISO_8859_5);
iconv_open_debug("KOI8-R", "KOI8-R");
iconv_open_debug("KOI8-R", "UTF-8");
iconv_open_debug("KOI8-R", "WCHAR_T");
iconv_open_debug("UTF-8", CP1251);
iconv_open_debug("UTF-8", ISO_8859_5);
iconv_open_debug("UTF-8", "KOI8-R");
iconv_open_debug("UTF-8", "UTF-8");
iconv_open_debug("UTF-8", "WCHAR_T");
iconv_open_debug("WCHAR_T", CP1251);
iconv_open_debug("WCHAR_T", ISO_8859_5);
iconv_open_debug("WCHAR_T", "KOI8-R");
iconv_open_debug("WCHAR_T", "UTF-8");
iconv_open_debug("WCHAR_T", "WCHAR_T");
return 0;
}
void iconv_open_debug(const char *from, const char *to) {
errno = 0;
if (iconv_open(to, from) == (iconv_t) -1) {
fprintf(stderr, "iconv_open(\"%s\", \"%s\") FAIL: errno = %d\n", to, from, errno);
perror("iconv_open()");
} else {
fprintf(stdout, "iconv_open(\"%s\", \"%s\") PASS\n", to, from);
}
}
which only prints
iconv_open("UTF-8", "ansi-1251") PASS
iconv_open("UTF-8", "ISO8859-5") PASS
iconv_open("UTF-8", "KOI8-R") PASS
iconv_open("ansi-1251", "UTF-8") PASS
iconv_open("ISO8859-5", "UTF-8") PASS
iconv_open("KOI8-R", "UTF-8") PASS
to stdout and returns EINVAL for other pairs. Note that even conversion to the same charset (e.g. UTF-8 -> UTF-8) is not supported.
Questions
Can anyone reference a document describing the limitations of Solaris version of iconv.h?
How can I convert a wchar_t* to a single- or multibyte string w/o relying on GNU libiconv? wcstombs() would be fine except that it relies on the current locale's charset, while I want a wide string converted to a regular string using a particular charset, possibly different from the default one.
Running sdtconvtool shows most legacy codepages are supported.
After re-running the same utility with truss -u libc::iconv_open, I learnt that conversion from one single-byte encoding to another single-byte one is done in two steps, with intermediate conversion to UTF-8.
Speaking of conversion from "WCHAR_T", iconv(3) also does support it, but "UCS-4" should be used as a source charset name since sizeof(wchar_t) is 4 on Solaris (for both x86 and SPARC).

protoc-c: Nested structure with optional string throws seg fault

Trying out Google protocol buffers for my code in C language.
messagefile.proto
===================
mesage othermessage
{
optional string otherstring = 1;
}
message onemessage
{
optional string messagestring = 1;
optional int32 aninteger = 2;
optional othermessage otr_message= 3;
}
==============================================
--> protoc-c messagefile.proto --c_out=./
this resulted in two files
--> messagefile.pb-c.c and messagefile.pb-c.h
Now my code file which would try to use the
simpleexample.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "messagefile.pb-c.h"
#include <stdbool.h>
int main(int argc, const char * argv[])
{
onemessage msg = ONE__MESSAGE__INIT; //from generated .h code file
void *buf;
unsigned int len;
char *ptr;
//integer initialization
msg.has_aninteger = true;
msg.aninteger = 1;
//accessing the string in onemessage
msg.messagestring = malloc(sizeof("a simple string"));
strncpy(msg.messagestring,"a simple string",strlen("a simple string"));
//trying to initialize the string in the nested structure othermessage
msg.otr_message = malloc(sizeof(othermessage));
msg.otr_message->otherstring = malloc(sizeof("a not so simple string"));
strncpy(msg.otr_message->otherstring,"a not so simple string",strlen("a not so simple string"));
//lets find the length of the packed structure
len = one_message__get_packed_size(&msg); //from generated .h code
//lets arrange for as much size as len
buf = malloc(len);
//lets get the serialized structure in buf
one_message__pack_to_buffer(&msg,buf); //from generated code
//write it to a stream, for now the screen
fwrite(buf,len,1,stdout);
//free buffer
free(buf);
return 0;
}
I compile it as gcc -o testout messagefile.pb-c.c simpleexample.c -lprotobuf-c
The Problem I am facing is when trying to initialize the nested othermessage variables and then call the get_packed_size it throws a segmentation fault.
I tried various combinations and I can say that whenever having strings in a nested class, I am facing problem to access those using google protoc.
Am i missing something? Is there anything wrong.
Can anyone please help.
note:There might be a few general syntax errors please ignore them.
ThankYou.
note:There might be a few general syntax errors please ignore them.
Err... they are kinda hard to ignore since your code does not compile :-)
Anyway, apart from the syntax errors, you need to make several corrections to your code. In order to use the field otr_message, it is not sufficient to just malloc() it. You also need to initialize it so the headers in the message get the right values. This is done with init(), like this:
//trying to initialize the string in the nested structure othermessage
msg.otr_message = malloc(sizeof(othermessage));
othermessage__init(msg.otr_message);
Then you use the wrong function to do the packing to your own array. As explained here, you need to use pack() as opposed to pack_to_buffer(), like this:
//lets get the serialized structure in buf
onemessage__pack(&msg,buf); //from generated code
Finally, your strncpy() invocations have a mistake. The length calculated with strlen() does not include the null terminator, which you do need. So you need to take strlen()+1 or use sizeof(), like this:
strncpy(msg.messagestring,"a simple string",sizeof("a simple string"));
After making those changes, the example worked for me:
$ ./testout
a simple string
a not so simple string

fprintf() issues utf-8 linux

Ok, I got to print UTF-8 encoded chars to terminal. But printing to file is not working like i expected. Using wchar.h and locale.h as such:
#include <locale.h>
#include <wchar.h>
int main(){
setlocale(LC_ALL,"");
wint_t index = 0;
FILE* fpinout = fopen("UTF-8.txt","w");
for(index = 0; index < 0x200; index++){
printf("%i:\t%lc\n", index, index); //works fine, prints utf-8 chars to terminal
fprintf(fpinout,"%i\t%lc", index, index); //does not work, output is wierd
}
fclose(fpinout);
}
I tried to use index there both as wint_t and wchar_t.
My UTF-8.txt file looks like this:
र㄀ĉल㌂̉ऴ㔄ԉश㜆܉स㤈उ〱ਉㄱଉ㈱ఉ㌱ഉ㐱ฉ㔱༉㘱ဉ㜱ᄉ㠱ሉ㤱ጉ〲ᐉㄲᔉ㈲ᘉ㌲ᜉ㐲᠉㔲ᤉ㘲ᨉ㜲ᬉ㠲ᰉ㤲ᴉ〳ḉㄳἉ㈳ ㌳℉㐳∉㔳⌉㘳␉㜳
┉㠳☉㤳✉〴⠉ㄴ⤉㈴⨉㌴⬉㐴Ⰹ㔴ⴉ㘴⸉㜴⼉㠴〉㤴ㄉ〵㈉ㄵ㌉㈵㐉㌵㔉㐵㘉㔵㜉㘵㠉㜵㤉㠵㨉㤵㬉〶㰉ㄶ㴉㈶㸉㌶㼉㐶䀉㔶䄉㘶䈉
㜶䌉㠶䐉㤶䔉〷䘉ㄷ䜉㈷䠉㌷䤉㐷䨉㔷䬉㘷䰉㜷䴉㠷三㤷伉〸倉ㄸ儉㈸刉㌸匉㐸吉㔸唉㘸嘉㜸圉㠸堉㤸変〹娉ㄹ嬉㈹尉㌹崉㐹帉
㔹弉㘹怉㜹愉㠹戉㤹按〱रㅤ㄰攉〱लㅦ㌰有〱ऴㅨ㔰椉〱शㅪ㜰欉〱सㅬ㤰洉ㄱरㅮㄱ漉ㄱलㅰ㌱焉ㄱऴㅲ㔱猉ㄱशㅴ㜱甉ㄱसㅶ㤱眉
㈱रㅸㄲ礉㈱लㅺ㌲笉㈱ऴㅼ㔲紉㈱शㅾ㜲缉㈱स胂㈱ह臂㌱र苂㌱ऱ菂㌱ल蓂㌱ळ藂㌱ऴ蛂㌱व蟂㌱श裂㌱ष观㌱स諂㌱ह诂㐱र賂㐱ऱ跂㐱ल軂㐱
ळ迂㐱ऴ郂㐱व釂㐱श鋂㐱ष鏂㐱स铂㐱ह闂㔱र雂㔱ऱ韂㔱ल飂㔱ळ駂㔱ऴ髂㔱व鯂㔱श鳂㔱ष鷂㔱स黂㔱ह鿂㘱रꃂ㘱ऱꇂ㘱लꋂ㘱ळꏂ㘱ऴ꓂
㘱वꗂ㘱शꛂ㘱षꟂ㘱सꣂ㘱ह꧂㜱रꫂ㜱ऱꯂ㜱ल곂㜱ळ귂㜱ऴ껂㜱व꿂㜱श냂㜱ष뇂㜱स닂㜱ह돂㠱र듂㠱ऱ뗂㠱ल뛂㠱ळ럂㠱ऴ룂㠱व맂㠱श뫂
㠱ष믂㠱स볂㠱ह뷂㤱र뻂㤱ऱ뿂㤱ल胃㤱ळ臃㤱ऴ苃㤱व菃㤱श蓃㤱ष藃㤱स蛃㤱ह蟃〲र裃〲ऱ觃〲ल諃〲ळ诃〲ऴ賃〲व跃〲श軃〲ष迃〲स郃〲ह
釃ㄲर鋃ㄲऱ鏃ㄲल铃ㄲळ闃ㄲऴ雃ㄲव韃ㄲश飃ㄲष駃ㄲस髃ㄲह鯃㈲र鳃㈲ऱ鷃㈲ल黃㈲ळ鿃㈲ऴꃃ㈲वꇃ㈲शꋃ㈲षꏃ㈲स꓃㈲हꗃ㌲रꛃ㌲ऱꟃ㌲
लꣃ㌲ळ꧃㌲ऴ꫃㌲वꯃ㌲श곃㌲ष귃㌲स껃㌲ह꿃㐲र냃㐲ऱ뇃㐲ल닃㐲ळ돃㐲ऴ듃㐲व뗃㐲श뛃㐲ष럃㐲स룃㐲ह맃㔲र뫃㔲ऱ믃㔲ल볃㔲ळ뷃㔲ऴ뻃
㔲व뿃
Any help is appreciated.
This way you write UTF32 by fact. Opening file in binary mode won't help. it will remain UTF32LE.
You should use transformation to UTF8 encoding. Either use ICU library or wctomb / wctombs / wclen c functions ( http://man7.org/linux/man-pages/man3/wctomb.3.html ). be aware that wctomb* functions usually are locale dependent (often won't work correctly with Japanese if you have Greek locale)

Resources