或者网上有源码吗
UTF-8 -> ANSI(gb2312)
ANSI -> UTF-8
9 个解决方案
#1
MultiByteWideChar(CP_UTF8,0,...);
WideCharToMultiByte(CP_UTF8,0,...);
WideCharToMultiByte(CP_UTF8,0,...);
#2
以前写的两个函数,可以参考一下,具体可以查看WideCharToMultiByte和MultiByteToWideChar函数的使用:
//====Unicode—>简体中文
CString CCodeTransDlg::UnicodeToJt(CString str) const
{
CString csSub;
int iLen = strlen(str) / 4;//简体中文的长度
int iRealLen=0; //对应中文的实际长度
WCHAR *uUnicode = new WCHAR[iLen];//unicode的十进制数组
for (int li=0; li<iLen; li++)
{
csSub = str.Mid(li*4, 4);
uUnicode[li] = StrHexToDec(csSub);
if (uUnicode[li]<256) iRealLen +=1;
else iRealLen +=2;
}
CString des;
WideCharToMultiByte(936, 0, uUnicode, -1, des.GetBuffer(iRealLen), iRealLen, NULL, NULL);
des.ReleaseBuffer(iRealLen);
delete[] uUnicode;
return des;
}//====简体—>DBCS码
CString CCodeTransDlg::JtToDBCS(CString str) const
{
CString csTmp,csResult="";
byte tmp;
for (int li=0; li<str.GetLength(); li++)
{
tmp = str[li];
csTmp.Format("%x", tmp);
csResult += csTmp;
}
return csResult;
}
//====Unicode—>简体中文
CString CCodeTransDlg::UnicodeToJt(CString str) const
{
CString csSub;
int iLen = strlen(str) / 4;//简体中文的长度
int iRealLen=0; //对应中文的实际长度
WCHAR *uUnicode = new WCHAR[iLen];//unicode的十进制数组
for (int li=0; li<iLen; li++)
{
csSub = str.Mid(li*4, 4);
uUnicode[li] = StrHexToDec(csSub);
if (uUnicode[li]<256) iRealLen +=1;
else iRealLen +=2;
}
CString des;
WideCharToMultiByte(936, 0, uUnicode, -1, des.GetBuffer(iRealLen), iRealLen, NULL, NULL);
des.ReleaseBuffer(iRealLen);
delete[] uUnicode;
return des;
}//====简体—>DBCS码
CString CCodeTransDlg::JtToDBCS(CString str) const
{
CString csTmp,csResult="";
byte tmp;
for (int li=0; li<str.GetLength(); li++)
{
tmp = str[li];
csTmp.Format("%x", tmp);
csResult += csTmp;
}
return csResult;
}
#3
ansi在win平台他会自动在后台把两个字节组合例如char*str="中国";
输出就可以看到中文
utf-8转换到ANSI这个问题我也在做我是这样做的
utf-8 <---->unicode <------>ansi
unsigned int utf82unicode(long utf8){
unsigned int unicode=0;
if((0x00000000<utf8)&&(utf8<0x0000007F)){ /* 0xxxxxxx */
unicode=(unsigned int)utf8;
return unicode;
}else{
if((0x0000C080<utf8)&&(utf8<0x0000DFBF)){ /* 110xxxxx 10xxxxxx */
unicode=(unsigned int)(0x00001F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0x00E08080<utf8)&&(utf8<0x00EFBFBF)){ /* 1110xxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x000F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0xF0808080<utf8)&&(utf8<0xF7BFBFBF))
/*11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x07000000&utf8)>>6
|(unsigned int)(0x003F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}
}
}
}
void printNumber(long i){
cout<<"In printNumber's input "<<hex<<i<<endl;
char buffer [sizeof(long)*8+1];
ltoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
ltoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
ltoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return ;
}
unsigned int u2utf8(unsigned short uni)
{
unsigned int utf8;
if(uni < 0x80)
{
utf8 = uni;
return utf8;
}
if(uni < 0x800)
{
utf8 = (0xc0 | (uni >> 6)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x10000)
{
utf8 = (0xe0 | (uni>>12)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x20000)
{
utf8 = (0xf0 | (uni >> 18)) << 24
| (0x80 | (uni >> 12 & 0x3f)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
else
{
/*we don't deal with it, so we return the unicode.*/
return uni;
}
}
哈哈!
utf82unicode没有实现全部的utf-8到unicode的转换
只做了四字节的utf-8到unicode的转换!
输出就可以看到中文
utf-8转换到ANSI这个问题我也在做我是这样做的
utf-8 <---->unicode <------>ansi
unsigned int utf82unicode(long utf8){
unsigned int unicode=0;
if((0x00000000<utf8)&&(utf8<0x0000007F)){ /* 0xxxxxxx */
unicode=(unsigned int)utf8;
return unicode;
}else{
if((0x0000C080<utf8)&&(utf8<0x0000DFBF)){ /* 110xxxxx 10xxxxxx */
unicode=(unsigned int)(0x00001F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0x00E08080<utf8)&&(utf8<0x00EFBFBF)){ /* 1110xxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x000F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0xF0808080<utf8)&&(utf8<0xF7BFBFBF))
/*11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x07000000&utf8)>>6
|(unsigned int)(0x003F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}
}
}
}
void printNumber(long i){
cout<<"In printNumber's input "<<hex<<i<<endl;
char buffer [sizeof(long)*8+1];
ltoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
ltoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
ltoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return ;
}
unsigned int u2utf8(unsigned short uni)
{
unsigned int utf8;
if(uni < 0x80)
{
utf8 = uni;
return utf8;
}
if(uni < 0x800)
{
utf8 = (0xc0 | (uni >> 6)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x10000)
{
utf8 = (0xe0 | (uni>>12)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x20000)
{
utf8 = (0xf0 | (uni >> 18)) << 24
| (0x80 | (uni >> 12 & 0x3f)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
else
{
/*we don't deal with it, so we return the unicode.*/
return uni;
}
}
哈哈!
utf82unicode没有实现全部的utf-8到unicode的转换
只做了四字节的utf-8到unicode的转换!
#4
我有个问题不明白,在char *str="汉"里面,用的是什么编码?是ASCII编码还是ANSI(GB2312)?我试了一下楼上的函数,从Unicode转换到ANSI时,得到的不是ANSI编码(BA BA),而是ASCII编码(-70 -70).
#5
UTF-8只是对UNICODE的一个编码方案,不要和UNICODE混淆了,UTF-8和UNICODE之间只是一个对照关系,所以编码解码都很容易。
UNICODE和本地编码方案(如GB2312)之间的转换可以通过标准的API来实现:
wcstombs
mbstowcs
使用这两个函数之前必须将你的LC_CTYPE设置成你想要的本地编码方案(使用 setlocale函数)
上面这些都是标准C++的实现方式,和使用WIN32 API:WideCharToMultiByte相比更加通用。
配合“ 回复人:woundedsoul(浪子) ”提供的UTF-8与UNICODE之间的转换(他实现得不完整,楼主要注意)
就可以在GB和UTF-8之间随意操作了:)
UNICODE和本地编码方案(如GB2312)之间的转换可以通过标准的API来实现:
wcstombs
mbstowcs
使用这两个函数之前必须将你的LC_CTYPE设置成你想要的本地编码方案(使用 setlocale函数)
上面这些都是标准C++的实现方式,和使用WIN32 API:WideCharToMultiByte相比更加通用。
配合“ 回复人:woundedsoul(浪子) ”提供的UTF-8与UNICODE之间的转换(他实现得不完整,楼主要注意)
就可以在GB和UTF-8之间随意操作了:)
#6
顺便帮你贴一个完整的UTF8到WCS的转换函数(SWI库里面的---实际上它是在APACHE的库里面的实现,不过是一个旧一些时候的版本了)
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
static const char gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static const unsigned long gUTFOffsets[6] =
{ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };
static const unsigned char gFirstByteMark[7] =
{ 0x00, 0x00, 0xC0, 0xE0,
0xF0, 0xF8, 0xFC };
SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen )
{
// Get pointers to our start and end points of the input buffer
const unsigned char* srcPtr = src;
const unsigned char* srcEnd = src + strlen((const char *)src);
wchar_t *dstSave = dst;
wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */
// We now loop until we run out of input data.
while (srcPtr < srcEnd) {
unsigned int trailingBytes;
unsigned long tmpVal = 0;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *srcPtr;
// Special-case ASCII, which is a leading byte value of <= 127
if (firstByte <= 127) {
*dst++ = (wchar_t) firstByte;
srcPtr++;
continue;
}
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
if (srcPtr + trailingBytes >= srcEnd)
return SWIchar_FAIL; // ??
// Looks ok, so lets build up the value
switch (trailingBytes) {
case 5: tmpVal += *srcPtr++; tmpVal <<= 6;
case 4: tmpVal += *srcPtr++; tmpVal <<= 6;
case 3: tmpVal += *srcPtr++; tmpVal <<= 6;
case 2: tmpVal += *srcPtr++; tmpVal <<= 6;
case 1: tmpVal += *srcPtr++; tmpVal <<= 6;
case 0: tmpVal += *srcPtr++;
break;
default:
return SWIchar_ERROR;
}
tmpVal -= gUTFOffsets[trailingBytes];
// If surrogate pairs would be required for 16-bit characters, fail.
if (tmpVal & 0xFFFF0000)
return SWIchar_FAIL;
if ( dst >= dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
*dst++ = (wchar_t)tmpVal;
}
*dst = L'\0';
// return dst-dstSave;
return SWIchar_SUCCESS; // check this (CARO)
}
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
static const char gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static const unsigned long gUTFOffsets[6] =
{ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };
static const unsigned char gFirstByteMark[7] =
{ 0x00, 0x00, 0xC0, 0xE0,
0xF0, 0xF8, 0xFC };
SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen )
{
// Get pointers to our start and end points of the input buffer
const unsigned char* srcPtr = src;
const unsigned char* srcEnd = src + strlen((const char *)src);
wchar_t *dstSave = dst;
wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */
// We now loop until we run out of input data.
while (srcPtr < srcEnd) {
unsigned int trailingBytes;
unsigned long tmpVal = 0;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *srcPtr;
// Special-case ASCII, which is a leading byte value of <= 127
if (firstByte <= 127) {
*dst++ = (wchar_t) firstByte;
srcPtr++;
continue;
}
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
if (srcPtr + trailingBytes >= srcEnd)
return SWIchar_FAIL; // ??
// Looks ok, so lets build up the value
switch (trailingBytes) {
case 5: tmpVal += *srcPtr++; tmpVal <<= 6;
case 4: tmpVal += *srcPtr++; tmpVal <<= 6;
case 3: tmpVal += *srcPtr++; tmpVal <<= 6;
case 2: tmpVal += *srcPtr++; tmpVal <<= 6;
case 1: tmpVal += *srcPtr++; tmpVal <<= 6;
case 0: tmpVal += *srcPtr++;
break;
default:
return SWIchar_ERROR;
}
tmpVal -= gUTFOffsets[trailingBytes];
// If surrogate pairs would be required for 16-bit characters, fail.
if (tmpVal & 0xFFFF0000)
return SWIchar_FAIL;
if ( dst >= dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
*dst++ = (wchar_t)tmpVal;
}
*dst = L'\0';
// return dst-dstSave;
return SWIchar_SUCCESS; // check this (CARO)
}
#7
再BTW:将ANSI和GB2312混为一谈的话,在文 化 大 革 命时期是要抓出去砍头的,楼主犯了严重的政治错位,请google一下ANSI到底是什么,GB2312又是什么。
#8
在 C++ 中,将字符与字节进行转换的方法是:mbstowcs
如果是在 Visual C++ 中,那么更好的方法是:MultiByteToWideChar
搂主可以将
ANSI(gb2312等,char[]) => UNICODE 字符串(wchar_t[])
再将
UNICODE(wchar_t[]) => utf8(char[])
(utf8 很简单,因此这一步操作也可以自己写代码完成)
关于概念,推荐文章:
http://www.regexlab.com/zh/encoding.htm
关于 utf8 的格式,推荐文章:
http://www.nk975.com/sswater/myref/index.asp?id=20
如果是在 Visual C++ 中,那么更好的方法是:MultiByteToWideChar
搂主可以将
ANSI(gb2312等,char[]) => UNICODE 字符串(wchar_t[])
再将
UNICODE(wchar_t[]) => utf8(char[])
(utf8 很简单,因此这一步操作也可以自己写代码完成)
关于概念,推荐文章:
http://www.regexlab.com/zh/encoding.htm
关于 utf8 的格式,推荐文章:
http://www.nk975.com/sswater/myref/index.asp?id=20
#9
#1
MultiByteWideChar(CP_UTF8,0,...);
WideCharToMultiByte(CP_UTF8,0,...);
WideCharToMultiByte(CP_UTF8,0,...);
#2
以前写的两个函数,可以参考一下,具体可以查看WideCharToMultiByte和MultiByteToWideChar函数的使用:
//====Unicode—>简体中文
CString CCodeTransDlg::UnicodeToJt(CString str) const
{
CString csSub;
int iLen = strlen(str) / 4;//简体中文的长度
int iRealLen=0; //对应中文的实际长度
WCHAR *uUnicode = new WCHAR[iLen];//unicode的十进制数组
for (int li=0; li<iLen; li++)
{
csSub = str.Mid(li*4, 4);
uUnicode[li] = StrHexToDec(csSub);
if (uUnicode[li]<256) iRealLen +=1;
else iRealLen +=2;
}
CString des;
WideCharToMultiByte(936, 0, uUnicode, -1, des.GetBuffer(iRealLen), iRealLen, NULL, NULL);
des.ReleaseBuffer(iRealLen);
delete[] uUnicode;
return des;
}//====简体—>DBCS码
CString CCodeTransDlg::JtToDBCS(CString str) const
{
CString csTmp,csResult="";
byte tmp;
for (int li=0; li<str.GetLength(); li++)
{
tmp = str[li];
csTmp.Format("%x", tmp);
csResult += csTmp;
}
return csResult;
}
//====Unicode—>简体中文
CString CCodeTransDlg::UnicodeToJt(CString str) const
{
CString csSub;
int iLen = strlen(str) / 4;//简体中文的长度
int iRealLen=0; //对应中文的实际长度
WCHAR *uUnicode = new WCHAR[iLen];//unicode的十进制数组
for (int li=0; li<iLen; li++)
{
csSub = str.Mid(li*4, 4);
uUnicode[li] = StrHexToDec(csSub);
if (uUnicode[li]<256) iRealLen +=1;
else iRealLen +=2;
}
CString des;
WideCharToMultiByte(936, 0, uUnicode, -1, des.GetBuffer(iRealLen), iRealLen, NULL, NULL);
des.ReleaseBuffer(iRealLen);
delete[] uUnicode;
return des;
}//====简体—>DBCS码
CString CCodeTransDlg::JtToDBCS(CString str) const
{
CString csTmp,csResult="";
byte tmp;
for (int li=0; li<str.GetLength(); li++)
{
tmp = str[li];
csTmp.Format("%x", tmp);
csResult += csTmp;
}
return csResult;
}
#3
ansi在win平台他会自动在后台把两个字节组合例如char*str="中国";
输出就可以看到中文
utf-8转换到ANSI这个问题我也在做我是这样做的
utf-8 <---->unicode <------>ansi
unsigned int utf82unicode(long utf8){
unsigned int unicode=0;
if((0x00000000<utf8)&&(utf8<0x0000007F)){ /* 0xxxxxxx */
unicode=(unsigned int)utf8;
return unicode;
}else{
if((0x0000C080<utf8)&&(utf8<0x0000DFBF)){ /* 110xxxxx 10xxxxxx */
unicode=(unsigned int)(0x00001F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0x00E08080<utf8)&&(utf8<0x00EFBFBF)){ /* 1110xxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x000F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0xF0808080<utf8)&&(utf8<0xF7BFBFBF))
/*11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x07000000&utf8)>>6
|(unsigned int)(0x003F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}
}
}
}
void printNumber(long i){
cout<<"In printNumber's input "<<hex<<i<<endl;
char buffer [sizeof(long)*8+1];
ltoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
ltoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
ltoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return ;
}
unsigned int u2utf8(unsigned short uni)
{
unsigned int utf8;
if(uni < 0x80)
{
utf8 = uni;
return utf8;
}
if(uni < 0x800)
{
utf8 = (0xc0 | (uni >> 6)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x10000)
{
utf8 = (0xe0 | (uni>>12)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x20000)
{
utf8 = (0xf0 | (uni >> 18)) << 24
| (0x80 | (uni >> 12 & 0x3f)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
else
{
/*we don't deal with it, so we return the unicode.*/
return uni;
}
}
哈哈!
utf82unicode没有实现全部的utf-8到unicode的转换
只做了四字节的utf-8到unicode的转换!
输出就可以看到中文
utf-8转换到ANSI这个问题我也在做我是这样做的
utf-8 <---->unicode <------>ansi
unsigned int utf82unicode(long utf8){
unsigned int unicode=0;
if((0x00000000<utf8)&&(utf8<0x0000007F)){ /* 0xxxxxxx */
unicode=(unsigned int)utf8;
return unicode;
}else{
if((0x0000C080<utf8)&&(utf8<0x0000DFBF)){ /* 110xxxxx 10xxxxxx */
unicode=(unsigned int)(0x00001F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0x00E08080<utf8)&&(utf8<0x00EFBFBF)){ /* 1110xxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x000F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}else{
if((0xF0808080<utf8)&&(utf8<0xF7BFBFBF))
/*11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
unicode=(unsigned int)(0x07000000&utf8)>>6
|(unsigned int)(0x003F0000&utf8)>>4
|(unsigned int)(0x00003F00&utf8)>>2
|(unsigned int)(0x0000003F&utf8);
return unicode;
}
}
}
}
void printNumber(long i){
cout<<"In printNumber's input "<<hex<<i<<endl;
char buffer [sizeof(long)*8+1];
ltoa (i,buffer,10);
printf ("decimal: %s\n",buffer);
ltoa (i,buffer,16);
printf ("hexadecimal: %s\n",buffer);
ltoa (i,buffer,2);
printf ("binary: %s\n",buffer);
return ;
}
unsigned int u2utf8(unsigned short uni)
{
unsigned int utf8;
if(uni < 0x80)
{
utf8 = uni;
return utf8;
}
if(uni < 0x800)
{
utf8 = (0xc0 | (uni >> 6)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x10000)
{
utf8 = (0xe0 | (uni>>12)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
if(uni < 0x20000)
{
utf8 = (0xf0 | (uni >> 18)) << 24
| (0x80 | (uni >> 12 & 0x3f)) << 16
| (0x80 | (uni >> 6 & 0x3f)) << 8
| (0x80 | (uni & 0x3f));
return utf8;
}
else
{
/*we don't deal with it, so we return the unicode.*/
return uni;
}
}
哈哈!
utf82unicode没有实现全部的utf-8到unicode的转换
只做了四字节的utf-8到unicode的转换!
#4
我有个问题不明白,在char *str="汉"里面,用的是什么编码?是ASCII编码还是ANSI(GB2312)?我试了一下楼上的函数,从Unicode转换到ANSI时,得到的不是ANSI编码(BA BA),而是ASCII编码(-70 -70).
#5
UTF-8只是对UNICODE的一个编码方案,不要和UNICODE混淆了,UTF-8和UNICODE之间只是一个对照关系,所以编码解码都很容易。
UNICODE和本地编码方案(如GB2312)之间的转换可以通过标准的API来实现:
wcstombs
mbstowcs
使用这两个函数之前必须将你的LC_CTYPE设置成你想要的本地编码方案(使用 setlocale函数)
上面这些都是标准C++的实现方式,和使用WIN32 API:WideCharToMultiByte相比更加通用。
配合“ 回复人:woundedsoul(浪子) ”提供的UTF-8与UNICODE之间的转换(他实现得不完整,楼主要注意)
就可以在GB和UTF-8之间随意操作了:)
UNICODE和本地编码方案(如GB2312)之间的转换可以通过标准的API来实现:
wcstombs
mbstowcs
使用这两个函数之前必须将你的LC_CTYPE设置成你想要的本地编码方案(使用 setlocale函数)
上面这些都是标准C++的实现方式,和使用WIN32 API:WideCharToMultiByte相比更加通用。
配合“ 回复人:woundedsoul(浪子) ”提供的UTF-8与UNICODE之间的转换(他实现得不完整,楼主要注意)
就可以在GB和UTF-8之间随意操作了:)
#6
顺便帮你贴一个完整的UTF8到WCS的转换函数(SWI库里面的---实际上它是在APACHE的库里面的实现,不过是一个旧一些时候的版本了)
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
static const char gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static const unsigned long gUTFOffsets[6] =
{ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };
static const unsigned char gFirstByteMark[7] =
{ 0x00, 0x00, 0xC0, 0xE0,
0xF0, 0xF8, 0xFC };
SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen )
{
// Get pointers to our start and end points of the input buffer
const unsigned char* srcPtr = src;
const unsigned char* srcEnd = src + strlen((const char *)src);
wchar_t *dstSave = dst;
wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */
// We now loop until we run out of input data.
while (srcPtr < srcEnd) {
unsigned int trailingBytes;
unsigned long tmpVal = 0;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *srcPtr;
// Special-case ASCII, which is a leading byte value of <= 127
if (firstByte <= 127) {
*dst++ = (wchar_t) firstByte;
srcPtr++;
continue;
}
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
if (srcPtr + trailingBytes >= srcEnd)
return SWIchar_FAIL; // ??
// Looks ok, so lets build up the value
switch (trailingBytes) {
case 5: tmpVal += *srcPtr++; tmpVal <<= 6;
case 4: tmpVal += *srcPtr++; tmpVal <<= 6;
case 3: tmpVal += *srcPtr++; tmpVal <<= 6;
case 2: tmpVal += *srcPtr++; tmpVal <<= 6;
case 1: tmpVal += *srcPtr++; tmpVal <<= 6;
case 0: tmpVal += *srcPtr++;
break;
default:
return SWIchar_ERROR;
}
tmpVal -= gUTFOffsets[trailingBytes];
// If surrogate pairs would be required for 16-bit characters, fail.
if (tmpVal & 0xFFFF0000)
return SWIchar_FAIL;
if ( dst >= dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
*dst++ = (wchar_t)tmpVal;
}
*dst = L'\0';
// return dst-dstSave;
return SWIchar_SUCCESS; // check this (CARO)
}
// gUTFBytes
// A list of counts of trailing bytes for each initial byte in the input.
//
// gUTFOffsets
// A list of values to offset each result char type, according to how
// many source bytes when into making it.
//
// gFirstByteMark
// A list of values to mask onto the first byte of an encoded sequence,
// indexed by the number of bytes used to create the sequence.
static const char gUTFBytes[256] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static const unsigned long gUTFOffsets[6] =
{ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 };
static const unsigned char gFirstByteMark[7] =
{ 0x00, 0x00, 0xC0, 0xE0,
0xF0, 0xF8, 0xFC };
SWIcharResult SWIutf8towcs( const unsigned char *src, wchar_t *dst, int maxdstlen )
{
// Get pointers to our start and end points of the input buffer
const unsigned char* srcPtr = src;
const unsigned char* srcEnd = src + strlen((const char *)src);
wchar_t *dstSave = dst;
wchar_t *dstEnd = dst+maxdstlen; /* leave room for null */
// We now loop until we run out of input data.
while (srcPtr < srcEnd) {
unsigned int trailingBytes;
unsigned long tmpVal = 0;
// Get the next leading byte out
const unsigned char firstByte = (unsigned char) *srcPtr;
// Special-case ASCII, which is a leading byte value of <= 127
if (firstByte <= 127) {
*dst++ = (wchar_t) firstByte;
srcPtr++;
continue;
}
// See how many trailing src bytes this sequence is going to require
trailingBytes = gUTFBytes[firstByte];
// If there are not enough source bytes to do this one, then we
// are done. Note that we done >= here because we are implicitly
// counting the 1 byte we get no matter what.
if (srcPtr + trailingBytes >= srcEnd)
return SWIchar_FAIL; // ??
// Looks ok, so lets build up the value
switch (trailingBytes) {
case 5: tmpVal += *srcPtr++; tmpVal <<= 6;
case 4: tmpVal += *srcPtr++; tmpVal <<= 6;
case 3: tmpVal += *srcPtr++; tmpVal <<= 6;
case 2: tmpVal += *srcPtr++; tmpVal <<= 6;
case 1: tmpVal += *srcPtr++; tmpVal <<= 6;
case 0: tmpVal += *srcPtr++;
break;
default:
return SWIchar_ERROR;
}
tmpVal -= gUTFOffsets[trailingBytes];
// If surrogate pairs would be required for 16-bit characters, fail.
if (tmpVal & 0xFFFF0000)
return SWIchar_FAIL;
if ( dst >= dstEnd ) {
return SWIchar_BUFFER_OVERFLOW;
}
*dst++ = (wchar_t)tmpVal;
}
*dst = L'\0';
// return dst-dstSave;
return SWIchar_SUCCESS; // check this (CARO)
}
#7
再BTW:将ANSI和GB2312混为一谈的话,在文 化 大 革 命时期是要抓出去砍头的,楼主犯了严重的政治错位,请google一下ANSI到底是什么,GB2312又是什么。
#8
在 C++ 中,将字符与字节进行转换的方法是:mbstowcs
如果是在 Visual C++ 中,那么更好的方法是:MultiByteToWideChar
搂主可以将
ANSI(gb2312等,char[]) => UNICODE 字符串(wchar_t[])
再将
UNICODE(wchar_t[]) => utf8(char[])
(utf8 很简单,因此这一步操作也可以自己写代码完成)
关于概念,推荐文章:
http://www.regexlab.com/zh/encoding.htm
关于 utf8 的格式,推荐文章:
http://www.nk975.com/sswater/myref/index.asp?id=20
如果是在 Visual C++ 中,那么更好的方法是:MultiByteToWideChar
搂主可以将
ANSI(gb2312等,char[]) => UNICODE 字符串(wchar_t[])
再将
UNICODE(wchar_t[]) => utf8(char[])
(utf8 很简单,因此这一步操作也可以自己写代码完成)
关于概念,推荐文章:
http://www.regexlab.com/zh/encoding.htm
关于 utf8 的格式,推荐文章:
http://www.nk975.com/sswater/myref/index.asp?id=20