常用字库编码的转换(Unicode,Utf8,Big5,Gb18030)

时间:2020-12-09 17:31:08

 

int UnicodeToUtf8(LPUNC src, BYTE* putf8)
{
int len=0;
while(*src)
{
if (*src < 0x80) //one byte
{
putf8[len++] = *src;
}
else if (*src < 0x800) //two byte
{
putf8[len++] = 0xC0 | (*src >> 12);
putf8[len++] = 0x80 | (*src >>6 & 0x3F);
}
else
{
putf8[len++] = 0xE0 | (*src >> 12);
putf8[len++] = 0x80 | (*src >>6 & 0x3F);
putf8[len++] = 0x80 | (*src &0x3F);
}
src ++;
}
putf8[len] = 0;
return len;
}

int Utf8ToUnicode(BYTE* src, LPUNC punicode)
{
if(0 == (src[0] & 0x80)){
// 单字节
*punicode = (UNC)src[0];
return 1;
}
if(0xC0 == (src[0] & 0xE0) &&
0x80 == (src[1] & 0xC0)){
// 双字节
*punicode = (UNC)((((UNC)src[0] & 0x001F) << 6) |
((UNC)src[1] & 0x003F));
return 2;
}

    if(0xE0 == (src[0] & 0xF0) &&
0x80 == (src[1] & 0xC0) &&
0x80 == (src[2] & 0xC0)){
// 三字节
*punicode = (UNC)((((UNC)src[0] & 0x000F) << 12) |
(((UNC)src[1] & 0x003F) << 6) |
((UNC)src[2] & 0x003F));
return 3;
}
return 0; // 表示出错
}

UNC Big5ToUnicode(WORD big5)
{
if(NULL == big5_unicode_tbl){
return 0x0000;
}
int low = 0;
int high = 13502;
int mid;
if(big5 < 0x80)
{
return big5;

}
if(big5 >= 0xA140 && big5 <= 0xF9FE)
{
while(low <= high)
{
mid = (low+high)/2;
WORD cur = *(big5_unicode_tbl + mid * 2);
if(cur > big5)
{
high = mid-1;
continue;
}
if(cur < big5)
{
low = mid +1;
continue;
}
if(cur == big5)
{
return *(big5_unicode_tbl + mid * 2 + 1);
}

}
}
return 0x0000;
}

WORD UnicodeToBig5(UNC unicode)
{
if(NULL == unicode_big5_tbl){
return 0x0000;
}
int low = 0;
int high = 13502;
int mid;
if(unicode < 0x80)
{
return unicode;

}
while(low <= high)
{
mid = (low+high)/2;
UNC cur = *(unicode_big5_tbl + mid * 2 + 1);
if(cur > unicode)
{
high = mid-1;
continue;
}
if(cur < unicode)
{
low = mid +1;
continue;
}
if(cur == unicode)
{
return *(unicode_big5_tbl + mid * 2);
}

}
return 0x0;

}

UNC Gb18030ToUnicode(WORD gb18030)
{
if(NULL == gb18030_unicode_tbl){
return 0x0000;
}
int low = 0;
int high = 21790;
int mid;
if(gb18030 < 0x80)
{
return gb18030;
}
if(gb18030 == 0x80)
return 0x20AC;
if(gb18030 >= 0x8140 && gb18030 <= 0xFE4F)
{
while(low <= high)
{
mid = (low+high)/2;
WORD cur = *(gb18030_unicode_tbl + mid * 2);
if(cur > gb18030)
{
high = mid-1;
continue;
}
if(cur < gb18030)
{
low = mid +1;
continue;
}
if(cur == gb18030)
{
return *(gb18030_unicode_tbl + mid * 2 + 1);
}

   }
}
return 0x0000;
}

WORD UnicodeToGb18030(UNC unicode)
{
if(NULL == unicode_gb18030_tbl){
return 0x0000;
}
int low = 0;
int high = 21790;
int mid;
if(unicode < 0x80)
{
return unicode;
}
if(unicode == 0x20AC)
return 0x80;
while(low <= high)
{
mid = (low+high)/2;
UNC cur = *(unicode_gb18030_tbl + mid * 2 + 1);
if(cur > unicode)
{
high = mid-1;
continue;
}
if(cur < unicode)
{
low = mid +1;
continue;
}
if(cur == unicode)
{
return *(unicode_gb18030_tbl + mid * 2);
}

}
return 0x0;
}


// 转换utf8字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int utf8_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
char* old_src = src;
int err = 0;
int i = 0;
while(i < dest_len){
int len = Utf8ToUnicode(src, &dest[i]);
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0 == len){
err = 1;
len = 1;
dest[i] = UNKNOWN_CHAR;
}
src += len;
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
if(err){
printf("utf8 string err! ");T();
DUMP_BUFFER("---------------",old_src,20);
DUMP_BUFFER("===============",(char *)dest,20);
}
return i+1;
}

int utf8_to_unicode2(BYTE* src, LPUNC dest, int dest_len, int*

perr_count)
{
*perr_count = 0;
char* old_src = src;
int err = 0;
int i = 0;
while(i < dest_len){
int len = Utf8ToUnicode(src, &dest[i]);
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0 == len){
err = 1;
len = 1;
dest[i] = UNKNOWN_CHAR;
if(NULL != perr_count){
(*perr_count) ++;
}
}
src += len;
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
if(err){
printf("utf8 string err! ");T();
DUMP_BUFFER("---------------",old_src,20);
DUMP_BUFFER("===============",(char *)dest,20);
}
return i+1;
}

// 转换gb18030字符串
// 参数:src:源字符串;dest:目标字符串;dest_len:目标可容纳长度(字符数)
// 返回值:实际完成转换的字符数
int gb18030_to_unicode(BYTE* src, LPUNC dest, int dest_len)
{
int i = 0;
while(i < dest_len){
if(*src < 0x80){
dest[i] = (UNC)(*src);
src += 1;
}else{
WORD ch = MAKEWORD(*(src+1), *src);
dest[i] = Gb18030ToUnicode(ch);
src += 2;
}
//F(len);printf("code = 0x%04X/n", dest[i]);
if(0x0000 == dest[i]){
break; // 字符串结束符
}
i ++;
}
return i+1;
}