我们native层配置文件用了两套编码,unicode和gbk2312,都是两个字节表示中文。要统一采用gbk编码,顺便顺便熟悉一下gbk。
python3.3的测试
为方便使用python做测试。python默认的字符串类str采用unicode,s = '中文' 等价于 s = u'中文'。
dp_gbk_bs将字符串转化成gbk编码,并输出每个中文对应的unsigned short。
wd_2_gbk_val 将单个字符转化成gbk编码的unsigned short值。
gbk_val_2_wd 将gbk编码下的一个short数值还原成中文字符串。
# function 0输出结果:
# 输出字符串中每个汉字码值
def dp_gbk_bs(name):
'''
dump gbk2312 string to unsigned short vector.
'''
gbk_bs = str.encode(name, 'gbk')
print( name, "(gbk2312) = ", gbk_bs)
size = len(name)
form = ''
for i in range(size):
form += 'H'
vec = struct.unpack(form, gbk_bs) # return a tuple
# v1, v2 = struct.unpack('HH', gbk_bs)
for i in range(size):
print(name[i], " = ", vec[i])
# function 1
def wd_2_gbk_val(wd):
'''
transform single gbk2312 word to unsigned short value.
中 = 53462 , 0xd0d6
'''
cd_ty = 'gbk'
gbk_bt = str.encode(wd, cd_ty);
vec = struct.unpack('H', gbk_bt) # return tuple type.
return vec[0]
ss = u'中国技术交易大厦'
dp_gbk_bs(ss)
# function 2
# short 转成hex string,然后转化成bytes,最后转化成str(gbk2312)
def gbk_val_2_wd(us_v):
'''
transform unsigned short value to gbk2312 single word.
53462 , 0xd0d6 => 中
'''
hex_s = hex(us_v) # 获取16进制串
print('hex str = ', hex_s)
hex_s = hex_s[2:] # skip '0x' header
hex_list = []
while hex_s:
str_tp = hex_s[0:2]
hex_list.append(str_tp)
hex_s = hex_s[2:]
res = ""
# 大小端倒置,逆序遍历
for w in reversed(hex_list):
res += w
print('reversed hex str = ', res)
bys = bytes.fromhex(res)
print('bytes = ', bys)
return bys.decode('gbk')
string = '中国技术交易大厦'
val_l = []
for ss in string:
val = wd_2_gbk_val(ss)
val_l.append( val )
print( ss, " = ", val)
print(val_l)
for val in val_l:
print('------------------')
print(val, " = ", gbk_val_2_wd(val))
中 = 53462
国 = 64185
技 = 48316
术 = 62922
交 = 48061
易 = 55250
大 = 62388
厦 = 50127
[53462, 64185, 48316, 62922, 48061, 55250, 62388, 50127]
------------------
hex str = 0xd0d6
reversed hex str = d6d0
bytes = b'\xd6\xd0'
53462 = 中
------------------
hex str = 0xfab9
reversed hex str = b9fa
bytes = b'\xb9\xfa'
64185 = 国
------------------
hex str = 0xbcbc
reversed hex str = bcbc
bytes = b'\xbc\xbc'
48316 = 技
------------------
hex str = 0xf5ca
reversed hex str = caf5
bytes = b'\xca\xf5'
62922 = 术
------------------
hex str = 0xbbbd
reversed hex str = bdbb
bytes = b'\xbd\xbb'
48061 = 交
------------------
hex str = 0xd7d2
reversed hex str = d2d7
bytes = b'\xd2\xd7'
55250 = 易
------------------
hex str = 0xf3b4
reversed hex str = b4f3
bytes = b'\xb4\xf3'
62388 = 大
------------------
hex str = 0xc3cf
reversed hex str = cfc3
bytes = b'\xcf\xc3'
50127 = 厦
C的测试
char buf[2] = {0};
short word = 53462;// ‘中’// 0xd0d6
memcpy(buf, &word, 2);// [-42, -48], 0xd0是-48的补码, 0xd6是-42的补码
// 0xd6 0xd0
unsigned char* byte_ptr = (unsigned char*)&word;// 小端存储
unsigned char byte0 = byte_ptr[0];// 2140xd6,0xd6是214的原码
unsigned char byte1 = byte_ptr[1];// 2080xd0,0xd0是208的原码
unsigned short word存储汉字‘中’的gbk2312码值,其内容拷贝到char buf数组中,buf内容为负数,buf[0]为word的低字节,buf[1]为高字节。通过byte_ptr指针取得内容为正数。
0xd0d6,word变量内存结构:
内存高地址 | 内存 低地址
d0 | d6
-48 | -42 // signed char,对应的负数补码
208 | 214 // unsigned char 对应的正数原码
buf数组内存结构
buf[0] | buf[1]
取低地址, 取高地址
-42 | -48
Unicode2GBK函数:
/**
* 将unicode字符串转化成gbk2312编码的字符串
*/
int Unicode2GBK( wchar_t *pUnicode, char** ppDest)
{
#ifndef CODE_PAGE_GB18030
#define CODE_PAGE_GB18030 54936
#endif
// get the size of the dest string
const int size = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0/* you can do more for it*/,
pUnicode, -1, 0, 0, 0, 0 );
if ( size == 0 )
{
return -1;
}
char* pDestString = new char[size + 2];
::memset( pDestString, 0, sizeof(pDestString) );
// transform
int ret = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0, pUnicode, -1, pDestString, size, 0, 0 );
if( ret == 0 )
{
delete pDestString;// 失败
return -1;
}
else
{
*ppDest = pDestString;
return 0;
}
return -1;
}
refer:windows下 unicode转化成gbk: http://hi.baidu.com/zhangweijiqn/item/e2ca4c1acfcb42d4bf904284