字符串编码问题学习

我们native层配置文件用了两套编码，unicode和gbk2312，都是两个字节表示中文。要统一采用gbk编码，顺便顺便熟悉一下gbk。

python3.3的测试

为方便使用python做测试。python默认的字符串类str采用unicode，s = '中文' 等价于 s = u'中文'。

dp_gbk_bs将字符串转化成gbk编码，并输出每个中文对应的unsigned short。

wd_2_gbk_val 将单个字符转化成gbk编码的unsigned short值。

gbk_val_2_wd 将gbk编码下的一个short数值还原成中文字符串。

# function 0
# 输出字符串中每个汉字码值
def dp_gbk_bs(name):
    '''
    dump gbk2312 string to unsigned short vector.
    '''
    gbk_bs = str.encode(name, 'gbk')
    print( name, "(gbk2312) = ", gbk_bs)
    size = len(name)

    form = ''
    for i in range(size):
        form += 'H'
    vec = struct.unpack(form, gbk_bs)       # return a tuple
    # v1, v2 = struct.unpack('HH', gbk_bs)

    for i in range(size):
        print(name[i], " = ", vec[i])

# function 1
def wd_2_gbk_val(wd):
    '''
    transform single gbk2312 word to unsigned short value.
    中  =  53462 ,  0xd0d6
    '''
    cd_ty = 'gbk'
    gbk_bt = str.encode(wd, cd_ty);
    vec = struct.unpack('H', gbk_bt)   # return tuple type.    
    return vec[0]

ss = u'中国技术交易大厦'
dp_gbk_bs(ss)

# function 2
# short 转成hex string，然后转化成bytes，最后转化成str(gbk2312)
def gbk_val_2_wd(us_v):
    '''
    transform unsigned short value to gbk2312 single word.
    53462 ,  0xd0d6  => 中
    '''
    hex_s = hex(us_v)   # 获取16进制串
    print('hex str = ', hex_s)

    hex_s = hex_s[2:]   # skip '0x' header
    hex_list = []
    while hex_s:
        str_tp = hex_s[0:2]
        hex_list.append(str_tp)
        hex_s = hex_s[2:]

    res = ""        
    # 大小端倒置，逆序遍历
    for w in reversed(hex_list):
        res += w

    print('reversed hex str = ', res)
    bys = bytes.fromhex(res)
    print('bytes = ', bys)
    return bys.decode('gbk')

string = '中国技术交易大厦'
val_l = []
for ss in string:
    val = wd_2_gbk_val(ss)
    val_l.append( val )
    print( ss, " = ", val)

print(val_l)

for val in val_l:
    print('------------------')
    print(val, " = ", gbk_val_2_wd(val))

输出结果：

中  =  53462
国  =  64185
技  =  48316
术  =  62922
交  =  48061
易  =  55250
大  =  62388
厦  =  50127
[53462, 64185, 48316, 62922, 48061, 55250, 62388, 50127]
------------------
hex str =  0xd0d6
reversed hex str =  d6d0
bytes =  b'\xd6\xd0'
53462  =  中
------------------
hex str =  0xfab9
reversed hex str =  b9fa
bytes =  b'\xb9\xfa'
64185  =  国
------------------
hex str =  0xbcbc
reversed hex str =  bcbc
bytes =  b'\xbc\xbc'
48316  =  技
------------------
hex str =  0xf5ca
reversed hex str =  caf5
bytes =  b'\xca\xf5'
62922  =  术
------------------
hex str =  0xbbbd
reversed hex str =  bdbb
bytes =  b'\xbd\xbb'
48061  =  交
------------------
hex str =  0xd7d2
reversed hex str =  d2d7
bytes =  b'\xd2\xd7'
55250  =  易
------------------
hex str =  0xf3b4
reversed hex str =  b4f3
bytes =  b'\xb4\xf3'
62388  =  大
------------------
hex str =  0xc3cf
reversed hex str =  cfc3
bytes =  b'\xcf\xc3'
50127  =  厦

C的测试

char buf[2] = {0};
short word = 53462;// ‘中’// 0xd0d6 
memcpy(buf, &word, 2);// [-42, -48],  0xd0是-48的补码, 0xd6是-42的补码
// 0xd6 0xd0
unsigned char* byte_ptr = (unsigned char*)&word;// 小端存储
unsigned char byte0 = byte_ptr[0];// 2140xd6，0xd6是214的原码
unsigned char byte1 = byte_ptr[1];// 2080xd0，0xd0是208的原码

unsigned short word存储汉字‘中’的gbk2312码值，其内容拷贝到char buf数组中，buf内容为负数，buf[0]为word的低字节，buf[1]为高字节。通过byte_ptr指针取得内容为正数。

0xd0d6，word变量内存结构：

内存高地址 | 内存低地址

d0 | d6

-48 | -42 // signed char，对应的负数补码

208 | 214 // unsigned char 对应的正数原码

buf数组内存结构

buf[0] | buf[1]

取低地址，取高地址

-42 | -48

Unicode2GBK函数：

/**
  * 将unicode字符串转化成gbk2312编码的字符串
  */
int Unicode2GBK( wchar_t *pUnicode, char** ppDest)
{
#ifndef CODE_PAGE_GB18030
#define CODE_PAGE_GB18030 54936
#endif
// get the size of the dest string 
const int size = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0/* you can do more for it*/,
pUnicode, -1, 0, 0, 0, 0 ); 
if ( size == 0 ) 
{
return -1; 
} 

char* pDestString = new char[size + 2];
::memset( pDestString, 0, sizeof(pDestString) );
// transform

int ret = ::WideCharToMultiByte( CODE_PAGE_GB18030, 0, pUnicode, -1, pDestString, size, 0, 0 );
if( ret == 0 ) 
{
delete pDestString;// 失败
return -1;
}
else 
{
*ppDest = pDestString; 
return 0; 
}

return -1;
}

refer：windows下 unicode转化成gbk： http://hi.baidu.com/zhangweijiqn/item/e2ca4c1acfcb42d4bf904284

秒客网

字符串编码问题学习

python3.3的测试

C的测试

0xd0d6，word变量内存结构：

buf数组内存结构

相关文章