unicode, ANSI, utf-8字符集之间的转换 C，C++

// 多字节编码转为UTF8编码  
bool MBToUTF8(vector<char>& pu8, const char* pmb, int mLen)  
{  
	// convert an MBCS string to widechar   
	int nLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0);  

	WCHAR* lpszW = NULL;  
	try  
	{  
		lpszW = new WCHAR[nLen];  
	}  
	catch(bad_alloc &memExp)  
	{  
		return false;  
	}  

	int nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, lpszW, nLen);  

	if(nRtn != nLen)  
	{  
		delete[] lpszW;  
		return false;  
	}  
	// convert an widechar string to utf8  
	int utf8Len = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, NULL, 0, NULL, NULL);  
	if (utf8Len <= 0)  
	{  
		return false;  
	}  
	pu8.resize(utf8Len);  
	nRtn = WideCharToMultiByte(CP_UTF8, 0, lpszW, nLen, &*pu8.begin(), utf8Len, NULL, NULL);  
	delete[] lpszW;  

	if (nRtn != utf8Len)  
	{  
		pu8.clear();  
		return false;  
	}  
	return true;  
}  

// UTF8编码转为多字节编码  
bool UTF8ToMB(vector<char>& pmb, const char* pu8, int utf8Len)  
{  
	// convert an UTF8 string to widechar   
	int nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0);  

	WCHAR* lpszW = NULL;  
	try  
	{  
		lpszW = new WCHAR[nLen];  
	}  
	catch(bad_alloc &memExp)  
	{  
		return false;  
	}  

	int nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, lpszW, nLen);  

	if(nRtn != nLen)  
	{  
		delete[] lpszW;  
		return false;  
	}  

	// convert an widechar string to Multibyte   
	int MBLen = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, NULL, 0, NULL, NULL);  
	if (MBLen <=0)  
	{  
		return false;  
	}  
	pmb.resize(MBLen);  
	nRtn = WideCharToMultiByte(CP_ACP, 0, lpszW, nLen, &*pmb.begin(), MBLen, NULL, NULL);  
	delete[] lpszW;  

	if(nRtn != MBLen)  
	{  
		pmb.clear();  
		return false;  
	}  
	return true;  
}  

// 多字节编码转为Unicode编码  
bool MBToUnicode(vector<wchar_t>& pun, const char* pmb, int mLen)  
{  
	// convert an MBCS string to widechar   
	int uLen = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, NULL, 0);  

	if (uLen<=0)  
	{  
		return false;  
	}  
	pun.resize(uLen);  

	int nRtn = MultiByteToWideChar(CP_ACP, 0, pmb, mLen, &*pun.begin(), uLen);  

	if (nRtn != uLen)  
	{  
		pun.clear();  
		return false;  
	}  
	return true;  
}  

//Unicode编码转为多字节编码  
bool UnicodeToMB(vector<char>& pmb, const wchar_t* pun, int uLen)  
{  
	// convert an widechar string to Multibyte   
	int MBLen = WideCharToMultiByte(CP_ACP, 0, pun, uLen, NULL, 0, NULL, NULL);  
	if (MBLen <=0)  
	{  
		return false;  
	}  
	pmb.resize(MBLen);  
	int nRtn = WideCharToMultiByte(CP_ACP, 0, pun, uLen, &*pmb.begin(), MBLen, NULL, NULL);  

	if(nRtn != MBLen)  
	{  
		pmb.clear();  
		return false;  
	}  
	return true;  
}  

// UTF8编码转为Unicode  
bool UTF8ToUnicode(vector<wchar_t>& pun, const char* pu8, int utf8Len)  
{  
	// convert an UTF8 string to widechar   
	int nLen = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, NULL, 0);  
	if (nLen <=0)  
	{  
		return false;  
	}  
	pun.resize(nLen);  
	int nRtn = MultiByteToWideChar(CP_UTF8, 0, pu8, utf8Len, &*pun.begin(), nLen);  

	if(nRtn != nLen)  
	{  
		pun.clear();  
		return false;  
	}  

	return true;  
}  

// Unicode编码转为UTF8  
bool UnicodeToUTF8(vector<char>& pu8, const wchar_t* pun, int uLen)  
{  
	// convert an widechar string to utf8  
	int utf8Len = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, NULL, 0, NULL, NULL);  
	if (utf8Len<=0)  
	{  
		return false;  
	}  
	pu8.resize(utf8Len);  
	int nRtn = WideCharToMultiByte(CP_UTF8, 0, pun, uLen, &*pu8.begin(), utf8Len, NULL, NULL);  

	if (nRtn != utf8Len)  
	{  
		pu8.clear();  
		return false;  
	}  
	return true;  
}

附：

MultiByteToWideChar和WideCharToMultiByte用法详解
//========================================================================
//TITLE:
//    MultiByteToWideChar和WideCharToMultiByte用法详解
//AUTHOR:
//    norains
//DATE:
//    第一版:Monday 25-December -2006
//    增补版:Wednesday 27-December -2006
//    修订版:Wednesday 14-March-2007 (修正之前的错误例子)
//Environment:
// EVC4.0 + Standard SDK
//========================================================================

1.使用方法详解

在本文开始之处,先简要地说一下何为短字符和宽字符.
所谓的短字符,就是用8bit来表示的字符,典型的应用是ASCII码.而宽字符,顾名思义,就是用16bit表示的字符,典型的有UNICODE.

关于windows下的ASCII和UNICODE的更多信息,可以参考这两本经典著作:《windows 程序设计》,《windows 核心编程》.这两本书关于这两种字符都有比较详细的解说.

宽字符转换为多个短字符是一个难点,不过我们只要掌握到其中的要领,便可如鱼得水.
好吧,那就让我们开始吧.

这个是我们需要转化的多字节字符串:
char sText[20] = {"多字节字符串!OK!"};

我们需要知道转化后的宽字符需要多少个数组空间.虽然在这个里程里面,我们可以直接定义一个20*2宽字符的数组,并且事实上将运行得

    
    
   
   
     
     
    
    非常轻松愉快.但假如多字节字符串更多,达到上千个乃至上万个,我们将会发现其中浪费的内存将会越来越多.所以以多字节字符的个数的两

    
    
   
   
     
     
    
    倍作为宽字符数组下标的声明绝对不是一个好主意.
  所幸,我们能够确知所需要的数组空间.
  我们只需要将MultiByteToWideChar()的第四个形参设为-1,即可返回所需的短字符数组空间的个数:
  DWORD dwNum = MultiByteToWideChar (CP_ACP, 0, sText, -1, NULL, 0);
 
  接下来,我们只需要分配响应的数组空间:
  wchar_t *pwText;
  pwText = new wchar_t[dwNum];
  if(!pwText)
  {
   delete []pwText;
  }
 
  接着,我们就可以着手进行转换了.在这里以转换成ASCII码做为例子:
  MultiByteToWideChar (CP_ACP, 0, psText, -1, sText, dwSize);
 
  最后,使用完毕当然要记得释放占用的内存:
  delete []psText;
 
 
  同理,宽字符转为多字节字符的代码如下:  
  wchar_t wText[20] = {L"宽字符转换实例!OK!"};
  DWORD dwNum = WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,NULL,0,NULL,FALSE);
  char *psText;
  psText = new char[dwNum];
  if(!psText)
  {
   delete []psText;
  }
  WideCharToMultiByte (CP_OEMCP,NULL,lpcwszStr,-1,psText,dwNum,NULL,FALSE);
  delete []psText;
 
   如果之前我们已经分配好空间,并且由于字符串较短,可以不理会浪费的空间,仅仅只是想简单地将短字符和宽字符相互转换,那有没有

    
    
   
   
     
     
    
    什么简便的方法呢?
   WIN32 API里没有符合这种要求的函数,但我们可以自己进行封装:
     
  //-------------------------------------------------------------------------------------
  //Description:
  // This function maps a character string to a wide-character (Unicode) string
  //
  //Parameters:
  // lpcszStr: [in] Pointer to the character string to be converted
  // lpwszStr: [out] Pointer to a buffer that receives the translated string.
  // dwSize: [in] Size of the buffer
  //
  //Return Values:
  // TRUE: Succeed
  // FALSE: Failed
  //
  //Example:
  // MByteToWChar(szA,szW,sizeof(szW)/sizeof(szW[0]));
  //---------------------------------------------------------------------------------------
  BOOL MByteToWChar(LPCSTR lpcszStr, LPWSTR lpwszStr, DWORD dwSize)
  {
    // Get the required size of the buffer that receives the Unicode
    // string.
    DWORD dwMinSize;
    dwMinSize = MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, NULL, 0);
 
    if(dwSize < dwMinSize)
    {
     return FALSE;
    }
 
    
    // Convert headers from ASCII to Unicode.
    MultiByteToWideChar (CP_ACP, 0, lpcszStr, -1, lpwszStr, dwMinSize);  
    return TRUE;
  }
 
  //-------------------------------------------------------------------------------------
  //Description:
  // This function maps a wide-character string to a new character string
  //
  //Parameters:
  // lpcwszStr: [in] Pointer to the character string to be converted
  // lpszStr: [out] Pointer to a buffer that receives the translated string.
  // dwSize: [in] Size of the buffer
  //
  //Return Values:
  // TRUE: Succeed
  // FALSE: Failed
  //
  //Example:
  // MByteToWChar(szW,szA,sizeof(szA)/sizeof(szA[0]));
  //---------------------------------------------------------------------------------------
  BOOL WCharToMByte(LPCWSTR lpcwszStr, LPSTR lpszStr, DWORD dwSize)
  {
   DWORD dwMinSize;
   dwMinSize = WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,NULL,0,NULL,FALSE);
   if(dwSize < dwMinSize)
   {
    return FALSE;
   }
   WideCharToMultiByte(CP_OEMCP,NULL,lpcwszStr,-1,lpszStr,dwSize,NULL,FALSE);
   return TRUE;
  }
 
 
  使用方法也很简单,示例如下:
  wchar_t wText[10] = {L"函数示例"};
  char sText[20]= {0};
  WCharToMByte(wText,sText,sizeof(sText)/sizeof(sText[0]));
  MByteToWChar(sText,wText,sizeof(wText)/sizeof(wText[0]));
 
  这两个函数的缺点在于无法动态分配内存,在转换很长的字符串时可能会浪费较多内存空间;优点是,在不考虑浪费空间的情况下转换较

    
    
   
   
     
     
    
    短字符串非常方便.

 
2.MultiByteToWideChar()函数乱码的问题

  有的朋友可能已经发现,在标准的WinCE4.2或WinCE5.0 SDK模拟器下,这个函数都无法正常工作,其转换之后的字符全是乱码.及时

    
    
   
   
     
     
    
    更改MultiByteToWideChar()参数也依然如此.
  不过这个不是代码问题,其结症在于所定制的操作系统.如果我们定制的操作系统默认语言不是中文,也会出现这种情况.由于标准的SDK

    
    
   
   
     
     
    
    默认语言为英文,所以肯定会出现这个问题.而这个问题的解决,不能在简单地更改控制面板的"区域选项"的"默认语言",而是要在系统定制

    
    
   
   
     
     
    
    的时候,选择默认语言为"中文".
  系统定制时选择默认语言的位置于:
  Platform -> Setting... -> locale -> default language ,选择"中文",然后编译即可.

    
    
   
   
     
     
    
     
      
      [cpp] 
      view plain 
      copy 
       
      
    
     
     
    
    
CString my_strEditA=_T(""),my_strEditB=_T(""),my_strEditC=_T("");    
        my_strlength = my_Base64Msg.GetLength();   
        char   *pstra = new char[my_strlength];   
        for(i=0;i<my_strlength;i++)    
        {    
         *(pstra+i)=(char)my_Base64Msg[i];    
        }   
        my_testA = pstra;    
        
       //BASE64解码函数，见另一篇博文    
        my_testB = decode(my_testA);   
        ///////////////////////////////////////////////////////////////////   
        ///////////////////////////////////////////////////////////////////   
        
       //获取转换字符串长度   
        DWORD dwNum = MultiByteToWideChar (CP_ACP, 0, my_testB, -1, NULL, 0);   
        wchar_t *pwText;   
        pwText = new wchar_t[dwNum];   
         if(!pwText)   
        {   
         delete []pwText;   
        }   
        
       //转换成UNICODE码   
        
       //CP_ACP=936,简体中文   
        MultiByteToWideChar (CP_ACP, 0 , my_testB, -1, pwText, dwNum);   
        m_EditMail += pwText;   
        delete []pwText;

  
  
 
 
   
   
  
   
    
    [cpp] 
    view plain 
    copy 
     
    
  
   
   
  
  
**************************************************************     
   *         功         能: 将unicode字符转换成gb2312字串     
   *         参         数: unistr 源字串     
                            gbstr 目标字串     
                            msg_len 源串长度     
   *         返   回   值: 无     
   **************************************************************     
   void str_unic_decode( unsigned short *unistr, unsigned char *gbstr, int msg_len)     
   {     
       int   i;     
       int   index;     
       unsigned   short   ch;     
       unsigned   char   str[2];     
          
       msg_len   =   msg_len==-1?  str_unic_len(unistr)   :   msg_len;     
       for(i=0,index=0;   i<msg_len;   i++)     
       {     
           ch   =   UNICODE_TO_GB2312[unistr[i]]; //查表法    
           str[0]   =   ch   &   0xff;     
           str[1]   =   ch>>8   &   0xff;     
           if(str[1]   >   0xa0)     
           {     
               gbstr[index++]   =   str[0];     
               gbstr[index++]   =   str[1];     
           }     
           else     
           {     
               gbstr[index++]   =   str[0];     
           }     
       }     
   }

  
  
 
 
  
  
   
   UTF-8
   
   
  
  
　　现在明白了Unicode，那么UTF-8又是什么呢？又为什么会出现UTF-8呢？
   
   
  
  
　　ASCII转换成UCS-2，只是在编码前插入一个0x0。用这些编码，会包括一些控制符，比如 '' 或 '/'，这在UNIX和一些C函数中，将
  
  
 
 
  
  
 
 
  
  
   
   会产生
  
  
 
 
  
  
 
 
  
  
   
   严重错误。因此可以肯定，UCS-2不适合作为Unicode的外部编码。
   
   
  
  
　　因此，才诞生了UTF-8。那么UTF-8是如何编码的？又是如何解决UCS-2的问题呢？
   
   
  
  
例：
   
   
  
  
E4 BD A0　　　　　　　　11100100 10111101 10100000
   
   
  
  
这是“你”字的UTF-8编码
   
   
  
  
4F 60　　　　　　　　　　01001111 01100000
   
   
  
  
这是“你”的Unicode编码
   
   
  
  
按照UTF-8的编码规则，分解如下：xxxx0100 xx111101 xx100000
   
   
  
  
把除了x之外的数字拼接在一起，就变成“你”的Unicode编码了。
   
   
  
  
注意UTF-8的最前面３个1，表示整个UTF-8串是由３个字节构成的。
   
   
  
  
经过UTF-8编码之后，再也不会出现敏感字符了，因为最高位始终为1。
   
   
  
  

以下是Unicode和UTF-8之间的转换关系表：
   
   
  
  
U-00000000 - U-0000007F: 0xxxxxxx       //没有1表示只有1个字节
   
   
  
  
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx  //前面2个1表示由2个字节
   
   
  
  
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx //前面3个1表示由3个字节
   
   
  
  
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx //依次类推
   
   
  
  
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   
   
  
  
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   
   
  
  
Unicode编码转换到UTF-8,简单的把Unicode字节流套到x中就变成UTF-8了。
   
   
  
  


所以,可以看到unicode编码和utf-8编码有线性转换关系,而unicode编码和gb2312编码不存在线性转换关系,所以我们必须使用对照表
  
  
 
 
  
  
 
 
  
  
   
   来进行unicode和gb2312编码的互换,就像阳历和农历转换算法一样,不能作线性计算
  
  
 
 
  
  
 
 
  
  
   
    
  
  
 
 
  
  
 
 
  
  
   
    
  
  
 
 
  
  
 
 
  
  
   
   由于各种编码之间不存在互相变换的算法，只能通过查表解决转换问题。自编代码进行转换在嵌入式系统中最有实际意义，该方法具有最
  
  
 
 
  
  
 
 
  
  
   
   方便的移植特性和最小的代码量。需要解决的主要技术问题有：
   
   
  
  

·获取所需的编码转换表
   
   
  
  

·实现码表的快速搜索算法（UTF-8转GB码才需要，其实就是折半查找）
   
   
  
  

·待转换字符串中的中/西文字符判别
   
   
  
  

由于折半查找要求码表是事先排序的，正变换和反变换需要各有一张转换表。转换表可以从开源软件中获取也可以自己编段程序生成一份。
   
   
  
  

由于非unicode编码字串中的西文字母只有一字节/字符，而汉字是2字节/字符，需要在转换时区别对待。判断方法在本文的前面部分有介绍。
   
   
  
  

由GB2312码转unicode时，由于转换表是按区位表排列的，可以直接由汉字的GB码通过计算得到转换表中的行列值，计算公式为：
   
   
  
  

Row = MSB - 0xA0 - 16
   
   
  
  

Col = LSB – 0xA0
   
   
  
  

由于转换表是从汉字区开始的，即第一个汉字是“啊”，开始行不是0，而是16，所以要从行值中减去一个偏移量。得到行列值后，可以直
  
  
 
 
  
  
 
 
  
  
   
   接取回表中的unicode：
   
   
  
  

Unicode = CODE_LUT[Row][Col];
   
   
  
  

今天对网上找到的转换表不太满意，于是自己编程序生成了一个新的。转换程序不大，
  
  
 
 
  
  
 
 
   
   
  
   
    
     
     [cpp] 
     view plain 
     copy 
      
     
    
   
全部源码如下：   
       
    // UnicodeCvt.cpp   
    // by mxh0506, 20081102   
       
    #include "stdafx.h"   
    #include <string.h>   
    #include <windows.h>   
       
       
       
    int _tmain(int argc, _TCHAR* argv[])   
    {   
         wchar_t wstr[8];   
         char szBuff[8];   
         FILE *fp;   
         unsigned char rowCode,colCode;   
         char szStr[64];   
         char szErr[16];   
         strcpy(szErr,"/*XX*/0x0000,");   
         fp = fopen("GB2Uni_LUT.h", "w+, ccs=UNICODE");   
         if( fp ){   
             strcpy( szStr, "unsigned short Unicode[72][96]={/n");    
             fwrite(szStr,1,strlen(szStr),fp);           
             szBuff[2] = 0;   
             for( unsigned char row = 0; row < 72; row++){                
                 for( unsigned char col = 0; col < 96; col++){                    
                     rowCode = (row + 16) + 0xA0;                   
                     colCode = col + 0xA0;                   
                     szBuff[0] = rowCode;                   
                     szBuff[1] = colCode;                   
                     if( MultiByteToWideChar(CP_THREAD_ACP,MB_ERR_INVALID_CHARS,szBuff,2,wstr,8)){                        
                         sprintf(szStr,"/*%s%X*/0x%X,",szBuff,*((unsigned short*)szBuff),wstr[0]);                       
                         fwrite(szStr,1,strlen(szStr),fp);                        
                     }else{                       
                         fwrite( szErr, 1, 13, fp );                       
                     }                   
                 }               
                 fwrite( "/n", 1, 1, fp );                
             }            
             strcpy( szStr, "};/n");            
             fwrite( szStr, 1, strlen( szStr ), fp );            
             fclose(fp);            
         }   
            
         return 0;   
            
    }   
   
 
  
  
 
 
  
  
 
 
  
  
   
   来测试发现这段程序成生的码表中丢掉了几个花括号，不过大体功能还是正确的。如果有人感兴趣，可以加上。另外，还可以试试码表从
  
  
 
 
  
  
 
 
  
  
   
   0行(而不是16行)开始会怎样。
  
  
 
 
  
  
 
 
  
  
   
    
  
  
 
 
  
  
 
 
  
  
   
    
  
  
 
 
  
  
 
 
   
   
  
   
   
     GB2312与unicode间的转换 
    

GB2312与unicode互转的两个函数，有点简陋，待转换的字符串长度要在256以内。 
    


 
   
       
    
   
      static int 
    
_convertCharSetFromGBKToUnicode(char *from, char *to) 
    
{ 
    
        iconv_t h; 
    
        char tmp_from[256] = { '/0' }; 
    
        char tmp_to[256] = { '/0' }; 
    
        char *p_from = tmp_from; 
    
        char *p_to = tmp_to; 
    
        int size_from, size_to; 
    
        strncpy(p_from, from, sizeof(tmp_from)-1); 
    
        size_from = strlen(p_from); 
    
        size_to = sizeof(tmp_to); 
    
        if ((h = iconv_open("UTF-8", "GBK")) < 0) 
    
                return -1; 
    
        iconv(h, &p_from, &size_from, &p_to, &size_to); 
    
        iconv_close(h); 
    
        printf("GBK Code : %s, UNICODE Code : %s %d/n", tmp_from, tmp_to, size_to); 
    
        strncpy(to, tmp_to, size_to); 
    
        return 0; 
    
} 
    

static int 
    
_convertCharSetFromUnicodeToGBK(char *from, char *to) 
    
{ 
    
        iconv_t h; 
    
        char tmp_from[256] = { '/0' }; 
    
        char tmp_to[256] = { '/0' }; 
    
        char *p_from = tmp_from; 
    
        char *p_to = tmp_to; 
    
        int size_from, size_to; 
    
        strncpy(p_from, from, sizeof(tmp_from)-1); 
    
        size_from = strlen(p_from); 
    
        size_to = sizeof(tmp_to); 
    
        if ((h = iconv_open("GBK", "UTF-8")) < 0) 
    
                return -1; 
    
        iconv(h, &p_from, &size_from, &p_to, &size_to); 
    
        iconv_close(h); 
    
        printf("UNICODE Code : %s, GBK Code : %s %d/n", tmp_from, tmp_to, size_to); 
    
        strncpy(to, tmp_to, size_to); 
    
        return 0; 
    
}

秒客网

unicode, ANSI, utf-8字符集之间的转换 C，C++

相关文章