short Utf8TextBytes (const unsigned char *mstr)
{
short textbytes = 0;
if (mstr == 0)
{ //Returns 0 if mstr contains a null string or if there is
return 0; //no valid character in mstr.
}
/* code checking.
because all unicode used now is lower than 0xffff, only 1~3 bytes UTF-8 has means,
the other length will not be dealed with in other functions */
if (((*mstr) & 0x80) == 0)
{
textbytes = 1; //0xxxxxxx, normal ASCII code
}
else if ((((*mstr) & 0xe0) == 0xc0) && (((*(mstr + 1)) & 0xc0) == 0x80))
{
if (((*mstr) & 0xfe) == 0xc0) //1100,000x is illegal data,
{
textbytes = 0;
}
else
{
textbytes = 2; //110xxxxx 10xxxxxx
}
}
else if ((((*mstr) & 0xf0) == 0xe0) && (((*(mstr + 1)) & 0xc0) == 0x80) && (((*(mstr + 2)) & 0xc0) == 0x80))
{
if ((*mstr == (char) 0xe0) && (((*(mstr + 1)) & 0xe0) == 0x80)) //1110,0000,100x,xxxx is illegal data
{
textbytes = 0;
}
else
{
textbytes = 3; //1110xxxx 10xxxxxx 10xxxxxx
}
}
else if ((((*mstr) & 0xf8) == 0xf0) && (((*(mstr + 1)) & 0xc0) == 0x80) && (((*(mstr + 2)) & 0xc0) == 0x80)
&& (((*(mstr + 3)) & 0xc0) == 0x80))
{
if (((*mstr) == (char) 0xf0) && (((*(mstr + 1)) & 0xf0) == 0x80)) // 1111,0000,1000,xxxx is illegal data
{
textbytes = 0;
}
else
{
textbytes = 4; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
}
}
else if ((((*mstr) & 0xfc) == 0xf8) && (((*(mstr + 1)) & 0xc0) == 0x80) &&
(((*(mstr + 2)) & 0xc0) == 0x80) && (((*(mstr + 3)) & 0xc0) == 0x80) && (((*(mstr + 4)) & 0xc0) == 0x80))
{
if ((*mstr == (char) 0xf8) && (((*(mstr + 1)) & 0xf8) == 0x80)) //1111,1000,1000,0xxx is illegal data
{
textbytes = 0;
}
else
{
textbytes = 5; //111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
}
}
else if ((((*mstr) & 0xfe) == 0xfc) && (((*(mstr + 1)) & 0xc0) == 0x80) &&
(((*(mstr + 2)) & 0xc0) == 0x80) && (((*(mstr + 3)) & 0xc0) == 0x80) && (((*(mstr + 4)) & 0xc0) == 0x80) &&
(((*(mstr + 5)) & 0xc0) == 0x80))
{
if ((*mstr == (char) 0xfc) && (((*(mstr + 1)) & 0xfc) == 0x80)) //1111,1100,1000,00xx is illegal data
{
textbytes = 0;
}
else
{
textbytes = 6; //1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
}
}
else
{
textbytes = 0; //illegal formal, return 0
}
return textbytes;
}
utf8Str --UTF8 source string
utf8strlen --max utf8 length
unStr -- Unicode Str dest
unMaxlen --Unicode 最大个数
return : --实际转化的长度
short FromUTF8ToUnicode (unsigned char *utf8Str, short utf8StrLen, WORD * unStr, unsigned short unMaxLen)
{
short zen_han, len_dest;
unsigned char *p_src;
unsigned char utf8_len = 0;
len_dest = 0;
p_src = (unsigned char *) (utf8Str);
/* limit the total bytes of the *mstr */
if (utf8StrLen <= 0)
{
return (0);
}
/* converting at here utf8_len <= utf8StrLen 改为 utf8_len < utf8StrLen*/
for (; (*p_src != 0x00) && (len_dest <= unMaxLen - 1) && (utf8_len < utf8StrLen);)
{
zen_han = Utf8TextBytes ((const unsigned char *) p_src);
if (zen_han == 1) //ASCII, just add 0x00 at beginning
{
(*unStr) = *(p_src);
unStr++;
p_src++;
utf8_len++;
len_dest += 1;
}
else if (zen_han == 2)
{
*unStr = (((((*p_src) >> 2) & (unsigned char) 0x07)) & 0x00ff) << 8;
*unStr |= ((((*p_src) << 6) | ((*(p_src + 1)) & (unsigned char) 0x3f))) & 0x00ff;
unStr++;
p_src += 2;
utf8_len += 2;
len_dest += 1;
}
else if (zen_han == 3)
{
*(unStr) = ((((*(p_src)) << 4) | (((*(p_src + 1)) >> 2) & 0x0f)) & 0x00ff) << 8;
*unStr |= ((((*(p_src + 1)) << 6) | ((*(p_src + 2)) & (unsigned char) 0x3f))) & 0x00ff;
unStr++;
p_src += 3;
utf8_len += 3;
len_dest += 1;
}
else if (zen_han == 4) //
{
*unStr = (*(p_src)) & 0x7;
for (int i=1; i<4;i++)
{
*unStr <<= 6; // 左移6位后与后续字节的有效位值"位或"赋值
*unStr = *unStr | ((*(p_src + i)) & 0x3f);//先与后或
}
unStr++;
p_src += 4;
utf8_len += 4;
len_dest += 1;
}
else if (zen_han == 5) //
{
*unStr = (*(p_src)) & 0x3;
for (int i=1; i<5;i++)
{
*unStr <<= 6; // 左移6位后与后续字节的有效位值"位或"赋值
*unStr = *unStr | ((*(p_src + i)) & 0x3f);//先与后或
}
unStr++;
p_src += 5;
utf8_len += 5;
len_dest += 1;
}
else if (zen_han == 6) //
{
*unStr = (*(p_src)) & 0x1;
for (int i=1; i<6;i++)
{
*unStr <<= 6; // 左移6位后与后续字节的有效位值"位或"赋值
*unStr = *unStr | ((*(p_src + i)) & 0x3f);//先与后或
}
unStr++;
p_src += 6;
utf8_len += 6;
len_dest += 1;
}
else //treated as illegal character, search the next character
{
p_src++;
utf8_len++;
}
}
*(unStr) = 0x0000;
return (len_dest);
}