我们通常所说的UNICODE其实是UTF-16,下面这几个函数实现UNICODE(UTF-16)与UTF-8编码的相互转换。
- /**
- * This file implement functions of:
- *
- * 1. UTF-16 character to UTF-8 chaaracter converting.
- * 2. UTF-8 character to UTF-16 character converting.
- *
- * 3. UTF-16 string to UTF-8 string converting.
- * 4. UTF-8 string to UTF-16 string converting.
- */
- /* Maximum bytes of a utf-8 character */
- #define MAX_CHARACTER_SIZE 8
- /**
- * UnicodeToUTF8 - convert unicode char to UTF-8 char
- * @unicode: a UNICODE(utf-16) character
- * @p: a buffer to contain a utf-8 characters
- *
- * @return: One step over the end of the utf-8 character buffer
- */
- unsigned char * UnicodeToUTF8( int unicode, unsigned char *p)
- {
- unsigned char *e = NULL;
- if((e = p))
- {
- if(unicode < 0x80)
- {
- *e++ = unicode;
- }
- else if(unicode < 0x800)
- {
- /* <11011111> < 000 0000 0000> */
- *e++ = ((unicode >> 6) & 0x1f)|0xc0;
- *e++ = (unicode & 0x3f)|0x80;
- }
- else if(unicode < 0x10000)
- {
- /* <11101111> <0000 0000 0000 0000> */
- *e++ = ((unicode >> 12) & 0x0f)|0xe0;
- *e++ = ((unicode >> 6) & 0x3f)|0x80;
- *e++ = (unicode & 0x3f)|0x80;
- }
- else if(unicode < 0x200000)
- {
- /* <11110111> <0 0000 0000 0000 0000 0000> */
- *e++ = ((unicode >> 18) & 0x07)|0xf0;
- *e++ = ((unicode >> 12) & 0x3f)|0x80;
- *e++ = ((unicode >> 6) & 0x3f)|0x80;
- *e++ = (unicode & 0x3f)|0x80;
- }
- else if(unicode < 0x4000000)
- {
- /* <11111011> <00 0000 0000 0000 0000 0000 0000> */
- *e++ = ((unicode >> 24) & 0x03)|0xf8 ;
- *e++ = ((unicode >> 18) & 0x3f)|0x80;
- *e++ = ((unicode >> 12) & 0x3f)|0x80;
- *e++ = ((unicode >> 6) & 0x3f)|0x80;
- *e++ = (unicode & 0x3f)|0x80;
- }
- else
- {
- /* <11111101> <0000 0000 0000 0000 0000 0000 0000 0000> */
- *e++ = ((unicode >> 30) & 0x01)|0xfc;
- *e++ = ((unicode >> 24) & 0x3f)|0x80;
- *e++ = ((unicode >> 18) & 0x3f)|0x80;
- *e++ = ((unicode >> 12) & 0x3f)|0x80;
- *e++ = ((unicode >> 6) & 0x3f)|0x80;
- *e++ = (unicode & 0x3f)|0x80;
- }
- }
- /* Return One step over the end of the utf-8 character buffer */
- return e;
- }
- /**
- * UTF8ToUnicode - convert UTF-8 char to unicode char
- * @ch: A buffer contain a utf-8 character
- * @unicode: Contain the converted utf-16 character
- *
- * @return: Bytes count of the utf-8 character (1 ~ 6),
- * can be used to step to next utf-8 character when convert a utf-8 string to a utf-16 string
- */
- int UTF8ToUnicode (unsigned char *ch, int *unicode)
- {
- unsigned char *p = NULL;
- int e = 0, n = 0;
- if((p = ch) && unicode)
- {
- if(*p >= 0xfc)
- {
- /* 6:<11111100> */
- e = (p[0] & 0x01) << 30;
- e |= (p[1] & 0x3f) << 24;
- e |= (p[2] & 0x3f) << 18;
- e |= (p[3] & 0x3f) << 12;
- e |= (p[4] & 0x3f) << 6;
- e |= (p[5] & 0x3f);
- n = 6;
- }
- else if(*p >= 0xf8)
- {
- /* 5:<11111000> */
- e = (p[0] & 0x03) << 24;
- e |= (p[1] & 0x3f) << 18;
- e |= (p[2] & 0x3f) << 12;
- e |= (p[3] & 0x3f) << 6;
- e |= (p[4] & 0x3f);
- n = 5;
- }
- else if(*p >= 0xf0)
- {
- /* 4:<11110000> */
- e = (p[0] & 0x07) << 18;
- e |= (p[1] & 0x3f) << 12;
- e |= (p[2] & 0x3f) << 6;
- e |= (p[3] & 0x3f);
- n = 4;
- }
- else if(*p >= 0xe0)
- {
- /* 3:<11100000> */
- e = (p[0] & 0x0f) << 12;
- e |= (p[1] & 0x3f) << 6;
- e |= (p[2] & 0x3f);
- n = 3;
- }
- else if(*p >= 0xc0)
- {
- /* 2:<11000000> */
- e = (p[0] & 0x1f) << 6;
- e |= (p[1] & 0x3f);
- n = 2;
- }
- else
- {
- e = p[0];
- n = 1;
- }
- *unicode = e;
- }
- /* Return bytes count of this utf-8 character */
- return n;
- }
- /**
- * UnicodeStrToUTF8Str - Convert a utf-16 string to a utf-8 string
- * @unicde_str: A utf-16 string
- * @utf8_str: A buffer to contain utf-8 string
- * @utf8_str_size: Maximum size of the utf-8 string buffer
- *
- * @return: One step over the end of the last utf-8 character
- */
- unsigned char * UnicodeStrToUTF8Str (unsigned short * unicode_str,
- unsigned char * utf8_str, int utf8_str_size)
- {
- int unicode = 0;
- unsigned char *e = NULL, *s = NULL;
- unsigned char utf8_ch[MAX_CHARACTER_SIZE];
- s = utf8_str;
- if ((unicode_str) && (s))
- {
- while ((unicode = (int) (*unicode_str++)))
- {
- memset (utf8_ch, 0, sizeof (utf8_ch));
- if ((e = UnicodeToUTF8 (unicode, utf8_ch)) > utf8_ch)
- {
- *e = '/0';
- /* Judge whether exceed the destination buffer */
- if ((s - utf8_str + strlen ((const char *) utf8_ch)) >= utf8_str_size)
- {
- return s;
- }
- else
- {
- memcpy (s, utf8_ch, strlen ((const char *) utf8_ch));
- s += strlen ((const char *) utf8_ch);
- *s = '/0';
- }
- }
- else
- {
- /* Converting error occurs */
- return s;
- }
- }
- }
- return s;
- }
- /**
- * UTF8StrToUnicodeStr - Convert a utf-8 stirng to a utf-16 string
- * @utf8_str: A utf-8 string
- * @unicode_str: A buffer to contain utf-16 string
- * @unicode_str_size: Maximum size of the utf-16 string buffer
- *
- * @return: Number of utf-16 character
- */
- int UTF8StrToUnicodeStr (unsigned char * utf8_str,
- unsigned short * unicode_str, int unicode_str_size)
- {
- int unicode = 0;
- int n = 0;
- int count = 0;
- unsigned char *s = NULL;
- unsigned short *e = NULL;
- s = utf8_str;
- e = unicode_str;
- if ((utf8_str) && (unicode_str))
- {
- while (*s)
- {
- if ((n = UTF8ToUnicode (s, &unicode)) > 0)
- {
- if ((count + 1) >= unicode_str_size)
- {
- return count;
- }
- else
- {
- *e = (unsigned short) unicode;
- e++;
- *e = 0;
- /* Step to next utf-8 character */
- s += n;
- }
- }
- else
- {
- /* Converting error occurs */
- return count;
- }
- }
- }
- return count;
- }