从string中将UTF-8编码解码成Unicode code point

时间:2022-02-17 20:14:22

上一篇UTF-8编码实测 中已经能够用boost.locale库将一个字符的UTF-8编码转换成Unicode code point.

今天看了locale的部分代码,作者Artyom建议使用utf_to_utf函数搞定一切。我的需求是将string中所有的字符串的unicode code point都解析出来,似乎utf_to_utf名称并不合适。

于是修改了Artyom的部分代码,现在首先看一下main.cc的调用代码:

#include "test.h"
#include "util/endian.h"
#include "util/utf.h"
#include <iostream>

using namespace std;

int main(int argc, char ** argv) {
// TEST(3 > 2);
char const * p = "一";
cout << PrintStringAsBinaryString(p) << endl;
string str = "一二三";
cout << PrintStringAsBinaryString(str) << endl;

string::iterator itor = str.begin();
vector<code_point> points;
UTF8ToUnicode(itor, str.end(), points);
cout << "code point0: 0x" << std::hex << points[0] << " binary format:B" << PrintIntAsBinaryString(points[0]) << endl;
cout << "code point1: 0x" << std::hex << points[1] << " binary format:B" << PrintIntAsBinaryString(points[1]) << endl;
cout << "code point2: 0x" << std::hex << points[2] << " binary format:B" << PrintIntAsBinaryString(points[2]) << endl;
}
运行结果字符串"一二三"被打印成:

code point0: 0x4e00 binary format:B00000000000000000100111000000000code point1: 0x4e8c binary format:B00000000000000000100111010001100code point2: 0x4e09 binary format:B00000000000000000100111000001001

所有实现代码在utf.h中:

#ifndef UTIL_UTF_H_#define UTIL_UTF_H_#include "util/endian.h"#include "util/unicode_error.h"#include <boost/locale/utf.hpp>using namespace boost::locale::utf;string PrintStringAsBinaryString(char const* p) {  stringstream stream;  for (size_t i = 0; i < strlen(p); ++i) {    stream << PrintIntAsBinaryString(p[i]);    stream << " ";  }  return stream.str();}string PrintStringAsBinaryString(string const& str) {  stringstream stream;  for (size_t i = 0; i < str.size(); ++i) {    stream << PrintIntAsBinaryString(str[i]);    stream << " ";  }  return stream.str();}struct ParseResult {  code_point point;  size_t size;};int trail_length(char ci) {  unsigned char c = ci;  if(c < 128)    return 0;  if(BOOST_LOCALE_UNLIKELY(c < 194))    return -1;  if(c < 224)    return 1;  if(c < 240)    return 2;  if(BOOST_LOCALE_LIKELY(c <=244))    return 3;  return -1;}int width(code_point value) {  if(value <=0x7F) {    return 1;  }  else if(value <=0x7FF) {    return 2;  }  else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {    return 3;  }  else {    return 4;  }}bool is_trail(char ci) {  unsigned char c = ci;  return (c & 0xC0) == 0x80;}bool is_lead(char ci) {  return !is_trail(ci);}// Convert the UTF-8 string into template<typename Iterator>void ParseUTF8(Iterator &p, Iterator e, ParseResult& result) {  if (BOOST_LOCALE_UNLIKELY(p == e)) {    throw UnicodeError("ParseUTF8 failed");  }  unsigned char lead = *p++;  // First byte is fully validated here  int trail_size = trail_length(lead);  if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) {    throw UnicodeError("ParseUTF8 failed");  }  //  // Ok as only ASCII may be of size = 0  // also optimize for ASCII text  //  if(trail_size == 0) {    result.point = lead;    result.size = 1;    return;  }              code_point c = lead & ((1<<(6-trail_size))-1);  // Read the rest  unsigned char tmp;  switch(trail_size) {  case 3:    if(BOOST_LOCALE_UNLIKELY(p==e)) {      throw UnicodeError("ParseUTF8 failed");    }    tmp = *p++;    c = (c << 6) | ( tmp & 0x3F);  case 2:    if(BOOST_LOCALE_UNLIKELY(p==e)) {      throw UnicodeError("ParseUTF8 failed");    }    tmp = *p++;    c = (c << 6) | ( tmp & 0x3F);  case 1:    if(BOOST_LOCALE_UNLIKELY(p==e)) {      throw UnicodeError("ParseUTF8 failed");    }    tmp = *p++;    c = (c << 6) | ( tmp & 0x3F);  }  // Check code point validity: no surrogates and  // valid range  if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) {    throw UnicodeError("ParseUTF8 failed");  }  // make sure it is the most compact representation  if(BOOST_LOCALE_UNLIKELY(width(c) != trail_size + 1)) {    throw UnicodeError("ParseUTF8 failed");  }    result.point = c;  result.size = trail_size + 1;}// Convert the UTF-8 string that represent one single Unicode character in [start, end) to Unicode code pointtemplate<typename Iterator>code_point UTF8ToUnicode(Iterator &start, Iterator end) {  ParseResult result;  ParseUTF8(start, end, result);  return result.point;}template<typename Iterator>code_point UTF8ToUnicode(Iterator &start, Iterator end, vector<code_point>& points) {  ParseResult result;  Iterator begin = start;  while (begin < end) {    ParseUTF8(start, end, result);    points.push_back(result.point);    begin += result.size;  }}#endif

wstring我没有研究过。不过一般来讲string对我够用了,理解了Unicode和UTF-8的编码后,对系统知识理解大为提升。