c++的字符串中的每一个元素都是一个字节。所以在装入utf8字符串的时候,其实是按照一定的规则编码的。
字符的8位中 如果0开头 则自己就是一个单位。
1字节 | 0xxxxxxx |
2字节 | 110xxxxx 10xxxxxx |
3字节 | 1110xxxx 10xxxxxx 10xxxxxx |
4字节 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
5字节 | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
6字节 | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
所以知道这个就好办了。这里有一个类,用来专门处理utf-8的字符串,实现了字符串截取,索引,长度等功能~
#ifndef __IUTF8_STRING__ #define __IUTF8_STRING__ class iutf8string { public: iutf8string(const std::string& ); iutf8string(const char* ); ~iutf8string(); public: int length(); std::string substring(int start_index, int length); std::string get(int index); iutf8string operator + (iutf8string& ); std::string operator [](int index); std::string stlstring(); const char* c_str(); iutf8string utf8substr(int u8start_index, int u8_length); std::string substr(int u8start_index, int u8_length); private: std::string data; int* offerset; int _length; void refresh(); }; #endif
#include <iostream> #include <string> #include "iutf8string.h" using namespace std; iutf8string::iutf8string(const string& str) { data = str; refresh(); } iutf8string::iutf8string(const char* str) { data = string(str); refresh(); } iutf8string::~iutf8string() { delete[] offerset; } string iutf8string::stlstring() { return data; } const char* iutf8string::c_str() { return data.c_str(); } iutf8string iutf8string::operator +(iutf8string& ustr) { string temp = data + ustr.stlstring(); return iutf8string(temp); } int iutf8string::length() { return _length; } string iutf8string::get(int index) { if(index >= _length) return ""; string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]); return temp; } string iutf8string::operator [](int index) { if(index >= _length) return ""; string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]); return temp; } string iutf8string::substr(int u8_start_index, int u8_length) { if(u8_start_index + u8_length >= _length) return ""; return data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]); } iutf8string iutf8string::utf8substr(int u8_start_index, int u8_length) { if(u8_start_index + u8_length >= _length) return iutf8string(""); string ret = data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]); return iutf8string(ret); } void iutf8string::refresh() { int *tmp = new int[data.length()]; int i, tmpidx = 0; for(i = 0; i < data.length(); i++) { if(((int)data[i] > 0)||(!(((int)data[i] & 0x00000040) == 0))) { tmp[tmpidx] = i; tmpidx++; } } tmp[tmpidx] = data.length(); int *tmp2 = new int[tmpidx]; for(i = 0; i < tmpidx; i++) { tmp2[i] = tmp[i]; } delete[] tmp; offerset = tmp2; _length = tmpidx; } //----------------test code ---------------------------- int main() { iutf8string str1("_我Love你!中国 ,!"); cout << "字符串长度:" <<str1.length() <<endl; int i; cout << "[" ; for(i = 0; i < str1.length(); i++) { cout << str1[i] << " "; } cout << "]" << endl; string one = str1.substr(2,11); cout << one << endl; string s1("我们都是好孩子!"); iutf8string str2(s1); cout << "[" ; for(i = 0; i < str2.length(); i++) { cout << str2[i] << " "; } cout << "]" << endl; }
最后祝您,提乾涉经。告辞。