c++ 处理utf-8字符串

时间:2023-01-05 18:42:31

c++的字符串中的每一个元素都是一个字节。所以在装入utf8字符串的时候,其实是按照一定的规则编码的。

字符的8位中 如果0开头 则自己就是一个单位。

1字节 0xxxxxxx 
2字节 110xxxxx 10xxxxxx 
3字节 1110xxxx 10xxxxxx 10xxxxxx
4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

 

 

 

  

  

 所以知道这个就好办了。这里有一个类,用来专门处理utf-8的字符串,实现了字符串截取,索引,长度等功能~

#ifndef __IUTF8_STRING__
#define __IUTF8_STRING__

class iutf8string
{
    public:
        iutf8string(const std::string& );

        iutf8string(const char* );

        ~iutf8string();

    public:
    
        int length();

        std::string substring(int start_index, int length);

        std::string get(int index);


        iutf8string operator + (iutf8string& );
        
        std::string operator [](int index);

        std::string stlstring();

        const char* c_str();

        iutf8string utf8substr(int u8start_index, int u8_length);
        
        std::string substr(int u8start_index, int u8_length);

    private:

        std::string data;
        int* offerset;
        int _length;

        void refresh();
};

#endif
#include <iostream>
#include <string>
#include "iutf8string.h"

using namespace std;

iutf8string::iutf8string(const string& str)
{
    data = str;
    refresh();
}

iutf8string::iutf8string(const char* str)
{
    data = string(str);
    refresh();
}

iutf8string::~iutf8string()
{
    delete[] offerset;
}

string iutf8string::stlstring()
{
    return data;
}

const char* iutf8string::c_str()
{
    return data.c_str();
}

iutf8string iutf8string::operator +(iutf8string& ustr)
{
    string temp = data + ustr.stlstring();

    return iutf8string(temp);
}

int iutf8string::length()
{

    return _length;
}

string iutf8string::get(int index)
{
    if(index >= _length) return "";
    string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]);

    return temp;
}

string iutf8string::operator [](int index)
{
    if(index >= _length) return "";
    string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]);

    return temp;
}

string iutf8string::substr(int u8_start_index, int u8_length)
{
    if(u8_start_index + u8_length >= _length) return "";

    return data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]);
}

iutf8string iutf8string::utf8substr(int u8_start_index, int u8_length)
{
    if(u8_start_index + u8_length >= _length) return iutf8string("");
    string ret = data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]);
    
    return iutf8string(ret);
}

void iutf8string::refresh()
{
    int *tmp = new int[data.length()];
    int i, tmpidx = 0;
    for(i = 0; i < data.length(); i++)
    {
        if(((int)data[i] > 0)||(!(((int)data[i] & 0x00000040) == 0)))
        {
            tmp[tmpidx] = i;
            tmpidx++;
        }
    }

    tmp[tmpidx] = data.length();

    int *tmp2 = new int[tmpidx];
    for(i = 0; i < tmpidx; i++)
    {
        tmp2[i] = tmp[i];
    }


    delete[] tmp;
    offerset = tmp2;
    _length = tmpidx;
}


//----------------test code ----------------------------
int main()
{
    iutf8string str1("_我Love你!中国  ,!");
    cout << "字符串长度:" <<str1.length() <<endl;
    int i; cout << "[" ;
    for(i = 0; i < str1.length(); i++)
    {
        cout << str1[i] << " ";
    }
    cout << "]" << endl;
    string one = str1.substr(2,11);
    cout << one << endl;

    string s1("我们都是好孩子!");
    iutf8string str2(s1);
    cout << "[" ;
    for(i = 0; i < str2.length(); i++)
    {
        cout << str2[i] << " ";
    }
    cout << "]" << endl;
}

 

最后祝您,提乾涉经。告辞。