utf8字符串截取

时间:2022-06-01 21:35:42
#include <iostream>
#include <string>
#include <vector>
#include <inttypes.h>
#include <time.h>
using namespace std;
size_t utf8_to_charset(const std::string &input, std::vector<std::string> &output)
{
    std::string ch;
    for(size_t i = 0, len = 0; i != input.length(); i += len) {
        unsigned char byte = (unsigned)input[i];
        if(byte >= 0xFC)
            len = 6;
        else if(byte >= 0xF8)
            len = 5;
        else if(byte >= 0xF0)
            len = 4;
        else if(byte >= 0xE0)
            len = 3;
        else if(byte >= 0xC0)
            len = 2;
        else
            len = 1;
        ch = input.substr(i,len);
        output.push_back(ch);
    }
    return output.size();
}
string utf8_substr(const std::string &input,size_t pos,size_t length)
{
    string returnStr;
    size_t added_len = 0;
    size_t cur_pos = 0;
    string ch;
    if(length == 0) return returnStr;
    for(size_t i = 0, len = 0; i != input.length(); i += len) {
        unsigned char byte = (unsigned)input[i];
        if(byte >= 0xFC) len = 6 ;
        else if(byte >= 0xF8) len = 5;
        else if(byte >= 0xF0) len = 4;
        else if(byte >= 0xE0) len = 3;
        else if(byte >= 0xC0) len = 2;
        else len = 1;
        ++cur_pos;
        if(cur_pos < pos) continue;
        else {
            returnStr.append(input.substr(i,len));
            added_len++;
            if(added_len == length)
                break;
        }
    }
    return returnStr;
}
int main(int argc,char* argv[])
{
    string s = "UTF-8字符串截取,123456,这是一个测试字符串,长度需要大于17个字符";
    cout << "utf8_substr(s,0,0) ===> " << utf8_substr(s,0,0) << endl;
    cout << "utf8_substr(s,0,1) ===> " << utf8_substr(s,0,1) << endl;
    cout << "utf8_substr(s,0,17) ===> " << utf8_substr(s,0,17) << endl;
    cout << "utf8_substr(s,0,50) ===> " << utf8_substr(s,0,50) << endl;
    cout << "utf8_substr(s,10,0) ===> " << utf8_substr(s,10,0) << endl;
    cout << "utf8_substr(s,10,1) ===> " << utf8_substr(s,10,1) << endl;
    cout << "utf8_substr(s,10,10)===> " << utf8_substr(s,10,10) << endl;
    cout << "utf8_substr(s,10,30) ===> " << utf8_substr(s,10,30) << endl;
    return 0;
}