场景:
1. 分析数据时,获取到的数据是字符串,但是有可能不是正确的完整的utf8字符串,打印出来或输出到文件时表现出来的就是显示乱码.
这时候就需要过滤掉非法字符使utf8字符串能正确显示, 比如把非法字符替换为#
代码:
1. 这个函数的特性是1个个字符判断, 适合任意长度,任意构造的 utf8 (无效)字符串.
bool IREUtil::FilterUtf8(unsigned char * string,int length){
if(!string)
{
return false;
}
unsigned char * bytes = string;
unsigned char * end = bytes+length;
//10xxxxxx 应该出现个数
int count_s = 0;
//10xxxxxx 剩余个数
int minus_s = 0;
while(bytes != end)
{
if(bytes[0] > 0xF7)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;
bytes[0] = '#';
bytes+=1;
continue;
}
if(bytes[0] <= 0x7F)
{
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
minus_s = 0;
count_s = 0;
//过滤掉不可见字符
if((bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)))
{
;
}else
{
bytes[0] = '#';
}
bytes+=1;
continue;
}
if((bytes[0] & 0xF8) == 0xF0)
{
// 1111 0XXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 3;
minus_s = 3;
bytes+=1;
continue;
}
if((bytes[0] & 0xF0) == 0xE0)
{
// 1110 XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 2;
minus_s = 2;
bytes+=1;
continue;
}
if((bytes[0] & 0xE0) == 0xC0)
{
// 110X XXXX
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
count_s = 1;
minus_s = 1;
bytes+=1;
continue;
}
if((bytes[0] & 0xC0) == 0x80)
{
// 10XX XXXX
if(minus_s)
{
--minus_s;
}else
{
bytes[0] = '#';
}
bytes+=1;
continue;
}
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}else
{
bytes[0] = '#';
}
minus_s = 0;
count_s = 0;
bytes+=1;
continue;
}
if(minus_s)
{
int m = count_s-minus_s+1;
memset((void*)(bytes-m),'#',m);
}
return true;
}
附送一个网络上下载的判断是否是utf8字符串的另外一种函数,这种方法有问题, 就是必须是符合utf8字符串规则的个数,不然会越界.
原文地址没留下:
bool IREUtil::is_utf8(const unsigned char * string,int length){ if(!string) { return false; } const unsigned char * bytes = (const unsigned char *)string; const unsigned char * end = bytes+length; while(bytes != end) { if( (// ASCII // use bytes[0] <= 0x7F to allow ASCII control characters bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E) ) ) { bytes += 1; continue; } if( (// non-overlong 2-byte (0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) ) ) { bytes += 2; continue; } if( (// excluding overlongs bytes[0] == 0xE0 && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) || (// straight 3-byte ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) || (// excluding surrogates bytes[0] == 0xED && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) ) ) { bytes += 3; continue; } if( (// planes 1-3 bytes[0] == 0xF0 && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) || (// planes 4-15 (0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) || (// plane 16 bytes[0] == 0xF4 && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF) ) ) { bytes += 4; continue; } return false; } return true;}
欢迎指正!