.Net : 脏字处理类,效率很高。。。。。

时间:2022-09-16 14:47:49

BadWordParse 类:

 

.Net : 脏字处理类,效率很高。。。。。.Net : 脏字处理类,效率很高。。。。。
using  System;
using  System.Collections.Generic;
using  System.Text;
using  System.Collections;
using  System.IO;

namespace  charCheck
{
    
public   class  BadWordParse
    {


        
private  HashSet < string >  hash  =   new  HashSet < string > ();
        
private   byte [] fastCheck  =   new   byte [ char .MaxValue];
        
private  BitArray charCheck  =   new  BitArray( char .MaxValue);
        
private   int  maxWordLength  =   0 ;
        
private   int  minWordLength  =   int .MaxValue;
        
private   bool  _isHave  =   false ;
        
private   string  _replaceString  =   " * " ;
        
private   char  _splitString  =   ' | ' ;
        
private   string  _newWord;
        
private   string  _badWordFilePath;


        
///   <summary>
        
///  是否含有脏字
        
///   </summary>
         public   bool  IsHave
        {
            
get  {  return  _isHave; }
        }

        
///   <summary>
        
///  替换后字符串
        
///   </summary>
         public   string  ReplaceString
        {
            
set  { _replaceString  =  value; }
        }
        
///   <summary>
        
///  脏字字典切割符
        
///   </summary>
         public   char  SplitString
        {
            
set  { _splitString  =  value; }
        }

        
///   <summary>
        
///  更新后的字符串
        
///   </summary>
         public   string  NewWord
        {
            
get  {  return  _newWord; }
        }

        
///   <summary>
        
///  脏字字典文档路径
        
///   </summary>
         public   string  BadWordFilePath
        {
            
get  {  return  _badWordFilePath; }
            
set  { _badWordFilePath  =  value; }
        }

        
public  BadWordParse( string  filePath)
        {
            _badWordFilePath 
=  filePath;
            
string  srList  =   string .Empty;
            
if  (File.Exists(_badWordFilePath))
            {
                StreamReader sr 
=   new  StreamReader(_badWordFilePath, Encoding.GetEncoding( " gb2312 " ));
                srList 
=  sr.ReadToEnd();
                sr.Close();
                sr.Dispose();
            }
            
string [] badwords  =  srList.Split( ' | ' );
            
foreach  ( string  word  in  badwords)
            {
                maxWordLength 
=  Math.Max(maxWordLength, word.Length);
                minWordLength 
=  Math.Min(minWordLength, word.Length);
                
for  ( int  i  =   0 ; i  <   7   &&  i  <  word.Length; i ++ )
                {
                    fastCheck[word[i]] 
|=  ( byte )( 1   <<  i);
                }

                
for  ( int  i  =   7 ; i  <  word.Length; i ++ )
                {
                    fastCheck[word[i]] 
|=   0x80 ;
                }

                
if  (word.Length  ==   1 )
                {
                    charCheck[word[
0 ]]  =   true ;
                }
                
else
                {
                    hash.Add(word);
                }
            }
        }
        
public   bool  HasBadWord( string  text)
        {
            
int  index  =   0 ;

            
while  (index  <  text.Length)
            {


                
if  ((fastCheck[text[index]]  &   1 ==   0 )
                {
                    
while  (index  <  text.Length  -   1   &&  (fastCheck[text[ ++ index]]  &   1 ==   0 ) ;
                }

                
// 单字节检测
                 if  (minWordLength  ==   1   &&  charCheck[text[index]])
                {
                    
return   true ;
                }


                
// 多字节检测
                 for  ( int  j  =   1 ; j  <=  Math.Min(maxWordLength, text.Length  -  index  -   1 ); j ++ )
                {
                    
// 快速排除
                     if  ((fastCheck[text[index  +  j]]  &  ( 1   <<  Math.Min(j,  7 )))  ==   0 )
                    {
                        
break ;
                    }

                    
if  (j  +   1   >=  minWordLength)
                    {
                        
string  sub  =  text.Substring(index, j  +   1 );

                        
if  (hash.Contains(sub))
                        {
                            
return   true ;
                        }
                    }
                }
                index
++ ;
            }
            
return   false ;
        }

        
public   string  ReplaceBadWord( string  text)
        {
            
int  index  =   0 ;

            
for  (index  =   0 ; index  <  text.Length; index ++ )
            {
                
if  ((fastCheck[text[index]]  &   1 ==   0 )
                {
                    
while  (index  <  text.Length  -   1   &&  (fastCheck[text[ ++ index]]  &   1 ==   0 ) ;
                }

                
// 单字节检测
                 if  (minWordLength  ==   1   &&  charCheck[text[index]])
                {
                    
// return true;
                    _isHave  =   true ;
                    text 
=  text.Replace(text[index], _replaceString[ 0 ]);
                    
continue ;
                }
                
// 多字节检测
                 for  ( int  j  =   1 ; j  <=  Math.Min(maxWordLength, text.Length  -  index  -   1 ); j ++ )
                {

                    
// 快速排除
                     if  ((fastCheck[text[index  +  j]]  &  ( 1   <<  Math.Min(j,  7 )))  ==   0 )
                    {
                        
break ;
                    }

                    
if  (j  +   1   >=  minWordLength)
                    {
                        
string  sub  =  text.Substring(index, j  +   1 );

                        
if  (hash.Contains(sub))
                        {

                            
// 替换字符操作
                            _isHave  =   true ;
                            
char  cc  =  _replaceString[ 0 ];
                            
string  rp  =  _replaceString.PadRight((j  +   1 ), cc);
                            text 
=  text.Replace(sub, rp);
                            
// 记录新位置
                            index  +=  j;
                            
break ;
                        }
                    }
                }
            }
            _newWord 
=  text;
            
return  text;
        }
    }


}

 

 

测试代码:

 

.Net : 脏字处理类,效率很高。。。。。.Net : 脏字处理类,效率很高。。。。。代码
  string  filePath  =   " F://charCheck/charCheck/badword.txt " ;  
            
string  testString  =   "" ;
            System.IO.StreamReader sr 
=   new  System.IO.StreamReader(filePath, System.Text.Encoding.GetEncoding( " gb2312 " ));
            
// testString = sr.ReadToEnd();
            sr.Close();
            sr.Dispose();
            
// uint t = GetTickCount();
            BadWordParse bwp  =   new  BadWordParse(filePath);
            
string  parsedString  =  bwp.ReplaceBadWord(testString);
            
// uint time = GetTickCount() - t;
            
// Console.Write("使用时间:" + time.ToString());
            
// Console.Write("\r\n");
            
// Console.Write("原始字符串" + parsedString);
            
// Console.Write("\r\n");
            
// Console.Write("替换后字符串" + parsedString);