.Net ：脏字处理类，效率很高。。。。。

BadWordParse 类：

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.IO;

namespace charCheck
{
     public class BadWordParse
    {

         private HashSet < string > hash = new HashSet < string > ();
         private byte [] fastCheck = new byte [ char .MaxValue];
         private BitArray charCheck = new BitArray( char .MaxValue);
         private int maxWordLength = 0 ;
         private int minWordLength = int .MaxValue;
         private bool _isHave = false ;
         private string _replaceString = " * " ;
         private char _splitString = ' | ' ;
         private string _newWord;
         private string _badWordFilePath;

         /// <summary>
         /// 是否含有脏字
         /// </summary>
         public bool IsHave
        {
             get { return _isHave; }
        }

         /// <summary>
         /// 替换后字符串
         /// </summary>
         public string ReplaceString
        {
             set { _replaceString = value; }
        }
         /// <summary>
         /// 脏字字典切割符
         /// </summary>
         public char SplitString
        {
             set { _splitString = value; }
        }

         /// <summary>
         /// 更新后的字符串
         /// </summary>
         public string NewWord
        {
             get { return _newWord; }
        }

         /// <summary>
         /// 脏字字典文档路径
         /// </summary>
         public string BadWordFilePath
        {
             get { return _badWordFilePath; }
             set { _badWordFilePath = value; }
        }

         public BadWordParse( string filePath)
        {
            _badWordFilePath = filePath;
             string srList = string .Empty;
             if (File.Exists(_badWordFilePath))
            {
                StreamReader sr = new StreamReader(_badWordFilePath, Encoding.GetEncoding( " gb2312 " ));
                srList = sr.ReadToEnd();
                sr.Close();
                sr.Dispose();
            }
             string [] badwords = srList.Split( ' | ' );
             foreach ( string word in badwords)
            {
                maxWordLength = Math.Max(maxWordLength, word.Length);
                minWordLength = Math.Min(minWordLength, word.Length);
                 for ( int i = 0 ; i < 7 && i < word.Length; i ++ )
                {
                    fastCheck[word[i]] |= ( byte )( 1 << i);
                }

                 for ( int i = 7 ; i < word.Length; i ++ )
                {
                    fastCheck[word[i]] |= 0x80 ;
                }

                 if (word.Length == 1 )
                {
                    charCheck[word[ 0 ]] = true ;
                }
                 else
                {
                    hash.Add(word);
                }
            }
        }
         public bool HasBadWord( string text)
        {
             int index = 0 ;

             while (index < text.Length)
            {

                 if ((fastCheck[text[index]] & 1 ) == 0 )
                {
                     while (index < text.Length - 1 && (fastCheck[text[ ++ index]] & 1 ) == 0 ) ;
                }

                 // 单字节检测
                 if (minWordLength == 1 && charCheck[text[index]])
                {
                     return true ;
                }

                 // 多字节检测
                 for ( int j = 1 ; j <= Math.Min(maxWordLength, text.Length - index - 1 ); j ++ )
                {
                     // 快速排除
                     if ((fastCheck[text[index + j]] & ( 1 << Math.Min(j, 7 ))) == 0 )
                    {
                         break ;
                    }

                     if (j + 1 >= minWordLength)
                    {
                         string sub = text.Substring(index, j + 1 );

                         if (hash.Contains(sub))
                        {
                             return true ;
                        }
                    }
                }
                index ++ ;
            }
             return false ;
        }

         public string ReplaceBadWord( string text)
        {
             int index = 0 ;

             for (index = 0 ; index < text.Length; index ++ )
            {
                 if ((fastCheck[text[index]] & 1 ) == 0 )
                {
                     while (index < text.Length - 1 && (fastCheck[text[ ++ index]] & 1 ) == 0 ) ;
                }

                 // 单字节检测
                 if (minWordLength == 1 && charCheck[text[index]])
                {
                     // return true;
                    _isHave = true ;
                    text = text.Replace(text[index], _replaceString[ 0 ]);
                     continue ;
                }
                 // 多字节检测
                 for ( int j = 1 ; j <= Math.Min(maxWordLength, text.Length - index - 1 ); j ++ )
                {

                     // 快速排除
                     if ((fastCheck[text[index + j]] & ( 1 << Math.Min(j, 7 ))) == 0 )
                    {
                         break ;
                    }

                     if (j + 1 >= minWordLength)
                    {
                         string sub = text.Substring(index, j + 1 );

                         if (hash.Contains(sub))
                        {

                             // 替换字符操作
                            _isHave = true ;
                             char cc = _replaceString[ 0 ];
                             string rp = _replaceString.PadRight((j + 1 ), cc);
                            text = text.Replace(sub, rp);
                             // 记录新位置
                            index += j;
                             break ;
                        }
                    }
                }
            }
            _newWord = text;
             return text;
        }
    }

}

测试代码：

代码

string filePath = " F://charCheck/charCheck/badword.txt " ;
             string testString = "" ;
            System.IO.StreamReader sr = new System.IO.StreamReader(filePath, System.Text.Encoding.GetEncoding( " gb2312 " ));
             // testString = sr.ReadToEnd();
            sr.Close();
            sr.Dispose();
             // uint t = GetTickCount();
            BadWordParse bwp = new BadWordParse(filePath);
             string parsedString = bwp.ReplaceBadWord(testString);
             // uint time = GetTickCount() - t;
             // Console.Write("使用时间：" + time.ToString());
             // Console.Write("\r\n");
             // Console.Write("原始字符串" + parsedString);
             // Console.Write("\r\n");
             // Console.Write("替换后字符串" + parsedString);

秒客网

.Net ：脏字处理类，效率很高。。。。。

相关文章

.Net ： 脏字处理类，效率很高。。。。。

相关文章

.Net ：脏字处理类，效率很高。。。。。