JAVA敏感词过滤
一、初始化敏感词库
1 import java.io.BufferedReader; 2 import java.io.File; 3 import java.io.FileInputStream; 4 import java.io.InputStreamReader; 5 import java.util.HashMap; 6 import java.util.HashSet; 7 import java.util.Iterator; 8 import java.util.Map; 9 import java.util.Set; 10 11 /** 12 * 初始化敏感词库,将敏感词加入到HashMap中,构建DFA算法模型 13 */ 14 public class SensitiveWordInit { 15 private String ENCODING = "utf-8"; //字符编码 16 public HashMap sensitiveWordMap; 17 public SensitiveWordInit(){ 18 super(); 19 } 20 21 /** 22 * 初始化 23 */ 24 public Map initKeyWord(){ 25 try { 26 //读取敏感词库 27 Set<String> keyWordSet = readSensitiveWordFile(); 28 //将敏感词库加入到HashMap中 29 addSensitiveWordToHashMap(keyWordSet); 30 //spring获取application,然后application.setAttribute("sensitiveWordMap",sensitiveWordMap); 31 } catch (Exception e) { 32 e.printStackTrace(); 33 } 34 return sensitiveWordMap; 35 } 36 37 /** 38 * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br> 39 * 中 = { 40 * isEnd = 0 41 * 国 = {<br> 42 * isEnd = 1 43 * 人 = {isEnd = 0 44 * 民 = {isEnd = 1} 45 * } 46 * 男 = { 47 * isEnd = 0 48 * 人 = { 49 * isEnd = 1 50 * } 51 * } 52 * } 53 * } 54 * 五 = { 55 * isEnd = 0 56 * 星 = { 57 * isEnd = 0 58 * 红 = { 59 * isEnd = 0 60 * 旗 = { 61 * isEnd = 1 62 * } 63 * } 64 * } 65 * } 66 */ 67 private void addSensitiveWordToHashMap(Set<String> keyWordSet) { 68 sensitiveWordMap = new HashMap(keyWordSet.size()); //初始化敏感词容器,减少扩容操作 69 String key = null; 70 Map nowMap = null; 71 Map<String, String> newWorMap = null; 72 //迭代keyWordSet 73 Iterator<String> iterator = keyWordSet.iterator(); 74 while(iterator.hasNext()){ 75 key = iterator.next(); //关键字 76 nowMap = sensitiveWordMap; 77 for(int i = 0 ; i < key.length() ; i++){ 78 char keyChar = key.charAt(i); //转换成char型 79 Object wordMap = nowMap.get(keyChar); //获取 80 81 if(wordMap != null){ //如果存在该key,直接赋值 82 nowMap = (Map) wordMap; 83 } 84 else{ //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个 85 newWorMap = new HashMap<String,String>(); 86 newWorMap.put("isEnd", "0"); //不是最后一个 87 nowMap.put(keyChar, newWorMap); 88 nowMap = newWorMap; 89 } 90 91 if(i == key.length() - 1){ 92 nowMap.put("isEnd", "1"); //最后一个 93 } 94 } 95 } 96 } 97 98 /** 99 * 读取敏感词库中的内容,将内容添加到set集合中 100 */ 101 @SuppressWarnings("resource") 102 private Set<String> readSensitiveWordFile() throws Exception{ 103 Set<String> set = null; 104 //https://github.com/heqiyoujing/config_file 词库地址 105 File file = new File("D:\\SensitiveWord.txt"); //读取文件 106 InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING); 107 try { 108 if(file.isFile() && file.exists()){ //文件流是否存在 109 set = new HashSet<String>(); 110 BufferedReader bufferedReader = new BufferedReader(read); 111 String txt = null; 112 while((txt = bufferedReader.readLine()) != null){ //读取文件,将文件内容放入到set中 113 set.add(txt); 114 } 115 } 116 else{ //不存在抛出异常信息 117 throw new Exception("敏感词库文件不存在"); 118 } 119 } catch (Exception e) { 120 throw e; 121 }finally{ 122 read.close(); //关闭文件流 123 } 124 return set; 125 } 126 }
二、检查敏感词并替换
1 import java.util.HashSet; 2 import java.util.Iterator; 3 import java.util.Map; 4 import java.util.Set; 5 6 /** 7 * 敏感词过滤 8 */ 9 public class SensitivewordFilter { 10 private Map sensitiveWordMap = null; 11 public static int minMatchTYpe = 1; //最小匹配规则 12 public static int maxMatchType = 2; //最大匹配规则 13 private static String replaceString = null; 14 /**例如:敏感词中含有中国人、中国 15 * 最小匹配规则minMatchTYpe为1时,会匹配出**人,为2时,会匹配出*** 16 * */ 17 public static void main(String[] args) throws Exception{ 18 SensitivewordFilter filter = new SensitivewordFilter(); 19 System.out.println("敏感词的数量:" + filter.sensitiveWordMap.size()); 20 String string = "dfa是面向三级装配的设计(Design for assembly)的英文简称,是指在产品设计阶段设计产品使得产品具有良好" + 21 "的可装配性,确保装配工序简单、装配效率高、装配质量高、装配不良率低和装配成本低。面向装配的设计通过一系" + 22 "列有利于装配的设计指南例如简化产品设计、减少零件数量等,女女并同装配工程师一起合作,被逼简化产品结构,近親使其便于" + 23 "装配,为提高产品质量、缩短产品开发周期和降低产品成本奠定基础"; 24 // ------获取敏感词--------- 25 Set<String> set = filter.getSensitiveWord(string, 1); 26 System.out.println("含敏感词的个数为:" + set.size() + "。包含:" + set); 27 // ------------------------替换敏感字begin---------------------- 28 Iterator<String> iterator = set.iterator(); 29 String word = null; 30 while (iterator.hasNext()) { 31 word = iterator.next(); 32 /** 33 * 得到word中敏感关键词被替换后的字符串,例如:*** 34 * */ 35 getReplaceCharsS("*", word.length()); 36 /** 37 * 将原字符串中的敏感关键词替换成带有replaceChar 38 * 或全部为replaceChar的关键词 39 * */ 40 string = string.replaceAll(word, replaceString); 41 } 42 // ------------------------替换敏感字end---------------------- 43 System.out.println(string); 44 } 45 46 /** 47 * 构造函数,初始化敏感词库 48 */ 49 public SensitivewordFilter(){ 50 sensitiveWordMap = new SensitiveWordInit().initKeyWord(); 51 } 52 53 /** 54 * 判断文字是否包含敏感字符 55 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 56 */ 57 public boolean isContaintSensitiveWord(String txt,int matchType){ 58 boolean flag = false; 59 for(int i = 0 ; i < txt.length() ; i++){ 60 int matchFlag = this.CheckSensitiveWord(txt, i, matchType); //判断是否包含敏感字符 61 if(matchFlag > 0){ //大于0存在,返回true 62 flag = true; 63 } 64 } 65 return flag; 66 } 67 68 /** 69 * 获取文字中的敏感词 70 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 71 */ 72 public Set<String> getSensitiveWord(String txt , int matchType){ 73 Set<String> sensitiveWordList = new HashSet<String>(); 74 75 for(int i = 0 ; i < txt.length() ; i++){ 76 int length = CheckSensitiveWord(txt, i, matchType); //判断是否包含敏感字符 77 if(length > 0){ //存在,加入list中 78 sensitiveWordList.add(txt.substring(i, i+length)); 79 i = i + length - 1; //减1的原因,是因为for会自增 80 } 81 } 82 83 return sensitiveWordList; 84 } 85 86 /** 87 * 替换敏感字字符,默认* 88 */ 89 public String replaceSensitiveWord(String txt,int matchType,String replaceChar){ 90 String resultTxt = txt; 91 Set<String> set = getSensitiveWord(txt, matchType); //获取所有的敏感词 92 Iterator<String> iterator = set.iterator(); 93 String word = null; 94 String replaceString = null; 95 while (iterator.hasNext()) { 96 word = iterator.next(); 97 replaceString = getReplaceChars(replaceChar, word.length()); 98 resultTxt = resultTxt.replaceAll(word, replaceString); 99 } 100 101 return resultTxt; 102 } 103 104 /** 105 * 获取替换字符串 106 */ 107 private String getReplaceChars(String replaceChar,int length){ 108 String resultReplace = replaceChar; 109 for(int i = 1 ; i < length ; i++){ 110 resultReplace += replaceChar; 111 } 112 113 return resultReplace; 114 } 115 116 /** 117 * 获取替换字符串,无返回值 118 */ 119 private static void getReplaceCharsS(String replaceChar,int length){ 120 replaceString = ""; 121 String resultReplace = replaceChar; 122 for(int i = 1 ; i < length ; i++){ 123 resultReplace += replaceChar; 124 } 125 replaceString = resultReplace; 126 } 127 128 /** 129 * 检查文字中是否包含敏感字符,检查规则如下:<br> 130 */ 131 @SuppressWarnings({ "rawtypes"}) 132 public int CheckSensitiveWord(String txt,int beginIndex,int matchType){ 133 boolean flag = false; //敏感词结束标识位:用于敏感词只有1位的情况 134 int matchFlag = 0; //匹配标识数默认为0 135 char word = 0; 136 Map nowMap = sensitiveWordMap; 137 for(int i = beginIndex; i < txt.length() ; i++){ 138 word = txt.charAt(i); 139 nowMap = (Map) nowMap.get(word); //获取指定key 140 if(nowMap != null){ //存在,则判断是否为最后一个 141 matchFlag++; //找到相应key,匹配标识+1 142 if("1".equals(nowMap.get("isEnd"))){ //如果为最后一个匹配规则,结束循环,返回匹配标识数 143 flag = true; //结束标志位为true 144 if(SensitivewordFilter.minMatchTYpe == matchType){ //最小规则,直接返回,最大规则还需继续查找 145 break; 146 } 147 } 148 } 149 else{ //不存在,直接返回 150 break; 151 } 152 } 153 if(matchFlag < 2 || !flag){ //长度必须大于等于1,为词 154 matchFlag = 0; 155 } 156 return matchFlag; 157 } 158 159 }
三、运行结果