java DFA 敏感词过滤

时间:2022-10-12 15:20:55

DFA算法介绍见这篇博文:
http://blog.csdn.net/chenssy/article/details/26961957
直接贴代码:

@SuppressWarnings("unchecked")
public class SensitiveWordUtils
{

/**
* 只要匹配到一个就返回
*/

public static final int MATCHTYPE_MIN = 1 << 0;

/**
* 统计所有敏感词
*/

public static final int MATCHTYPE_ALL = 1 << 1;

/**
* map 中的结束标志
*/

private static final String END = "1";

private static final String END_NOT = "0";

private static final Map<String, Object> sensitiveWordMap = new HashMap<>();

@SuppressWarnings("serial")
private static final Set<String> sensitiveWordSets = new HashSet<String>()
{
{
add("日本人");
add("本来啊");
}
};

static
{
Map<String, Object> newWordMap = null;
Map<String, Object> addMap = null;
for (String keywords : sensitiveWordSets)
{
newWordMap = sensitiveWordMap;
for (int i = 0; i < keywords.length(); i++)
{
String c = String.valueOf(keywords.charAt(i));
Object o = newWordMap.get(c);
if (null == o)
{
addMap = new HashMap<String, Object>();
addMap.put("isEnd", END_NOT); // 不是最后一个
newWordMap.put(c, addMap);
newWordMap = addMap;
}
else
{
newWordMap = (Map<String, Object>)o;
}

if (i == keywords.length() - 1)
{
newWordMap.put("isEnd", END); // 最后一个
}
}
}
}

/**
* 敏感词匹配,返回匹配到的敏感词数量
*
* @Description
* @param str 需要匹配的字符
* @param beginIndex 开始匹配位置
* @param matchType 匹配规则: MATCHTYPE_MIN(只要匹配到一个敏感词就返回) MATCHTYPE_ALL(返回匹配到的所有敏感词数量)
* @return
*/

public static int checkSensitiveWord(String str, int beginIndex, int matchType)
{
int mark = 0;
Map<String, Object> newWordMap = null;
Object o;
for (int i = beginIndex; i < str.length(); i++)
{
newWordMap = sensitiveWordMap;
int j = i;
while (j < str.length())
{
o = newWordMap.get(String.valueOf(str.charAt(j)));
if (o == null)
{
break;
}
else
{
newWordMap = (Map<String, Object>)o;
if (END.equals(newWordMap.get("isEnd").toString()))
{
mark++;
if (matchType == MATCHTYPE_MIN)
{
return mark;
}
break;
}
}
j++;
}
}
return mark;
}

/**
* 返回匹配到的敏感词
*
* @Description
* @param str
* @param beginIndex 开始匹配位置
* @param matchType 匹配规则
* @return
*/

public static Set<String> firstSensitiveWord(String str, int beginIndex, int matchType)
{
Set<String> set = new HashSet<>();
Map<String, Object> newWordMap = null;
Object o;
for (int i = beginIndex; i < str.length(); i++)
{
newWordMap = sensitiveWordMap;
int j = i;
while (j < str.length())
{
o = newWordMap.get(String.valueOf(str.charAt(j)));
if (o == null)
{
break;
}
else
{
newWordMap = (Map<String, Object>)o;
if (END.equals(newWordMap.get("isEnd").toString()))
{
set.add(str.substring(i, j + 1));
if (matchType == MATCHTYPE_MIN)
{
return set;
}
break;
}
}
j++;
}
}
return set;
}

public static void main(String[] args)
{
System.out.println(checkSensitiveWord("日本来啊日本人", 0, MATCHTYPE_ALL));
}
}

代码中 sensitiveWordMap 为{日={本={男={人={isEnd=1}, isEnd=0}, 人={isEnd=1}, isEnd=0}, isEnd=0}, 法={isEnd=0, 轮={isEnd=0, 功={isEnd=1}}}}