一个简单的实现
主要是通过循环和replace的方式进行敏感词的替换
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
class NaiveFilter():
'''Filter Messages from keywords
very simple filter implementation
>>> f = NaiveFilter()
>>> f.parse("filepath")
>>> f.filter("hello sexy baby")
hello **** baby
'''
def __init__( self ):
self .keywords = set ([])
def parse( self , path):
for keyword in open (path):
self .keywords.add(keyword.strip().decode( 'utf-8' ).lower())
def filter ( self , message, repl = "*" ):
message = str (message).lower()
for kw in self .keywords:
message = message.replace(kw, repl)
return message
|
使用BSF(宽度优先搜索)进行实现
对于搜索查找进行了优化,对于英语单词,直接进行了按词索引字典查找。对于其他语言模式,我们采用逐字符查找匹配的一种模式。
BFS:宽度优先搜索方式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
class BSFilter:
'''Filter Messages from keywords
Use Back Sorted Mapping to reduce replacement times
>>> f = BSFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''
def __init__( self ):
self .keywords = []
self .kwsets = set ([])
self .bsdict = defaultdict( set )
self .pat_en = re. compile (r '^[0-9a-zA-Z]+$' ) # english phrase or not
def add( self , keyword):
if not isinstance (keyword, str ):
keyword = keyword.decode( 'utf-8' )
keyword = keyword.lower()
if keyword not in self .kwsets:
self .keywords.append(keyword)
self .kwsets.add(keyword)
index = len ( self .keywords) - 1
for word in keyword.split():
if self .pat_en.search(word):
self .bsdict[word].add(index)
else :
for char in word:
self .bsdict[char].add(index)
def parse( self , path):
with open (path, "r" ) as f:
for keyword in f:
self .add(keyword.strip())
def filter ( self , message, repl = "*" ):
if not isinstance (message, str ):
message = message.decode( 'utf-8' )
message = message.lower()
for word in message.split():
if self .pat_en.search(word):
for index in self .bsdict[word]:
message = message.replace( self .keywords[index], repl)
else :
for char in word:
for index in self .bsdict[char]:
message = message.replace( self .keywords[index], repl)
return message
|
使用DFA(Deterministic Finite Automaton)进行实现
DFA即Deterministic Finite Automaton,也就是确定有穷自动机。
使用了嵌套的字典来实现。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
class DFAFilter():
'''Filter Messages from keywords
Use DFA to keep algorithm perform constantly
>>> f = DFAFilter()
>>> f.add("sexy")
>>> f.filter("hello sexy baby")
hello **** baby
'''
def __init__( self ):
self .keyword_chains = {}
self .delimit = '\x00'
def add( self , keyword):
if not isinstance (keyword, str ):
keyword = keyword.decode( 'utf-8' )
keyword = keyword.lower()
chars = keyword.strip()
if not chars:
return
level = self .keyword_chains
for i in range ( len (chars)):
if chars[i] in level:
level = level[chars[i]]
else :
if not isinstance (level, dict ):
break
for j in range (i, len (chars)):
level[chars[j]] = {}
last_level, last_char = level, chars[j]
level = level[chars[j]]
last_level[last_char] = { self .delimit: 0 }
break
if i = = len (chars) - 1 :
level[ self .delimit] = 0
def parse( self , path):
with open (path,encoding = 'UTF-8' ) as f:
for keyword in f:
self .add(keyword.strip())
def filter ( self , message, repl = "*" ):
if not isinstance (message, str ):
message = message.decode( 'utf-8' )
message = message.lower()
ret = []
start = 0
while start < len (message):
level = self .keyword_chains
step_ins = 0
for char in message[start:]:
if char in level:
step_ins + = 1
if self .delimit not in level[char]:
level = level[char]
else :
ret.append(repl * step_ins)
start + = step_ins - 1
break
else :
ret.append(message[start])
break
else :
ret.append(message[start])
start + = 1
return ''.join(ret)
|
到此这篇关于Python 敏感词过滤的实现示例的文章就介绍到这了,更多相关Python 敏感词过滤内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!
原文链接:https://juejin.cn/post/7002068513070268424