看代码吧~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
import re
import jieba.analyse
import codecs
import pandas as pd
def simplification_text(xianbingshi):
"""提取文本"""
xianbingshi_simplification = []
with codecs. open (xianbingshi, 'r' , 'utf8' ) as f:
for line in f :
line = line.strip()
line_write = re.findall( '(?<=\<b\>).*?(?=\<e\>)' ,line)
for line in line_write:
xianbingshi_simplification.append(line)
with codecs. open (r 'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt' , 'w' , 'utf8' ) as f:
for line in xianbingshi_simplification:
f.write(line + '\n' )
def jieba_text():
""""""
word_list = []
data = open (r "C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt" , encoding = 'utf-8' ).read()
seg_list = jieba.cut(data, cut_all = False ) # 精确模式
for i in seg_list:
word_list.append(i.strip())
data_quchong = pd.DataFrame({ 'a' :word_list})
data_quchong.drop_duplicates(subset = [ 'a' ],keep = 'first' ,inplace = True )
word_list = data_quchong[ 'a' ].tolist()
with codecs. open ( 'word.txt' , 'w' , 'utf8' )as w:
for line in word_list:
w.write(line + '\n' )
def word_messy(word):
"""词语提炼"""
word_sub_list = []
with codecs. open (word, 'r' , 'utf8' ) as f:
for line in f:
line_sub = re.sub( "^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?" ,'',line)
word_sub_list.append(line_sub)
word_sub_list.sort()
with codecs. open ( 'word.txt' , 'w' , 'utf8' )as w:
for line in word_sub_list:
w.write(line.strip( "\n" ) + '\n' )
if __name__ = = '__main__' :
xianbingshi = r 'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt'
# simplification_text(xianbingshi)
# word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt'
simplification_text(xianbingshi)
|
补充:python 进行结巴分词 并且用re去掉符号
看代码吧~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# 把停用词做成字典
stopwords = {}
fstop = open ( 'stop_words.txt' , 'r' ,encoding = 'utf-8' ,errors = 'ingnore' )
for eachWord in fstop:
stopwords[eachWord.strip()] = eachWord.strip() #停用词典
fstop.close()
f1 = open ( 'all.txt' , 'r' ,encoding = 'utf-8' ,errors = 'ignore' )
f2 = open ( 'allutf11.txt' , 'w' ,encoding = 'utf-8' )
line = f1.readline()
while line:
line = line.strip() #去前后的空格
line = re.sub(r "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+" , " " , line) #去标点符号
seg_list = jieba.cut(line,cut_all = False ) #结巴分词
outStr = ""
for word in seg_list:
if word not in stopwords:
outStr + = word
outStr + = " "
f2.write(outStr)
line = f1.readline()
f1.close()
f2.close()
|
以上为个人经验,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://www.cnblogs.com/yiwoqu/p/11542002.html