词云分析——西游记词云分析函数代码及分析
# 封装词云分析函数
# 下载第三方库
# !pip install jieba
# !pip install wordcloud
#导入第三方库
import jieba
from wordcloud import WordCloud
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#展示在当前页面中
%matplotlib inline
def Ciyunfenxi():
#文件读写
f = open('西游记.txt','r',encoding = 'utf-8')
txt = f.read()
f.close
#print(txt) //输出txt文本
#读取中文停用字表
st_word = []
with open('中文停用词表.txt','r',encoding = 'utf-8') as f1:
for line in f1:
st_word.append(line.strip())
f1.close
st_words = np.array(st_word)
#print(st_words) //输出中文停用词字表
#分词
segments = []
segs = jieba.cut(txt)
for seg in segs:
if len(seg.strip())>1:#去掉单字词
segments.append(seg);
#分词结果添加到数据框中
segmentDF = pd.DataFrame({'segment':segments});
#词频统计
word_FR = segmentDF['segment'].value_counts()
# print(word_FR.index) //输出词频索引
# word_FR["我们"]
#移除停用词
for word in word_FR.index:
if word in st_words:
word_FR.drop(index = word,inplace = True)
# print(word_FR.head(200)) //输出词频前200个
#写入统计西游记词频文件
with open('西游记_词频.txt','w',encoding='utf-8') as f:
for i in range(200):
f.write(word_FR.index[i]+" "+str(word_FR[i])+"\n")
f.close
#生成词云并输出词云文件和词云图片
wc = WordCloud(background_color="white",font_path="",height=700,width=1000)
wordcloud=wc.fit_words(word_FR)
plt.imshow(wordcloud)
wordcloud.to_file('词云图.png')
Ciyunfenxi()