pyLDAvis生成LDA主题并可视化
# 准备数据
PATH = "C:\\Users\\mat\\Desktop\data\\各阶段关键短语抽取\\stage1_关键短语.txt"
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容
data_set=[] #建立存储分词的列表
for i in range(len(file_object2)):
result=[]
seg_list = file_object2[i].split()
for w in seg_list :#读取每一行分词
result.append(w)
data_set.append(result)
# print(data_set)
dictionary = corpora.Dictionary(data_set) # 构建 document-term matrix
corpus = [dictionary.doc2bow(text) for text in data_set]
#Lda = # 创建LDA对象
#计算困惑度
def perplexity(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30)
# print(ldamodel.print_topics(num_topics=num_topics, num_words=15))
# print(ldamodel.log_perplexity(corpus))
return ldamodel.log_perplexity(corpus)
#计算coherence
def coherence(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1)
# print(ldamodel.print_topics(num_topics=num_topics, num_words=10))
ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')
# print(ldacm.get_coherence())
return ldacm.get_coherence()
# 绘制困惑度折线图
x = range(1,15)
# z = [perplexity(i) for i in x]
y = [coherence(i) for i in x]
plt.plot(x, y)
plt.xlabel('主题数目')
plt.ylabel('coherence大小')
plt.rcParams['-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('主题-coherence变化情况')
plt.show()