pyLDAvis生成LDA主题并可视化

时间:2025-04-09 07:34:20
# 准备数据 PATH = "C:\\Users\\mat\\Desktop\data\\各阶段关键短语抽取\\stage1_关键短语.txt" file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #一行行的读取内容 data_set=[] #建立存储分词的列表 for i in range(len(file_object2)): result=[] seg_list = file_object2[i].split() for w in seg_list :#读取每一行分词 result.append(w) data_set.append(result) # print(data_set) dictionary = corpora.Dictionary(data_set) # 构建 document-term matrix corpus = [dictionary.doc2bow(text) for text in data_set] #Lda = # 创建LDA对象 #计算困惑度 def perplexity(num_topics): ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30) # print(ldamodel.print_topics(num_topics=num_topics, num_words=15)) # print(ldamodel.log_perplexity(corpus)) return ldamodel.log_perplexity(corpus) #计算coherence def coherence(num_topics): ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1) # print(ldamodel.print_topics(num_topics=num_topics, num_words=10)) ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v') # print(ldacm.get_coherence()) return ldacm.get_coherence() # 绘制困惑度折线图 x = range(1,15) # z = [perplexity(i) for i in x] y = [coherence(i) for i in x] plt.plot(x, y) plt.xlabel('主题数目') plt.ylabel('coherence大小') plt.rcParams['-serif']=['SimHei'] matplotlib.rcParams['axes.unicode_minus']=False plt.title('主题-coherence变化情况') plt.show()