统计每篇文章重要的词作为这篇文章的关键词,用tf-idf来实现。生产中有很多第三包可以调用,这里记录原理,顺便熟练python
1、公式 :
计算词频TF
考虑到文章有长短之分,为了便于不同文章的比较,进行"词频"标准化。
或者
计算反文档频率idf
import os
import math
import operator
filepath='H:/data/allfiles/allfiles'
doc_word = dict()
i=0
#统计每篇文章中的词频,及文章总数
for filename in os.listdir(filepath):
with open(filepath+'/'+filename,'r',encoding='utf-8') as f:
freq_word = dict()
for line in f.readlines():
words = line.strip().split(' ')
if len(words) == '':
continue
for word in words :
if freq_word.get(word,-1) == -1:
freq_word[word] = 1
else:
freq_word[word] += 1
doc_word[filename] = freq_word
i += 1
#统计idf
doc_nums = float(i)
doc_freq = dict()
for filename in doc_word.keys():
for word in doc_word[filename].keys():
if doc_freq.get(word,-1)==-1:
doc_freq[word]=1
else:
doc_freq[word]+=1
for word in doc_freq.keys():
doc_freq[word] =math.log(doc_nums/(doc_freq[word]+1))
#TF-IDF
for filename in doc_word.keys():
word_sorted = sorted(doc_word[filename].items(),key=operator.itemgetter(1),reverse=True)
for word in doc_word[filename].keys():
doc_word[filename][word] = doc_word[filename][word]*doc_freq[word]/float(word_sorted[0][1])
print (doc_word[filename])