#-*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("gbk")
#code:myhaspl@qq.com
#12-4.py
import numpy as np
import jieba
import copy
import re
import math
def get_cossimi(x,y):
myx=np.array(x)
myy=np.array(y)
cos1=np.sum(myx*myy)
cos21=np.sqrt(sum(myx*myx))
cos22=np.sqrt(sum(myy*myy))
return cos1/float(cos21*cos22)
#读取样本文本
#去除停用词,同时构造样本词的字典
f_stop = open('stopwords.txt')
try:
f_stop_text = f_stop.read( )
f_stop_text=unicode(f_stop_text,'utf-8')
finally:
f_stop.close( )
f_stop_seg_list=f_stop_text.split('\n')
list8=[]
lines=[]
f1 = file('testk.txt','r')
lines3 = f1.read() #全部读出
lines2=lines3.split(u'。')#逗号分隔
for q in lines2:
lines4=q.split(u',') #句号分割
for p in lines4:
if len(p)>1:
list8.append(p.strip())
#for ii in range(len(list8)):
# print list8[ii].decode('gbk')
lines=list8
fx=open('res.xls','w+')
for p in list8:
for i in lines :
f1_seg_list = jieba.cut(p)
ftest1_seg_list = jieba.cut(i)
test_words={}
all_words={}
for myword in f1_seg_list:
if not(myword.strip() in f_stop_seg_list):
test_words.setdefault(myword,0)
all_words.setdefault(myword,0)
all_words[myword]+=1
#读取待测试文本
mytest1_words=copy.deepcopy(test_words)
for myword in ftest1_seg_list:
if not(myword.strip() in f_stop_seg_list):
if mytest1_words.has_key(myword):
mytest1_words[myword]+=1
#计算样本与待测试文本的余弦相似度
sampdata=[]
test1data=[]
for key in all_words.keys():
sampdata.append(all_words[key])
test1data.append(mytest1_words[key])
test1simi=get_cossimi(sampdata,test1data)
if math.isnan(float(test1simi)) == False :
#if test1simi <> 1 :
#print "%s %f "%(p+','+i+',',test1simi)
fx.write(p+'\t'+i+'\t'+"%f "%test1simi+'\t'+'\n')
f1.close()
fx.close()