python 对一篇文章,按逗号和句号分成一句一句的,然后在这篇文章中找到句子之间类似的情况(包含相同的词),写入文件

时间:2021-07-29 12:23:40
#-*- coding:utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("gbk")
#code:myhaspl@qq.com
#12-4.py

import numpy as np
import jieba
import copy
import re
import math

def get_cossimi(x,y):
    myx=np.array(x)
    myy=np.array(y)
    cos1=np.sum(myx*myy)
    cos21=np.sqrt(sum(myx*myx))
    cos22=np.sqrt(sum(myy*myy))
    return cos1/float(cos21*cos22)

#读取样本文本
    #去除停用词,同时构造样本词的字典
f_stop = open('stopwords.txt')  
try:  
        f_stop_text = f_stop.read( )
        f_stop_text=unicode(f_stop_text,'utf-8')
finally:  
        f_stop.close( )
f_stop_seg_list=f_stop_text.split('\n')


list8=[]
lines=[]
f1 = file('testk.txt','r')
lines3 = f1.read()  #全部读出
 
lines2=lines3.split(u'。')#逗号分隔
 
for q in lines2:
 lines4=q.split(u',') #句号分割
 for p in lines4:
  if len(p)>1:
   list8.append(p.strip())
#for ii in range(len(list8)):
# print list8[ii].decode('gbk')
lines=list8

fx=open('res.xls','w+')


for p in list8:

     for  i  in  lines :  
      f1_seg_list = jieba.cut(p)

      ftest1_seg_list = jieba.cut(i)     

      test_words={}
      all_words={}
      for  myword in f1_seg_list:

        if not(myword.strip() in f_stop_seg_list):
            test_words.setdefault(myword,0)
            all_words.setdefault(myword,0)
            all_words[myword]+=1
            
      #读取待测试文本
      mytest1_words=copy.deepcopy(test_words)
      for  myword in ftest1_seg_list:

        if not(myword.strip() in f_stop_seg_list):
            if mytest1_words.has_key(myword):
                mytest1_words[myword]+=1
    

           
      #计算样本与待测试文本的余弦相似度
      sampdata=[]
      test1data=[]


      for key in all_words.keys():
        sampdata.append(all_words[key])
        test1data.append(mytest1_words[key])


      test1simi=get_cossimi(sampdata,test1data)
      

      if  math.isnan(float(test1simi))   == False :

          #if test1simi <> 1 :

            #print "%s    %f   "%(p+','+i+',',test1simi)
            fx.write(p+'\t'+i+'\t'+"%f "%test1simi+'\t'+'\n')
    
                
f1.close()    
fx.close()