pre_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
#-*-coding:utf-8-*-
import MySQLdb
import MySQLdb as mdb
import os,sys,string
import jieba
import codecs
reload (sys)
sys.setdefaultencoding( 'utf-8' )
#连接数据库
try :
conn = mdb.connect(host = '127.0.0.1' ,user = 'root' ,passwd = 'kongjunli' ,db = 'test1' ,charset = 'utf8' )
except Exception,e:
print e
sys.exit()
#获取cursor对象操作数据库
cursor = conn.cursor(mdb.cursors.DictCursor) #cursor游标
#获取内容
sql = 'SELECT link,content FROM test1.spider;'
cursor.execute(sql) #execute()方法,将字符串当命令执行
data = cursor.fetchall() #fetchall()接收全部返回结果行
f = codecs. open ( 'C:\Users\kk\Desktop\hello-result1.txt' , 'w' , 'utf-8' )
for row in data: #row接收结果行的每行数据
seg = '/' .join( list (jieba.cut(row[ 'content' ],cut_all = 'False' )))
f.write(row[ 'link' ] + ' ' + seg + '\r\n' )
f.close()
cursor.close()
#提交事务,在插入数据时必须
|
jiansuo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
#-*-coding:utf-8-*-
import sys
import string
import MySQLdb
import MySQLdb as mdb
import gensim
from gensim import corpora,models,similarities
from gensim.similarities import MatrixSimilarity
import logging
import codecs
reload (sys)
sys.setdefaultencoding( 'utf-8' )
con = mdb.connect(host = '127.0.0.1' ,user = 'root' ,passwd = 'kongjunli' ,db = 'test1' ,charset = 'utf8' )
with con:
cur = con.cursor()
cur.execute( 'SELECT * FROM cutresult_copy' )
rows = cur.fetchall()
class MyCorpus( object ):
def __iter__( self ):
for row in rows:
yield str (row[ 1 ]).split( '/' )
#开启日志
logging.basicConfig( format = '%(asctime)s:%(levelname)s:%(message)s' ,level = logging.INFO)
Corp = MyCorpus()
#将网页文档转化为tf-idf
dictionary = corpora.Dictionary(Corp)
corpus = [dictionary.doc2bow(text) for text in Corp] #将文档转化为词袋模型
#print corpus
tfidf = models.TfidfModel(corpus) #使用tf-idf模型得出文档的tf-idf模型
corpus_tfidf = tfidf[corpus] #计算得出tf-idf值
#for doc in corpus_tfidf:
#print doc
###
'''
q_file=open('C:\Users\kk\Desktop\q.txt','r')
query=q_file.readline()
q_file.close()
vec_bow=dictionary.doc2bow(query.split(' '))#将请求转化为词带模型
vec_tfidf=tfidf[vec_bow]#计算出请求的tf-idf值
#for t in vec_tfidf:
# print t
'''
###
query = raw_input ( 'Enter your query:' )
vec_bow = dictionary.doc2bow(query.split())
vec_tfidf = tfidf[vec_bow]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[vec_tfidf]
similarity = list (sims)
print sorted (similarity,reverse = True )
|
encodings.xml
1
2
3
4
5
6
|
<? xml version = "1.0" encoding = "UTF-8" ?>
< project version = "4" >
< component name = "Encoding" >
< file url = "PROJECT" charset = "UTF-8" />
</ component >
</ project >
|
misc.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
<? xml version = "1.0" encoding = "UTF-8" ?>
< project version = "4" >
< component name = "ProjectLevelVcsManager" settingsEditedManually = "false" >
< OptionsSetting value = "true" id = "Add" />
< OptionsSetting value = "true" id = "Remove" />
< OptionsSetting value = "true" id = "Checkout" />
< OptionsSetting value = "true" id = "Update" />
< OptionsSetting value = "true" id = "Status" />
< OptionsSetting value = "true" id = "Edit" />
< ConfirmationsSetting value = "0" id = "Add" />
< ConfirmationsSetting value = "0" id = "Remove" />
</ component >
< component name = "ProjectRootManager" version = "2" project-jdk-name = "Python 2.7.11 (C:\Python27\python.exe)" project-jdk-type = "Python SDK" />
</ project >
|
modules.xml
1
2
3
4
5
6
7
8
|
<? xml version = "1.0" encoding = "UTF-8" ?>
< project version = "4" >
< component name = "ProjectModuleManager" >
< modules >
< module fileurl = "file://$PROJECT_DIR$/.idea/爬虫练习代码.iml" filepath = "$PROJECT_DIR$/.idea/爬虫练习代码.iml" />
</ modules >
</ component >
</ project >
|