本文实例讲述了Python基于多线程实现抓取数据存入数据库的方法。分享给大家供大家参考,具体如下:
1. 数据库类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
"""
使用须知:
代码中数据表名 aces ,需要更改该数据表名称的注意更改
"""
import pymysql
class Database():
# 设置本地数据库用户名和密码
host = "localhost"
user = "root"
password = ""
database = "test"
port = 3306
charset = "utf8"
cursor = ''
connet = ''
def __init__( self ):
#连接到数据库
self .connet = pymysql.connect(host = self .host , user = self .user,password = self .password , database = self .database, charset = self .charset)
self .cursor = self .connet.cursor()
# #删表
def dropTables( self ):
self .cursor.execute( '''''drop table if exists aces''' )
print ( "删表" )
#建表
def createTables( self ):
self .cursor.execute( '''''create table if not exists aces
(
asin varchar(11) primary key not null,
checked varchar(200));''' )
print ( "建表" )
#保存数据
def save( self ,aceslist):
self .cursor.execute( "insert into aces ( asin, checked) values(%s,%s)" , (aceslist[ 0 ],aceslist[ 1 ]))
self .connet.commit()
#判断元素是否已经在数据库里,在就返回true ,不在就返回false
def is_exists_asin( self ,asin):
self .cursor.execute( 'select * from aces where asin = %s' ,asin)
if self .cursor.fetchone() is None :
return False
return True
# db =Database()
|
2. 多线程任务类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import time
import random
import threading
import logging
import pymysql
from bs4 import BeautifulSoup
from local_data import Database
#一个模块中存储多个类 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob
class AmazonSpider():
def __init__( self ):
self .db = Database()
def randHeader( self ):
head_connection = [ 'Keep-Alive' , 'close' ]
head_accept = [ 'text/html, application/xhtml+xml, */*' ]
head_accept_language = [ 'zh-CN,fr-FR;q=0.5' , 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3' ]
head_user_agent = [ 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' ,
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36' ,
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)' ,
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1' ,
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3' ,
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12' ,
'Opera/9.27 (Windows NT 5.2; U; zh-cn)' ,
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0' ,
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)' ,
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6' ,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)' ,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)' ,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)' ,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ' ,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)' ,
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ' ,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER' ,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)' ,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11' ]
header = {
'Connection' : head_connection[ 0 ],
'Accept' : head_accept[ 0 ],
'Accept-Language' : head_accept_language[ 1 ],
'User-Agent' : head_user_agent[random.randrange( 0 , len (head_user_agent))]
}
return header
def getDataById( self , queryId):
#如果数据库中有的数据,直接返回不处理
if self .db.is_exists_asin(queryId):
return
req = urllib.request.Request(url = "https://www.amazon.com/dp/" + str (queryId) , headers = self .randHeader())
webpage = urllib.request.urlopen(req)
html = webpage.read()
soup = BeautifulSoup(html, 'html.parser' )
content = soup.find_all( "span" , id = "asTitle" )
# 加入一种判断,有的asin没有该定位,
if len (content):
# 非空
state = content[ 0 ].string
else :
# 列表为空,没有定位到
state = "other"
print (queryId)
print (state)
self .db.save([queryId,state])
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__( self , queue): #子类特有属性, queue
FORMAT = time.strftime( "[%Y-%m-%d %H:%M:%S]" , time.localtime()) + "[AmazonSpider]-----%(message)s------"
logging.basicConfig(level = logging.INFO, format = FORMAT )
threading.Thread.__init__( self )
self .queue = queue
self .spider = AmazonSpider() #子类特有属性spider, 并初始化,将实例用作属性
def run( self ):
while True :
success = True
item = self .queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
try :
self .spider.getDataById(item) #调用实例spider的方法getDataById(item)
except :
# print("失败")
success = False
if not success :
self .queue.put(item)
logging.info( "now queue size is: %d" % self .queue.qsize()) #队列对象qsize()方法,返回队列的大小
self .queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class AmazonSpiderJob():
def __init__( self , size , qs):
self .size = size # 将形参size的值存储到属性变量size中
self .qs = qs
def work( self ):
toSpiderQueue = Queue() #创建一个Queue队列对象
for q in self .qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
for i in range ( self .size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon( True )
t.start()
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
|
3. 主线程类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
from amazon_s import AmazonSpiderJob #从一个模块中导入类
import pymysql
import pandas as pd
from local_data import Database
if __name__ = = '__main__' :
#初次跑程序的时候,需要删除旧表,然后新建表,之后重启再跑的时候需要注释
#----------------------
db = Database()
db.dropTables()
db.createTables()
#---------------------------
df = pd.read_excel( "ASIN检查_viogico_1108.xlsx" )
# print(df.info())
qs = df[ "asin1" ].values
print (qs)
print ( len (qs))
amazonJob = AmazonSpiderJob( 8 , qs)
amazonJob.work()
|
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/zn505119020/article/details/78590416