不得不说 python真是一个神奇的东西,学三天就能爬网站 真香
完整代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
|
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 17:53:13 2021
@author: 19088
"""
import urllib.request
import os
import pickle
import re
import random
import sys
#获取http代理
class getHttpAgents:
#初始化函数
def __init__( self ):
self .attArray = self .__loadAgentList()
self .myagent = ""
#注意 返回对象未进行解码
def openUrl( self ,url,istry = 1 ):
response = ""
ip = ""
if ( 0 ! = len ( self .myagent.strip())):
ip = self .myagent
i = 1
if not istry:
i = 99
while i< 100 :
try :
#print(self.attArray)
if ( 0 = = len ( self .attArray) and 0 = = len (ip.strip())):
req = urllib.request.Request(url)
#设置访问头
req.add_header( "User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" )
response = urllib.request.urlopen(req)
else :
if ( 0 ! = len ( self .attArray)):
ip = random.choice( self .attArray)
if ( 0 ! = len ( self .myagent.strip())):
ip = self .myagent
print ( "以{}访问 {}" . format (ip,url))
#设置代理
proxy = { "http" :ip}
#print(proxy)
#定义一个代理字段
proxy_support = urllib.request.ProxyHandler(proxy)
#建立一个opener
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [( "User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" )]
#urllib.request.install_opener(opener)
#获得网页对象
response = opener. open (url)
except :
if not istry:
print ( "{} 无法使用" . format (ip))
else :
print ( "第{}次尝试连接!" . format (i))
else :
break ;
finally :
i + = 1
if 11 = = i and istry:
raise ValueError
if not response:
return
html = response.read()
#print(html)
return html
#检查代理池 去除掉不可用代理ip
def checkMyIpPool( self ):
agentsResult = []
agentList = self .attArray
for iter in agentList:
ip = iter
self .setMyIp(ip)
b = self .__getMyIp()
if not b:
#代理不能用
#agentList.pop(-iter)
pass
else :
agentsResult.append(ip)
#print(b)
#记录爬取过的可以使用的代理ip
self .__writeAgentList(agentsResult)
self .__setAgents(agentsResult)
self .setMyIp("")
#解析读取网页中所有的代理地址
def getAgents( self ,html):
#print(html)
#匹配 ip地址 正则表达式
pattern = re. compile (r '(<td>)\s*((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\s*</td>' )
ipList = []
ip = pattern.finditer(html)
for ipiter in ip:
ipText = ipiter.group()
ipGroup = re.search(r "((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)" , ipText)
ipList.append(ipGroup.group())
#匹配 端口地址 正则表达式
portList = []
pattern = re. compile (r '(<td>)\s*\d+\s*</td>' )
port = pattern.finditer(html)
for portiter in port:
portText = portiter.group()
portGroup = re.search(r "\d+" , portText)
portList.append(portGroup.group())
if ( len (ipList) is not len (portList)):
print ( "注意: ip和端口参数不匹配!" )
return
ipDict = dict ( zip (ipList,portList))
agentList = []
for key in ipDict:
agentList.append(key + ":" + ipDict.get(key))
agentsResult = []
for iter in agentList:
ip = iter
self .setMyIp(ip)
b = self .__getMyIp()
if not b:
#代理不能用
pass
#agentList.pop(-iter)
else :
agentsResult.append(ip)
self .__setAgents(agentsResult)
print ( "{} 可以使用" . format (ip))
agentsResult.extend( self .attArray)
#记录爬取过的可以使用的代理ip
if ( 0 = = len (agentsResult)):
return
self .__writeAgentList(agentsResult)
self .__setAgents(agentsResult)
self .setMyIp("")
return agentList
def __setAgents( self ,ipArray):
self .attArray = ipArray
def setMyIp( self ,ip):
self .myagent = ip
#存储爬取过的ip代理
def __writeAgentList( self , agentList):
if os.path.exists( "agent.pkl" ):
os.remove( "agent.pkl" ) #每次重新生成 要不多次 dump需要多次 load
with open ( "agent.pkl." , "wb" ) as f:
pickle.dump(agentList, f)
print ( "存储{}条代理" . format ( len (agentList)))
#加载之前存储过的ip代理
def __loadAgentList( self ):
agentlist = []
if not os.path.exists( "agent.pkl" ):
return agentlist
with open ( "agent.pkl" , "rb" ) as f:
agentlist = pickle.load(f)
print ( "加载{}条代理" . format ( len (agentlist)))
return agentlist
#获取当前使用的ip地址 类的内部方法 仅供内部调用
def __getMyIp( self ,ip = ""):
url = "https://www.baidu.com/"
html = ""
try :
html = self .openUrl(url, 0 ).decode( "utf-8" )
except :
return
#匹配ip地址
#pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)')
#groupIp=pattern.search(html)
#if groupIp:
#return groupIp.group()
else :
return html
#通过不同的网站去爬取代理
def crawlingAgents( self ,index):
try :
url = "http://ip.yqie.com/ipproxy.htm"
print (url)
html = self .openUrl(url)
html = html.decode( "utf-8" )
self .setMyIp("") #不指定ip 随机挑选一个作为代理
self .getAgents(html)
except Exception as e:
print ( "{} 爬取失败" . format (url))
#一共搜集多少页
page = index
indexCur = 1
while indexCur< = page:
try :
url = r "https://www.89ip.cn/index_{}.html" . format (indexCur)
print (url)
self .setMyIp("")
html = self .openUrl(url) #不指定ip 随机挑选一个作为代理
html = html.decode( "utf-8" )
self .getAgents(html)
except Exception as e:
print ( "{} 爬取失败" . format (url))
finally :
indexCur + = 1
indexCur = 1
while indexCur< = page:
try :
url = r "http://www.66ip.cn/{}.html" . format (indexCur)
print (url)
self .setMyIp("")
html = a.openUrl(url) #不指定ip 随机挑选一个作为代理
html = html.decode( "gb2312" )
self .getAgents(html)
except Exception as e:
print ( "{} 爬取失败" . format (url))
finally :
indexCur + = 1 indexCur = 1
while indexCur< = page:
try :
url = r "http://www.ip3366.net/?stype=1&page={}" . format (indexCur)
print (url)
self .setMyIp("")
html = a.openUrl(url) #不指定ip 随机挑选一个作为代理
html = html.decode( "gb2312" )
self .getAgents(html)
except Exception as e:
print ( "{} 爬取失败" . format (url))
finally :
indexCur + = 1
indexCur = 1
while indexCur< = page:
try :
url = r "http://www.kxdaili.com/dailiip/1/{}.html" . format (indexCur)
print (url)
self .setMyIp("")
html = a.openUrl(url) #不指定ip 随机挑选一个作为代理
html = html.decode( "utf-8" )
self .getAgents(html)
except Exception as e:
print ( "{} 爬取失败" . format (url))
finally :
indexCur + = 1
#下载图片封装类
class downLoadPictures:
#构造函数
def __init__( self ):
self .sortKey = {} #定义一个搜索关键字的字典
self .urlLoad = getHttpAgents()
self .bzmenuDict = {} #分类信息 风景 美女 什么的分类
self .sortscreenDict = {} #按照屏幕尺寸分类
self .littleSignDict = {} #分类信息下面的小分类
pass
def getPictures( self ,url):
#第一步 打开网页 读取page信息
pagerHtml = self .urlLoad.openUrl(url)
#第二步 获取 pageFolder 链接和各种分类信息 返回的是一堆folder链接的url
folderPictursUrl = self .readPages(pagerHtml).values()
if not folderPictursUrl:
print ( "获取图片集失败!" )
return
for floderiterUrl in folderPictursUrl:
folderUrl = str ( "https://www.ivsky.com/" ) + floderiterUrl
folderHtml = self .urlLoad.openUrl(folderUrl)
#第三步 读取图片集 获取单个图片的链接地址 返回的是图片集里面的一堆文件url
pictursUrlDict = self .readFolders(folderHtml)
for iterPictureKey in pictursUrlDict:
fileName = iterPictureKey + ".jpg"
pictureUrl = str ( "https://www.ivsky.com/" ) + pictursUrlDict.get(iterPictureKey)
#读取图片页相关信息
pictureHtml = self .urlLoad.openUrl(pictureUrl)
picturDownUrl = self .readPictures(pictureHtml)
pictureDownHtml = self .urlLoad.openUrl(picturDownUrl)
if not pictureDownHtml:
continue
#保存图片
with open (fileName, "wb+" ) as f:
f.write(pictureDownHtml)
#提取匹配内容中的所有链接地址
def getHrefMap( self ,html,isPicture = 0 ,isFolder = 0 ):
hrefDict = {}
pattern = re. compile (r '<a\s*.*?\s*</a>' ,re.I)
if isPicture:
pattern = re. compile (r '<p>\s*?<a\s*.*?</p>' ,re.I)
hrefIter = pattern.finditer(html)
index = 0
for iter in hrefIter:
hrefText = iter .group()
#匹配分类名字
pattern = re. compile (r '"\s*?>\s*?.*?</a>' ,re.I)
name = ""
nameGroup = pattern.search(hrefText)
if nameGroup:
name = nameGroup.group()
if ( 5 = = len (nameGroup.group().replace( " " , ""))):
pattern = re. compile (r 'title=".*?"' ,re.I)
nameGroup = pattern.search(hrefText)
if nameGroup:
name = nameGroup.group()[ 7 : - 1 ]
name = name[ 2 : - 4 ].replace( " " , '')
#匹配href
pattern = re. compile (r 'href=".*?" rel="external nofollow" ' ,re.I)
url = ""
urlGroup = pattern.search(hrefText)
if urlGroup:
url = urlGroup.group()[ 6 : - 1 ].replace( " " , '')
if isFolder:
index + = 1
name + = "_" + str (index)
hrefDict[name] = url
return hrefDict
#读取首页信息 包含各种分类的链接地址 以及图片集的地址集合
def readPages( self ,html):
html = html.decode( "utf-8" )
#检索壁纸分类
#匹配 壁纸分类信息
pattern = re. compile (r '<ul\s*class="bzmenu".*?</ul>' ,re.I)
sortClassGroup = pattern.search(html)
if sortClassGroup:
sortMessage = sortClassGroup.group()
self .bzmenuDict = self .getHrefMap(sortMessage)
#print(self.bzmenuDict)
else :
print ( "匹配壁纸分类出错!" )
return
#匹配 按照屏幕大小分类
pattern = re. compile (r '<ul\s*class="sall_dd".*?</ul>' ,re.I)
sortClassGroup = pattern.search(html)
if sortClassGroup:
sortMessage = sortClassGroup.group()
self .sortscreenDict = self .getHrefMap(sortMessage)
#print(self.sortscreenDict)
else :
print ( "匹配屏幕尺寸分类失败!" )
return
#匹配 获取小分类
pattern = re. compile (r '<div\s*class="sline".*?</div>' ,re.I)
sortClassGroup = pattern.search(html)
if sortClassGroup:
sortMessage = sortClassGroup.group()
#print(sortMessage)
self .littleSignDict = self .getHrefMap(sortMessage)
#print(self.littleSignDict)
else :
print ( "匹配小分类失败" )
return
pictureDict = {}
#匹配 图片集地址
pattern = re. compile (r '<ul\s*class="ali".*?</ul>' ,re.I)
sortClassGroup = pattern.search(html)
if sortClassGroup:
sortMessage = sortClassGroup.group()
pictureDict = self .getHrefMap(sortMessage, 1 )
#print(pictureDict)
else :
print ( "匹配图片集地址失败!" )
return #print(html)
return pictureDict
#解析每个图片集合对应的图片集内容 解析出单个图片的链接地址
def readFolders( self ,html):
if not html:
return
html = html.decode( "utf-8" )
#获取图片集里面每个图片的具体地址和名称
#匹配 获取小分类
pattern = re. compile (r '<ul\s*class="pli".*?</ul>' ,re.I)
sortClassGroup = pattern.search(html)
pictureUrlDict = {}
if sortClassGroup:
sortMessage = sortClassGroup.group()
#print(sortMessage)
pictureUrlDict = self .getHrefMap(sortMessage, 1 , 1 )
#print(pictureUrlDict)
else :
print ( "匹配小分类失败" )
return return pictureUrlDict
#解析每个图片集合对应的图片集内容 解析出单个图片的链接地址
def readPictures( self ,html):
if not html:
return html = html.decode( "utf-8" )
#获取图片集里面每个图片的具体地址和名称
#匹配 获取小分类
pattern = re. compile (r '<div\s*class="pic".*?</div>' ,re.I)
sortClassGroup = pattern.search(html)
pictureUrl = ""
if sortClassGroup:
sortMessage = sortClassGroup.group()
#匹配href
pattern = re. compile (u "src='.*?'" ,re.I)
url = ""
urlGroup = pattern.search(sortMessage)
if urlGroup:
url = urlGroup.group()[ 5 : - 1 ].replace( " " , '')
url = url.replace( 'img-pre' , 'img-picdown' )
url = url.replace( 'pre' , 'pic' )
url = str ( "https:" ) + url
#print(sortMessage)
pictureUrlDict = url
#print(url)
else :
print ( "匹配小分类失败" )
return return pictureUrlDict
class UrlUser:
def __init__( self ):
self .agent = getHttpAgents()
self .downPicture = downLoadPictures()
#下载图片调用函数
def downPictures( self ):
#url="https://www.ivsky.com/bizhi"
#b.getPictures(url)
#确定保存路径
dirPath = input ( "请输入保存路径:" )
if not os.path.exists(dirPath):
os.mkdir(dirPath)
if not os.path.isdir(dirPath):
print ( "savePath is wrong!" )
sys.exit()
os.chdir(dirPath) #切换工作目录
#url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html"
page = input ( "爬取前多少页的图片?\n" )
indexRe = re.search(r "\d+" , page)
if ( not indexRe):
print ( "输入页数有误!" )
indexRe = int (indexRe.group())
indexCur = 1
while indexCur< = indexRe:
try :
#注意 爬取什么类型的图片可以根据不同的网址进行设计 下载类里面已经读取了所有分类对应的地址 有兴趣可以自己完善
url = r "https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html" . format (indexCur)
print (url)
self .downPicture.getPictures(url)
except :
print ( "打开出错!" )
pass
finally :
indexCur + = 1
#爬取代理
def downAgents( self ):
page = input ( "爬取前多少页的代理?\n" )
indexRe = re.search(r "\d+" , page)
if ( not indexRe):
print ( "输入页数有误!" )
return
indexRe = int (indexRe.group())
self .agent.crawlingAgents(indexRe)
# 检查当前代理池是否可以
def checkPool( self ):
self .agent.checkMyIpPool()
if __name__ = = "__main__" :
print ( "*" * 20 )
print ( "1.爬取代理\n" )
print ( "2.检查代理\n" )
print ( "3.爬取图片" )
print ( "*" * 20 )
mode = input ( "请输入数字选择处理模式:\n" )
indexRe = re.search(r "\d+" , mode)
if ( not indexRe):
print ( "输入页数有误!" )
sys.exit()
indexRe = int (indexRe.group())
#实例化一个对象
uesrObj = UrlUser()
if 1 = = indexRe:
uesrObj.downAgents()
elif 2 = = indexRe:
uesrObj.checkPool()
elif 3 = = indexRe:
uesrObj.downPictures()
else :
print ( "模式选择错误!" )
sys.exit()
print ( "爬取完毕!" )
|
效果图
原文链接:https://blog.csdn.net/dododododoooo/article/details/117353910