【python】一个简单的贪婪爬虫

时间:2023-03-09 14:34:20
【python】一个简单的贪婪爬虫

这个爬虫的作用是,对于一个给定的url,查找页面里面所有的url连接并依次贪婪爬取

主要需要注意的地方:

1.lxml.html.iterlinks()  可以实现对页面所有url的查找

2.获取页面要设置超时,否则遇到没有响应的页面,代码容易卡死

3.对所有异常的捕获

4.广度优先搜索实现

具体代码如下:

#!/usr/bin/env python
#encoding:utf8 #这是一个爬虫代码,从初始url页面中查找所有的url并根据广度优先的顺序依次贪婪爬取
#爬取的页面以1.html,2.html...的方式命名
#author:匡子语 import re
import time
import lxml.html
import urllib2
from collections import deque class ScrawURL:
def __init__(self):
self.total = 50000 #要获取的页面数
self.urls = ["http://www.cnblogs.com/"] #初始url
self.html_num = 13047 #当前存储的是第几个页面
self.url_num = 0 #当前要爬取的url在urls中的位置 def getHtml(self, url):
print "url:%s" % url
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request, timeout=5) #要设置timeout,有时页面长时间无法响应不加timeout会导致程序卡死
content = response.read()
self.html_num += 1
name = "htmls/%d.html" % self.html_num
with open(name, "w") as f:
f.write(content)
print self.html_num
return content
except urllib2.HTTPError, e:
return ''
except urllib2.URLError, e:
return ''
except Exception, ex: #对于所有异常形式,都返回空内容
return '' def getUrl(self, content):
try:
print "getUrl"
html = lxml.html.fromstring(content)
links = lxml.html.iterlinks(html) #这个函数可以自动获取页面中的所有url,非常好用
urls = []
for link in links:
if "http" in link[2]:
urls.append(link[2])
return urls
except Exception, ex:
return [] def scrawl(self): #广度优先爬取
print "scrawl"
while len(self.urls) < self.total and len(self.urls) > self.url_num: #获取足够的url
url = self.urls[self.url_num]
self.url_num += 1
content = self.getHtml(url)
if content:
urls = self.getUrl(content)
if urls:
for url in urls:
if url not in self.urls: #对于已经爬取过的url跳过
self.urls.append(url) while self.html_num < self.total and len(self.urls) > 0: #获取足够的页面
url = self.urls[self.url_num]
self.url_num += 1
self.getHtml(url) if __name__ == "__main__":
surl = ScrawURL()
surl.scrawl()