注:本爬虫项目只对都斗图网“最新套图”表情包进行抓取,其它标签未经测试!!!
第一步:获取网页源码
1,获取网页源码的请求地址
a,打开斗图网,点击要爬取套图的标签,单击第2页(以此显示出需要请求的完整的URL地址)
b,按F12件键,打开开发者工具,单击“Network”,在通用头General中找到请求地址URL
2,获取User-Agent,伪造简单的请求头
3,测试代码
def get_html(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36'}
req = requests.get(url=url, headers=HEADERS)
resp = req.content
print resp
# return resp
if __name__ == "__main__":
home_url = “http://www.doutula.com/article/list/?page=2”
get_html(home_url)
第二步:获取套图的URL
1,先解析网页源码
2,右击任意套图,选择"检查",找到套图URL地址的属性盒子
3,测试代码
def get_ablum_url(html):
soup = etree.HTML(html)
ablum_urls = soup.xpath('//div[@class="col-sm-9"]/a/@href')
for url in ablum_urls:
print url
# return url
if __name__ == "__main__":
home_url = “http://www.doutula.com/article/list/?page=2”
home_html = get_html(home_url)
get_ablum_url(home_html)
第三步:获取套图图片的URL
1,先解析网页源码
2,右击任意套图,选择"检查",找到套图图片URL地址的属性盒子
3,测试代码
def get_img(html):
soup = etree.HTML(html)
imgs_url = soup.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src')
for img_url in imgs_url:
print img_url
if __name__ == "__main__":
home_url = “http://www.doutula.com/article/list/?page=2”
home_html = get_html(home_url)
ablum_html = get_ablum_url(home_html)
get_img(ablum_html)
网页分析结束!!!
完整代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from multiprocessing import Pool
# 获取网页源码HTML
def get_html(url):
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36'}
req = requests.get(url=url, headers=HEADERS)
resp = req.content
# print resp
return resp
# 获取套图链接URL,并进一步的获取图片
def get_ablum_url(html):
soup = etree.HTML(html)
ablum_urls = soup.xpath('//div[@class="col-sm-9"]/a/@href')
for url in ablum_urls:
ablum_html = get_html(url)
get_img(ablum_html)
# 获取图片链接
def get_img(html):
soup = etree.HTML(html)
imgs_url = soup.xpath('//div[@class="artile_des"]/table/tbody/tr/td/a/img/@src')
multiprocessing_download(imgs_url)
# 下载图片
def download_img(img_url):
req = requests.get(img_url)
resp = req.content
with open('C:\Users\Administrator\Desktop\doutu/%s' % img_url.split('/')[-1], 'wb') as f:
f.write(resp)
f.close()
# 多进程下载
def multiprocessing_download(img_urls):
p = Pool(processes=2)
p.map_async(download_img, img_urls)
p.close()
p.join()
# 主函数
def main():
HOME_URL = 'http://www.doutula.com/article/list/?page={}'
for i in range(1, 6):
html = get_html(HOME_URL.format(str(i)))
get_ablum_url(html)
if __name__ == "__main__":
main()