看了崔大佬的文章,写了这个爬虫,学习了!原文地址
现在该网站加了反爬机制,不过在headers里加上refere参数就行了。
以下代码仅做学习记录之用:
from bs4 import BeautifulSoup
import requests
import os
import time
# 构造带页码的页面链接
def get_mzi_page():
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
" (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
res = requests.get('http://www.mzitu.com', headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
page = soup.select('.nav-links a') # 返回一个列表
page_count = page[-2].get_text()
for i in range(1, int(page_count) + 1):
# 构造每个页面链接
page_url = "http://www.mzitu.com/page/" + f"{i}/"
# 获取当前页面的所有专题,并输出提示
print(f"总计{page_count}页,当前第{i}页:")
get_mzi_channel(page_url) def get_mzi_channel(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
" (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
# 抓取每个妹子的专题页面
channel = soup.select('#pins li span a')
channel_count_onepage = len(channel)
print(channel)
count = 0
# 遍历所有专题url,逐个访问下载图片
for c in channel:
count += 1
channel_name = c.get_text()
os.mkdir('D;\mziPic')
filepath = f'D:\mziPic\{channel_name}'
if os.path.exists(filepath):
pass
else:
os.mkdir(filepath)
channel_url = c.get('href')
print(f"本页总计{len(channel)}个妹子,当前第{count}个妹子")# 提示信息
get_mzi_img(filepath, channel_url) def get_mzi_img(filepath ,url):
# 下载一个妹子的所有图片
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
" (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
#每个妹子的all图片是分多个页面展示的,所以还要再访问多个页面,逐图片下载
img_page = soup.select('.pagenavi a')
img_page_count = img_page[-2].get_text()
# 构造逐图片下载的url
count2 = 0
for i in range(1, int(img_page_count) + 1):
count2 += 1
img_page_url = url + f'/{i}'
print(f"本妹子共{img_page_count}图片,现第{count2}张")
print("img_page_url(refere):", img_page_url)
download(filepath, img_page_url, count2) def download(filepath, img_page_url, count2):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
headers2 = {'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'DNT': '',
'Host': 'i.meizitu.net',
'Referer': img_page_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
res = requests.get(img_page_url, headers=headers)
print("res.status_code", res.status_code)
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml')
img_url = soup.find('div', class_='main-image').find('img').get('src')
print("图片地址", img_url)
# 这个网站访问图片必须要带refere参数,不然返回403,(折腾好久才发现问题
res2 = requests.get(img_url, headers=headers2)
print("res2.status_code:", res2.status_code)
with open(filepath + f'/{count2}.jpg', 'ab') as f:
f.write(res2.content)
f.close()
# time.sleep(1) # 注释后运行下好像不封ip。。。
else:
pass if __name__ == '__main__':
get_mzi_page()