python爬虫一些参考代码

时间:2022-03-05 16:58:12

http://www.cnblogs.com/Dadio/p/5513594.html

这个是爬P站的代码,目前还没看,感觉很棒


from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
import os
import codecs
import sys
headers
={
'Accept':'text/html',
'Accept-Language':'zh-CN,zh;q=0.8',
'Referer':"",
'User-Agent':"此处为浏览器的user-agent"#浏览器数据
}
order
=1
def getpic (src,href,mode=""):
os.system(
"cls")
print("共有%d个文件需要下载"%number_of_file)
if src[-3:] == "gif":
return'''使用gif来保存静态图片的都是*'''
headers[
'Referer'] = href
ispng
=False
url
=src.replace("_master1200","")
url
=url.replace(url[20:40],"img-original")
if mode=='mul':
print('正在下载第%d个...'%order)
print("该文件含有多张图:")
else:
print('正在下载第%d个...'%order)
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
print('已下载第%d个'%order)
return
else:
data
=requests.get(url,headers=headers,timeout=60)
if str(data)!='<Response [200]>':
ispng
=True
url
=url.replace("jpg","png")
if mode == 'mul':
if ispng:
print("********正在下载第1张")
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
pass
else:
data
=requests.get(url,headers=headers)
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'png')
print("********已下载第1张")
for i in range(150):
url
=url.replace("p%d.png"%i,"p%d.png"%(i+1))
os.system(
"cls")
print("********正在下载第%d张..."%(i+2))
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
pass
else:
data
=requests.get(url,headers=headers,timeout=60)
if str(data)!='<Response [200]>':
break
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'png')
print("********已下载第%d张"%(i+2))
else:
print("********正在下载第1张")
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
pass
else:
data
=requests.get(url,headers=headers,timeout=60)
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'jpeg')
print("********已下载第1张")
for i in range(150):
url
=url.replace("p%d.jpg"%i,"p%d.jpg"%(i+1))
os.system(
"cls")
print("********正在下载第%d张..."%(i+2))
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
pass
else:
data
=requests.get(url,headers=headers,timeout=60)
if str(data)!='<Response [200]>':
break
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'jpeg')
print("********已下载第%d张"%(i+2))
else:
if ispng :
if os.path.exists(file_path+(url.replace('/',""))[-15:]):
print('已下载第%d个'%order)
return
else:
data
=requests.get(url,headers=headers,timeout=60)
if str(data) == '<Response [200]>':
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'png')
print('已下载第%d个'%order)
else:
im
=Image.open(BytesIO(data.content))
im.save(file_path
+(url.replace('/',""))[-15:],'jpeg')
print('已下载第%d个'%order)

number
=sys.argv[1]
file_path
=sys.argv[2]+'\\Picture\\'#修改此处即可改变路径
url_save="http://spotlight.pics/zh/a/%s"%number
wb
=requests.get(url_save,headers=headers)
wb_data
=BeautifulSoup(wb.text,'lxml')
title
=wb_data.h2.string.replace("\n","").replace(":","").replace("?","").replace("\"","").replace(" ","")
title
=title.replace("<","").replace(">","").replace("|","").replace("*","").replace("/","").replace("\\","")
#依据windows目录命名规则
file_path=(file_path+title)+"\\"
'''判断文件是否存在'''
if not os.path.exists(file_path):
introduce
=str(wb_data.h2.next_sibling.next_sibling.next_element)
os.mkdir(file_path)
f
=codecs.open(file_path+"介绍.txt","w","utf-8")
f.write(
"特辑号:%s\n"%number+introduce)
f.close()
divs
=wb_data.body.select('div[class="illust-wrap"]')
number_of_file
=len(divs)
headers[
'Accept']='image/webp,image/*,*/*;q=0.8'
for div in divs:
if str(div.a.parent['class'])!='[\'ugoira-player\', \'ui-scroll-view\']':
if str(div.a.parent['class'])=='[\'illust-multi-page-wrap\']':
getpic(div.img[
'src'],div.a['href'],"mul")
else:
getpic(div.img[
'src'],div.a['href'])
#要想好动图怎么办
order+=1