reqeusts库,在使用ip代理时,单ip代理和多ip代理的写法不同
(目前测试通过,如有错误,请评论指正)
-
单ip代理模式
省去headers等import requests proxy = { \'HTTPS\': \'162.105.30.101:8080\' } url = \'爬取链接地址\' response = requests.get(url,proxies=proxy)
- 多ip代理模式
import requests
#导入random,对ip池随机筛选
import random
proxy = [
{
\'http\': \'http://61.135.217.7:80\',
\'https\': \'http://61.135.217.7:80\',
},
{
\'http\': \'http://118.114.77.47:8080\',
\'https\': \'http://118.114.77.47:8080\',
},
{
\'http\': \'http://112.114.31.177:808\',
\'https\': \'http://112.114.31.177:808\',
},
{
\'http\': \'http://183.159.92.117:18118\',
\'https\': \'http://183.159.92.117:18118\',
},
{
\'http\': \'http://110.73.10.186:8123\',
\'https\': \'http://110.73.10.186:8123\',
},
]
url = \'爬取链接地址\'
response = requests.get(url,proxies=random.choice(proxy))
简单的智联招聘爬虫封装
import requests
from bs4 import BeautifulSoup
import re
import ssl
import time
import random
ssl._create_default_https_context = ssl._create_unverified_context
user_agent = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
\'\'\'
代理若出错,替换代理池,但代理池需要更新
\'\'\'
# proxy = [
# {
# \'http\': \'http://61.135.217.7:80\',
# \'https\': \'http://61.135.217.7:80\',
# },
# {
# \'http\': \'http://118.114.77.47:8080\',
# \'https\': \'http://118.114.77.47:8080\',
# },
# {
# \'http\': \'http://112.114.31.177:808\',
# \'https\': \'http://112.114.31.177:808\',
# },
# {
# \'http\': \'http://183.159.92.117:18118\',
# \'https\': \'http://183.159.92.117:18118\',
# },
# {
# \'http\': \'http://110.73.10.186:8123\',
# \'https\': \'http://110.73.10.186:8123\',
# },
# ]
def get_job_txt(city,kw,txt_name):
for i in range(100):
time.sleep(2)
url = \'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={2}&kw={0}&sm=0&p={1}\'.format(kw,i,city)
response = requests.get(url,headers = {\'User-Agent\': random.choice(user_agent)}).content.decode()
soup =BeautifulSoup(response,\'lxml\')
tables = soup.select(\'.newlist\')[1:]
if tables:
for table in tables:
job = table.select(\'.zwmc\')[0].text
company = table.select(\'.gsmc\')[0].text
money = table.select(\'.zwyx\')[0].text
place = table.select(\'.gzdd\')[0].text
href = table.select(\'.zwmc\')[0].find(\'a\')[\'href\']
print(job+\'\t\'+company+\'\t\'+money+\'\t\'+place+\'\t\'+href+\'\n\')
with open(\'{0}.txt\'.format(txt_name),\'a+\',encoding=\'utf-8\',errors=\'ignore\') as f:
f.write(job+\'\t\'+company+\'\t\'+money+\'\t\'+place+\'\t\'+href+\'\n\')
else:
print(\'总页\'+ str(i))
break
if __name__ == \'__main__\':
city = input(\'输入城市\')
kw = input(\'输入岗位\')
txt_name = input(\'输入储存文件名\')
get_job_txt(city=city,kw=kw,txt_name=txt_name)
转自https://blog.csdn.net/weixin_35993084/article/details/80770157