下载地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/
pip install wheel
pip install lxml
pip install pyopenssl
pip install Twisted
pip install pywin32
pip install scrapy
scrapy startproject jandan 创建项目
cd jandan
cd jandan
items.py 存放数据
pipelines.py 管道文件
由于煎蛋网有反爬虫措施,我们需要做一些处理
settings文件
ROBOTSTXT_OBEY = False #不遵寻reboot协议
DOWNLOAD_DELAY = 2 #下载延迟时间
DOWNLOAD_TIMEOUT = 15 #下载超时时间
COOKIES_ENABLED = False #禁用cookie
DOWNLOADER_MIDDLEWARES = {
#请求头
'jandan.middlewares.RandomUserAgent': 100,
#代理ip
'jandan.middlewares.RandomProxy': 200,
}
#请求列表
USER_AGENTS = [
#遨游
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
#火狐
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
#谷歌
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
] #代理ip列表
PROXIES = [
{"ip_port":"119.177.90.103:9999","user_passwd":""},
#代理ip无密码
{"ip_port":"101.132.122.230:3128","user_passwd":""},
#代理ip有密码
# {"ip_port":"123.139.56.238:9999","user_passwd":"root:admin"}
]
#管道文件,取消注释
ITEM_PIPELINES = {
'jandan.pipelines.JandanPipeline': 300,
}
IMAGES_STORE = "images"
middlewares文件
import random
import base64
from jandan.settings import USER_AGENTS
from jandan.settings import PROXIES class RandomUserAgent(object):
def process_request(self,request,spider):
useragent = random.choice(USER_AGENTS)
request.headers.setdefault("User-Agent",useragent) class RandomProxy(object):
def process_request(self,request,spider):
proxy = random.choice(PROXIES)
if proxy["user_passwd"] is None:
request.meta["proxy"] = "http://" + proxy["ip_port"]
else:
# b64编码接收字节对象,在py3中str是unicode,需要转换,返回是字节对象
base64_userpasswd = base64.b16encode(proxy["user_passwd"].encode())
request.meta["proxy"] = "http://" + proxy["ip_port"]
#拼接是字符串,需要转码
request.headers["Proxy-Authorization"] = "Basic " + base64_userpasswd.decode()
items文件
import scrapy class JandanItem(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
scrapy genspider -t crawl dj jandan.net 创建crawlscrapy类爬虫
会自动在spiders下创建jandan.py文件,页面由js编写,需要BeautifulSoup类定位js元素获取数据
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jandan.items import JandanItem
from selenium import webdriver
from bs4 import BeautifulSoup as bs4 class JdSpider(CrawlSpider):
name = 'jd'
allowed_domains = ['jandan.net']
start_urls = ['http://jandan.net/pic/page-1#comments/'] rules = (
Rule(LinkExtractor(allow=r'pic/page-\d+'), callback='parse_item', follow=True),
) def parse_item(self, response):
item = JandanItem()
driver = webdriver.PhantomJS()
driver.get(response.url)
soup = bs4(driver.page_source, 'html.parser')
all_data = soup.find_all('div', {'class': 'row'})
for i in all_data:
name = i.find("strong")
item["name"] = name.get_text().strip()
link = i.find('a', {'class': 'view_img_link'})
url = link.get("href")
if len(url) == 0:
return
item["url"] = "http://" + url.split("//")[-1]
yield item
pipelines.py
import json
import os
import requests
from scrapy.conf import settings class JandanPipeline(object):
#保存为json文件
# def __init__(self):
# self.filename = open("jandan.json","wb")
# self.num = 0
#
# def process_item(self, item, spider):
# text = json.dumps(dict(item),ensure_ascii=False) + "\n"
# self.filename.write(text.encode("utf-8"))
# self.num += 1
# return item
#
# def close_spider(self,spider):
# self.filename.close()
# print("总共有" + str(self.num) + "个资源") #下载到本地
def process_item(self, item, spider):
if 'url' in item:
dir_path = settings["IMAGES_STORE"]
if not os.path.exists(dir_path):
os.makedirs(dir_path)
su = "." + item["url"].split(".")[-1]
path = item["name"] + su
new_path = '%s/%s' % (dir_path, path)
if not os.path.exists(new_path):
with open(new_path, 'wb') as handle:
response = requests.get(item["url"], stream=True)
for block in response.iter_content(1024):
if not block:
break handle.write(block)
return item
scrapy crawl dj 启动爬虫
scrapy shell "https://hr.tencent.com/position.php?&start=0" 发送请求
奉上我的github地址,会定期更新项目
https://github.com/bjptw/workspace