环境搭建
Windows下安装Python: http://www.cnblogs.com/0bug/p/8228378.html
virtualenv的安装:http://www.cnblogs.com/0bug/p/8598458.html
创建项目的虚拟环境(Python3.6):
mkvirtualenv article_spider
安装Scrapy
1.pip install lxml 2.pip install pyopenssl 3.pip install pywin32 4.下载相应版本的twisted https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted 4.安装下载的twisted : pip install C:\Users\lichengguang\Downloads\Twisted-17.9.0-cp36-cp36m-win_amd64.whl 5.pip install scrapy
选择一个项目的工程目录,创建项目:
scrapy startproject ArticleSpider
cd ArticleSpider\ scrapy genspider jobbole blog.jobbole.com
用Pycharm打开项目进行编写
设置用于调试的启动脚本main
# -*- coding:utf-8 -*- from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(['scrapy', 'crawl', 'jobbole'])
import os p1 = os.path.abspath(__file__) # 当前文件的路径 print(p1) # D:\Workspace\ArticleSpider\ArticleSpider\test.py p2 = os.path.dirname(os.path.abspath(__file__)) # 当前文件路径的上一级路径 print(p2) # D:\Workspace\ArticleSpider\ArticleSpider
在setting.py里把robots协议项设置为Flase
ROBOTSTXT_OBEY = False
XPATH
xpath基础语法:http://www.cnblogs.com/0bug/p/8903668.html
开始编写爬虫
编写jobbole.py,初始url设置为 http://blog.jobbole.com/110287/
# -*- coding: utf-8 -*- import scrapy class JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://blog.jobbole.com/110287/'] def parse(self, response): pass
启动scrapy shell
scrapy shell http://blog.jobbole.com/110287/
用xpath来找标题
>>> title = response.xpath('//div[@class="entry-header"]/h1/text()') >>> title [<Selector xpath='//div[@class="entry-header"]/h1/text()' data='2016 腾讯软件开发面试题(部分)'>] >>> title.extract() ['2016 腾讯软件开发面试题(部分)'] >>> title.extract()[0] '2016 腾讯软件开发面试题(部分)'
用css选择器的写法:
>>> title = response.css('.entry-header h1::text').extract() >>> title ['2016 腾讯软件开发面试题(部分)'] >>> title = response.css('.entry-header h1::text').extract()[0] >>> title '2016 腾讯软件开发面试题(部分)'
通过xpath提取文章的具体字段
def parse(self, response): # 提取文章的具体字段 import re title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = match_re.group(1) comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = match_re.group(1) content = response.xpath("//div[@class='entry']").extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list)
通过css选择器提取文档的具体字段
def parse(self, response): # 通过css选择器提取字段 import re front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) pass