项目结构:
Mysql数据库表创建语句
CREATE TABLE `myhexun` ( `id` int(9) NOT NULL AUTO_INCREMENT COMMENT '文章的id', `name` varchar(60) DEFAULT NULL COMMENT '文章名', `url` varchar(100) DEFAULT NULL COMMENT '文章url', `hits` int(15) DEFAULT NULL COMMENT '文章点击数', `comment` int(15) DEFAULT NULL COMMENT '文章评论数', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1876 DEFAULT CHARSET=utf8;
1、piplines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class HexunpjtPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="spider")
def process_item(self, item, spider):
# 每个博文列表页中包含多篇博文的信息,我们可以通过for循环一次处理各博文的信息
for j in range(0, len(item['name'])):
# 将获取到的name、url、hits、comment分别赋给各变量
name = item['name'][j]
url = item['url'][j]
hits = item['hits'][j]
comment = item['comment'][j]
print("name:%s,url:%s,hits:%s,comment:%s"% (name, url, hits, comment))
# 构造对应的sql语句,实现将获取到的对应数据插入数据库中
sql = "insert into myhexun(name,url,hits,comment) VALUES ('"+name + "','"+url + "','"+hits + "','"+comment\
+ "')"
print("sql语句:%s" % sql)
# 通过query实现执行对应的sql语句
self.conn.query(sql)
self.conn.commit()
return item
def close_spider(self, spider):
# 最后关闭数据库连接
self.conn.close()
2、settings.py
取消注释
ITEM_PIPELINES = {
'hexunpjt.pipelines.HexunpjtPipeline': 300,
}
3、item.py
# -*- coding: utf-8 -*-
import scrapy
class HexunpjtItem(scrapy.Item):
# 建立name存储文章名
name = scrapy.Field()
# url存储文章网址
url = scrapy.Field()
# hits存储文章阅读数
hits = scrapy.Field()
# comment存储文章评论数
comment = scrapy.Field()
4、myhexunspd.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from hexunpjt.items import HexunpjtItem
import urllib.request
import re
# http://14755969.blog.hexun.com/
class MyhexunspdSpider(scrapy.Spider):
name = 'myhexunspd'
# 设置要爬取的用户的uid,为后续构造爬取网址做准备
uid = "14755969"
allowed_domains = ['hexun.com']
start_urls = ['http://hexun.com/']
def start_requests(self):
# 首次爬取模拟成浏览器进行
yield Request("http://"+str(self.uid)+".blog.hexun.com/p1/default.html", headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36"})
def parse(self, response):
item = HexunpjtItem()
item['name'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
item['url'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").extract()
# 接下来需要使用urllib和re模块获取博文的评论数和阅读数
# 首先提取存储评论数和点击数网址的正则表达式
pat1 = '<script type="text/javascript" src="(.*?)">'
# hcurl 为存储评论数和点击数的网址
hcurl = re.compile(pat1).findall(str(response.body))[0]
# print("获取的点击数和评论数网址:%s"%type(hcurl))
# 模拟成浏览器
headers2 = ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers2]
# 将opener安装为全局
urllib.request.install_opener(opener)
# data为对应博客列表页的所有博文的点击数与评论数数据
data = urllib.request.urlopen(hcurl).read()
# print("获取的点击数和评论数网址内容:%s" % data)
# pat2为提取文章阅读数的正则表达式
pat2 = "click\d*?','(\d*?)'"
# pat3为提取文章评论数的正则表达式
pat3 = "comment\d*','(\d*?)'"
# 提取阅读数和评论数数据并分别赋值给item下的hits和comment
item["hits"] = re.compile(pat2).findall(str(data))
item["comment"] = re.compile(pat3).findall(str(data))
yield item
# 提取博文列表页的总页数
pat4 = "blog.hexun.com/p(.*?)/"
# 通过正则表达式获取到的数据为一个列表,倒数第二个元素为总页数
data2 = re.compile(pat4).findall(str(response.body))
if(len(data2) >= 2):
totalurl = data2[-2]
else:
totalurl = 1
# 在实际运行中,下一行print的代码可以注释掉,在调试过程中,可以开启下一行的print的代码
# print("一共" + str(totalurl) + "页")
# 进入for循环,一次爬取各博文列表页的博文数据
for i in range(2, int(totalurl)+1):
# 构造下一次要爬取的url,爬取下一页博文列表页中的数据
nexturl= "http://"+str(self.uid)+".blog.hexun.com/p" + str(i) + "/default.html"
# 进行下一次爬取,下一次爬取仍然模拟成浏览器进行
yield Request(nexturl, callback=self.parse, headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36"})
5、运行结果
scrapy crawl myhexunspd
6、注意
1、python和mysql通讯插入数据时,最后需要commit才能将数据提交