爬取豆瓣电影top250movie.douban.com/top250的电影数据,并保存在MongoDB中。
创建项目
scrapy startproject douban
items.py
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # 标题 title = scrapy.Field() # 信息 bd = scrapy.Field() # 评分 star = scrapy.Field() # 简介 quote = scrapy.Field()
创建CrawSpider,使用模版craw
scrapy genspider -t craw doubanmovie douban.com
import scrapy from douban.items import DoubanItem class DoubamovieSpider(scrapy.Spider): name = "doubanmovie" allowed_domains = ["movie.douban.com"] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = ( url+str(offset), ) def parse(self, response): item = DoubanItem() movies = response.xpath("//div[@class='info']") for each in movies: # 标题 item['title'] = each.xpath(".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath(".//div[@class='bd']/p/text()").extract()[0] # 评分 item['star'] = each.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract()[0] # 简介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset < 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
pipelines.py
import pymongo from scrapy.conf import settings class DoubanPipeline(object): def __init__(self): host = settings["MONGODB_HOST"] port = settings["MONGODB_PORT"] dbname = settings["MONGODB_DBNAME"] sheetname= settings["MONGODB_SHEETNAME"] # 创建MONGODB数据库链接 client = pymongo.MongoClient(host = host, port = port) # 指定数据库 mydb = client[dbname] # 存放数据的数据库表名 self.sheet = mydb[sheetname] def process_item(self, item, spider): data = dict(item) self.sheet.insert(data) return item
settings.py
BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" DOWNLOAD_DELAY = 2.5 # Disable cookies (enabled by default) COOKIES_ENABLED = False DOWNLOADER_MIDDLEWARES = { 'douban.middlewares.RandomUserAgent': 100, 'douban.middlewares.RandomProxy': 200, USER_AGENTS = [ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)', 'Opera/9.27 (Windows NT 5.2; U; zh-cn)', 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)', 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0', 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13' ] PROXIES = [ {"ip_port" :"121.42.140.113:16816", "user_passwd" : "用户名:密码"}, #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""} ] ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, } # MONGODB 主机名 MONGODB_HOST = "127.0.0.1" # MONGODB 端口号 MONGODB_PORT = 27017 # 数据库名称 MONGODB_DBNAME = "Douban" # 存放数据的表名称 MONGODB_SHEETNAME = "doubanmovies"