数据采集之贝壳新房【完整代码(数据库+图片)】
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymysql
class MysqlPipeline:
def open_spider(self,spider):
# 读取中的配置项
host = spider.settings.get("MYSQL_DB_HOST")
port = spider.settings.get("MYSQL_DB_PORT")
dbname = spider.settings.get("MYSQL_DB_NAME")
user = spider.settings.get("MYSQL_DB_USER")
pwd = spider.settings.get("MYSQL_DB_PASSWORD")
# 创建数据库链接
self.db_conn = pymysql.connect(host=host, port=port, db=dbname, user=user, password=pwd)
# 打开游标
self.db_cur = self.db_conn.cursor()
def process_item(self, item, spider):
values = (
item["house"],
item["address"],
item["price"],
item["total"]) # 与占位符%s对应的数据
# sql语句,数据部分使用占位符%s代替
sql = "insert into HouseInfo(house,address,price,total) values(%s,%s,%s,%s)"
self.db_cur.execute(sql, values) # 执行SQL语句
return item
def close_spider(self, spider):
self.db_conn.commit() # 提交事务
self.db_cur.close() # 关闭游标
self.db_conn.close() # 关闭数据库连接
class HouseImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info): #请求下载指定item对象数据
# for image_url in item["image_urls"]:
# yield Request(image_url)
yield Request(item["image_urls"])
def item_completed(self, results, item, info):#对下载结果进行处理
# results - 下载好的资源参数([(True, {'url': '/',
# 'path': 'full/',
# 'checksum': '037f4f643599f3e7870225798ece845b', 'status': 'downloaded'})])
# item - 被爬取的item对象
image_path=[x['path'] for ok,x in results if ok]
# print(image_path)
if not image_path:
raise DropItem("items contains no images")
item["image_path"]=image_path[0]
return item