数据采集之贝壳新房【完整代码(数据库+图片)】

时间:2024-10-06 07:48:59
from scrapy.http import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline import pymysql class MysqlPipeline: def open_spider(self,spider): # 读取中的配置项 host = spider.settings.get("MYSQL_DB_HOST") port = spider.settings.get("MYSQL_DB_PORT") dbname = spider.settings.get("MYSQL_DB_NAME") user = spider.settings.get("MYSQL_DB_USER") pwd = spider.settings.get("MYSQL_DB_PASSWORD") # 创建数据库链接 self.db_conn = pymysql.connect(host=host, port=port, db=dbname, user=user, password=pwd) # 打开游标 self.db_cur = self.db_conn.cursor() def process_item(self, item, spider): values = ( item["house"], item["address"], item["price"], item["total"]) # 与占位符%s对应的数据 # sql语句,数据部分使用占位符%s代替 sql = "insert into HouseInfo(house,address,price,total) values(%s,%s,%s,%s)" self.db_cur.execute(sql, values) # 执行SQL语句 return item def close_spider(self, spider): self.db_conn.commit() # 提交事务 self.db_cur.close() # 关闭游标 self.db_conn.close() # 关闭数据库连接 class HouseImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): #请求下载指定item对象数据 # for image_url in item["image_urls"]: # yield Request(image_url) yield Request(item["image_urls"]) def item_completed(self, results, item, info):#对下载结果进行处理 # results - 下载好的资源参数([(True, {'url': '/', # 'path': 'full/', # 'checksum': '037f4f643599f3e7870225798ece845b', 'status': 'downloaded'})]) # item - 被爬取的item对象 image_path=[x['path'] for ok,x in results if ok] # print(image_path) if not image_path: raise DropItem("items contains no images") item["image_path"]=image_path[0] return item