设计题目: 豆瓣读书的Python爬虫
1. 数据爬虫结构
2. 数据存储方案
3. 数据分析方案
# -*- coding: UTF-8 -*- # encoding: utf-8 import sys import time import urllib import pymysql import importlib import requests import numpy as np from bs4 import BeautifulSoup from openpyxl import Workbook, comments importlib.reload(sys) import judege # Some User Agents hds = [{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv: Gecko/20091201 Firefox/3.5.6'}, \ {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \ {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] def book_spider(): #起始页数 page_num = 1; #连接数据库 conn = pymysql.connect(host='localhost', user='用户名', password='密码', db='数据库名称', port=3306, charset="utf8") cur = conn.cursor() while(1): #爬取的页面 url = '' + str(page_num * 15) time.sleep(np.random.rand() * 5) # Last Version #爬虫的基本操作 try: req = urllib.request.urlopen(url) source_code = plain_text = str(source_code, 'utf-8') except: print("error") continue # #Previous Version, IP is easy to be Forbidden # source_code = requests.get(url) # plain_text = source_code.text soup = BeautifulSoup(plain_text) list_soup = soup.find('div', {'class': 'mod book-list'}) for book_info in list_soup.findAll('dd'): title = book_info.find('a', {'class':'title'}).string.strip() desc = book_info.find('div', {'class':'desc'}).string.strip() desc_list = desc.split('/') book_url = book_info.find('a', {'class':'title'}).get('href') author = '/'.join(desc_list[0:-3]) #这里我对字符串进行了清洗,因为网站上的作者前面都有国籍信息,例如:[美]xxxx.xxxx,但是只有中国的前面什么都没有 #老师让我将字符串改为一致的,方便之后的数据处理,所以我将名字前面没有国籍信息的加上[中] if author[0] != '[': author = '[中' + author price = desc_list[-1] price2 = "" #这里我对价格进行处理,因为网上爬下来的价格末尾都带‘元’字,在数据库存储中为了方便之后的数据处理我想将数据存储为float型 #所以需要去掉末尾的‘元’ for ii in range(len(price)): if price[ii] >= '0' and price[ii] <= '9' or price[ii] == '.' : price2 = price2 + price[ii] try: rating = book_info.find('span', {'class':'rating_nums'}).string.strip() except: rating = '0.0' people_num = get_people_num(book_url) cur.execute('insert into douban(title,grade,comment,fivestars,fourstars,threestars,twostars,onestar,author,info,time,price)values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (title, float(rating), people_num[0], people_num[1], people_num[2], people_num[3], people_num[4], people_num[5], author, desc_list[-3], desc_list[-2], float(price2))) conn.commit() page_num += 1 #最终爬的页数 if page_num == 10: break return 1; #之后老师的要求是统计五星-一星评分,由于这个信息需要在嵌套的url里,就是在图书的界面中需要点击进具体的图书才可以收集到星级数量 def get_people_num(url): comments = [0 for x in range(6)] try: req = urllib.request.urlopen(url) source_code = plain_text = str(source_code, 'utf-8') except: print ('error') soup = BeautifulSoup(plain_text) people_num = soup.find('div', {'class':'rating_sum'}).findAll('span')[1].string.strip() fivestars = soup.findAll('span', {'class':'rating_per'})[0].string.strip() fourstars = soup.findAll('span', {'class':'rating_per'})[1].string.strip() threestars = soup.findAll('span', {'class':'rating_per'})[2].string.strip() twostars = soup.findAll('span', {'class':'rating_per'})[3].string.strip() onestar = soup.findAll('span', {'class':'rating_per'})[4].string.strip() #由于这里面的星级爬到的数据是百分比,所以还是需要进行数据处理,将星级百分比变成小数,再与总评论数相乘就可以得到具体的数量 fivestars = (int)((float)(people_num) * (float)(fivestars.strip('%')) / 100) fourstars = (int)((float)(people_num) * (float)(fourstars.strip('%')) / 100) threestars = (int)((float)(people_num) * (float)(threestars.strip('%')) / 100) twostars = (int)((float)(people_num) * (float)(twostars.strip('%')) / 100) onestar = (int)((float)(people_num) * (float)(onestar.strip('%')) / 100) comments[0] = people_num comments[1] = fivestars comments[2] = fourstars comments[3] = threestars comments[4] = twostars comments[5] = onestar return comments if __name__ == '__main__': judege_point = book_spider(); print(judege_point);