一、选题背景
现如今我们出去游玩都是通过网上预定来选择游玩地点,项目等等,来查看游玩的价钱。这类的平台有很多种,途牛、携程、去哪儿、飞猪。为此呢,我选择了飞猪这个平台,爬取丽江游玩的项目,以及价格。然后做可视化分析。
二、网络爬虫设计方案
爬虫名称:丽江旅游项目数据爬取
内容:通过爬虫程序爬取游玩项目,然后进行数据可视化分析。
方案描述:
1、request请求访问
2、解析网页,爬取数据。这里采用xtree.xpath
3、数据保存,使用sys
三、结构特征分析
1、结构特征:内容导航型
2、Htmls页面解析:
游玩主题:
价格:
出售情况:
评论条数:
3、节点查找方法:
title = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/div[1]/a/h3/div/text()".format(coun)) price = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[3]/div/div/span/text()".format(coun)) sell = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[1]/text()".format(coun)) coumm = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[2]/text()".format(coun))
3、遍历方法:
for i in range(48): title = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/div[1]/a/h3/div/text()".format(coun)) for i in title: title = i price = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[3]/div/div/span/text()".format(coun)) for i in price: price = i sell = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[1]/text()".format(coun)) sell1 = [] for i in sell: sell = i.strip(\'月售\') sell = sell.strip(\'笔\') if sell == sell1: sell = \'0\' # print(sell) coumm = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[2]/text()".format(coun)) coumm1 = [] for i in coumm: if i in \'评价\': pass elif i in \'条\': pass elif int(i) > 1: coumm = i
四、网络爬虫设计
1、数据爬取与采集
代码分析:
1 import time 2 import random 3 import requests 4 from lxml import etree 5 import sys 6 import re 7 8 USER_AGENTS = [ 9 \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0\', 10 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0\', 11 \'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0\', 12 \'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0\', 13 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\', 14 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\', 15 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0\', 16 \'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0\', 17 \'Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0\', 18 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0\', 19 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0\', 20 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0\', 21 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0\', 22 \'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0\', 23 \'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0\', 24 \'Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0\', 25 \'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0\', 26 \'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0\', 27 \'Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0\', 28 \'Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0\', 29 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0\', 30 \'Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0\', 31 \'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1\', 32 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0\', 33 \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\' 34 ] 35 headers = { 36 \'User-Agent\':random.choice(USER_AGENTS), 37 \'Connection\':\'keep-alive\', 38 \'Accept-Language\':\'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\' 39 } 40 41 #销量排序 42 def sales_volume(page): 43 # 创建Feizhu_sales_volume.csv 44 file = open("Feizhu_sales_volume.csv", "a") 45 file.write("title" + "," + "price" + "," + "sell" + "," + "coumm" + \'\n\') 46 file = file.close() 47 48 for i in range(page): 49 # 请求访问 50 try: 51 url = \'https://travelsearch.fliggy.com/index.htm?spm=181.15077045.1398723350.1.48f3620d7UbQ9z&searchType=product&keyword=%E4%B8%BD%E6%B1%9F&category=MULTI_SEARCH&pagenum=\'+str(page)+\'&-1=sales_des&conditions=-1%3Asales_des\' 52 res = requests.get(url,headers=headers) 53 res.encoding = \'utf-8\' 54 html = etree.HTML(res.text) 55 coun = 1 56 # 主题title、价格price、已售sell、评论数coumm 57 for i in range(48): 58 title = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/div[1]/a/h3/div/text()".format(coun)) 59 for a in title: 60 title = a 61 price = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[3]/div/div/span/text()".format(coun)) 62 for a in price: 63 price = a 64 sell = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[1]/text()".format(coun)) 65 for a in sell: 66 sell = a.strip(\'月售\') 67 sell = sell.strip(\'笔\') 68 coumm = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[2]/text()".format(coun)) 69 for i in coumm: 70 if i in \'评价\': 71 pass 72 elif i in \'条\': 73 pass 74 elif int(i) > 1: 75 coumm = str(i) 76 coun +=1 77 # 保存数据 78 with open("Feizhu_sales_volume.csv", "a", encoding=\'utf-8\') as f2: 79 f2.writelines(title + "," + price + "," + sell + "," + coumm + "," + \'\n\') 80 print(\'主题:\',title,\'\n\', 81 \'价格:\',price,\'元\n\', 82 \'已售出:\',sell,\'笔\n\', 83 \'评论:\',coumm,\'条\n\') 84 except: 85 pass 86 87 synthesize(page) 88 page+=1 89 time.sleep(1) 90 91 # 综合排序 92 def synthesize(page): 93 # 创建Feizhu_synthesize.csv 94 file = open("Feizhu_synthesize.csv", "a") 95 file.write("title" + "," + "price" + "," + "sell" + "," + "coumm" + \'\n\') 96 file = file.close() 97 try: 98 for i in range(page): 99 # 请求访问 100 url = \'https://travelsearch.fliggy.com/index.htm?spm=181.15077045.1398723350.1.48f3620d7UbQ9z&searchType=product&keyword=%E4%B8%BD%E6%B1%9F&category=MULTI_SEARCH&pagenum=\'+str(page)+\'&-1=popular&conditions=-1%3Apopular\' 101 res = requests.get(url, headers=headers) 102 res.encoding = \'utf-8\' 103 html = etree.HTML(res.text) 104 coun = 1 105 #主题title、价格price、已售sell、评论数coumm 106 for i in range(48): 107 title = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/div[1]/a/h3/div/text()".format(coun)) 108 for i in title: 109 title = i 110 price = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[3]/div/div/span/text()".format(coun)) 111 for i in price: 112 price = i 113 sell = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[1]/text()".format(coun)) 114 sell1 = [] 115 for i in sell: 116 sell = i.strip(\'月售\') 117 sell = sell.strip(\'笔\') 118 if sell == sell1: 119 sell = \'0\' 120 # print(sell) 121 coumm = html.xpath("//*[@id=\'content\']/div[6]/div[1]/div[1]/div/div[{}]/div[2]/p[2]/span[2]/text()".format(coun)) 122 coumm1 = [] 123 for i in coumm: 124 if i in \'评价\': 125 pass 126 elif i in \'条\': 127 pass 128 elif int(i) > 1: 129 coumm = i 130 if coumm == coumm1: 131 coumm = \'0\' 132 coun += 1 133 # 保存数据 134 with open("Feizhu_synthesize.csv", "a", encoding=\'utf-8\') as f2: 135 f2.writelines(title + "," + price + "," + sell + "," + coumm + "," + \'\n\') 136 print(\'主题:\', title, \'\n\', 137 \'价格:\', price, \'元\n\', 138 \'已售出:\', sell, \'笔\n\', 139 \'评论:\', coumm, \'条\n\') 140 page +=1 141 time.sleep(1) 142 except: 143 pass 144 145 if __name__ == \'__main__\': 146 page = 2 147 sales_volume(page) 148 # synthesize(page)
2、数据的清洗与处理
import pandas as pd import numpy as np # xs为销量排行的表格、zh为综合表排序 xs = pd.read_csv(r\'D:\wtx\Feizhu_sales_volume.csv\',error_bad_lines=False) zh = pd.read_csv(r\'D:\wtx\Feizhu_synthesize.csv\',error_bad_lines=False) xs
# 重复值处理 xs = xs.drop_duplicates(\'title\') zh = zh.drop_duplicates(\'title\') # Nan处理 xs = xs.dropna(axis = 0) zh = zh.dropna(axis = 0)
# 根据销售数降序排序 xs.sort_values(by=["sell"],inplace=True,ascending=[False]) zh.sort_values(by=["sell"],inplace=True,ascending=[False])
按销售量排序可视化分析:
# 销售排行可视化分析 import matplotlib.pyplot as plt x = xs[\'title\'].head(20) y = xs[\'price\'].head(20) z = xs[\'sell\'].head(20) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.rcParams[\'axes.unicode_minus\']=False plt.plot(x,z,\'-\',color = \'r\',label="sell") plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="price") plt.xticks(rotation=90) plt.legend(loc = "best")#图例 plt.title("飞猪丽江旅游销售量趋势图") plt.xlabel("主题",)#横坐标名字 plt.ylabel("价格")#纵坐标名字 plt.show()
xs.sort_values(by=["coumm"],inplace=True,ascending=[False]) x = xs[\'title\'].head(20) y = xs[\'price\'].head(20) z = xs[\'coumm\'].head(20) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.rcParams[\'axes.unicode_minus\']=False plt.plot(x,z,\'-\',color = \'r\',label="评论数") plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="价格") plt.xticks(rotation=90) plt.legend(loc = "best")#图例 plt.title("飞猪丽江旅游销售量趋势图") plt.xlabel("主题",)#横坐标名字 plt.ylabel("价格")#纵坐标名字 plt.show()
# 水平图 plt.barh(x,y, alpha=0.2, height=0.4, color=\'g\',label="价格", lw=3) plt.title("飞猪丽江旅游销售量水平图") plt.legend(loc = "best")#图例 plt.xlabel("价格",)#横坐标名字 plt.ylabel("主题")#纵坐标名字 plt.show()
# 水平图 plt.barh(x,z, alpha=0.2, height=0.4, color=\'r\',label="评论数", lw=3) plt.title("飞猪丽江旅游销售量水平图") plt.legend(loc = "best")#图例 plt.xlabel("评论数",)#横坐标名字 plt.ylabel("主题")#纵坐标名字 plt.show()
# 散点图 plt.scatter(x,z,color=\'gray\',marker=\'o\',s=40,alpha=0.5) plt.xticks(rotation=90) plt.title("飞猪丽江旅游销售量散点图") plt.xlabel("主题",)#横坐标名字 plt.ylabel("销售量")#纵坐标名字 plt.show()
# 盒图 plt.boxplot(z) plt.title("飞猪丽江旅游销售量量盒图") plt.show()
综合排序可视化分析:
#价格降序排行 zh.sort_values(by=["price"],inplace=True,ascending=[False])
# 综合排序可视化分析 import matplotlib.pyplot as plt x = zh[\'title\'].head(20) y = zh[\'price\'].head(20) z = zh[\'sell\'].head(20) d = zh[\'coumm\'].head(20) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.rcParams[\'axes.unicode_minus\']=False plt.plot(x,d,\'-\',color = \'r\',label="coumm") plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="price") plt.xticks(rotation=90) plt.legend(loc = "best")#图例 plt.title("飞猪丽江旅游销综合排序趋势图") plt.xlabel("主题",)#横坐标名字 plt.ylabel("价格")#纵坐标名字 plt.show()
# 水平图 plt.barh(x,d, alpha=0.2, height=0.4, color=\'pink\',label="评论数", lw=3) plt.title("飞猪丽江旅游综合排序水平图") plt.legend(loc = "best")#图例 plt.xlabel("出售数",)#横坐标名字 plt.ylabel("主题")#纵坐标名字 plt.show()
# 散点图 plt.scatter(x,d,color=\'b\',marker=\'o\',s=40,alpha=0.5) plt.xticks(rotation=90) plt.title("飞猪丽江旅游综合排序散点图") plt.xlabel("主题",)#横坐标名字 plt.ylabel("销售量")#纵坐标名字 plt.show()
# 盒图 plt.boxplot(d) plt.title("飞猪丽江旅游综合排序盒图") plt.show()
云词:
import pandas as pd import numpy as np import wordcloud as wc from PIL import Image import matplotlib.pyplot as plt import random bk = np.array(Image.open("111.jpg")) mask = bk # 定义尺寸 word_cloud = wc.WordCloud( width=1000, # 词云图宽 height=1000, # 词云图高 mask = mask, background_color=\'black\', # 词云图背景颜色,默认为白色 font_path=\'msyhbd.ttc\', # 词云图 字体(中文需要设定为本机有的中文字体) max_font_size=400, # 最大字体,默认为200 random_state=50, # 为每个单词返回一个PIL颜色 ) text = random.choices([\'云南\',\'大理\',\'丽江\',\'洱海\',\'昆明\',\'石林\',\'古镇\',\'花之城\'],k=100) text = " ".join(text) word_cloud.generate(text) plt.imshow(word_cloud) plt.show()
总代码:
1 import pandas as pd 2 import numpy as np 3 # xs为销量排行的表格、zh为综合表排序 4 xs = pd.read_csv(r\'D:\wtx\Feizhu_sales_volume.csv\',error_bad_lines=False) 5 zh = pd.read_csv(r\'D:\wtx\Feizhu_synthesize.csv\',error_bad_lines=False) 6 7 xs 8 9 # 重复值处理 10 xs = xs.drop_duplicates(\'title\') 11 zh = zh.drop_duplicates(\'title\') 12 13 # Nan处理 14 xs = xs.dropna(axis = 0) 15 zh = zh.dropna(axis = 0) 16 17 # 根据销售数降序排序 18 xs.sort_values(by=["sell"],inplace=True,ascending=[False]) 19 zh.sort_values(by=["sell"],inplace=True,ascending=[False]) 20 21 # 销售排行可视化分析 22 import matplotlib.pyplot as plt 23 x = xs[\'title\'].head(20) 24 y = xs[\'price\'].head(20) 25 z = xs[\'sell\'].head(20) 26 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 27 plt.rcParams[\'axes.unicode_minus\']=False 28 plt.plot(x,z,\'-\',color = \'r\',label="sell") 29 plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="price") 30 plt.xticks(rotation=90) 31 plt.legend(loc = "best")#图例 32 plt.title("飞猪丽江旅游销售量趋势图") 33 plt.xlabel("主题",)#横坐标名字 34 plt.ylabel("价格")#纵坐标名字 35 plt.show() 36 37 xs.sort_values(by=["coumm"],inplace=True,ascending=[False]) 38 39 x = xs[\'title\'].head(20) 40 y = xs[\'price\'].head(20) 41 z = xs[\'coumm\'].head(20) 42 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 43 plt.rcParams[\'axes.unicode_minus\']=False 44 plt.plot(x,z,\'-\',color = \'r\',label="评论数") 45 plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="价格") 46 plt.xticks(rotation=90) 47 plt.legend(loc = "best")#图例 48 plt.title("飞猪丽江旅游销售量趋势图") 49 plt.xlabel("主题",)#横坐标名字 50 plt.ylabel("价格")#纵坐标名字 51 plt.show() 52 53 # 水平图 54 plt.barh(x,y, alpha=0.2, height=0.4, color=\'r\',label="价格", lw=3) 55 plt.title("飞猪丽江旅游销售量水平图") 56 plt.legend(loc = "best")#图例 57 plt.xlabel("评论数",)#横坐标名字 58 plt.ylabel("主题")#纵坐标名字 59 plt.show() 60 61 # 水平图 62 plt.barh(x,z, alpha=0.2, height=0.4, color=\'r\',label="评论数", lw=3) 63 plt.title("飞猪丽江旅游销售量水平图") 64 plt.legend(loc = "best")#图例 65 plt.xlabel("评论数",)#横坐标名字 66 plt.ylabel("主题")#纵坐标名字 67 plt.show() 68 69 70 # 散点图 71 plt.scatter(x,z,color=\'gray\',marker=\'o\',s=40,alpha=0.5) 72 plt.xticks(rotation=90) 73 plt.title("飞猪丽江旅游销售量散点图") 74 plt.xlabel("主题",)#横坐标名字 75 plt.ylabel("销售量")#纵坐标名字 76 plt.show() 77 78 79 # 盒图 80 plt.boxplot(z) 81 plt.title("飞猪丽江旅游销售量量盒图") 82 plt.show() 83 84 zh.sort_values(by=["price"],inplace=True,ascending=[False]) 85 86 zh.sort_values(by=["coumm"],inplace=True,ascending=[False]) 87 zh.head(20) 88 89 # 综合排序可视化分析 90 import matplotlib.pyplot as plt 91 x = zh[\'title\'].head(20) 92 y = zh[\'price\'].head(20) 93 z = zh[\'sell\'].head(20) 94 d = zh[\'coumm\'].head(20) 95 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 96 plt.rcParams[\'axes.unicode_minus\']=False 97 plt.plot(x,d,\'-\',color = \'r\',label="coumm") 98 plt.bar(x,y,alpha=0.2, width=0.4, color=\'b\', lw=3,label="price") 99 plt.xticks(rotation=90) 100 plt.legend(loc = "best")#图例 101 plt.title("飞猪丽江旅游销综合排序趋势图") 102 plt.xlabel("主题",)#横坐标名字 103 plt.ylabel("价格")#纵坐标名字 104 plt.show() 105 106 # 水平图 107 plt.barh(x,d, alpha=0.2, height=0.4, color=\'pink\',label="评论数", lw=3) 108 plt.title("飞猪丽江旅游综合排序水平图") 109 plt.legend(loc = "best")#图例 110 plt.xlabel("出售数",)#横坐标名字 111 plt.ylabel("主题")#纵坐标名字 112 plt.show() 113 114 # 散点图 115 plt.scatter(x,d,color=\'b\',marker=\'o\',s=40,alpha=0.5) 116 plt.xticks(rotation=90) 117 plt.title("飞猪丽江旅游综合排序散点图") 118 plt.xlabel("主题",)#横坐标名字 119 plt.ylabel("销售量")#纵坐标名字 120 plt.show() 121 122 # 盒图 123 plt.boxplot(d) 124 plt.title("飞猪丽江旅游综合排序盒图") 125 plt.show() 126 127 import pandas as pd 128 import numpy as np 129 import wordcloud as wc 130 from PIL import Image 131 import matplotlib.pyplot as plt 132 import random 133 134 bk = np.array(Image.open("111.jpg")) 135 mask = bk 136 # 定义尺寸 137 word_cloud = wc.WordCloud( 138 width=1000, # 词云图宽 139 height=1000, # 词云图高 140 mask = mask, 141 background_color=\'black\', # 词云图背景颜色,默认为白色 142 font_path=\'msyhbd.ttc\', # 词云图 字体(中文需要设定为本机有的中文字体) 143 max_font_size=400, # 最大字体,默认为200 144 random_state=50, # 为每个单词返回一个PIL颜色 145 ) 146 text = random.choices([\'云南\',\'大理\',\'丽江\',\'洱海\',\'昆明\',\'石林\',\'古镇\',\'花之城\'],k=100) 147 text = " ".join(text) 148 word_cloud.generate(text) 149 plt.imshow(word_cloud) 150 plt.show()
五、总结
1.经过对主题数据的分析与可视化,可以得到哪些结论?是否达到预期的目标?
从可视化分析可以得出
销售排序:跟团游、双飞游、玉龙雪山旅游项目比较热门。
综合排序:云向旅游江到活湖二日游纯玩商务车小泊活湖旅游旅拍团游、云向旅游昆明大理江玉龙雪山6天5晚洱海旅拍纯玩跟团游双飞游比较受人们欢迎。
分析结果达到预期。
2.在完成此设计过程中,得到哪些收获?以及要改进的建议?
在此次设计过程种我对数据处理种的数据筛出有了很大的收获,说白了就是怎么进行类型转换,然后达到自己的想要的效果。受益匪浅!需要改进的地方可能就是编写程序反应时间过慢了!编程经验比较欠缺。