Python基础——爬虫以及简单的数据分析

时间:2025-03-01 08:05:32

目标:使用Python编写爬虫,获取链家青岛站的房产信息,然后对爬取的房产信息进行分析。

环境:win10+python3.8+pycharm

Python库:

 import requests
import bs4
from bs4 import BeautifulSoup
import lxml
import re
import xlrd
import xlwt
import xlutils.copy
import time

目标分析:

1、编写爬虫爬取链家青岛站的房产信息

  ①分析目标链接

    第一页:https://qd.fang.lianjia.com/loupan/pg/pg1

    第二页:https://qd.fang.lianjia.com/loupan/pg/pg2

  由上面的链接可以看出来,不同网页是使用最后的pgx来进行变化的

  所以将链接分为两部分,使用字符串拼接获得所有的房产网页链接

   WebDiZhi = []
for i in range(1,85):
UrlHTML = Url + str(i)
WebDiZhi.append(UrlHTML)

使用遍历获得所有的链接并保存为列表

  ②分析网页结构

  

 #获取目标网页的html代码并进行解析
Xu = 0
Shuliang = len(WebDiZhi)
while Xu in range(Shuliang):#循环整个列表 Web = requests.get(WebDiZhi[Xu])
WebText = Web.text #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码
soup_One = BeautifulSoup(WebText,'html.parser')
XinXi_One = soup_One.find_all(class_="resblock-list-wrapper") #第二步、进一步筛选目标信息所在html代码,去除无效信息代码
soup_Two = BeautifulSoup(str(XinXi_One),'lxml')
XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper")

通过两步简单的筛选将房产信息所对应的html代码筛选出来

方便进一步分析html网页标签获取不同的房产信息

  ③针对不同的房产信息定义不同的函数,通过调用函数来获取不同的房产信息并保存到目标文件中

  

  print("-----------------开始写入第{}页-------------".format(Xu))
Name = GetName(XinXi_Two) # 获取小区名称
Write_File(Name, 0,Xu)
print("---------小区名称写入成功---------")
time.sleep(3)  #延时
Nature = NatureHouse(XinXi_Two) # 获取小区住宅性质(住宅、商业性)
Write_File(Nature, 1,Xu)
print("---------小区性质写入成功---------")
time.sleep(3)
Status = StatusHouse(XinXi_Two) # 获取小区状态(在售)
Write_File(Status, 2,Xu)
print("---------小区状态写入成功---------")
time.sleep(3)
Address = AddressHouse(XinXi_Two) # 获取小区地址
Write_File(Address, 3,Xu)
print("---------小区地址写入成功---------")
time.sleep(3)
Area = AreaHouse(XinXi_Two) # 获取小区房屋面积
Write_File(Area, 4,Xu)
print("---------小区面积写入成功---------")
time.sleep(3)
Average = AveragePriceHouse(XinXi_Two) # 均价
Write_File(Average, 5,Xu)
print("---------小区均价写入成功---------")
time.sleep(3)
Total = TotalPriceHouse(XinXi_Two) # 总价
Write_File(Total, 6,Xu)
print("---------小区总价写入成功---------")
time.sleep(3)

  各房产信息函数

 def Write_File(Data, lei,Hang):
data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(data)
table = ws.get_sheet(0)
Shu = Hang * 10
for i in range(len(Data)):
table.write(i + 1 + Shu, lei, Data[i])
print("----第{}项写入成功----".format(i))
ws.save(r"F:\实例\Python实例\爬虫\111.xls") def GetName(XinXi):
"""
@param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return: 返回小区名称,列表类型
"""
Nmae_list = []
# 获取小区名称
Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml')
Name_One = Obtain_Name_One.findAll(class_="name")
for i in Name_One:
Get_A = BeautifulSoup(str(i), 'lxml')
Nmae_list.append(Get_A.string)
return Nmae_list """
代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可
以下为部分函数,其他函数未定义 """
def NatureHouse(Nature):
"""房屋性质"""
Nature_list = []
Obtain_Nature = BeautifulSoup(str(Nature), 'lxml')
Nature_one = Obtain_Nature.find_all(class_='resblock-type')
for i in Nature_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Nature_list.append(Get_Span.string)
return Nature_list def StatusHouse(Status):
"""房屋状态"""
Status_list = []
Obtain_Nature = BeautifulSoup(str(Status), 'lxml')
Status_one = Obtain_Nature.find_all(class_='sale-status')
for i in Status_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Status_list.append(Get_Span.string)
return Status_list def AddressHouse(Area):
""" @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return:
Analysis_Label_xxx:分析标签,xxx:代表第几次分析
Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list
"""
#获取标签
Target_Information_list = []
Analysis_Label_One = BeautifulSoup(str(Area), 'lxml')
# 获取div标签,calss=resblock-location
Get_label_One = Analysis_Label_One.find_all(class_='resblock-location')
#解析标签并获得span标签
Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml')
Get_label_Two = Analysis_Label_Two.find_all(name='span') #获取span标签里面的文字内容并保存在列表内 #第一个
Target_Information_One = []
for i in Get_label_Two:
#使用正则表达式取出内部信息并保存在列表中
Information_Str = re.sub(r'<.*?>','',str(i))
Target_Information_One.append(Information_Str)
#将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内
i = 1
a = 0 #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来
Target_Information_Two = []
while i <= len(Target_Information_One):
while a < i:
#将Target_Information_One中每两项进行合并
Information_Two = Target_Information_One[a]
Information_One = Target_Information_One[i]
Information_Three = Information_One + Information_Two Target_Information_Two.append(Information_Three)
a += 2
i += 2 #获取详细地址 #第三个
Target_Information_Three = []
Span_html_One = Analysis_Label_Two.find_all(name='a')
for c in Span_html_One:
Area_Str_1 = re.sub(r'<.*?>', '', str(c))
Target_Information_Three.append(Area_Str_1) # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中
A = min(len(Target_Information_Two),len(Target_Information_Three))
for i in range(A):
Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i]) return Target_Information_list def AreaHouse(Area):
""" @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return: 返回房屋房间数量以及房屋总面积
"""
Area_list = []
#筛选目标信息的父标签
Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')
Area_one = Obtain_Area_One.find_all(class_='resblock-room') #通过正则表达式去除多余的html标签信息
Get_Area_One = []
for c in Area_one:
Area_Str_1 = re.sub(r'<.*?>', '', str(c))
Get_Area_One.append(Area_Str_1) #通过正则表达式去除多余的换行符
Get_Area_Two = []
for i in Get_Area_One:
Area_Str_2 = re.sub(r'\s+','',str(i))
Get_Area_Two.append(Area_Str_2) #开始获取房屋总面积
Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')
Area_two = Obtain_Area_Two.find_all(class_='resblock-area')
#通过正则表达式去除多余的html标签信息
Get_Area_Three = []
for a in Area_two:
Area_Str_3 = re.sub(r'<.*?>', '', str(a))
Get_Area_Three.append(Area_Str_3) # 通过正则表达式去除多余的换行符
Get_Area_Four = []
for r in Get_Area_Three:
Area_Str_4 = re.sub(r'\s+', '', str(r))
Get_Area_Four.append(Area_Str_4) # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中
A = min(len(Get_Area_Two), len(Get_Area_Four))
for i in range(A):
Area_list.append(Get_Area_Two[i] + Get_Area_Four[i]) return Area_list def AveragePriceHouse(Average):
"""
房屋均价
@param Average:
@return:
"""
Average_list = []
Obtain_Average = BeautifulSoup(str(Average), 'lxml')
Average_one = Obtain_Average.find_all(class_='number')
for i in Average_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Average_list.append(Get_Span.string) return Average_list def TotalPriceHouse(Total):
"""
房屋总价 @param Total:
@return:
"""
Total_list = []
Obtain_Total = BeautifulSoup(str(Total), 'lxml')
Total_one = Obtain_Total.fjind_all(class_='second')
for i in Total_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Get_Span_one = Get_Span.string
Get_Span_two = Get_Span_one.lstrip('总价')
Total_list.append(Get_Span_two) return Total_list

  创建存储文件

 def Create_File():
name = ['名称','性质','状态','地址','面积','均价','总价',]
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)
for i in range(len(name)):
sheet.write(0, i, name[i])
workbook.save(r'F:\实例\Python实例\爬虫\111.xls')
print("文件创建成功")

2、简单的数据分析并使用饼状图表示房产均价比例

 所用到的库:

import pandas as pd
import xlrd
import re
import xlutils.copy
import matplotlib.pyplot as plt

  ①数据清洗----删除空值行

 def ExceptNull():
"""
数据清洗第一步:去除表中空值
@param df: 传入读取的xls表格数据
@return: 保存数据后返回,
"""
df = pd.DataFrame(pd.read_excel(r'F:\实例\Python实例\爬虫\111.xls'))
#查找面积列空值,使用99999填充空缺值后删除所在行
print(df['面积'].isnull().value_counts())
df["面积"] = df["面积"].fillna('')
NullKey = df[(df.面积 == '')].index.tolist()
print(NullKey)
df = df.drop(NullKey)
print("*"*30)
print(df['面积'].isnull().value_counts()) print("*"*30)
#查找总价列空值,使用99999填充空缺值后删除所在行
print(df['总价'].isnull().value_counts())
df["总价"] = df["总价"].fillna('')
NullKey1 = df[(df.总价 == '')].index.tolist()
print(NullKey1)
df = df.drop(NullKey1)
print("*"*30)
print(df['总价'].isnull().value_counts())
df.to_excel('111.xls',index=False,encoding='uf-8') print("修改后数据保存成功")

  ②数据预处理----将数据转换成易处理格式

 def LeiChuli():
Data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(Data)
Table = Data.sheet_by_name("Sheet1")
Nrows = Table.nrows
list_A = []
for i in range(1,Nrows):
A = Table.cell_value(i,6)
A_Str = re.sub('/套','',A,Nrows)
list_A.append(A_Str)
Replace = []
for i in range(len(list_A)):
Price_Str = list_A[i]
Last_Str = Price_Str[-1]
if Last_Str == '万':
A_Str = re.sub('万', '', Price_Str, 1)
Replace.append(A_Str)
else:
Replace.append(Price_Str)
table = ws.get_sheet(0)
for i in range(len(Replace)):
table.write(i + 1, 6, Replace[i])
print("------>开始写入修改后数据<------")
print("---->第{}项写入成功<----".format(i))
ws.save(r"F:\实例\Python实例\爬虫\111.xls")
print("------>数据写入完成<------")

  ③对处理后的数据进行分析并绘制饼状图

 def Data_Analysis_One():
Data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(Data)
Table = Data.sheet_by_name("Sheet1")
Nrows = Table.nrows
a,b,c,d,e,f = 0,0,0,0,0,0 for i in range(1, Nrows):
A = Table.cell_value(i, 5)
if A == "价格待定":
f += 1
else:
if int(A) <= 5000:
a += 1
elif int(A) <= 10000:
b += 1
elif int(A) <= 15000:
c += 1
elif int(A) <= 20000:
d += 1
else:
e += 1 # 开始准备绘制饼状图 #价格区间数据准备
sizes = []
Percentage_a = (a / Nrows) * 100
sizes.append(int(Percentage_a))
Percentage_b = (b / Nrows) * 100
sizes.append(int(Percentage_b))
Percentage_c = (c / Nrows) * 100
sizes.append(int(Percentage_c))
Percentage_d = (d / Nrows) * 100
sizes.append(int(Percentage_d))
Percentage_e = (e / Nrows) * 100
sizes.append(int(Percentage_e))
Percentage_f = (f / Nrows) * 100
sizes.append(int(Percentage_f))
#设置占比说明
labels = '0-5000','5001-10000','10001-15000','15001-20000','20000-','Undetermined'
explode = (0,0,0.1,0,0,0)
#开始绘制
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)
plt.axis('equal')
plt.show()
ws.save(r"F:\实例\Python实例\爬虫\111.xls")

最后附上效果图。Python基础——爬虫以及简单的数据分析

最后附上完整代码:

1、爬虫代码

 import requests
import bs4
from bs4 import BeautifulSoup
import lxml
import re
# import LianJia_QD_DataProcessing
import xlrd
import xlwt
import xlutils.copy
import time def GetHTML(Url):
"""
1、通过传入url组合,获取所有网页地址的url
2、获取目标网页的html代码并进行解析
3、解析后将目标信息分别写入字典类型的变量并返回 @param Url: 目标网址的不变链接
@return: 网站目标信息 """ #通过传入url组合,获取所有网页地址的url
WebDiZhi = []
for i in range(1,85):
UrlHTML = Url + str(i)
WebDiZhi.append(UrlHTML) print("共计{}页".format(len(WebDiZhi)))
# Create_File()
#获取目标网页的html代码并进行解析
Xu = 0
Shuliang = len(WebDiZhi)
while Xu in range(Shuliang):#range(len(WebDiZhi))--循环整个列表 Web = requests.get(WebDiZhi[Xu])
WebText = Web.text #第一步、粗筛选目标信息所在的html代码,去除大部分无效信息代码
soup_One = BeautifulSoup(WebText,'html.parser')
XinXi_One = soup_One.find_all(class_="resblock-list-wrapper") #第二步、进一步筛选目标信息所在html代码,去除无效信息代码
soup_Two = BeautifulSoup(str(XinXi_One),'lxml')
XinXi_Two = soup_Two.find_all(class_="resblock-desc-wrapper") print("-----------------第{}页爬取成功------------".format(Xu))
# Html.append(XinXi_Two)
# time.sleep(1)
# return Html print("-----------------开始写入第{}页-------------".format(Xu))
Name = GetName(XinXi_Two) # 获取小区名称
Write_File(Name, 0,Xu)
print("---------小区名称写入成功---------")
time.sleep(3)
Nature = NatureHouse(XinXi_Two) # 获取小区住宅性质(住宅、商业性)
Write_File(Nature, 1,Xu)
print("---------小区性质写入成功---------")
time.sleep(3)
Status = StatusHouse(XinXi_Two) # 获取小区状态(在售)
Write_File(Status, 2,Xu)
print("---------小区状态写入成功---------")
time.sleep(3)
Address = AddressHouse(XinXi_Two) # 获取小区地址
Write_File(Address, 3,Xu)
print("---------小区地址写入成功---------")
time.sleep(3)
Area = AreaHouse(XinXi_Two) # 获取小区房屋面积
Write_File(Area, 4,Xu)
print("---------小区面积写入成功---------")
time.sleep(3)
Average = AveragePriceHouse(XinXi_Two) # 均价
Write_File(Average, 5,Xu)
print("---------小区均价写入成功---------")
time.sleep(3)
Total = TotalPriceHouse(XinXi_Two) # 总价
Write_File(Total, 6,Xu)
print("---------小区总价写入成功---------")
time.sleep(3) Xu += 1 # 调用不同函数获取不同信息 def Write_File(Data, lei,Hang):
data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(data)
table = ws.get_sheet(0)
Shu = Hang * 10
for i in range(len(Data)):
table.write(i + 1 + Shu, lei, Data[i])
print("----第{}项写入成功----".format(i))
ws.save(r"F:\实例\Python实例\爬虫\111.xls") def GetName(XinXi):
"""
@param XinXi: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return: 返回小区名称,列表类型
"""
Nmae_list = []
# 获取小区名称
Obtain_Name_One = BeautifulSoup(str(XinXi), 'lxml')
Name_One = Obtain_Name_One.findAll(class_="name")
for i in Name_One:
Get_A = BeautifulSoup(str(i), 'lxml')
Nmae_list.append(Get_A.string)
return Nmae_list """
代码以及目标信息均已获取,通过不同函数将html代码在对应函数中逐一进行解析获取函数对应信息并保存即可
以下为部分函数,其他函数未定义 """
def NatureHouse(Nature):
"""房屋性质"""
Nature_list = []
Obtain_Nature = BeautifulSoup(str(Nature), 'lxml')
Nature_one = Obtain_Nature.find_all(class_='resblock-type')
for i in Nature_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Nature_list.append(Get_Span.string)
return Nature_list def StatusHouse(Status):
"""房屋状态"""
Status_list = []
Obtain_Nature = BeautifulSoup(str(Status), 'lxml')
Status_one = Obtain_Nature.find_all(class_='sale-status')
for i in Status_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Status_list.append(Get_Span.string)
return Status_list def AddressHouse(Area):
""" @param Area:传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return:
Analysis_Label_xxx:分析标签,xxx:代表第几次分析
Target_Information_xxx:目标信息,xxx:代表第几个信息部分,总共分为两部分,以及一个整体信息存储列表Target_Information_list
"""
#获取标签
Target_Information_list = []
Analysis_Label_One = BeautifulSoup(str(Area), 'lxml')
# 获取div标签,calss=resblock-location
Get_label_One = Analysis_Label_One.find_all(class_='resblock-location')
#解析标签并获得span标签
Analysis_Label_Two = BeautifulSoup(str(Get_label_One), 'lxml')
Get_label_Two = Analysis_Label_Two.find_all(name='span') #获取span标签里面的文字内容并保存在列表内 #第一个
Target_Information_One = []
for i in Get_label_Two:
#使用正则表达式取出内部信息并保存在列表中
Information_Str = re.sub(r'<.*?>','',str(i))
Target_Information_One.append(Information_Str)
#将列表内相同小区的地址进行合并,使用循环嵌套获取内容、合并最后保存在列表内
i = 1
a = 0 #第二个,第二个信息是在第一个信息的基础上合并列表内的元素得来
Target_Information_Two = []
while i <= len(Target_Information_One):
while a < i:
#将Target_Information_One中每两项进行合并
Information_Two = Target_Information_One[a]
Information_One = Target_Information_One[i]
Information_Three = Information_One + Information_Two Target_Information_Two.append(Information_Three)
a += 2
i += 2 #获取详细地址 #第三个
Target_Information_Three = []
Span_html_One = Analysis_Label_Two.find_all(name='a')
for c in Span_html_One:
Area_Str_1 = re.sub(r'<.*?>', '', str(c))
Target_Information_Three.append(Area_Str_1) # 将Target_Information_Two和Target_Information_Three两个列表中的各项元素分别进行合并并保存在Area_list列表中
A = min(len(Target_Information_Two),len(Target_Information_Three))
for i in range(A):
Target_Information_list.append(Target_Information_Two[i] + Target_Information_Three[i]) return Target_Information_list def AreaHouse(Area):
""" @param Area: 传入GetHTML函数第二步中筛选出的div标签下的html代码以及目标信息
@return: 返回房屋房间数量以及房屋总面积
"""
Area_list = []
#筛选目标信息的父标签
Obtain_Area_One = BeautifulSoup(str(Area), 'lxml')
Area_one = Obtain_Area_One.find_all(class_='resblock-room') #通过正则表达式去除多余的html标签信息
Get_Area_One = []
for c in Area_one:
Area_Str_1 = re.sub(r'<.*?>', '', str(c))
Get_Area_One.append(Area_Str_1) #通过正则表达式去除多余的换行符
Get_Area_Two = []
for i in Get_Area_One:
Area_Str_2 = re.sub(r'\s+','',str(i))
Get_Area_Two.append(Area_Str_2) #开始获取房屋总面积
Obtain_Area_Two = BeautifulSoup(str(Area),'lxml')
Area_two = Obtain_Area_Two.find_all(class_='resblock-area')
#通过正则表达式去除多余的html标签信息
Get_Area_Three = []
for a in Area_two:
Area_Str_3 = re.sub(r'<.*?>', '', str(a))
Get_Area_Three.append(Area_Str_3) # 通过正则表达式去除多余的换行符
Get_Area_Four = []
for r in Get_Area_Three:
Area_Str_4 = re.sub(r'\s+', '', str(r))
Get_Area_Four.append(Area_Str_4) # 将Get_Area_Two和Get_Area_Four两个列表中的各项元素分别进行合并并保存在Area_list列表中
A = min(len(Get_Area_Two), len(Get_Area_Four))
for i in range(A):
Area_list.append(Get_Area_Two[i] + Get_Area_Four[i]) return Area_list def AveragePriceHouse(Average):
"""
房屋均价
@param Average:
@return:
"""
Average_list = []
Obtain_Average = BeautifulSoup(str(Average), 'lxml')
Average_one = Obtain_Average.find_all(class_='number')
for i in Average_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Average_list.append(Get_Span.string) return Average_list def TotalPriceHouse(Total):
"""
房屋总价 @param Total:
@return:
"""
Total_list = []
Obtain_Total = BeautifulSoup(str(Total), 'lxml')
Total_one = Obtain_Total.fjind_all(class_='second')
for i in Total_one:
Get_Span = BeautifulSoup(str(i), 'lxml')
Get_Span_one = Get_Span.string
Get_Span_two = Get_Span_one.lstrip('总价')
Total_list.append(Get_Span_two) return Total_list def Create_File():
name = ['名称','性质','状态','地址','面积','均价','总价',]
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('shett1', cell_overwrite_ok=True)
for i in range(len(name)):
sheet.write(0, i, name[i])
workbook.save(r'F:\实例\Python实例\爬虫\111.xls')
print("文件创建成功") if __name__ == '__main__':
url = "https://qd.fang.lianjia.com/loupan/pg"
Create_File()
DataHtml = GetHTML(url) print("全部房产信息写入成功")

2、数据处理代码

 import pandas as pd
import xlrd
import re
import xlutils.copy
import matplotlib.pyplot as plt def ExceptNull():
"""
数据清洗第一步:去除表中空值
@param df: 传入读取的xls表格数据
@return: 保存数据后返回,
"""
df = pd.DataFrame(pd.read_excel(r'F:\实例\Python实例\爬虫\111.xls'))
#查找面积列空值,使用99999填充空缺值后删除所在行
print(df['面积'].isnull().value_counts())
df["面积"] = df["面积"].fillna('')
NullKey = df[(df.面积 == '')].index.tolist()
print(NullKey)
df = df.drop(NullKey)
print("*"*30)
print(df['面积'].isnull().value_counts()) print("*"*30)
#查找总价列空值,使用99999填充空缺值后删除所在行
print(df['总价'].isnull().value_counts())
df["总价"] = df["总价"].fillna('')
NullKey1 = df[(df.总价 == '')].index.tolist()
print(NullKey1)
df = df.drop(NullKey1)
print("*"*30)
print(df['总价'].isnull().value_counts())
df.to_excel('111.xls',index=False,encoding='uf-8') print("修改后数据保存成功") def LeiChuli():
Data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(Data)
Table = Data.sheet_by_name("Sheet1")
Nrows = Table.nrows
list_A = []
for i in range(1,Nrows):
A = Table.cell_value(i,6)
A_Str = re.sub('/套','',A,Nrows)
list_A.append(A_Str)
Replace = []
for i in range(len(list_A)):
Price_Str = list_A[i]
Last_Str = Price_Str[-1]
if Last_Str == '万':
A_Str = re.sub('万', '', Price_Str, 1)
Replace.append(A_Str)
else:
Replace.append(Price_Str)
table = ws.get_sheet(0)
for i in range(len(Replace)):
table.write(i + 1, 6, Replace[i])
print("------>开始写入修改后数据<------")
print("---->第{}项写入成功<----".format(i))
ws.save(r"F:\实例\Python实例\爬虫\111.xls")
print("------>数据写入完成<------") def Data_Analysis_One():
Data = xlrd.open_workbook(r"F:\实例\Python实例\爬虫\111.xls")
ws = xlutils.copy.copy(Data)
Table = Data.sheet_by_name("Sheet1")
Nrows = Table.nrows
a,b,c,d,e,f = 0,0,0,0,0,0 for i in range(1, Nrows):
A = Table.cell_value(i, 5)
if A == "价格待定":
f += 1
else:
if int(A) <= 5000:
a += 1
elif int(A) <= 10000:
b += 1
elif int(A) <= 15000:
c += 1
elif int(A) <= 20000:
d += 1
else:
e += 1 # 开始准备绘制饼状图 #价格区间数据准备
sizes = []
Percentage_a = (a / Nrows) * 100
sizes.append(int(Percentage_a))
Percentage_b = (b / Nrows) * 100
sizes.append(int(Percentage_b))
Percentage_c = (c / Nrows) * 100
sizes.append(int(Percentage_c))
Percentage_d = (d / Nrows) * 100
sizes.append(int(Percentage_d))
Percentage_e = (e / Nrows) * 100
sizes.append(int(Percentage_e))
Percentage_f = (f / Nrows) * 100
sizes.append(int(Percentage_f))
#设置占比说明
labels = '0-5000','5001-10000','10001-15000','15001-20000','20000-','Undetermined'
explode = (0,0,0.1,0,0,0)
#开始绘制
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)
plt.axis('equal')
plt.show()
ws.save(r"F:\实例\Python实例\爬虫\111.xls") if __name__ == '__main__':
# ExceptNull()
# LeiChuli()
Data_Analysis_One()

数据来源于链家青岛站部分数据,因为一些原因爬取结果可能不是完全符合预期。

转发请注明出处、欢迎指教、私信。