Python——爬取B站科技区排行并把数据存入Excel
本代码可将B站的科技区的top100爬取下来,并将其数据存储到Excel文件中,具有良好的可移植性。
需要的库:
- requests 用于请求连接到特定网站
- BeautifulSoup 分析,处理得到的HTML代码
- xlwt 将得到的数据存入Excel
- time 延时
下载
直接在命令行中输入pip install+库名即可自动从网上下载安装
代码
import requests
import time
import xlwt
from bs4 import BeautifulSoup
workbook=xlwt.Workbook()
booksheet=workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
#新建Excel文件和工作表
be=BeautifulSoup
def pa():
r = requests.get('/ranking/all/36/1/3/?spm_id_from=333.334.ranking_technology.8')
#传入URL
soup = be(r.text, '')
pm_list = soup.find_all('li', attrs = {'class': 'rank-item'})
# 解析URL
for pm in pm_list:
mc=pm.find('div',class_='num').text
mz=pm.find('a',class_='title').text
L.append(mc)
M.append(mz)
#将数据存入数组中
for x in range(100):
L=[]
M=[]
pa()
for i in range(len(L)):
booksheet.write(i,0,L[i])
booksheet.write(i,x,M[i])
workbook.save('')
#将数组中的数据存入当前目录下的Excel
time.sleep(60)#每分钟爬取一次
更新
加了用时和爬取次数,以后添加数据变化显示
import requests
import time
import xlwt
import datetime
import sys
from bs4 import BeautifulSoup
workbook=xlwt.Workbook()
booksheet=workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
be=BeautifulSoup
for i in range(1,101):
booksheet.write(i,0,i)
def pa():
r = requests.get('/ranking/all/36/1/3/?spm_id_from=333.334.ranking_technology.8')
soup = be(r.text, '')
pm_list = soup.find_all('li', attrs = {'class': 'rank-item'})
for pm in pm_list:
mc=pm.find('div',class_='num').text
mz=pm.find('a',class_='title').text
L.append(mc)
M.append(mz)
mi=0
d1=datetime.datetime.now()
for x in range(100000):
d2=datetime.datetime.now()
booksheet.write(0,x+1,str(d2)[11:19])
L=[]
M=[]
pa()
for i in range(len(L)):
booksheet.write(i+1,x+1,M[i])
mi+=1
sec=str(round((d2-d1).total_seconds()))
time='已爬取'+str(mi)+'次,用时'+sec+'sec'
sys.stdout.write("\r%s"%time)
workbook.save('')
time.sleep(60)