本文实例讲述了Python实现爬虫爬取NBA数据功能。分享给大家供大家参考,具体如下:
爬取的网站为:stat-nba.com,这里爬取的是NBA2016-2017赛季常规赛至2017年1月7日的数据
改变url_header和url_tail即可爬取特定的其他数据。
源代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
#coding=utf-8
import sys
reload (sys)
sys.setdefaultencoding( 'utf-8' )
import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *
def getURLLists(url_header,url_tail,pages):
"""
获取所有页面的URL列表
"""
url_lists = []
url_0 = url_header + '0' + url_tail
print url_0
url_lists.append(url_0)
for i in range ( 1 ,pages + 1 ):
url_temp = url_header + str (i) + url_tail
url_lists.append(url_temp)
return url_lists
def getNBAAllData(url_lists):
"""
获取所有2017赛季NBA常规赛数据
"""
datasets = ['']
for item in url_lists:
data1 = getNBASingleData(item)
datasets.extend(data1)
#去掉数据里的空元素
for item in datasets[:]:
if len (item) = = 0 :
datasets.remove(item)
return datasets
def getNBASingleData(url):
"""
获取1个页面NBA常规赛数据
"""
# url = 'http://stat-nba.com/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
# html = requests.get(url).text
html = urllib.urlopen(url).read()
# print html
soup = BeautifulSoup(html)
data = soup.html.body.find( 'tbody' ).text
list_data = data.split( '\n' )
# with open('nba_data.txt','a') as fp:
# fp.write(data)
# for item in list_data[:]:
# if len(item) == 0:
# list_data.remove(item)
return list_data
def saveDataToExcel(datasets,sheetname,filename):
book = Workbook()
sheet = book.add_sheet(sheetname)
sheet.write( 0 , 0 ,u '序号' )
sheet.write( 0 , 1 ,u '球队' )
sheet.write( 0 , 2 ,u '时间' )
sheet.write( 0 , 3 ,u '结果' )
sheet.write( 0 , 4 ,u '主客' )
sheet.write( 0 , 5 ,u '比赛' )
sheet.write( 0 , 6 ,u '投篮命中率' )
sheet.write( 0 , 7 ,u '命中数' )
sheet.write( 0 , 8 ,u '出手数' )
sheet.write( 0 , 9 ,u '三分命中率' )
sheet.write( 0 , 10 ,u '三分命中数' )
sheet.write( 0 , 11 ,u '三分出手数' )
sheet.write( 0 , 12 ,u '罚球命中率' )
sheet.write( 0 , 13 ,u '罚球命中数' )
sheet.write( 0 , 14 ,u '罚球出手数' )
sheet.write( 0 , 15 ,u '篮板' )
sheet.write( 0 , 16 ,u '前场篮板' )
sheet.write( 0 , 17 ,u '后场篮板' )
sheet.write( 0 , 18 ,u '助攻' )
sheet.write( 0 , 19 ,u '抢断' )
sheet.write( 0 , 20 ,u '盖帽' )
sheet.write( 0 , 21 ,u '失误' )
sheet.write( 0 , 22 ,u '犯规' )
sheet.write( 0 , 23 ,u '得分' )
num = 24
row_cnt = 0
data_cnt = 0
data_len = len (datasets)
print 'data_len:' ,data_len
while (data_cnt< data_len):
row_cnt + = 1
print '序号:' ,row_cnt
for col in range (num):
# print col
sheet.write(row_cnt,col,datasets[data_cnt])
data_cnt + = 1
book.save(filename)
def writeDataToTxt(datasets):
fp = open ( 'nba_data.txt' , 'w' )
line_cnt = 1
for i in range ( len (datasets) - 1 ):
#球队名称对齐的操作:如果球队名字过短或者为76人队是 球队名字后面加两个table 否则加1个table
if line_cnt % 24 = = 2 and len (datasets[i]) < 5 or datasets[i] = = u '费城76人' :
fp.write(datasets[i] + '\t\t' )
else :
fp.write(datasets[i] + '\t' )
line_cnt + = 1
if line_cnt % 24 = = 1 :
fp.write( '\n' )
fp.close()
if __name__ = = "__main__" :
pages = int ( 1132 / 150 )
url_header = 'http://stat-nba.com/query_team.php?page='
url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
url_lists = getURLLists(url_header,url_tail,pages)
datasets = getNBAAllData(url_lists)
writeDataToTxt(datasets)
sheetname = 'nba normal data 2016-2017'
str_time = time.strftime( '%Y-%m-%d' ,time.localtime(time.time()))
filename = 'nba_normal_data' + str_time + '.xls'
saveDataToExcel(datasets,sheetname,filename)
|
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/roytao2/article/details/54180494