本文实例讲述了Python实现提取谷歌音乐搜索结果的方法。分享给大家供大家参考。具体如下:
Python的简单脚本,用于提取谷歌音乐搜索页面中的歌曲信息,包括歌曲名,作者,专辑名,现在链接等,最多只提取10页结果。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#! /usr/bin/env python
#coding=utf-8
'''
Created on 2011-8-19
@author: yaoboyuan
'''
from urllib import request,parse
import re,sys
def extractSongRawData(text):
'抓取每一首歌的原始数据'
text = re.sub( '\n+' ,'',text)
songList = re.findall( '\<tbody.*?\</tbody>' ,text)
nums = len (songList)
print ( 'search ' + str (nums) + ' songs' )
return songList
def translate(text):
'去掉text中的无用字符,转换unicode码'
text = re.sub( '\<b>' ,'',text)
text = re.sub( '\</b>' ,'',text)
#find the 成 and translate into chinese
s = re.findall( '&#([0-9]+);' ,text)
if len (s)< = 0 :
return text
else :
text = ''
for i in range ( len (s)):
value = int (s[i], 10 ) #from str'123' to 10 base's int 124
text + = chr (value) #from value to char
return text
def extractSongName(song):
'提取歌曲名字'
td = re.findall( '(?:\<td class\="Title).*(?:\</td>)' ,song)
name = re.findall( '.+?\<a.+?>(.*?)\</a>' ,td[ 0 ])
songName = translate(name[ 0 ])
return songName
def extractAuthorName(song):
'提取作者名字'
td = re.findall( '(?:\<td class\="Artist).*(?:\</td>)' ,song)
name = re.findall( '.+?\<a.+?>(.*?)\</a>' ,td[ 0 ])
authorName = name[ 0 ]
authorName = translate(authorName)
return authorName
def extrackAlbumName(song):
'提取专辑名字'
td = re.findall( '(?:\<td class\="Album).*(?:\</td>)' ,song)
name = re.findall( '.+?\<a.+?>(.*?)\</a>' ,td[ 0 ])
albumName = translate(name[ 0 ])
return albumName
def extractID(song):
'提取歌曲id'
td = re.findall( '''\<tbody id\="([a-zA-Z0-9]+)"''' ,song)
if len (td)> 0 :
return td[ 0 ]
else :
return song
def extractLink(song):
'提取歌曲下载链接'
td = re.findall( '''\<td class\="Icon.*?(?=title\="下载").*?onclick\="(.*?)>''' ,song)
if len (td) = = 0 :
return 'NULL'
s = str (td[ 0 ])
rawLink = re.findall( 'http.*?(?=\?)' ,s)
if len (rawLink) = = 0 :
return s
link = rawLink[ 0 ]
link = re.sub( '%3D' , '=' ,link)
id = extractID(song)
return link + '?id=' + id
def extractPageNums(text):
'提取返回结果的页数,最多要10页'
pageList = re.findall( 'page_link' ,text)
return len (pageList)
def extractSongInfo(song):
'提取歌曲信息,返回歌曲列表'
songList = []
for i in range ( len (song)):
songName = extractSongName(song[i])
authorName = extractAuthorName(song[i])
albumName = extrackAlbumName(song[i])
link = extractLink(song[i])
songItem = [songName,authorName,albumName,link]
songList.append(songItem)
index = ''
if i< 9 :
index = '0' + str (i + 1 )
else :
index = str (i + 1 )
#print(index + ' ' + songName + ' ' + authorName + ' ' + albumName + ' ' + link)
return songList
def main():
while True :
url = 'http://www.google.cn/music/search?q='
key = input ( '请输入歌曲名字或关键字:' )
key = parse.quote(key) #统一编码成utf-8
url + = key
mf = request.urlopen(url)
c = mf.readall()
c = str (c,encoding = 'utf-8' )
num = extractPageNums(c)
print ( str (num + 1 ) + ' pages found' )
song = extractSongRawData(c)
songList = extractSongInfo(song)
#if the result great than 2 pages, then request all pages
if num> 0 :
for i in range (num):
start = (i + 1 ) * 20
next_page = '&cat=song&start=%d' % (start)
#next_page = parse.quote(next_page) #统一编码成utf-8
url + = next_page
mf = request.urlopen(url)
c = mf.readall()
c = str (c,encoding = 'utf-8' )
song = extractSongRawData(c)
songList + = extractSongInfo(song) #find all results
for i in range ( len (songList)): #print the result
index = ''
if i< 9 :
index = '0' + str (i + 1 )
else :
index = str (i + 1 )
print (index + ' ' + str (songList[i]))
if __name__ = = '__main__' :
main()
|
希望本文所述对大家的Python程序设计有所帮助。