安卓最美应用页面爬虫,爬虫很简单,设计的东西到挺多的
文件操作
正则表达式
字符串替换等等
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
import requests
import re
url = "http://zuimeia.com"
r = requests.get( 'http://zuimeia.com/community/app/hot/?platform=2' )
pattern = re. compile (r '<a class="community-app-cover-wrapper" href="(.*?)" target="_blank">' )
urlList = pattern.findall(r.content)
def requestsUrl(url):
r = requests.get(url)
title = re.findall(r '"app-title"><h1>(.*?)</h1>' ,r.content)
#print title
category = re.findall(r '<a class="app-tag" href="/community/app/category/title/.*?/?platform=2">(.*?)</a>' ,r.content)
#print category
describe = re.findall(r '<div id="article_content">(.*?)<div class="community-image-wrapper">' ,r.content)
#print type(describe[0])
strdescribe = srtReplace(describe[ 0 ])
#print strdescribe
downloadUrl = re.findall(r '<a class="download-button direct hidden" href="(.*?)"' ,r.content)
#print downloadUrl
return title,category,strdescribe,downloadUrl
def srtReplace(string):
listReplace = [ '<p>' , '<br>' , '<h1>' , '<h2>' , '<h3>' , '<h4>' , '<h5>' , '<h6>' , '<h7>' , '<strong>' , '</p>' , '<br/>' , '</h1>' , '</h2>' , '</h3>' , '</h4>' , '</h5>' ,
'</h6>' , '</h7>' , '</strong>' , '<b>' , '</b>' ]
for eachListReplace in listReplace:
string = string.replace( str (eachListReplace), '\n' )
string = string.replace( '\n\n' ,'')
return string
def categornFinal(category):
categoryFinal = ''
for eachCategory in category:
categoryFinal = categoryFinal + str (eachCategory) + '-->'
return categoryFinal
def urlReplace(url):
url = url.replace( '&' , '&' )
return url
requestsUrl( "http://zuimeia.com/community/app/27369/?platform=2" )
for eachUrl in urlList:
eachUrl = url + eachUrl
content = requestsUrl(eachUrl)
categoryFinal = ''
title = content[ 0 ][ 0 ]
category = categornFinal(content[ 1 ])
strdescribe = content[ 2 ]
downloadUrl = urlReplace(content[ 3 ][ 0 ])
with open ( 'c:/wqa.txt' , 'a+' ) as fd:
fd.write( 'title:' + title + '\n' + 'category:' + category + '\n' + 'strdescribe:' + strdescribe + '\n' + 'downloadUrl:' + downloadUrl + '\n\n\n-----------------------------------------------------------------------------------------------------------------------------\n\n\n' )
|