本文实例讲述了Python实现登录人人网并抓取新鲜事的方法。分享给大家供大家参考。具体如下:
这里演示了Python登录人人网并抓取新鲜事的方法(抓取后的排版不太美观~~)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
from sgmllib import SGMLParser
import sys,urllib2,urllib,cookielib
class spider(SGMLParser):
def __init__( self ,email,password):
SGMLParser.__init__( self )
self .h3 = False
self .h3_is_ready = False
self .div = False
self .h3_and_div = False
self .a = False
self .depth = 0
self .names = ""
self .dic = {}
self .email = email
self .password = password
self .domain = 'renren.com'
try :
cookie = cookielib.CookieJar()
cookieProc = urllib2.HTTPCookieProcessor(cookie)
except :
raise
else :
opener = urllib2.build_opener(cookieProc)
urllib2.install_opener(opener)
def login( self ):
url = 'http://www.renren.com/PLogin.do'
postdata = {
'email' : self .email,
'password' : self .password,
'domain' : self .domain
}
req = urllib2.Request(
url,
urllib.urlencode(postdata)
)
self . file = urllib2.urlopen(req).read()
#print self.file
def start_h3( self ,attrs):
self .h3 = True
def end_h3( self ):
self .h3 = False
self .h3_is_ready = True
def start_a( self ,attrs):
if self .h3 or self .div:
self .a = True
def end_a( self ):
self .a = False
def start_div( self ,attrs):
if self .h3_is_ready = = False :
return
if self .div = = True :
self .depth + = 1
for k,v in attrs:
if k = = 'class' and v = = 'content' :
self .div = True ;
self .h3_and_div = True #h3 and div is connected
def end_div( self ):
if self .depth = = 0 :
self .div = False
self .h3_and_div = False
self .h3_is_ready = False
self .names = ""
if self .div = = True :
self .depth - = 1
def handle_data( self ,text):
#record the name
if self .h3 and self .a:
self .names + = text
#record says
if self .h3 and ( self .a = = False ):
if not text: pass
else : self .dic.setdefault( self .names,[]).append(text)
return
if self .h3_and_div:
self .dic.setdefault( self .names,[]).append(text)
def show( self ):
type = sys.getfilesystemencoding()
for key in self .dic:
print ( (' '.join(key)).replace(' ',' ')).decode(' utf - 8 ').encode( type ), \
( (' '.join(self.dic[key])).replace(' ',' ')).decode(' utf - 8 ').encode( type )
renrenspider = spider( 'your email' , 'your password' )
renrenspider.login()
renrenspider.feed(renrenspider. file )
renrenspider.show()
|
希望本文所述对大家的Python程序设计有所帮助。