本文实例讲述了Python实现的爬虫功能。分享给大家供大家参考,具体如下:
主要用到urllib2、BeautifulSoup模块
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
#encoding=utf-8
import re
import requests
import urllib2
import datetime
import MySQLdb
from bs4 import BeautifulSoup
import sys
reload (sys)
sys.setdefaultencoding( "utf-8" )
class Splider( object ):
def __init__( self ):
print u '开始爬取内容...'
##用来获取网页源代码
def getsource( self ,url):
headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2652.0 Safari/537.36' }
req = urllib2.Request(url = url,headers = headers)
socket = urllib2.urlopen(req)
content = socket.read()
socket.close()
return content
##changepage用来生产不同页数的链接
def changepage( self ,url,total_page):
now_page = int (re.search( 'page/(\d+)' ,url,re.S).group( 1 ))
page_group = []
for i in range (now_page,total_page + 1 ):
link = re.sub( 'page/(\d+)' , 'page/%d' % i,url,re.S)
page_group.append(link)
return page_group
#获取字内容
def getchildrencon( self ,child_url):
conobj = {}
content = self .getsource(child_url)
soup = BeautifulSoup(content, 'html.parser' , from_encoding = 'utf-8' )
content = soup.find( 'div' ,{ 'class' : 'c-article_content' })
img = re.findall( 'src="(.*?)"' , str (content),re.S)
conobj[ 'con' ] = content.get_text()
conobj[ 'img' ] = ( ';' ).join(img)
return conobj
##获取内容
def getcontent( self ,html_doc):
soup = BeautifulSoup(html_doc, 'html.parser' , from_encoding = 'utf-8' )
tag = soup.find_all( 'div' ,{ 'class' : 'promo-feed-headline' })
info = {}
i = 0
for link in tag:
info[i] = {}
title_desc = link.find( 'h3' )
info[i][ 'title' ] = title_desc.get_text()
post_date = link.find( 'div' ,{ 'class' : 'post-date' })
pos_d = post_date[ 'data-date' ][ 0 : 10 ]
info[i][ 'content_time' ] = pos_d
info[i][ 'source' ] = 'whowhatwear'
source_link = link.find( 'a' ,href = re. compile (r "section=fashion-trends" ))
source_url = 'http://www.whowhatwear.com' + source_link[ 'href' ]
info[i][ 'source_url' ] = source_url
in_content = self .getsource(source_url)
in_soup = BeautifulSoup(in_content, 'html.parser' , from_encoding = 'utf-8' )
soup_content = in_soup.find( 'section' ,{ 'class' : 'widgets-list-content' })
info[i][ 'content' ] = soup_content.get_text().strip( '\n' )
text_con = in_soup.find( 'section' ,{ 'class' : 'text' })
summary = text_con.get_text().strip( '\n' ) if text_con.text ! = None else NULL
info[i][ 'summary' ] = summary[ 0 : 200 ] + '...' ;
img_list = re.findall( 'src="(.*?)"' , str (soup_content),re.S)
info[i][ 'imgs' ] = ( ';' ).join(img_list)
info[i][ 'create_time' ] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S" )
i + = 1
#print info
#exit()
return info
def saveinfo( self ,content_info):
conn = MySQLdb.Connect(host = '127.0.0.1' ,user = 'root' ,passwd = '123456' ,port = 3306 ,db = 'test' ,charset = 'utf8' )
cursor = conn.cursor()
for each in content_info:
for k,v in each.items():
sql = "insert into t_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values ('%s','%s','%s','%s','%s','%s','%s','%s')" % (MySQLdb.escape_string(v[ 'title' ]),MySQLdb.escape_string(v[ 'summary' ]),MySQLdb.escape_string(v[ 'content' ]),v[ 'content_time' ],v[ 'imgs' ],v[ 'source' ],v[ 'source_url' ],v[ 'create_time' ])
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
if __name__ = = '__main__' :
classinfo = []
p_num = 5
url = 'http://www.whowhatwear.com/section/fashion-trends/page/1'
jikesplider = Splider()
all_links = jikesplider.changepage(url,p_num)
for link in all_links:
print u '正在处理页面:' + link
html = jikesplider.getsource(link)
info = jikesplider.getcontent(html)
classinfo.append(info)
jikesplider.saveinfo(classinfo)
|
希望本文所述对大家Python程序设计有所帮助。