最近在学习Python,自然接触到了爬虫,写了一个小型爬虫软件,从初始Url解析网页,使用正则获取待爬取链接,使用beautifulsoup解析获取文本,使用自己写的输出器可以将文本输出保存,具体代码如下:
Spider_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# coding:utf8
from baike_spider import url_manager, html_downloader, html_parser, html_outputer
class SpiderMain( object ):
def __init__( self ):
self .urls = url_manager.UrlManager()
self .downloader = html_downloader.HtmlDownloader()
self .parser = html_parser.HtmlParser()
self .outputer = html_outputer.HtmlOutputer()
def craw( self , root_url):
count = 1
self .urls.add_new_url(root_url)
while self .urls.has_new_url():
print ( "self.urls.has %s" % self .urls.new_urls)
try :
new_url = self .urls.get_new_url()
print ( "craw %d : %s" % (count, new_url))
html_cont = self .downloader.download(new_url)
new_urls, new_data = self .parser.parse(new_url, html_cont)
self .urls.add_new_urls(new_urls)
self .outputer.collect_data(new_data)
if count = = 1000 :
break
count = count + 1
except :
print ( "craw failed" )
self .outputer.output_html()
self .outputer.output_txt()
if __name__ = = '__main__' :
root_url = "http://www.shushu8.com/jiangnan/longzu2qianzhuan/1"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
|
url_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
class UrlManager( object ):
def __init__( self ):
self .new_urls = set ()
self .old_urls = set ()
def add_new_url( self , url):
print (url)
if url is None :
return
if url not in self .new_urls and url not in self .old_urls:
self .new_urls.add(url)
def has_new_url( self ):
return len ( self .new_urls) ! = 0
def get_new_url( self ):
new_url = self .new_urls.pop()
self .old_urls.add(new_url)
# print('new url is %s' % new_url)
return new_url
def add_new_urls( self , urls):
print ( "add_new_urls %s" % urls)
if urls is None or len (urls) = = 0 :
return
for url in urls:
self .add_new_url(url)
print (url)
|
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import re
import urllib.parse
from bs4 import BeautifulSoup
class HtmlParser( object ):
def parse( self , page_url, html_cont):
if page_url is None or html_cont is None :
return
soup = BeautifulSoup(html_cont, 'html.parser' , from_encoding = 'utf-8' )
new_urls = self ._get_new_urls(page_url, soup)
print ( "parse new_urls %s" % new_urls)
new_data = self ._get_new_data(page_url, soup)
return new_urls, new_data
def _get_new_data( self , page_url, soup):
res_data = {}
res_data[ 'url' ] = page_url
print (page_url)
title_node = soup.find( class_ = "title" ).find( "h1" )
print (title_node.get_text())
res_data[ 'title' ] = title_node.get_text()
print ( "_get_new_data" )
summary_node = soup.find( 'pre' )
print (summary_node.get_text())
res_data[ 'summary' ] = summary_node.get_text()
return res_data
def _get_new_urls( self , page_url, soup):
new_urls = set ()
links = soup.find_all( 'a' , href = re. compile (r "/jiangnan/" ))
print (links)
for link in links:
new_url = link[ 'href' ]
new_full_url = urllib.parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
# print(new_full_url)
return new_urls
|
html_downloader.py
1
2
3
4
5
6
7
8
9
|
import urllib.request
class HtmlDownloader( object ):
def download( self , url):
if url is None :
return None
response = urllib.request.urlopen(url)
if response.getcode() ! = 200 :
return None
return response.read()
|
html_outputer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
class HtmlOutputer( object ):
def __init__( self ):
self .datas = []
def collect_data( self , data):
if data is None :
return
self .datas.append(data)
def output_txt( self ):
fout = open ( 'output.txt' , 'w' , encoding = 'utf-8' )
for data in self .datas:
fout.write( '%s \n' % data[ 'title' ])
fout.write( '%s \n' % data[ 'summary' ])
def output_html( self ):
fout = open ( 'output.html' , 'w' , encoding = 'utf-8' )
fout.write( '<html>' )
fout.write( '<body>' )
fout.write( '<table>' )
for data in self .datas:
fout.write( '<tr>' )
fout.write( '<td>%s</td>' % data[ 'url' ])
fout.write( '<td>%s</td>' % data[ 'title' ])
fout.write( '<td>%s</td>' % data[ 'summary' ])
fout.write( '</tr>' )
fout.write( '</table>' )
fout.write( '</body>' )
fout.write( '</html>' )
fout.close()
|
总结
以上所述是小编给大家介绍的Python实现爬虫从网络上下载文档的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持!
原文链接:https://www.cnblogs.com/hasan/archive/2018/06/12/9175592.html