引言
写这个小爬虫主要是为了爬校园论坛上的实习信息,主要采用了Requests库
源码
URLs.py
主要功能是根据一个初始url(包含page页面参数)来获得page页面从当前页面数到pageNum的url列表
1
2
3
4
5
6
7
8
9
10
11
12
|
import re
def getURLs(url, attr, pageNum = 1 ):
all_links = []
try :
now_page_number = int (re.search(attr + '=(\d+)' , url, re.S).group( 1 ))
for i in range (now_page_number, pageNum + 1 ):
new_url = re.sub(attr + '=\d+' , attr + '=%s' % i, url, re.S)
all_links.append(new_url)
return all_links
except TypeError:
print "arguments TypeError:attr should be string."
|
uni_2_native.py
由于论坛上爬取得到的网页上的中文都是unicode编码的形式,文本格式都为 &#XXXX;的形式,所以在爬得网站内容后还需要对其进行转换
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
import sys
import re
reload (sys)
sys.setdefaultencoding( 'utf-8' )
def get_native(raw):
tostring = raw
while True :
obj = re.search( '&#(.*?);' , tostring, flags = re.S)
if obj is None :
break
else :
raw, code = obj.group( 0 ), obj.group( 1 )
tostring = re.sub(raw, unichr ( int (code)), tostring)
return tostring
|
存入SQLite数据库:saveInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
# -*- coding: utf-8 -*-
import MySQLdb
class saveSqlite():
def __init__( self ):
self .infoList = []
def saveSingle( self , author = None , title = None , date = None , url = None ,reply = 0 , view = 0 ):
if author is None or title is None or date is None or url is None :
print "No info saved!"
else :
singleDict = {}
singleDict[ 'author' ] = author
singleDict[ 'title' ] = title
singleDict[ 'date' ] = date
singleDict[ 'url' ] = url
singleDict[ 'reply' ] = reply
singleDict[ 'view' ] = view
self .infoList.append(singleDict)
def toMySQL( self ):
conn = MySQLdb.connect(host = 'localhost' , user = 'root' , passwd = ' ', port=3306, db=' db_name ', charset=' utf8')
cursor = conn.cursor()
# sql = "select * from info"
# n = cursor.execute(sql)
# for row in cursor.fetchall():
# for r in row:
# print r
# print '\n'
sql = "delete from info"
cursor.execute(sql)
conn.commit()
sql = "insert into info(title,author,url,date,reply,view) values (%s,%s,%s,%s,%s,%s)"
params = []
for each in self .infoList:
params.append((each[ 'title' ], each[ 'author' ], each[ 'url' ], each[ 'date' ], each[ 'reply' ], each[ 'view' ]))
cursor.executemany(sql, params)
conn.commit()
cursor.close()
conn.close()
def show( self ):
for each in self .infoList:
print "author: " + each[ 'author' ]
print "title: " + each[ 'title' ]
print "date: " + each[ 'date' ]
print "url: " + each[ 'url' ]
print "reply: " + str (each[ 'reply' ])
print "view: " + str (each[ 'view' ])
print '\n'
if __name__ = = '__main__' :
save = saveSqlite()
save.saveSingle( '网' , 'aaa' , '2008-10-10 10:10:10' , 'www.baidu.com' , 1 , 1 )
# save.show()
save.toMySQL()
|
主要爬虫代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
import requests
from lxml import etree
from cc98 import uni_2_native, URLs, saveInfo
# 根据自己所需要爬的网站,伪造一个header
headers = {
'Accept' : '',
'Accept-Encoding' : '',
'Accept-Language' : '',
'Connection' : '',
'Cookie' : '',
'Host' : '',
'Referer' : '',
'Upgrade-Insecure-Requests' : '',
'User-Agent' : ''
}
url = 'http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98 = 'http://www.cc98.org/'
print "get infomation from cc98..."
urls = URLs.getURLs(url, "page" , 50 )
savetools = saveInfo.saveSqlite()
for url in urls:
r = requests.get(url, headers = headers)
html = uni_2_native.get_native(r.text)
selector = etree.HTML(html)
content_tr_list = selector.xpath( '//form/table[@class="tableborder1 list-topic-table"]/tbody/tr' )
for each in content_tr_list:
href = each.xpath( './td[2]/a/@href' )
if len (href) = = 0 :
continue
else :
# print len(href)
# not very well using for, though just one element in list
# but I don't know why I cannot get the data by index
for each_href in href:
link = cc98 + each_href
title_author_time = each.xpath( './td[2]/a/@title' )
# print len(title_author_time)
for info in title_author_time:
info_split = info.split( '\n' )
title = info_split[ 0 ][ 1 : len (info_split[ 0 ]) - 1 ]
author = info_split[ 1 ][ 3 :]
date = info_split[ 2 ][ 3 :]
hot = each.xpath( './td[4]/text()' )
# print len(hot)
for hot_num in hot:
reply_view = hot_num.strip().split( '/' )
reply, view = reply_view[ 0 ], reply_view[ 1 ]
savetools.saveSingle(author = author, title = title, date = date, url = link, reply = reply, view = view)
print "All got! Now saving to Database..."
# savetools.show()
savetools.toMySQL()
print "ALL CLEAR! Have Fun!"
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/qq_22187919/article/details/60466283