开发工具:python3.4
操作系统:win8
主要功能:去指定小说网页爬小说目录,按章节保存到本地,并将爬过的网页保存到本地配置文件。
被爬网站:http://www.cishuge.com/
小说名称:灵棺夜行
代码出处:本人亲自码的
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
import urllib.request
import http.cookiejar
import socket
import time
import re
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 10
time.sleep(sleep_download_time)
def makeMyOpener(head = {
'Connection' : 'Keep-Alive' ,
'Accept' : 'text/html, application/xhtml+xml, */*' ,
'Accept-Language' : 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3' ,
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}):
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
header = []
for key, value in head.items():
elem = (key, value)
header.append(elem)
opener.addheaders = header
return opener
def saveFile(save_path,txts):
f_obj = open (save_path, 'w+' )
for item in txts:
f_obj.write(item + '\n' )
f_obj.close()
#get_code_list
code_list = 'http://www.cishuge.com/read/0/771/'
oper = makeMyOpener()
uop = oper. open (code_list,timeout = 1000 )
data = uop.read().decode( 'gbk' , 'ignore' )
pattern = re. compile ( '<li><a href="(.*?)".*?>(.*?)</a></li>' ,re.S)
items = re.findall(pattern,data)
print ( '获取列表完成' )
url_path = 'url_file.txt'
url_r = open (url_path, 'r' )
url_arr = url_r.readlines( 100000 )
url_r.close()
print ( len (url_arr))
url_file = open (url_path, 'a' )
print ( '获取已下载网址' )
for tmp in items:
save_path = tmp[ 1 ].replace( ' ' ,' ')+' .txt'
url = code_list + tmp[ 0 ]
if url + '\n' in url_arr:
continue
print ( '写日志:' + url + '\n' )
url_file.write(url + '\n' )
opene = makeMyOpener()
op1 = opene. open (url,timeout = 1000 )
data = op1.read().decode( 'gbk' , 'ignore' )
opene.close()
pattern = re. compile ( ' (.*?)<br />' ,re.S)
txts = re.findall(pattern,data)
saveFile(save_path,txts)
url_file.close()
|
虽然代码还是有点瑕疵,还是分享给大家,一起改进