Python爬虫——小说

时间:2023-01-18 15:16:13

#encoding:utf8

import re

import urllib2

url = 'http://www.23us.com/html/55/55304/'

request = urllib2.Request(url)

response = urllib2.urlopen(request)

content = response.read().decode('gbk')

the_url = re.compile('<td class=\"L"\><a href=\"(.*?)"\>.*?</a></td>',re.S) last_url = the_url.findall(content)

for i in last_url:

print i

url = 'http://www.23us.com/html/55/55304/'+i

request = urllib2.Request(url)

response = urllib2.urlopen(request)

zhi = response.read()

code = re.compile('.*?content="text.html; charset=(.*?)".*?',re.S)

last_code = code.findall(zhi)[0]

try:

content = zhi.decode(''+last_code)

except:

try:

content = zhi.decode('gb2312')

except:

continue

last_content = re.compile('<title>(.*?)</title>.*?<dd id="contents">(.*?)</dd>',re.S)

last_content = last_content.findall(content)

if last_content==[]:

print '采集失败'

print content

for I,J in last_content:

J = J.replace('&nbsp;','').replace('<br/> <br/>','\n')

file = open('小说.txt','a+')

t = '\n\n\t\t' + I + '\n\n' + '\t' + J

file.write(t.encode('utf-8'))

file.close()