目的是下载一个网站所有分类的小说,并且按照分类自动创建目录、根据小说名保存为txt文件。
一、抓取思路:
我的思路是百度一个小说网站,找到一本小说的章节页,使用requests、BeautifulSoup测试是否能
正常下载。如果能正常下载,进行下一步。
二、操作顺序:
1.导入模块,指定网页请求头:
from bs4 import BeautifulSoup import requests import time import os import random my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" ] headers = { \'User-Agent\':random.choice(my_headers)#随机选取模拟浏览器 }
2.获取一本小说的章节页,并写入指定路径:
#url = \'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml\' data = requests.get(url,headers=headers) time.sleep(2) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\')#注意.encode(\'ISO-8859-1\').decode(\'GB18030\')的用法
text = soup.select(\'div.zhangjieTXT\')[0].text title2 = soup.select(\'div.zhangjieming > h1\')[0].text
ls = [] for i in text:
if i in "\'\r\',\'ads_wz_2();\',\'\xa0\',\'\t\',\'\n\',\'“\',\'\t\',\'■\', \'◆\', \'n\', \'■\', \'◆\', \'h\', \'u\', \'■\', \'◆\', \'b\', \',\', \'∧\', \'n\', \'♀\', \'※\',":
continue
ls.append(i)
text =\'\'.join(ls) print(\'正在下载{}\'.format(title2))
with open(\'.\\books\\\' + \'title1\' + \'.txt\',\'ab+\') as f:
f.write((title2 + \'\r\n\').encode()) #写入标题
f.write(text.encode())#写入正文
f.write(\'\r\n\r\n\'.encode())#写入换行
3.获取一本小说的全部章节链接:
def get_urls(url,fenlei_title): #url = \'http://www.fhxiaoshuo.com/read/3/3414/\' data = requests.get(url,headers=headers) time.sleep(2) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') title1 = soup.select(\'div#maininfo > div > h1\')[0].text if not os.path.exists(\'.\\books\\\' + fenlei_title + \'\\\'+ title1): os.mkdir(\'.\\books\\\' + fenlei_title + \'\\\'+ title1) links = soup.select(\'div#list > dl\') print("正在下载{}".format(title1)) #ls = [] for i in links: data = i.select(\'dd > a\') time.sleep(2) for m in data: url = m.get(\'href\') #ls.append(ls) get_text(url,title1,fenlei_title)
4.获取一个分类,比如武侠类的全部小说:
def get_list(url,fenlei_title): #url = \'http://www.fhxiaoshuo.com/sort/5/1/\' data = requests.get(url,headers=headers) time.sleep(1) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') links = soup.select(\'div#alist\') for i in links: data = i.select(\'div.info > div.title > h2 > a\') for m in data: url = m.get(\'href\') time.sleep(3) title = m.text get_urls(url,fenlei_title)
5.获取首页全部分类链接:
def get_fenlei(): url = \'http://www.fhxiaoshuo.com/\' data = requests.get(url,headers=headers) time.sleep(0.5) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') links = soup.select(\'div.nav1 > ul\') for i in links: data = i.select(\'li > a\') for m in data: url = m.get(\'href\') time.sleep(1) fenlei_title = m.text if not os.path.exists(\'.\\books\\\' + fenlei_title): os.mkdir(\'.\\books\\\' + fenlei_title) get_list(url, fenlei_title)
三、全部代码如下(使用time.sleep()保障网页请求):
#!/usr/bin/env python # -*- coding:utf-8 -*- #Author: ss from bs4 import BeautifulSoup import requests import time import os import random my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" ] headers = { \'User-Agent\':random.choice(my_headers) } def get_text(url,title1,fenlei_title): #url = \'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml\' data = requests.get(url,headers=headers) time.sleep(2) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') text = soup.select(\'div.zhangjieTXT\')[0].text title2 = soup.select(\'div.zhangjieming > h1\')[0].text ls = [] for i in text: if i in "\'\r\',\'ads_wz_2();\',\'\xa0\',\'\t\',\'\n\',\'“\',\'\t\',\'■\', \'◆\', \'n\', \'■\', \'◆\', \'h\', \'u\', \'■\', \'◆\', \'b\', \',\', \'∧\', \'n\', \'♀\', \'※\',": continue ls.append(i) text =\'\'.join(ls) print(\'正在下载{}\'.format(title2)) with open(\'.\\books\\\' + fenlei_title + \'\\\' +title1 +\'\\\' + title1 + \'.txt\',\'ab+\') as f: f.write((title2 + \'\r\n\').encode()) f.write(text.encode()) f.write(\'\r\n\r\n\'.encode()) def get_urls(url,fenlei_title): #url = \'http://www.fhxiaoshuo.com/read/3/3414/\' data = requests.get(url,headers=headers) time.sleep(2) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') title1 = soup.select(\'div#maininfo > div > h1\')[0].text if not os.path.exists(\'.\\books\\\' + fenlei_title + \'\\\'+ title1): os.mkdir(\'.\\books\\\' + fenlei_title + \'\\\'+ title1) links = soup.select(\'div#list > dl\') print("正在下载{}".format(title1)) #ls = [] for i in links: data = i.select(\'dd > a\') time.sleep(2) for m in data: url = m.get(\'href\') #ls.append(ls) get_text(url,title1,fenlei_title) def get_list(url,fenlei_title): #url = \'http://www.fhxiaoshuo.com/sort/5/1/\' data = requests.get(url,headers=headers) time.sleep(1) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') links = soup.select(\'div#alist\') for i in links: data = i.select(\'div.info > div.title > h2 > a\') for m in data: url = m.get(\'href\') time.sleep(3) title = m.text get_urls(url,fenlei_title) def get_fenlei(): url = \'http://www.fhxiaoshuo.com/\' data = requests.get(url,headers=headers) time.sleep(0.5) soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'GB18030\'),\'lxml\') links = soup.select(\'div.nav1 > ul\') for i in links: data = i.select(\'li > a\') for m in data: url = m.get(\'href\') time.sleep(1) fenlei_title = m.text if not os.path.exists(\'.\\books\\\' + fenlei_title): os.mkdir(\'.\\books\\\' + fenlei_title) get_list(url, fenlei_title) get_fenlei()
asd