from lxml import etree
import requests
import re
# href = "(.*?)" title = "(.*?)"</a>
class Novel_Data(object):
def __init__(self):
self.url = "https://www.17k.com/list/3495367.html"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/94.0.4606.71 Safari/537.36 SE 2.X MetaSr 1.0 "
}
def get_data_index(self):
response = requests.get(self.url, headers=self.headers)
if response.status_code == 200:
return response.text
else:
return None
def parse_novel_data_index(self, r):
response1 = re.sub(r'\s*', '', r)
response2 = re.search(r'正文.*', response1).group()
response3 = re.findall(r'_blank"href="(.*?)"title="(.*?)
', response2)
for data in response3[:-1]:
href, name = data
new_href = "https://www.17k.com" + href
self.get_novel_content(new_href, name)
def get_novel_content(self, new_href, name):
response = requests.get(new_href, headers=self.headers)
response.encoding = "utf-8"
res = response.text
result = re.sub(r'\s*', '', res)
txt_list = re.findall(r'[\s]*<p>(.*?)</p><pclass="copy">', result, re.S)
for data in txt_list:
s = data.replace('</p><p>', '\n')
self.write_txt(name, s)
print(name, "保存成功")
def write_txt(self, name, s):
with open("./zxy/" + name + ".txt", "a", encoding="utf-8")as f:
f.write(s)
def run(self):
response = self.get_data_index()
self.parse_novel_data_index(response)
if __name__ == '__main__':
novel = Novel_Data()
novel.run()