爬虫 Python将网页内容保存为PDF(url转pdf) 譬如下载某个专栏下的全部文章
# -*- coding: utf-8 -*-
import requests
import re
import os
import json
import pdfkit
from collections import deque
HEADERS={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
# 这里还可以以字典形式给出别的请求头属性
}
# 如果配置了环境变量无法立即生效(大概是需要重启),可以通过这一行语句添加环境变量
os.environ["PATH"] += os.pathsep + r'D:\wkhtmltox\bin'
def getUrls(zhuanlan):
'''
:param zhuanlan: such as https://zhuanlan.zhihu.com/reinforcementlearning 传入的是最后这个reinforcementlearning
:return: 返回专栏下所有文章的url
'''
urls = []
# p_titles = []
offset = 0
while True:
url = 'https://zhuanlan.zhihu.com/api/columns/{}/articles?include=data&limit=100&offset={}'.format(zhuanlan, offset)
html_string = requests.get(url,headers=HEADERS).text
content = json.loads(html_string) # 获取的content可以加载为json格式
urls.extend([item['url'] for item in content['data']]) # 就可以用json的方式索引到所有的url
# p_titles.extend([item['title'] for item in content['data']]) # 获取标题
if len(content['data']) < 100: # 如果是最后一页
break
else:
offset += 100
return urls
def getUrls2(zhuanlan):
'''
:param zhuanlan: such as https://zhuanlan.zhihu.com/reinforcementlearning 传入的是最后这个reinforcementlearning
:return: 返回专栏下所有文章的url
'''
urlindex = 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan)
print('urlindex:', urlindex)
resindex = requests.get(urlindex, headers=HEADERS)
# print('resindex.text:', resindex.text)
matchac = re.search(r'"articlesCount":(\d+),', resindex.text) # 通过正则表达式获取文章总数
articlesCount = int(matchac.group(1))
upper = articlesCount//100+1 # 下面设置了每页显示100条,这里求总页数
urls = []
for i in range(upper):
urlpage = 'https://zhuanlan.zhihu.com/api/columns/{}/articles?include=data&limit={}&offset={}'.format(zhuanlan, 100, 100*i)
# limit最大是100
respage = requests.get(urlpage, headers=HEADERS)
respage.encoding = 'unicode_escape'
matchurl = re.findall(r'"title":\s"[^"]+?",\s"url":\s"([^"]+?)",', respage.text) # 通过正则匹配url
if len(matchurl) !=0:
urls += matchurl
else:
html_string = requests.get(urlpage, headers=HEADERS).text
content = json.loads(html_string) # 获取的content可以加载为json格式
urls.extend([item['url'] for item in content['data']]) # 就可以用json的方式索引到所有的url
return urls
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36',
}
res = requests.get(url, headers=headers)
return res.text
def extract_all_urls(html):
pattren = re.compile(r'https://zhuanlan.zhihu.com/p/\d+')
# pattren = re.compile(r'https://www.lz13.cn/[^\s]+.html')
url_lst = pattren.findall(html)
return url_lst
def get_urls_from_url(url):
html = get_html(url)
url_lst = extract_all_urls(html)
return url_lst
def get_all_urls(web_site):
url_access_set = set() # 已经访问过的url
queue_url_set = set()
url_lst = get_urls_from_url(web_site)
url_access_set.add(web_site)
queue = deque()
for url in url_lst:
queue.append(url)
queue_url_set.add(url)
# while len(queue) != 0:
# print(len(queue))
# url = queue.popleft()
# if url in url_access_set:
# continue
#
# url_access_set.add(url)
# url_lst = get_urls_from_url(url)
# for url in url_lst:
# if url not in queue_url_set:
# queue.append(url)
# queue_url_set.add(url)
return list(queue_url_set)
def saveArticlesPdf(urls, target_path):
os.makedirs(target_path, exist_ok=True)
for i, url in enumerate(urls):
print('[ {} / {} ] processing'.format(str(i+1).zfill(3), len(urls)))
content = requests.get(url, headers=HEADERS).text
# print('content:', content)
try:
title = re.search(r'<h1\sclass="Post-Title">(.+)</h1>', content).group(1)
except Exception as e:
print('error content:', content)
content = content.replace('<noscript>', '') # 解决无法下载图片问题,其中图片路径为相对路径
content = content.replace('</noscript>', '')
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
title = re.sub(rstr, " ", title)
title = re.sub('', ' ', title)
print('title:', title)
try:
# 方式一,直接调用wkhtmltopdf的命令
# os.system('wkhtmltopdf {} {}'.format(content, target_path+'/{}.pdf'.format(title)))
# 方式二,调用pdfkit包的方式
pdfkit.from_string(content, target_path+'/{}.pdf'.format(title))
except ValueError as e:
print(title, e)
if __name__ == '__main__':
zhuanlan = 'reinforcementlearning'
# urls = getUrls(zhuanlan)
# urls = getUrls2(zhuanlan)
urls = get_all_urls('https://zhuanlan.zhihu.com/p/657345052')
saveArticlesPdf(urls, r'E:\save\{}'.format(zhuanlan))