本文实例为大家分享了python下载微信公众号相关文章的具体代码,供大家参考,具体内容如下
目的:从零开始学自动化测试公众号中下载“pytest"一系列文档
1、搜索微信号文章关键字搜索
2、对搜索结果前n页进行解析,获取文章标题和对应url
主要使用的是requests和bs4中的beautifulsoup
weixin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
import requests
from urllib.parse import quote
from bs4 import beautifulsoup
import re
from weixinspider.html2doc import myhtmlparser
class weixinspider( object ):
def __init__( self , gzh_name, pageno,keyword):
self .gzh_name = gzh_name
self .pageno = pageno
self .keyword = keyword.lower()
self .page_url = []
self .article_list = []
self .headers = {
'user-agent' : 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' }
self .timeout = 5
# [...] 用来表示一组字符,单独列出:[amk] 匹配 'a','m'或'k'
# re+ 匹配1个或多个的表达式。
self .pattern = r '[\\/:*?"<>|\r\n]+'
def get_page_url( self ):
for i in range ( 1 , self .pageno + 1 ):
# https://weixin.sogou.com/weixin?query=从零开始学自动化测试&_sug_type_=&s_from=input&_sug_=n&type=2&page=2&ie=utf8
url = "https://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=2&page=%s&ie=utf8" \
% (quote( self .gzh_name),i)
self .page_url.append(url)
def get_article_url( self ):
article = {}
for url in self .page_url:
response = requests.get(url,headers = self .headers,timeout = self .timeout)
result = beautifulsoup(response.text, 'html.parser' )
articles = result.select( 'ul[class="news-list"] > li > div[class="txt-box"] > h3 > a ' )
for a in articles:
# print(a.text)
# print(a["href"])
if self .keyword in a.text.lower():
new_text = re.sub( self .pattern,"",a.text)
article[new_text] = a[ "href" ]
self .article_list.append(article)
headers = { 'user-agent' :
'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' }
timeout = 5
gzh_name = 'pytest文档'
my_gzh = weixinspider(gzh_name, 5 , 'pytest' )
my_gzh.get_page_url()
# print(my_gzh.page_url)
my_gzh.get_article_url()
# print(my_gzh.article_list)
for article in my_gzh.article_list:
for (key,value) in article.items():
url = value
html_response = requests.get(url,headers = headers,timeout = timeout)
myhtmlparser = myhtmlparser(key)
myhtmlparser.feed(html_response.text)
myhtmlparser.doc.save(myhtmlparser.docfile)
|
html2doc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
from html.parser import htmlparser
import requests
from docx import document
import re
from docx.shared import rgbcolor
import docx
class myhtmlparser(htmlparser):
def __init__( self ,docname):
htmlparser.__init__( self )
self .docname = docname
self .docfile = r "d:\pytest\%s.doc" % self .docname
self .doc = document()
self .title = false
self .code = false
self .text = ''
self .processing = none
self .codeprocessing = none
self .picindex = 1
self .headers = {
'user-agent' : 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/70.0.3538.110 safari/537.36' }
self .timeout = 5
def handle_startendtag( self , tag, attrs):
# 图片的处理比较复杂,首先需要找到对应的图片的url,然后下载并写入doc中
if tag = = "img" :
if len (attrs) = = 0 :
pass
else :
for (variable, value) in attrs:
if variable = = "data-type" :
picname = r "d:\pytest\%s%s.%s" % ( self .docname, self .picindex, value)
# print(picname)
if variable = = "data-src" :
picdata = requests.get(value, headers = self .headers, timeout = self .timeout)
# print(value)
self .picindex = self .picindex + 1
# print(self.picindex)
with open (picname, "wb" ) as pic:
pic.write(picdata.content)
try :
self .doc.add_picture(picname)
except docx.image.exceptions.unexpectedendoffileerror as e:
print (e)
def handle_starttag( self , tag, attrs):
if re.match(r "h(\d)" , tag):
self .title = true
if tag = = "p" :
self .processing = tag
if tag = = "code" :
self .code = true
self .codeprocessing = tag
def handle_data( self , data):
if self .title = = true:
self .doc.add_heading(data, level = 2 )
# if self.in_div == true and self.tag == "p":
if self .processing:
self .text = self .text + data
if self .code = = true:
p = self .doc.add_paragraph()
run = p.add_run(data)
run.font.color.rgb = rgbcolor( 111 , 111 , 111 )
def handle_endtag( self , tag):
self .title = false
# self.code = false
if tag = = self .processing:
self .doc.add_paragraph( self .text)
self .processing = none
self .text = ''
if tag = = self .codeprocessing:
self .code = false
|
运行结果:
缺少部分文档,如pytest文档4,是因为搜狗微信文章搜索结果中就没有
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/yaoliuwei1426/article/details/84707163