初学python,抓取搜狗微信公众号文章存入mysql
mysql表:
代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
import requests
import json
import re
import pymysql
# 创建连接
conn = pymysql.connect(host = '你的数据库地址' , port = 端口, user = '用户名' , passwd = '密码' , db = '数据库名称' , charset = 'utf8' )
# 创建游标
cursor = conn.cursor()
cursor.execute( "select * from hd_gzh" )
effect_row = cursor.fetchall()
from bs4 import beautifulsoup
socket.setdefaulttimeout( 60 )
count = 1
headers = { 'user-agent' : 'mozilla/5.0 (windows nt 10.0; win64; x64; rv:65.0) gecko/20100101 firefox/65.0' }
#阿布云ip代理暂时不用
# proxyhost = "http-cla.abuyun.com"
# proxyport = "9030"
# # 代理隧道验证信息
# proxyuser = "h56761606429t7uc"
# proxypass = "9168eb00c4167176"
# proxymeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
# "host" : proxyhost,
# "port" : proxyport,
# "user" : proxyuser,
# "pass" : proxypass,
# }
# proxies = {
# "http" : proxymeta,
# "https" : proxymeta,
# }
#查看是否已存在数据
def checkdata(name):
sql = "select * from gzh_article where title = '%s'"
data = (name,)
count = cursor.execute(sql % data)
conn.commit()
if (count! = 0 ):
return false
else :
return true
#插入数据
def insertdata(title,picture,author,content):
sql = "insert into gzh_article (title,picture,author,content) values ('%s', '%s','%s', '%s')"
data = (title,picture,author,content)
cursor.execute(sql % data)
conn.commit()
print ( "插入一条数据" )
return
for row in effect_row:
newsurl = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=' + row[ 1 ] + '&ie=utf8&_sug_=n&_sug_type_='
res = requests.get(newsurl,headers = headers)
res.encoding = 'utf-8'
soup = beautifulsoup(res.text, 'html.parser' )
url = 'https://weixin.sogou.com' + soup.select( '.tit a' )[ 0 ][ 'href' ]
res2 = requests.get(url,headers = headers)
res2.encoding = 'utf-8'
soup2 = beautifulsoup(res2.text, 'html.parser' )
pattern = re. compile (r "url \+= '(.*?)';" , re.multiline | re.dotall)
script = soup2.find( "script" )
url2 = pattern.search(script.text).group( 1 )
res3 = requests.get(url2,headers = headers)
res3.encoding = 'utf-8'
soup3 = beautifulsoup(res3.text, 'html.parser' )
print ()
pattern2 = re. compile (r "var msglist = (.*?);$" , re.multiline | re.dotall)
script2 = soup3.find( "script" , text = pattern2)
s2 = json.loads(pattern2.search(script2.text).group( 1 ))
#等待10s
time.sleep( 10 )
for news in s2[ "list" ]:
articleurl = "https://mp.weixin.qq.com" + news[ "app_msg_ext_info" ][ "content_url" ]
articleurl = articleurl.replace( '&' , '&' )
res4 = requests.get(articleurl,headers = headers)
res4.encoding = 'utf-8'
soup4 = beautifulsoup(res4.text, 'html.parser' )
if (checkdata(news[ "app_msg_ext_info" ][ "title" ])):
insertdata(news[ "app_msg_ext_info" ][ "title" ],news[ "app_msg_ext_info" ][ "cover" ],news[ "app_msg_ext_info" ][ "author" ],pymysql.escape_string( str (soup4)))
count + = 1
#等待5s
time.sleep( 10 )
for news2 in news[ "app_msg_ext_info" ][ "multi_app_msg_item_list" ]:
articleurl2 = "https://mp.weixin.qq.com" + news2[ "content_url" ]
articleurl2 = articleurl2.replace( '&' , '&' )
res5 = requests.get(articleurl2,headers = headers)
res5.encoding = 'utf-8'
soup5 = beautifulsoup(res5.text, 'html.parser' )
if (checkdata(news2[ "title" ])):
insertdata(news2[ "title" ],news2[ "cover" ],news2[ "author" ],pymysql.escape_string( str (soup5)))
count + = 1
#等待10s
time.sleep( 10 )
cursor.close()
conn.close()
print ( "操作完成" )
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/a2398936046/article/details/88814078