python爬虫微信公众号视频

时间:2025-03-13 10:19:27
import time import json import random import csv from selenium import webdriver from lxml import html import requests import re from http import cookiejar # 获取文章 class getEssay: def __init__(self): # 获取cookies with open('','r') as f : cookie = f.read() f.close() self.cookie = json.loads(cookie) # 获取token self.header = { "HOST": "", "User-Agent": 'Mozilla / 5.0(WindowsNT6.1;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 74.0.3729.131Safari / 537.36' } m_url = '' response = requests.get(url=m_url, cookies=self.cookie) print(response); self.token = 000000; ## 填入保存的token信息。 print(self.token) # fakeid与name self.fakeid = [] # 获取公众号信息 def getGname(self): # 请求头 headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': '', 'Referer': '/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=%d&lang=zh_CN'%int(self.token), 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # 地址 url = '/cgi-bin/searchbiz?' # query = input('请输入要搜索的公众号关键字:') # begin = int(input('请输入开始的页数:')) query = '印前制作' begin = 0 begin *= 5 # 请求参数 data = { 'action': 'search_biz', 'token': self.token, 'lang': 'zh_CN', 'f': 'json', 'ajax':' 1', 'random': random.random(), 'query': query, 'begin': begin, 'count': '1' } # 请求页面,获取数据 res = requests.get(url=url, cookies=self.cookie, headers=headers, params=data) print(res.text); name_js = res.text name_js = json.loads(name_js) list = name_js['list'] for i in list: time.sleep(1) fakeid = i['fakeid'] nickname =i['nickname'] print(nickname,fakeid) self.fakeid.append((nickname,fakeid)) # 获取文章url def getEurl(self, begin): url = '/cgi-bin/appmsg?' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': '', 'Referer': '/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=%d&lang=zh_CN'%int(self.token), 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # 遍历fakeid,访问获取文章链接 for i in self.fakeid: time.sleep(1) fake = i[1] data = { 'token': self.token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'action': 'list_ex', 'begin': begin, 'count': 5, 'fakeid': fake, 'type': 9 } res = requests.get(url, cookies=self.cookie, headers=headers, params=data) js = res.text print(js) link_l = json.loads(js) self.parJson(link_l) # 解析提取url def parJson(self,link_l): l = link_l['app_msg_list'] for i in l: link = i['link'] link = self.getVideo(link) name = i['title'] self.saveData(name,link) # 保存数据进csv中 def saveData(self,name,link): with open('' ,'a',encoding='utf8') as f: w = csv.writer(f) w.writerow((name,link)) print('ok') def getVideo(self, url): # 请求要下载的url地址 html = requests.get(url); # content返回的是bytes型也就是二进制的数据。 # 我用的是正则,也可以使用xpath jsonRes = html.text # 匹配:wxv_1105179750743556096 dirRe = r"wxv_.{19}" result = re.search(dirRe, jsonRes) if result: wxv = result.group(0) print(wxv) print(html) # 页面播放形式 video_url = "/mp/readtemplate?t=pages/video_player_tmpl&auto=0&vtoken operator">+ wxv print("video_url", video_url) # 页面可下载形式 video_url_temp = "/mp/videoplayer?action=get_mp_video_play_url&preview=0&__biz=MzU1MTg5NTQxNA==&mid=2247485507&idx=4&vtoken operator">+ wxv response = requests.get(video_url_temp) content = response.content.decode() content = json.loads(content) print(content) url_info = content.get("url_info") if url_info: video_url2 = url_info[0].get("url") print(video_url2) return video_url2 else: return "" else: return "" if __name__ == '__main__': G = getEssay() G.getGname() for num in range(0,20): time.sleep(1) G.getEurl(num*5)