爬取央视热榜并存储到MongoDB

时间:2025-04-06 20:08:25
  • import re
  • import pymongo
  • import requests
  • headers = {
  • # 请求工具标识
  • "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (HTML, like Gecko) "
  • "Chrome/127.0.0.0 Safari/537.36"
  • }
  • url = '/top/?spm=C28340.PdNvWY0LYxCP.EtmP5mypaGE4.11'
  • res = (url, headers=headers)
  • con = ("utf8")
  • datas = (r'<ul>.*?</ul>', con, )
  • result = {
  • "热播榜": {
  • "name": "热播榜",
  • "items": []
  • },
  • "动画片": {
  • "name": "动画片",
  • "items": []
  • },
  • "电视剧": {
  • "name": "电视剧",
  • "items": []
  • },
  • "纪录片": {
  • "name": "纪录片",
  • "items": []
  • },
  • "特别节目": {
  • "name": "特别节目",
  • "items": []
  • }
  • }
  • # print(datas[1])
  • items = (
  • r'<li.*?lazy="(.*?)".*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<div class="column"><i class="icon_l"></i><a href=".*?" target="_blank">(.*?)</a>'
  • r'<i class="icon_r"></i></div>.*?</li>',
  • datas[1], )
  • for item in items:
  • # print(item)
  • result["热播榜"]["items"].append({
  • "img": item[0],
  • "title": item[1],
  • "category": item[2]
  • })
  • # pass
  • # print(datas[2])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number"><i class="icon_l">'
  • r'</i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text">'
  • r'<a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[2],
  • )
  • for item in items:
  • # print(item)
  • result["动画片"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "category": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[3])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[3],
  • )
  • for item in items:
  • # print(item)
  • result["电视剧"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "episode": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[4])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i>'
  • r'</span>.*?</div>.*?<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text">'
  • r'<a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[4],
  • )
  • for item in items:
  • # print(item)
  • result["纪录片"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "category": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(datas[5])
  • items = (
  • r'<li.*?lazy="(.*?)" width="188" height="250"></a>.*?<span class="number">'
  • r'<i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?</div>.*?'
  • r'<a class="cover" href=".*?" target="_blank">.*?</a>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a>'
  • r'</div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>.*?</li>'
  • , datas[5],
  • )
  • for item in items:
  • # print(item)
  • result["特别节目"]["items"].append({
  • "img": item[0],
  • "title": item[2],
  • "tv": item[1],
  • "synopsis": item[3]
  • })
  • # pass
  • # print(result)
  • client = ()
  • db = client.get_default_database("cctv")
  • collection = db.get_collection("top")
  • collection.insert_one(result)
  • ()