爬取小红书

时间:2024-03-12 11:04:57

1.打开要爬取的网页https://tophub.today/n/L4MdA5ldxD

2.按F12获取headers

3.右键查看源代码

4.代码实现

import requests
import pandas as pd
from bs4 import BeautifulSoup
from pandas import DataFrame
url=\'https://tophub.today/n/L4MdA5ldxD\'
def getHTMLText(url):
    try:
        headers={\'user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0\'}
        r=requests.get(url,timeout=30,headers=headers)
        r.raise_for_status()
        r.encoding=\'utf-8\'
        return r.text
    except:
        return\'异常\'
def saveHTMLText(title,html,c):
        soup=BeautifulSoup(html,\'html.parser\')
        a=soup.find_all(\'span\',class_=\'t\')
        print(\'排名\', \'标题\')
        index=[i for i in range(c)]
        print(index)
        title.append(a)      
        title=[]
        saveHTMLText(title,html,c=10)
        html=getHTMLText(url)
        df=pd.DataFrame(title,columns=[\'排名\',\'标题\'])
        print(df.T)