python爬虫:爬取豌豆荚APP第一页数据信息(爬虫三部曲)

时间:2022-09-18 17:29:52
'''
爬取豌豆荚app数据
-请求url:
          page1:
           https: / /wwW . wandouj ia . com/wdjweb/api/ category/more?
           catId=6001&subCatId=0&page=2&ctoken=vbw9lj1sRQsRddx0hD-XqCNF
'''
# 1.发送请求
import  re
import  requests
from bs4 import BeautifulSoup
def get_page(url):
    requests.get(url)

# 2.解析数据
def parse_data(text):
    soup = BeautifulSoup(text,'lxml')
    print(soup)
    li_list = soup.find_all(name = 'li',class_='card')
    # print(li_list)

    for li in li_list:
        print(li)
        print('tank' * 100 )
        app_name = li.find(name = 'a',class_='name').text
        print(app_name)
        qpp_url = li.find(name = 'a',class_='name').attrs.get('href')
        print(qpp_url)

        download_num = li.find(name='span',class_='install-count').text
        print(download_num)

        app_size = li.find(name='span',attrs={"title":re._compile('\d+MB')}).text
        print(app_size)

        app_data ='''
        游戏名称:{}
        游戏地址:{}
        下载人数:{}
        游戏大小:{}
        \n
        '''.format(app_name,qpp_url,download_num,app_size)
        print(app_data)
        with  open('wandoujia.text','a',encoding='utf-8') as f:
            f.write(app_data)
            f.flush()


if __name__ == '__main__':
    for line in range(1,31):
        url='https: / /wwW . wandouj ia . com/wdjweb/api/ category/more?catId=6001&subCatId=0&page=2{}&ctoken=vbw9lj1sRQsRddx0hD-XqCNF'.format(line)
        print(url)

        # 1.发送请求
        # 往接口发送请求获取响应数据
        response = get_page(url)
        # print(response.text)
        import  json
        # json.loads(response.text)
        # print(type(response.jason()))
        # print(type(response.text))
        # print('tank' * 1000)

        # 把json数据格式转换成python的字典
        data = response.json()

        # print(data['state'])

        print(data.get('state').get('content'))

        parse_data(type(response.text))