python爬虫练习--爬取所有微博

时间:2022-01-25 21:30:20

Ajax,全称为Asynchronous JavaScript and XML,即异步的JavaScript和XML。它不是一门编程语言,而是利用JavaScript在保证页面不被刷新、页面链接不改变的情况下与服务器交换数据并更新部分网页的技术。

对于传统的网页,如果想更新其内容,那么必须要刷新整个页面,但有了Ajax,便可以在页面不被全部刷新的情况下更新其内容。在这个过程中,页面实际上是在后台与服务器进行了数据交互,获取到数据之后,再利用JavaScript改变网页,这样网页内容就会更新了。

可以到W3School上体验几个示例来感受一下:http://www.w3school.com.cn/ajax/ajax_xmlhttprequest_send.asp

代码如下:

 1 #! /usr/bin/env python
 2 # coding: utf-8
 3 
 4 import requests
 5 from pyquery import PyQuery as pq
 6 import pymysql
 7 from pymongo import MongoClient
 8 import time
 9 
10 '''
11 抓取所有微博信息,是通过Ajax进行分析抓取
12 '''
13 
14 
15 headers = {
16     'Host': 'm.weibo.cn',
17     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
18     'X-Requested-With': 'XMLHttpRequest',
19     'Referer': 'https://m.weibo.cn/u/2830678474'
20 }
21 
22 def get_page(page):
23     '''
24     请求url,并获取内容
25     :param page:
26     :return:
27     '''
28     params = {
29         'type':'uid',
30         'value':'2830678474',
31         'containerid':'1076032830678474',
32         'page':page
33 
34     }
35     url = 'https://m.weibo.cn/api/container/getIndex'
36     try:
37         res = requests.get(url,headers=headers,params=params)
38         if res.status_code == 200:
39             return res.json()
40     except requests.ConnectionError as e:
41         print('Error',e.args)
42 
43 
44 
45 def parse_page(json,page):
46     if json:
47         items = json.get('data').get('cards')
48         for index, item in enumerate(items):
49             if page == 1 and index == 1:
50                 continue
51             else:
52                 item = item.get('mblog')
53                 weibo = {}
54                 weibo['id'] = item.get('id')
55                 weibo['text'] = pq(item.get('text')).text()
56                 weibo['attitudes'] = item.get('attitudes_count')
57                 weibo['comments'] = item.get('comments_count')
58                 weibo['reposts'] = item.get('reposts_count')
59                 yield weibo
60 
61 
62 #往MySQL中存储
63 def save_mysql(result):
64 
65     table = 'weibo'
66     keys = ', '.join(result.keys())
67     values = ', '.join(['%s'] * len(result))
68     db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306,db='spiders',charset="utf8")
69     cursor = db.cursor()
70     sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
71     try:
72         if cursor.execute(sql, tuple(result.values())):
73             print('Successful')
74             db.commit()
75     except Exception as e:
76         print('Failed',e.args)
77         db.rollback()
78     db.close()
79 
80 
81 #往mangoDB中存储
82 def save_to_mongo(result):
83     client = MongoClient()
84     db = client['weibo']
85     collection = db['weibo']
86     if collection.insert(result):
87         print('Saved to Mongo')
88 
89 
90 
91 if __name__ == '__main__':
92     for page in range(1,11):
93         json = get_page(page)
94         results = parse_page(json,page)
95         for result in results:
96             print(result)
97             save_mysql(result)
98             time.sleep(1)