python2 爬取新浪帐号所有微博

时间:2021-08-10 20:43:21
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
#
#  CrawlSinaBycookie.py
#  Copyright 2016 bitwater <bitwater@bitwater>
#  My Gmail is bitwater1997@gmail.com

'''
开发环境:
	因为网上的python2 中文资源比较全,所以开发环境是     python2.7   linux ubuntu server   eclipse 

使用的库:
	使用lxml进行的网页分析,因为据说lxml比bs4快, 
	模拟等入部分太复杂,先写一个基于cookie的,以后慢慢学rsa
	获取url使用的requests    #pip install request 
	文件使用codecs   也是因为默认文件处理编码问题太多 #pip install codecs

爬取的网站:
	爬取的是移动版微博,   cookie 获取网站是   https://passport.weibo.cn/signin/login 
							在chrome 打开网站 -> 右键检查 -> Network -> 刷新网页 -> 观察获取的文件 -> 点击登入 -> 找到m.weibo.cn
							->打开->找到Request Headers -> Cookie: 后面的就是cookie
	用户的ID 是在url中找  e.g  http://m.weibo.cn/u/2518300370

输出 : 
	写入文件 当前路径的/微博爬出输出信息.txt 

注意 : 
	因为未用多线程 , 所以速度较慢
	整个运行过程为了提高效率,没有输出过程,
	参考自   http://www.tuicool.com/articles/ja2ayqi
	优化了部分过程, 原文的代码我测试的时候并没有爬到所有的微博

'''

import requests
from lxml import etree
import codecs

class weibo:
    
    cookie = {"Cookie": "_T_WM=10e482205eab95a2636214f62e66b6e7;\
    SUB=_2A251T7kXDeRxGeNG6VYV8ivLyziIHXVWs8dfrDV6PUJbkdBeLUXZkW1Nyy70dahwL14LSBt_hnzScyc5bw..; \
    SUHB=0U1Js2WagBcf52; SCF=AhXb86-PSqrCi8mVQRzXm32YePzMWe4GFaKqUxe9Gj4FEmj2HggLPQ4518hS13OH_zSo590ni7K2yIxZPWngMRA.; \
    SSOLoginState=1481361735"}  # 将your cookie替换成自己的cookie
    
    def __init__ (self, user_id):
        self.user_id = user_id 
        self.user_name = ''
        self.information = []
        self.weiboNum2 = 0  # 爬取到的微博数
        self.Cntweibo = 0;
        self.all_weibos = {}
        self.allpages = 0
        
    def start(self):
        
        self.getUserName()
        self.getAllweibo()
        self.writeInfo()
        
    def getUserName(self):
        
        url = 'http://weibo.cn/%d?filter=1&page=1' % (self.user_id)
        html = requests.get(url , cookies=weibo.cookie).content
        selector = etree.HTML(html)
        userName = selector.xpath("//head/title")
        
        ''' 
            获取姓名
        '''
        self.user_name = userName[0].text[:-3]  # .encode('utf-8')    
#         print type(selector.xpath("//head"))

        '''
            获取基本信息
        ''' 
        str_wb = selector.xpath("//div[@class='tip2']")[0].xpath("string(.)")  # .replace('\n', '').replace(' ', '')
#         print type(str_wb)
        self.information = str_wb.split()
        
        '''
           获取页数 
        '''
        tmp = selector.xpath("//input[@name='mp']/@value")
#         print int(tmp[0])
        self.allpages = int(tmp[0])
        
        
    def getAllweibo(self):
    
#         self.allpages = 2
        for page in range(1, self.allpages + 1):
            url = 'http://weibo.cn/%d?filter=1&page=%d' % (self.user_id, page)
            html = requests.get(url , cookies=weibo.cookie).content
            selector = etree.HTML(html)
            
            str_wb = selector.xpath("//span[@class='ctt']")
            
            l = len(str_wb)
            
#             for i in range(3, l):
#                 print "%d" % (i - 2) + " " + str_wb[i].xpath("string(.)") + '\n'
#         
            for i in range(3, l):
                self.Cntweibo = self.Cntweibo + 1
                self.all_weibos[self.Cntweibo] = str_wb[i].xpath("string(.)") + '\n'
            
        
    def writeInfo(self):
        fw = codecs.open("微博爬出输出信息.txt", "w", "utf-8")
        fw.write(self.user_name + '\n')
        fw.write(self.information[0] + '\n')
        fw.write(self.information[1] + '\n')
        fw.write(self.information[2] + '\n')
        
        fw.write(u"所有的原创微博\n")
        for i in range(1, self.Cntweibo + 1):
            fw.write((u'  第%d条微博 : \n') % i + self.all_weibos[i])
        fw.close()
        
        
if __name__ == '__main__':
    
    user_id = input("input the user id ")
    wb = weibo(user_id)
    wb.start()