#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- # # CrawlSinaBycookie.py # Copyright 2016 bitwater <bitwater@bitwater> # My Gmail is bitwater1997@gmail.com ''' 开发环境: 因为网上的python2 中文资源比较全,所以开发环境是 python2.7 linux ubuntu server eclipse 使用的库: 使用lxml进行的网页分析,因为据说lxml比bs4快, 模拟等入部分太复杂,先写一个基于cookie的,以后慢慢学rsa 获取url使用的requests #pip install request 文件使用codecs 也是因为默认文件处理编码问题太多 #pip install codecs 爬取的网站: 爬取的是移动版微博, cookie 获取网站是 https://passport.weibo.cn/signin/login 在chrome 打开网站 -> 右键检查 -> Network -> 刷新网页 -> 观察获取的文件 -> 点击登入 -> 找到m.weibo.cn ->打开->找到Request Headers -> Cookie: 后面的就是cookie 用户的ID 是在url中找 e.g http://m.weibo.cn/u/2518300370 输出 : 写入文件 当前路径的/微博爬出输出信息.txt 注意 : 因为未用多线程 , 所以速度较慢 整个运行过程为了提高效率,没有输出过程, 参考自 http://www.tuicool.com/articles/ja2ayqi 优化了部分过程, 原文的代码我测试的时候并没有爬到所有的微博 ''' import requests from lxml import etree import codecs class weibo: cookie = {"Cookie": "_T_WM=10e482205eab95a2636214f62e66b6e7;\ SUB=_2A251T7kXDeRxGeNG6VYV8ivLyziIHXVWs8dfrDV6PUJbkdBeLUXZkW1Nyy70dahwL14LSBt_hnzScyc5bw..; \ SUHB=0U1Js2WagBcf52; SCF=AhXb86-PSqrCi8mVQRzXm32YePzMWe4GFaKqUxe9Gj4FEmj2HggLPQ4518hS13OH_zSo590ni7K2yIxZPWngMRA.; \ SSOLoginState=1481361735"} # 将your cookie替换成自己的cookie def __init__ (self, user_id): self.user_id = user_id self.user_name = '' self.information = [] self.weiboNum2 = 0 # 爬取到的微博数 self.Cntweibo = 0; self.all_weibos = {} self.allpages = 0 def start(self): self.getUserName() self.getAllweibo() self.writeInfo() def getUserName(self): url = 'http://weibo.cn/%d?filter=1&page=1' % (self.user_id) html = requests.get(url , cookies=weibo.cookie).content selector = etree.HTML(html) userName = selector.xpath("//head/title") ''' 获取姓名 ''' self.user_name = userName[0].text[:-3] # .encode('utf-8') # print type(selector.xpath("//head")) ''' 获取基本信息 ''' str_wb = selector.xpath("//div[@class='tip2']")[0].xpath("string(.)") # .replace('\n', '').replace(' ', '') # print type(str_wb) self.information = str_wb.split() ''' 获取页数 ''' tmp = selector.xpath("//input[@name='mp']/@value") # print int(tmp[0]) self.allpages = int(tmp[0]) def getAllweibo(self): # self.allpages = 2 for page in range(1, self.allpages + 1): url = 'http://weibo.cn/%d?filter=1&page=%d' % (self.user_id, page) html = requests.get(url , cookies=weibo.cookie).content selector = etree.HTML(html) str_wb = selector.xpath("//span[@class='ctt']") l = len(str_wb) # for i in range(3, l): # print "%d" % (i - 2) + " " + str_wb[i].xpath("string(.)") + '\n' # for i in range(3, l): self.Cntweibo = self.Cntweibo + 1 self.all_weibos[self.Cntweibo] = str_wb[i].xpath("string(.)") + '\n' def writeInfo(self): fw = codecs.open("微博爬出输出信息.txt", "w", "utf-8") fw.write(self.user_name + '\n') fw.write(self.information[0] + '\n') fw.write(self.information[1] + '\n') fw.write(self.information[2] + '\n') fw.write(u"所有的原创微博\n") for i in range(1, self.Cntweibo + 1): fw.write((u' 第%d条微博 : \n') % i + self.all_weibos[i]) fw.close() if __name__ == '__main__': user_id = input("input the user id ") wb = weibo(user_id) wb.start()