参考用WebCollector 2.x爬取新浪微博(无需手动获取cookie)
从java 转为python
from selenium import webdriver
import seleniumfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup
import requests
import logging
import re
import time
from urllib.parse import quote
import random
def parse_list(url=None):
#=========================登录模块==============================
driver = webdriver.PhantomJS()
driver.get("http://login.weibo.cn/login/")
time.sleep(3)
driver.maximize_window() # 浏览器全屏显示
#通过用户名密码登陆
driver.find_element_by_name("mobile").send_keys("微博帐号")
driver.find_element_by_xpath("//input[@type='password']").send_keys("密码")
#勾选保存密码
driver.find_element_by_name("remember").click()
time.sleep(1)
#点击登陆按钮
driver.find_element_by_name("submit").click()
#=========================爬虫模块==============================
driver.get(url)
html=driver.page_source
if not html:
return None
soup = BeautifulSoup(html)
ls = soup.select('.c')
# print (html)
# print (ls)
link_list = []
for item in ls:
item_dict = {}
print (item.text)
for a in item.select('a'):
print (a['href']+"-------"+a.text)
# item_dict['title'] = item.a.text
# item_dict['link'] = item.a['href']
# item_dict['time']=item.select('.s-p')[0].text
return None
if __name__ == '__main__':
parse_list("http://weibo.cn/u/1789834424?page=1");