github源码地址:
https://github.com/kuishou68/python
各类图表的实现效果
爬取的说说内容
个性化说说内容词云图
每年发表说说总数柱状图、每年点赞和评论折线图
7天好友动态柱状图、饼图
使用方法
按照你的谷歌浏览器下载指定版本的驱动 http://chromedriver.storage.googleapis.com/index.html
驱动跟两个python脚本放入同目录,我的版本是90.0.4430的,查看你自己的版本,下载后把我的chromedriver.exe替换掉!
这里用到了很多第三方包,鼠标放在报红的包名下,用alt+enter导包,如果失败则在控制台用下面的必杀技
1
|
pip install 包名 - i http: / / pypi.douban.com / simple / - - trusted - host pypi.douban.com
|
主要代码
qq空间txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import time
from selenium import webdriver
from lxml import etree
# 这里一定要设置编码格式,防止后面写入文件时报错
friend = '1569339843' # 朋友的qq号,朋友的空间要求允许你能访问
user = '783533896' # 你的qq号
pw = '1323mkonji.@' # 你的qq密码
# 获取浏览器驱动
chrome_driver = 'chromedriver.exe'
driver = webdriver.chrome(executable_path = chrome_driver)
# 浏览器窗口最大化
driver.maximize_window()
# 浏览器地址定向为qq登陆页面
driver.get( "http://i.qq.com" )
# 所以这里需要选中一下frame,否则找不到下面需要的网页元素
driver.switch_to.frame( "login_frame" )
time.sleep( 3 )
# 自动点击账号登陆方式
driver.find_element_by_id( "switcher_plogin" ).click()
time.sleep( 3 )
# 账号输入框输入已知qq账号
driver.find_element_by_id( "u" ).send_keys(user)
time.sleep( 5 )
# 密码框输入已知密码
driver.find_element_by_id( "p" ).send_keys(pw)
time.sleep( 5 )
# 自动点击登陆按钮
driver.find_element_by_id( "login_button" ).click()
time.sleep( 5 )
# 让webdriver操纵当前页
driver.switch_to.default_content()
time.sleep( 5 )
# 跳到说说的url, friend你可以任意改成你想访问的空间
driver.get( "http://user.qzone.qq.com/" + friend + "/311" )
time.sleep( 5 )
next_num = 0 # 初始“下一页”的id
while true:
# 下拉滚动条,使浏览器加载出动态加载的内容,
# 我这里是从1开始到6结束 分5 次加载完每页数据
for i in range ( 1 , 6 ):
height = 20000 * i # 每次滑动20000像素
strword = "window.scrollby(0," + str (height) + ")"
driver.execute_script(strword)
time.sleep( 4 )
# 很多时候网页由多个<frame>或<iframe>组成,webdriver默认定位的是最外层的frame,
# 所以这里需要选中一下说说所在的frame,否则找不到下面需要的网页元素
driver.switch_to.frame( "app_canvas_frame" )
selector = etree.html(driver.page_source)
divs = selector.xpath( '//*[@id="msglist"]/li/div[3]' )
# 这里使用 a 表示内容可以连续不清空写入
with open ( 'qq_word.txt' , 'a' , encoding = "utf-8" ) as f:
for div in divs:
qq_name = div.xpath( './div[2]/a/text()' )
qq_content = div.xpath( './div[2]/pre/text()' )
qq_time = div.xpath( './div[4]/div[1]/span/a/text()' )
qq_praise = div.xpath( './div[4]/div[2]/span/span/a[2]/text()' )
qq_comment = div.xpath( './div[4]/div[2]/a[3]/text()' )
qq_name = qq_name[ 0 ] if len (qq_name) > 0 else ''
qq_content = qq_content[ 0 ] if len (qq_content) > 0 else ''
qq_content = qq_content.replace( '\n' , ' ' )
qq_time = qq_time[ 0 ] if len (qq_time) > 0 else ''
qq_praise = qq_praise[ 0 ] if len (qq_praise) > 0 else ''
qq_comment = qq_comment[ 0 ] if len (qq_comment) > 0 else ''
print (qq_name, qq_time, qq_content, qq_praise, qq_comment)
f.write(qq_content + "\n" )
# 当已经到了尾页,“下一页”这个按钮就没有id了,可以结束了
if driver.page_source.find( 'pager_next_' + str (next_num)) = = - 1 :
break
# 找到“下一页”的按钮,因为下一页的按钮是动态变化的,这里需要动态记录一下
driver.find_element_by_id( 'pager_next_' + str (next_num)).click()
# “下一页”的id
next_num + = 1
# 因为在下一个循环里首先还要把页面下拉,所以要跳到外层的frame上
driver.switch_to.parent_frame()
# 关闭浏览器
driver.quit()
|
各种图表的生成
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
import pandas as pd
from pyecharts.charts import bar
from pyecharts.charts import pie
import pyecharts.options as opts
from pyecharts.charts import line
import re
df_excel = pd.read_excel( 'qq_excel.xlsx' ) # 默认读取sheet=0 pandas dataframe'
def gettimestr(row):
item = row[ '时间' ]
if pd.isnull(item) | pd.isna(item):
return
data = item.split( '年' )[ 0 ]
return data
# 按年统计说说数量
def readcount(result, row):
timedata = gettimestr(row)
if timedata = = none: return
if timedata in result.keys():
result[timedata] + = 1
else :
result[timedata] = 1
# 按年统计说说点赞数
def readthumb(result, row):
item = row[ '赞' ]
if pd.isnull(item):
return
# data = re.match(r'赞\((\d+).*', item, re.m | re.i)
if len (item.split( "(" )) < = 1 :
return
data = item.split( "(" )[ 1 ].split( ")" )[ 0 ]
timedata = gettimestr(row)
if timedata = = none: return
if timedata in result.keys():
result[timedata] + = int (data)
else :
result[timedata] = int (data)
# 按年统计说说评论数
def readcomment(result, row):
item = row[ '评论' ]
if pd.isnull(item):
return
# data = re.match(r'赞\((\d+).*', item, re.m | re.i)
if len (item.split( "(" )) < = 1 :
return
data = item.split( "(" )[ 1 ].split( ")" )[ 0 ]
timedata = gettimestr(row)
if timedata = = none: return
if timedata in result.keys():
result[timedata] + = int (data)
else :
result[timedata] = int (data)
def readexcel(df_excel):
count = {}
result = {}
thumb = {}
comment = {}
for index, row in df_excel.iterrows():
readcount(count, row)
readthumb(thumb, row)
readcomment(comment, row)
result[ 'count' ] = count
result[ 'thumb' ] = thumb
result[ 'comment' ] = comment
return result
def getkeyandval(keyword):
data = readexcel(df_excel).get(keyword)
key = []
value = []
for item in data.keys():
key.append(item)
value.append(data[item])
key.reverse()
value.reverse()
return [key, value]
# 统计每年发表说说次数柱状图
def paintbar():
count = readexcel(df_excel).get( 'count' )
# v1 版本开始支持链式调用
data = getkeyandval( 'count' )
print (data[ 0 ])
d = (
bar()
.add_xaxis(data[ 0 ])
.add_yaxis( "每年发表说说总数" , data[ 1 ])
.render( "每年发表说说总数柱状图.html" )
)
paintbar()
# 统计点赞和评论折线图
def paintline():
commentdata = getkeyandval( 'comment' )
thumbdata = getkeyandval( 'thumb' )
xaxis_data = commentdata[ 0 ]
commentvalue = commentdata[ 1 ]
thumbvalue = thumbdata[ 1 ]
d = (
line()
.add_xaxis(xaxis_data = xaxis_data)
.add_yaxis( "每年评论数" , y_axis = commentvalue)
.add_yaxis( "每年点赞数" , y_axis = thumbvalue)
.render( "每年点赞和评论折现图.html" ) # 输出图形
)
paintline()
|
其他代码自行下载项目查看
以上就是python爬取网页版qq空间,生成各类图表的详细内容,更多关于python 爬取qq空间的资料请关注服务器之家其它相关文章!
原文链接:https://github.com/kuishou68/python