bzip2 zip 压缩后体积比 0.8:1

时间:2021-08-24 16:26:52

1、

对.bz2 后缀文件 跳过不处理

2、逐行同字段的json文件,压缩后大小为原文件的12.81%

测试文件近似认为为逐行json文本数据,没有进行多文件重复测试,没有统计时间;

{"uid":50013896,"uuid":"f32feacf-5f83-4866-8dfe-41bff794b8d4","ip":"666298884","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/55821723.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":2,"country":1,"province":0,"city":0,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50015357,"uuid":"388b3676-8835-49b4-827b-5c1f3ddf6bc8","ip":"1973056862","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/55218551.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":2,"country":1,"province":0,"city":0,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50016991,"uuid":"dbd44846-4b4a-4b26-aad2-8a70a7a31c74","ip":"2004569145","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/VVZv_q1-Hpas_pCYVW1sfg.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":1,"country":1,"province":6,"city":77,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50001228,"uuid":"1b4908cd-1306-40e7-bd4e-df0372bcc749","ip":"3740751066","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/CKHzIMoRfJUYOkAwNZTfMg.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":1,"country":1,"province":14,"city":197,"flash":"0","java":"0","request_time":1547395199,"create_date":"2019-01-13 23:59:59"}
{"ad_slots_id":1002,"uuid":"a369a303-1d70-49eb-9e73-7a2a8f028626","industry_pid":0,"industry_id":0,"ip":"1700604567","site":72,"address":"https:\/\/info.b2b168.com\/s168-47325051.html","create_date":"2019-01-13 23:59:59","ad_id":"50012715","uid":"50012715","keyword":"\u8bbe\u5907","pageinfo":""}
{"ad_slots_id":1002,"uuid":"a369a303-1d70-49eb-9e73-7a2a8f028626","industry_pid":0,"industry_id":0,"ip":"1700604567","site":72,"address":"https:\/\/info.b2b168.com\/s168-47325051.html","create_date":"2019-01-13 23:59:59","ad_id":"50015314","uid":"50015314","keyword":"\u5b81\u6ce2\u6536\u94f6\u8f6f\u4ef6","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"50020536","uid":"50020536","keyword":"Sup\u53e3\u7ea2\u8272\u53f7\u63a8\u8350","pageinfo":"\u7545\u9500\u7684\u56fe\u96c6\u53f7\u8fbd2011J606\u63a8\u8350 |\u8ba2\u8d2d\u56fe\u96c6\u53f7\u8fbd2011J606_\u4f9b\u6c42\u5546\u673a_\u91d1\u6cc9\u7f51#|^#|^http:\/\/www.jqw.com\/Businfo\/1688002049073.htm"}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"34064333","uid":"34064333","keyword":"\u8f6f\u4ef6\u8ba2\u5236","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"50014483","uid":"50014483","keyword":"\u5c71\u6cc9\u6c34\u6279\u53d1","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"34022975","uid":"34022975","keyword":"\u718a\u638c\u53f7","pageinfo":""}

  

  

137M -rw-r--r-- 1 root root 137M Jan 10 11:45 visit-2019-01-10
20M -rw-r--r-- 1 root root 20M Jan 10 11:48 visit-2019-01-10.zip

bzip2  visit-2019-01-10

16M -rw-r--r-- 1 root root 16M Jan 10 11:45 visit-2019-01-10.bz2
20M -rw-r--r-- 1 root root 20M Jan 10 11:48 visit-2019-01-10.zip

默认 bzip2 theFile 删除原文件,结果文件命名为theFile.bzip2

压缩后的体积为zip的0.8

bzip2 -9 visit-2019-01-03-u

890M -rw-r--r-- 1 root root 890M Jan 10 11:59 visit-2019-01-03-u
65M -rw-r--r-- 1 root root 65M Jan 10 11:59 visit-2019-01-03-u.bz2
87M -rw-r--r-- 1 root root 87M Jan 10 12:00 visit-2019-01-03-u.zip

压缩后的体积为zip的0.7475,为原始文件的0.0730

用压缩后的文件覆盖原文件

import sys, glob, os

targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
for i in LocalFiles:
if passFeature in i:
continue
cmd = 'cd {};bzip2 -9 {}'.format(targetDir, i)
print(cmd)
os.system(cmd)

[root@a data]# tree testBiz2Py/
testBiz2Py/
├── 2-23-3
├── 2-23-a
├── 2-23-b
├── a
└── b

0 directories, 5 files
[root@a data]# python bzip2Action/biz2SaveCost.py /data/testBiz2Py/ b
cd /data/testBiz2Py/;bzip2 -9 /data/testBiz2Py/2-23-a
cd /data/testBiz2Py/;bzip2 -9 /data/testBiz2Py/2-23-3
[root@a data]# tree testBiz2Py/
testBiz2Py/
├── 2-23-3.bz2
├── 2-23-a.bz2
├── 2-23-b
├── a
└── b

0 directories, 5 files

cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

压缩前

[root@a data]# du --max-depth=2 -h ./
141G ./unionlog
8.0K ./bzip2Action
21G ./visitlog
169G ./
[root@a data]# tree visitlog/
visitlog/
├── visit-2018-09-18
├── visit-2018-09-19
├── visit-2018-09-20

[root@b ~]# cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

19G ./visitlog
104G ./unionlog

1.1T ./
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-09-19
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-09-25

[root@c ~]# cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

21G ./visitlog
141G ./unionlog

940G ./
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-24
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-01
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-19
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-10-22

统计压缩速度

单个文件的平均速度

总数据量的平均速度

注意增加计算压缩率的功能代码

# -*- coding: utf-8 -*-

import sys, glob, os, time
import random targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
allMB, allSeconds, singleSeconds = 0, 0, []
for i in LocalFiles:
if passFeature in i:
continue # 进入原文件目录,压缩后覆盖原文件
cmd = 'cd {};bzip2 -9 {}'.format(targetDir, i) # 研究压缩速度 fileMB = os.stat(i).st_size / 1024 / 1024
t_start = time.time()
print(cmd)
# os.system(cmd)
t = random.random()*10
time.sleep(t)
t_end = time.time()
fileSeconds = t_end - t_start
allMB += fileMB
allSeconds += fileSeconds
singleSeconds.append(fileMB / fileSeconds) # 按照速度大小由小到大排序
singleSeconds = list(sorted(singleSeconds, reverse=True))
singleSeconds = sorted(singleSeconds)
print('averageSpeed(MB/s):', allMB / allSeconds)
print('singleSeconds(MB/s):', singleSeconds) 压缩:主要消耗cpu,计算密集型

bzip2 zip  压缩后体积比 0.8:1

压缩后

[root@b data]# cd /data;du --max-depth=2 -h ./;

8.0K ./bzip2Action
4.6G ./visitlog
104G ./unionlog

1016G ./
[root@b data]#

压缩前后比值19G:4.6G =1: 0.2421052631578947,

a节点
4.9G    ./visitlog 21G:4.9G= 1:0.21904761904761902 c节点

4.9G ./visitlog

同a节点

[root@a data]# tree visitlog/ -h
visitlog/
├── [6.2M] visit-2018-09-18.bz2
├── [8.4M] visit-2018-09-19.bz2
├── [8.3M] visit-2018-09-20.bz2
├── [8.8M] visit-2018-09-21.bz2
├── [8.7M] visit-2018-09-22.bz2
├── [7.5M] visit-2018-09-23.bz2
├── [7.4M] visit-2018-09-24.bz2
├── [8.8M] visit-2018-09-25.bz2
├── [9.3M] visit-2018-09-26.bz2
├── [9.6M] visit-2018-09-27.bz2
├── [ 12M] visit-2018-09-28.bz2
├── [ 15M] visit-2018-09-29.bz2
├── [ 15M] visit-2018-09-30.bz2
├── [ 13M] visit-2018-10-01.bz2
├── [ 13M] visit-2018-10-02.bz2
├── [ 14M] visit-2018-10-03.bz2
├── [ 14M] visit-2018-10-04.bz2
├── [ 15M] visit-2018-10-05.bz2
├── [ 15M] visit-2018-10-06.bz2
├── [ 15M] visit-2018-10-07.bz2
├── [ 17M] visit-2018-10-08.bz2
├── [ 16M] visit-2018-10-09.bz2
├── [ 17M] visit-2018-10-10.bz2
├── [ 15M] visit-2018-10-11.bz2
├── [ 16M] visit-2018-10-12.bz2
├── [ 16M] visit-2018-10-13.bz2
├── [ 23M] visit-2018-10-14.bz2
├── [ 28M] visit-2018-10-15.bz2
├── [ 25M] visit-2018-10-16.bz2
├── [ 21M] visit-2018-10-17.bz2
├── [ 23M] visit-2018-10-18.bz2
├── [ 21M] visit-2018-10-19.bz2
├── [ 21M] visit-2018-10-20.bz2
├── [ 24M] visit-2018-10-21.bz2
├── [ 18M] visit-2018-10-22.bz2
├── [ 20M] visit-2018-10-23.bz2
├── [ 20M] visit-2018-10-24.bz2
├── [ 20M] visit-2018-10-25.bz2
├── [ 21M] visit-2018-10-26.bz2
├── [ 20M] visit-2018-10-27.bz2
├── [ 18M] visit-2018-10-28.bz2
├── [ 21M] visit-2018-10-29.bz2
├── [ 22M] visit-2018-10-30.bz2
├── [ 21M] visit-2018-10-31.bz2
├── [ 22M] visit-2018-11-01.bz2
├── [ 21M] visit-2018-11-02.bz2
├── [9.8M] visit-2018-11-03.bz2
├── [7.6M] visit-2018-11-04.bz2
├── [9.7M] visit-2018-11-05.bz2
├── [9.6M] visit-2018-11-06.bz2
├── [9.5M] visit-2018-11-07.bz2
├── [ 19M] visit-2018-11-08.bz2
├── [ 12M] visit-2018-11-09.bz2
├── [ 12M] visit-2018-11-10.bz2
├── [ 11M] visit-2018-11-11.bz2
├── [ 13M] visit-2018-11-12.bz2
├── [ 14M] visit-2018-11-13.bz2
├── [ 16M] visit-2018-11-14.bz2
├── [ 16M] visit-2018-11-15.bz2
├── [ 15M] visit-2018-11-16.bz2
├── [ 15M] visit-2018-11-17.bz2
├── [ 17M] visit-2018-11-18.bz2
├── [ 18M] visit-2018-11-19.bz2
├── [ 16M] visit-2018-11-20.bz2
├── [ 20M] visit-2018-11-21.bz2
├── [ 22M] visit-2018-11-22.bz2
├── [ 13M] visit-2018-11-23.bz2
├── [ 11M] visit-2018-11-24.bz2
├── [ 11M] visit-2018-11-25.bz2
├── [ 11M] visit-2018-11-26.bz2
├── [9.7M] visit-2018-11-27.bz2
├── [8.0M] visit-2018-11-28.bz2
├── [ 12M] visit-2018-11-29.bz2
├── [ 15M] visit-2018-11-30.bz2
├── [ 15M] visit-2018-12-01.bz2
├── [ 16M] visit-2018-12-02.bz2
├── [ 20M] visit-2018-12-03.bz2
├── [ 21M] visit-2018-12-04.bz2
├── [ 23M] visit-2018-12-05.bz2
├── [ 25M] visit-2018-12-06.bz2
├── [ 32M] visit-2018-12-07.bz2
├── [ 36M] visit-2018-12-08.bz2
├── [ 35M] visit-2018-12-09.bz2
├── [ 37M] visit-2018-12-10.bz2
├── [ 38M] visit-2018-12-11.bz2
├── [ 35M] visit-2018-12-12.bz2
├── [ 35M] visit-2018-12-13.bz2
├── [ 30M] visit-2018-12-14.bz2
├── [ 32M] visit-2018-12-15.bz2
├── [ 31M] visit-2018-12-16.bz2
├── [ 39M] visit-2018-12-17.bz2
├── [ 39M] visit-2018-12-18.bz2
├── [ 38M] visit-2018-12-19.bz2
├── [ 29M] visit-2018-12-20.bz2
├── [ 43M] visit-2018-12-21.bz2
├── [ 37M] visit-2018-12-22.bz2
├── [ 35M] visit-2018-12-23.bz2
├── [ 38M] visit-2018-12-24.bz2
├── [ 38M] visit-2018-12-25.bz2
├── [ 36M] visit-2018-12-26.bz2
├── [ 38M] visit-2018-12-27.bz2
├── [ 38M] visit-2018-12-28.bz2
├── [ 37M] visit-2018-12-29.bz2
├── [ 30M] visit-2018-12-30.bz2
├── [ 35M] visit-2018-12-31.bz2
├── [296M] visit-2019-01-01
├── [345M] visit-2019-01-02
├── [397M] visit-2019-01-03
├── [331M] visit-2019-01-04
├── [300M] visit-2019-01-05
├── [312M] visit-2019-01-06
├── [311M] visit-2019-01-07
├── [154M] visit-2019-01-08
├── [173M] visit-2019-01-09
└── [176M] visit-2019-01-10

0 directories, 115 files
[root@a data]#

[root@a tmp]# ll -ash
total 32K
4.0K drwxr-xr-x 2 root root 4.0K Jan 11 14:22 .
4.0K drwxr-xr-x 17 root root 4.0K Jan 10 16:51 ..
24K -rw-r--r-- 1 root root 21K Jan 11 14:22 a
[root@a tmp]# bzip2 -9 a
[root@a tmp]# ll -as
total 12
4 drwxr-xr-x 2 root root 4096 Jan 11 14:22 .
4 drwxr-xr-x 17 root root 4096 Jan 10 16:51 ..
4 -rw-r--r-- 1 root root 1036 Jan 11 14:22 a.bz2
[root@a tmp]# bzip2 -9 a.bz2
bzip2: Input file a.bz2 already has .bz2 suffix.
[root@a tmp]# ll -as
total 12
4 drwxr-xr-x 2 root root 4096 Jan 11 14:22 .
4 drwxr-xr-x 17 root root 4096 Jan 10 16:51 ..
4 -rw-r--r-- 1 root root 1036 Jan 11 14:22 a.bz2
[root@a tmp]#

cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

2019年1月14日

c

6.0G ./visitlog
20G ./unionlog

b

5.8G ./visitlog
18G ./unionlog

a

20G     ./unionlog

6.0G    ./visitlog

27M -rw-r--r-- 1 nginx nginx 27M Dec 30 23:59 visit-2018-12-30.bz2
36M -rw-r--r-- 1 nginx nginx 36M Dec 31 23:59 visit-2018-12-31.bz2
312M -rw-r--r-- 1 nginx nginx 312M Jan 6 23:59 visit-2019-01-06
312M -rw-r--r-- 1 nginx nginx 312M Jan 7 23:59 visit-2019-01-07

44M -rw-r--r-- 1 nginx nginx 44M Dec 30 23:59 visit-2018-12-30.bz2
53M -rw-r--r-- 1 nginx nginx 53M Dec 31 23:59 visit-2018-12-31.bz2

882M -rw-r--r-- 1 nginx nginx 882M Jan 11 23:59 visit-2019-01-11
745M -rw-r--r-- 1 nginx nginx 745M Jan 12 23:59 visit-2019-01-12
707M -rw-r--r-- 1 nginx nginx 707M Jan 13 23:59 visit-2019-01-13
232M -rw-r--r-- 1 nginx nginx 232M Jan 14 09:21 visit-2019-01-14

压缩率计算

因为每日都有新文件写入,处理前的数据没有记录,如果不解压还原数据的话,无法计算准确的压缩率

压缩速度计算

认为cpu、内存资源充足

控制台输出的日志

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-09.bz2
bzip2: Input file /data/unionlog/visit-2018-11-09.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.211187493937172)
('singleSeconds(MB/s):', [3.9369898031816426, 3.953846125040358, 3.9544741312123928, 3.9555894807291088, 3.96099337092276, 3.983298697446923, 3.9966511209824667, 4.007815560864753, 4.013902687515588, 4.015872734532144, 4.015899236549791, 4.015963246206192, 4.01612198327753, 4.023731445780551, 4.025416758738823, 4.025951959772834, 4.030831979910141, 4.039693901910457, 4.0399486196050765, 4.040242824350764, 4.040648424669689, 4.041098180762507, 4.043051325648554, 4.051655360512291, 4.056658593948987, 4.059627164614112, 4.070020953590698, 4.073870225127285, 4.07503751826594, 4.075686989285653, 4.080265084217549, 4.082345972466677, 4.090936968718271, 4.0944335040477275, 4.099429160013611, 4.102229025161095, 4.104974537958556, 4.110126096413723, 4.118484472726296, 4.119251467116442, 4.121534548809426, 4.125553711713982, 4.12775144900931, 4.129621429296399, 4.129656881725015, 4.1315901550586105, 4.131830165781944, 4.143680130292085, 4.145293603443776, 4.146942161873823, 4.147313376948774, 4.148370367740056, 4.151411958798099, 4.153755223178981, 4.161263788273014, 4.164412810381955, 4.166850751469844, 4.167063598601332, 4.169355624609407, 4.176170448673875, 4.1940635910827355, 4.195109540816128, 4.2000814466148055, 4.200333163905996, 4.2022824476243406, 4.202418248410636, 4.20572094512217, 4.212585249380411, 4.218441487185745, 4.427734600215904, 4.837932137856126, 5.076886456535105, 6.319574088013213, 6.375565540330376, 7.40075797478448, 8.700273234442928, 9.955987719965876, 10.49359459267714, 10.496288104978296, 10.910080297989559, 13.4994372035219, 14.011910382913635, 14.077535801136763, 14.982532672419739, 15.577184977610813, 16.567526614277405, 16.72303989453991, 17.031745290872077, 19.135089160791104, 19.481129833087913, 20.071232748258293, 20.639159774908073, 28.797696550990196, 29.481318443179987, 32.960199161359675, 35.348744965782345, 35.914231265432065, 44.91248986354474, 56.068395022554554, 63.90561770409619, 67.3045079377858, 80.84762211431958, 88.98706103023787, 175.14815255493826, 557.3579021970233])
[root@a data]#

  

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-03.bz2
bzip2: Input file /data/unionlog/visit-2018-11-03.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.037645142862762)
('singleSeconds(MB/s):', [3.4115271073680473, 3.5042062998346606, 3.509713341704194, 3.525571499281982, 3.5898929553667154, 3.6505914130679624, 3.7138527066218354, 3.7231339152271996, 3.7267693810378284, 3.7292860405119153, 3.7299857191562316, 3.7326899795857953, 3.756952872366287, 3.757740179198384, 3.758101035864619, 3.762634699575258, 3.771730878546173, 3.7786267621034892, 3.796819445397061, 3.8048003527368794, 3.8085626615863237, 3.8112231318976035, 3.8156227214117053, 3.841775745310672, 3.848201931373685, 3.851350834838122, 3.8566428423319925, 3.857162507505528, 3.863292589421678, 3.863331261341491, 3.8643059756625355, 3.8917795293132476, 3.8927296353495002, 3.893436977500035, 3.8935765838449194, 3.8965081510857744, 3.90837814215203, 3.9189434690852534, 3.931661792967054, 3.9543185154898364, 3.962796230312998, 3.9670385630201, 4.002202845736961, 4.140499586487628, 4.2275292796865545, 4.606845720648893, 4.712329383339015, 4.723474167059724, 4.763673069508994, 4.8135033859300425, 4.842742592123715, 4.950956959538387, 4.964229453203472, 5.00257129469767, 7.159600281109767, 7.301770358234334, 7.7868991551617475, 8.339245078376065, 9.1340349502132, 9.325438851566286, 9.746514302246108, 10.417916214518563, 10.505980787495512, 10.672366971761184, 10.932940199850863, 10.976049990046109, 11.65977574461905, 11.955475170447237, 12.03807663466081, 12.591329733176506, 13.033587237840626, 13.06160835399656, 13.063173024665307, 13.130837040649459, 13.3820749715684, 13.770620023288442, 14.631663068222947, 15.281334268265432, 20.345541381319876, 23.357577176500726, 23.618352333724083, 26.66225279976816, 28.154498014416912, 28.77254085414158, 33.61942092676073, 38.447972182087994, 296.4583324183667, 1415.6438387404276])
[root@b data]#

  

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-30.bz2
bzip2: Input file /data/unionlog/visit-2018-11-30.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.100344833546164)
('singleSeconds(MB/s):', [3.7181192973321773, 3.7257172341979174, 3.745482158633604, 3.7692114529613185, 3.7872264010472043, 3.7884661064039555, 3.8186428004503985, 3.82068338626644, 3.8231526245648015, 3.83125733853526, 3.8345807959000737, 3.8487431513676458, 3.8533984710523392, 3.8766502888766508, 3.8792671200198057, 3.8794958504143318, 3.8864832683027672, 3.8874083389735117, 3.890098049760509, 3.9004211061797065, 3.9012259669791716, 3.904081019180935, 3.909088169138795, 3.9100598586939057, 3.9182811647981137, 3.9223673146999176, 3.936911082605703, 3.938102928814517, 3.9431581709314845, 3.9469155257226864, 3.9477303083616584, 3.9510218414752734, 3.9544768734685007, 3.9561312758351868, 3.9603868364070123, 3.960529493355076, 3.973218434311659, 3.973952987812832, 3.9750079546493047, 3.9769556093199063, 3.990533382215301, 3.9908648419479373, 3.99253131026352, 3.9993173904820893, 4.000028933408353, 4.009540707394956, 4.0167451008623525, 4.01694265894807, 4.021467667067072, 4.025888190955974, 4.029591312996541, 4.034701091498445, 4.051077667021889, 4.051677223836611, 4.05476273834563, 4.063306221670503, 4.067358092550384, 4.068730730698932, 4.102108601845601, 4.106712519686551, 4.112994160199945, 4.123322845773183, 4.124306143488609, 4.14616216102037, 4.190121259265525, 4.2012301048613345, 4.231113928027722, 4.635490375664297, 4.672385583985039, 4.693718071514089, 4.723321575516211, 4.723548196405968, 4.786506177340032, 4.850884632133513, 4.859158112858001, 6.1953653787024, 9.018906346164437, 9.323122551505794, 10.900660226263645, 11.832227279482243, 12.166883663453696, 12.860438635558914, 13.911042528296319, 13.942537479159501, 14.251319697516088, 15.801254911294155, 18.116126384680413, 19.22717367957711, 20.43278105002856, 20.50204524305958, 21.19584266823886, 27.97025937406051, 28.08823466724362, 35.51835114999952, 36.066163946710745, 37.79058467225096, 46.753210686310574, 47.149060556499826, 55.56960474674869, 67.19344402304698, 67.72819158484143, 88.15080202857988, 90.13568800307546, 161.22373765616393, 184.2411337594747])
[root@c data]#

  

由于没在统计时标识或者过滤.biz2文件,认为压缩速度为4MB/s;

# unionlog 假设从11号上午统计时,至14号上午统计时,空间增量为4个自然日,
# 空间大小 (882+745+707)/3*4/1024
res, ori, cut = [20, 18, 20], [141, 104, 141], (882 + 745 + 707) / 3 * 4 / 1024
compression_ratio = []
for i in range(0, len(res), 1):
ii = (res[i] - cut) / ori[i]
compression_ratio.append(ii)
print(compression_ratio)
print('压缩率均值', sum(compression_ratio) / len(compression_ratio))

[0.12029033687943262, 0.14385516826923078, 0.12029033687943262]
压缩率均值 0.128145280676032

# -*- coding: utf-8 -*-
import sys, glob, os, time targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
allMBCompressed, allSeconds, singleSeconds = 0, 0, []
allCompressionRatio, singleCompressionRatio = 0, []
for i in LocalFiles:
if not i.endswith('.bz2'):
continue
if passFeature in i:
continue # 进入原文件目录,解压后覆盖原文件
cmd = 'cd {};bzip2 -d {}'.format(targetDir, i)
print(cmd)
# 研究解压速度
fileMBCompressed = os.stat(i).st_size / 1024 / 1024
t_start = time.time()
os.system(cmd)
t_end = time.time()
try:
fileMBDecompressed = os.stat(i.strip('.bz2')).st_size / 1024 / 1024
fileSeconds = t_end - t_start
allMBCompressed += fileMBCompressed
allSeconds += fileSeconds
singleSeconds.append(fileMBCompressed / fileSeconds)
singleCompressionRatio.append(fileMBCompressed / allMBCompressed) # 按照速度大小由大到小排序
singleSeconds = list(sorted(singleSeconds, reverse=True))
singleSeconds = sorted(singleSeconds) print('averageSpeed(MB/s):', allMBCompressed / allSeconds)
print('singleSeconds(MB/s):', singleSeconds) print('singleCompressionRatio:', singleCompressionRatio)
print('arithmeticAverageSingleCompressionRatio:', sum(singleCompressionRatio) / len(singleCompressionRatio)) # 计算压缩率不考虑调和平均数,只考虑算术平均数
except Exception as e:
print(e)
cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-03.bz2
bzip2: Input file /data/unionlog/visit-2018-11-03.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.037645142862762)
('singleSeconds(MB/s):', [3.4115271073680473, 3.5042062998346606, 3.509713341704194, 3.525571499281982, 3.5898929553667154, 3.6505914130679624, 3.7138527066218354, 3.7231339152271996, 3.7267693810378284, 3.7292860405119153, 3.7299857191562316, 3.7326899795857953, 3.756952872366287, 3.757740179198384, 3.758101035864619, 3.762634699575258, 3.771730878546173, 3.7786267621034892, 3.796819445397061, 3.8048003527368794, 3.8085626615863237, 3.8112231318976035, 3.8156227214117053, 3.841775745310672, 3.848201931373685, 3.851350834838122, 3.8566428423319925, 3.857162507505528, 3.863292589421678, 3.863331261341491, 3.8643059756625355, 3.8917795293132476, 3.8927296353495002, 3.893436977500035, 3.8935765838449194, 3.8965081510857744, 3.90837814215203, 3.9189434690852534, 3.931661792967054, 3.9543185154898364, 3.962796230312998, 3.9670385630201, 4.002202845736961, 4.140499586487628, 4.2275292796865545, 4.606845720648893, 4.712329383339015, 4.723474167059724, 4.763673069508994, 4.8135033859300425, 4.842742592123715, 4.950956959538387, 4.964229453203472, 5.00257129469767, 7.159600281109767, 7.301770358234334, 7.7868991551617475, 8.339245078376065, 9.1340349502132, 9.325438851566286, 9.746514302246108, 10.417916214518563, 10.505980787495512, 10.672366971761184, 10.932940199850863, 10.976049990046109, 11.65977574461905, 11.955475170447237, 12.03807663466081, 12.591329733176506, 13.033587237840626, 13.06160835399656, 13.063173024665307, 13.130837040649459, 13.3820749715684, 13.770620023288442, 14.631663068222947, 15.281334268265432, 20.345541381319876, 23.357577176500726, 23.618352333724083, 26.66225279976816, 28.154498014416912, 28.77254085414158, 33.61942092676073, 38.447972182087994, 296.4583324183667, 1415.6438387404276])

  

bzip2, a block-sorting file compressor. Version 1.0.6, 6-Sept-2010.

usage: bzip2 [flags and input files in any order]

-h --help print this message
-d --decompress force decompression
-z --compress force compression
-k --keep keep (don't delete) input files
-f --force overwrite existing output files
-t --test test compressed file integrity
-c --stdout output to standard out
-q --quiet suppress noncritical error messages
-v --verbose be verbose (a 2nd -v gives more)
-L --license display software version & license
-V --version display software version & license
-s --small use less memory (at most 2500k)
-1 .. -9 set block size to 100k .. 900k
--fast alias for -1
--best alias for -9

If invoked as `bzip2', default action is to compress.
as `bunzip2', default action is to decompress.
as `bzcat', default action is to decompress to stdout.

If no file names are given, bzip2 compresses or decompresses
from standard input to standard output. You can combine
short flags, so `-v -4' means the same as -v4 or -4v, &c.