背景是这样的,阿里云上的数据目前已经买了2TB的磁盘空间了,而且已经到了上限,无法扩容了. 所以才会把数据库中的不常用数据使用此方式进行归档.
目前主要的思路是:
1 按照顺序读取每天的数据,写成对应的csv.
2 自动压缩csv为zip,删除csv.
3 删除数据库中的数据.
看主要的代码:
#!/usr/bin/python # -*- coding: UTF-8 -*- import pymssql import csv import datetime import os,sys import zipfile import math import getopt class RunBack: tableName = "VendorAPILog" headerName = ["ID","AppSid","RequestDateTime","RequestConTime","VendorAPIName","RequestUrl","RequestContent","ResponseContent","DateYear","DateMonth","DateDay"]; maxid = 0; rowids = [] def __init__(self,curdate): print(curdate) self.date = datetime.datetime.strptime(curdate, "%Y-%m-%d") self.dbaccess() self.filepath = os.path.dirname(os.path.realpath(__file__)) self.filepath = os.path.join(self.filepath,"dblog") def dbaccess(self): self.db = pymssql.connect(host="xxxxxxxxxx.sqlserver.rds.aliyuncs.com",port=3433, user="username", password="pwd", database="tablename", charset='utf8') self.cursor = self.db.cursor() def process(self): if os.path.exists(self.filepath): #os.makedirs(filepath) print("exists") else: os.makedirs(self.filepath) self.count=self.reminecount() if (self.count>0): filename = "db_name_%s.csv"%(self.date.strftime("%Y%m%d")) fullfilename = os.path.join(self.filepath,filename) self.csvfile = open(fullfilename, 'w',newline='',encoding='utf_8') self.csvwrite = csv.writer(self.csvfile) self.appendheader() while self.count>0: self.appendrow() self.count = self.reminecount() self.cursor.close() self.db.close() self.csvfile.close() #压缩存储 self.zip() def appendheader(self): self.csvwrite.writerow(self.headerName) def appendrow(self): #读取数据库 print("获取数据") self.cursor.execute("select top 2000 * from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d order by id" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid)) rows = self.cursor.fetchall() print(self.maxid) #批量转成csv for row in rows: self.maxid = max(row[0],self.maxid) self.rowids.append([row[0],row[9]]) self.csvwrite.writerow(row) print(self.maxid) self.delrow() def reminecount(self): print("获取剩余行数") self.cursor.execute("select count(*) from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid)) row = self.cursor.fetchone() print(row[0]) return row[0] def zip(self): filename = "db_name_%s.zip"%(self.date.strftime("%Y%m%d")) fullfilename = os.path.join(self.filepath,filename) file = zipfile.ZipFile(fullfilename,'w',compression=zipfile.ZIP_LZMA) print(fullfilename) filename3 = filename.replace("zip","csv") filename2 =os.path.join("dblog",filename3) print(filename2) file.write(filename2,filename3) os.remove(filename2) def delrow(self): if len(self.rowids)>0: for rowid in self.rowids: #print(rowid) sql = "delete from %s where id=%d and DateMonth=%d;"%(self.tableName,rowid[0],rowid[1]) print(sql) self.cursor.execute(sql) self.db.commit() self.rowids = [] run = RunBack(sys.argv[1]) run.process() #run.zip()
最后需要大家注意的是:
不要再归档的过程中造成数据库的io压力,以免影响生产数据的性能.
因为我这边用的阿里云,我也处理好了索引,所以我的归档过程中对生产库几乎没什么影响.
阿里云数据库的磁盘 iops在3000左右,磁盘读写在50m/s左右.除了数据量很大,实际上系统运行的压力不是很大,而且我这个是库是日志库基本上是只写不读.
实际的情况,大家要自己去评估去衡量.