使用Python对阿里云sql server进行数据归档

时间:2022-09-23 08:17:32

背景是这样的,阿里云上的数据目前已经买了2TB的磁盘空间了,而且已经到了上限,无法扩容了. 所以才会把数据库中的不常用数据使用此方式进行归档.

目前主要的思路是:

1 按照顺序读取每天的数据,写成对应的csv.

2 自动压缩csv为zip,删除csv.

3 删除数据库中的数据.

 

看主要的代码:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import pymssql
import csv
import datetime
import os,sys
import zipfile
import math
import getopt

class RunBack:
    tableName = "VendorAPILog"
    headerName = ["ID","AppSid","RequestDateTime","RequestConTime","VendorAPIName","RequestUrl","RequestContent","ResponseContent","DateYear","DateMonth","DateDay"];
    maxid = 0;
    rowids = []

    def __init__(self,curdate):
        print(curdate)
        self.date = datetime.datetime.strptime(curdate, "%Y-%m-%d")
        self.dbaccess()
        self.filepath = os.path.dirname(os.path.realpath(__file__)) 
        self.filepath = os.path.join(self.filepath,"dblog")
    
    def dbaccess(self):
        self.db = pymssql.connect(host="xxxxxxxxxx.sqlserver.rds.aliyuncs.com",port=3433, user="username", password="pwd", database="tablename", charset='utf8')
        self.cursor = self.db.cursor()

    def process(self):
        
        if os.path.exists(self.filepath):
            #os.makedirs(filepath)
            print("exists")
        else:
             os.makedirs(self.filepath)

        self.count=self.reminecount()
        if (self.count>0):        
            filename = "db_name_%s.csv"%(self.date.strftime("%Y%m%d"))
            fullfilename = os.path.join(self.filepath,filename)
            self.csvfile = open(fullfilename, 'w',newline='',encoding='utf_8')
            self.csvwrite = csv.writer(self.csvfile)
            self.appendheader()

            while self.count>0:
                self.appendrow()
                self.count = self.reminecount()
                
            self.cursor.close()
            self.db.close()
            self.csvfile.close()
            #压缩存储
            self.zip()


    def appendheader(self):
        self.csvwrite.writerow(self.headerName)
    
    def appendrow(self):
        #读取数据库
        print("获取数据")
        self.cursor.execute("select top 2000 * from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d order by id" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid))
        rows = self.cursor.fetchall()

        print(self.maxid)
        #批量转成csv
        for row in rows:
            self.maxid = max(row[0],self.maxid) 
            
            self.rowids.append([row[0],row[9]])
            self.csvwrite.writerow(row)
            
        print(self.maxid)
        self.delrow()

    def reminecount(self):
        print("获取剩余行数")
        self.cursor.execute("select count(*) from %s where DateYear=%d and DateMonth=%d and DateDay=%d and id>%d" %(self.tableName,self.date.year,self.date.month,self.date.day,self.maxid))
        row = self.cursor.fetchone()
        print(row[0])
        return row[0]
    
    def zip(self):
        filename = "db_name_%s.zip"%(self.date.strftime("%Y%m%d"))
        fullfilename = os.path.join(self.filepath,filename)
        file = zipfile.ZipFile(fullfilename,'w',compression=zipfile.ZIP_LZMA)
        print(fullfilename)
        filename3 = filename.replace("zip","csv")
        filename2 =os.path.join("dblog",filename3)
        print(filename2)
        file.write(filename2,filename3)

        os.remove(filename2)

    def delrow(self):
        if len(self.rowids)>0:
            for rowid in self.rowids:
                #print(rowid)
                sql = "delete from %s where id=%d and DateMonth=%d;"%(self.tableName,rowid[0],rowid[1])
                print(sql)
                self.cursor.execute(sql)
            self.db.commit() 
            self.rowids = []   


run = RunBack(sys.argv[1])
run.process()
#run.zip()

最后需要大家注意的是:

不要再归档的过程中造成数据库的io压力,以免影响生产数据的性能.

因为我这边用的阿里云,我也处理好了索引,所以我的归档过程中对生产库几乎没什么影响.

阿里云数据库的磁盘 iops在3000左右,磁盘读写在50m/s左右.除了数据量很大,实际上系统运行的压力不是很大,而且我这个是库是日志库基本上是只写不读.

 

实际的情况,大家要自己去评估去衡量.