Python3.7 比较两个Excel文件指定列的值的异同，并将核对结果写入Excel中（含升级版本）

背景：

最近工作中需要核对客户的历史数据，

接近400个产品，需要核对的列有15列，行数有8000+行

肉眼核对简直要吐血

心想着反正在学python呢

人生苦短

何不用python写个脚本

帮助我核对

我省出时间来做些更有意义的事情呢

前提：

我需要核对的两份Excel有以下特点：

有共同的主键
两份Excel的结构不尽相同，仅需要核对源文件47列中的15列，这些列在目标文件中都能找到

主要思路：

将两个文件的主键作为键，将要核对的列作为值，保存到字典中

然后再进行比较

实现代码：

import xlrd
import xlwt
import time


origin_dict = {}  # 初始化，用于保存源文件{key(主键):value(需要核对的列值)}
target_dict = {}  # 初始化，用于保存目标文件{key(主键):value(需要核对的列值)}

# 放在函数外部声明 会报错 还不晓得原因
# success = 0
# fail = 0


def compare_excel(ori_path,tar_path,col_ori,col_tar):
    \'\'\'
    ori_path:用于存放源文件
    tar_path:用于存放目标文件
    col_ori:源文件中需要比较的列
    col_tar:目标文件中需要比较的列
    \'\'\'
    success = 0  # 核对无差异的个数
    fail = 0  # 核对结果有差异的个数
    row_res = 0  #要写入的文件的行

    # 分别打开源文件与目标文件
    wb_ori = xlrd.open_workbook(ori_path)
    wb_tar = xlrd.open_workbook(tar_path)
    # 新建一个excel，用于存放核对结果
    wb_res = xlwt.Workbook()  

    # 分别获取源文件与目标文件的sheet
    sheet_ori = wb_ori.sheet_by_index(0)
    sheet_tar = wb_tar.sheet_by_index(0)
    # 新建一名称为result的sheet页，用于存放核对具体结果
    sheet_res = wb_res.add_sheet(\'result\')

    # 获取源文件中由主键、需核对列组成的键值对，放入字典中
    for row_ori in range(1,sheet_ori.nrows):
        cell_ori_key = sheet_ori.cell_value(row_ori,0)  #因我的源文件的主键位于第0列，故该列未参数化，大家可以视自己实际情况进行优化
        cell_ori_value = sheet_ori.cell_value(row_ori,col_ori)
        origin_dict[cell_ori_key] = cell_ori_value
    # print(\'源文件获取成功\')
    # print(\'源文件如下：%s\' % origin_dict)

    # 获取目标文件中由主键、待核对列组成的键值对，放入字典中
    for row_tar in range(1,sheet_tar.nrows):
        cell_tar_key = sheet_tar.cell_value(row_tar,0)
        cell_tar_value = sheet_tar.cell_value(row_tar,col_tar)
        target_dict[cell_tar_key] = cell_tar_value
    # print(\'目标文件获取成功\')
    # print(\'目标文件如下：%s\' % target_dict)

    # 核对逻辑
    try:
        for i in origin_dict.keys():  # 获取源文件字典的键
            if target_dict.get(i) == origin_dict.get(i):  # 对比两个字典中相同键的值
                success += 1  # 值相等，则无差异数+1
                sheet_res.write(row_res+1,0,i)  # 将键写入结果文件的第0列
                sheet_res.write(row_res+1,1,\'你俩长一样\')  #将核对无差异结果写入结果文件的第1列
                row_res += 1  # 结果文件行数+1
                print(\'金融产品 %s 核对无差异\'% i)
            else:
                fail +=1  # 值不相等，则有差异数+1
                sheet_res.write(row_res+1,0,i) 
                sheet_res.write(row_res+1,1,\'核对有差异：源文件的值为：%s，目标文件的值为:%s\' % (origin_dict.get(i),target_dict.get(i)))  # #将核对有差异结果写入结果文件的第1列
                row_res += 1  # 结果文件行数+1
                print(\'金融产品 %s 核对有差异：源文件的值为：%s，目标文件的值为:%s\' % (i,origin_dict.get(i),target_dict.get(i)))
                wb_res.save(\'result.xlsx\')  # 保存结果文件
        print(time.strftime(\'%Y-%m-%d %H-%M-%S\',time.localtime())+\'核对完成，共核对 %d 条，其中无差异 %d 条， 有差异 %d条\' % (len(origin_dict),success,fail))
    except Exception as error:
        print(str(error))


compare_excel(\'C111111.xlsx\',\'O222222.xlsx\',3,2)  # 核对源文件第4列，目标文件第3列

后续：

代码还有很多优化的空间：

比如说源文件中有的键目标文件中没有，如何提示？

比如说能否一次比较多个列的值

比如扩展成双主键/多主键，该如何比对？

如果你有方法或思路，可以跟我这个小白一起交流

*************************************************************************************************************************************************************************************************

上面说的优化空间，今天解决了多主键的问题

多主键问题解决的关键

是把多主键拼接成一个字符串，然后再作为字典的键

进而比较其值

优化的点还包括：

将核对结果追加写入源文件中

而非新建一个Excel

具体代码如下：

import xlrd
import xlwt
import xlutils
import time
from datetime import datetime
from xlrd import xldate_as_tuple
from xlutils import copy

dict_ori = {}  # 初始化，用于保存源文件{key(主键):value(需要核对的列值)}
dict_tar = {}  # 初始化，用于保存目标文件{key(主键):value(需要核对的列值)}

def Excel_Compare(ori_path,tar_path,sheet_index_ori,col_ori,col_tar):
    \'\'\'
    ori_path:源文件地址
    tar_path:目标文件地址
    sheet_index_ori:源文件中待核对的sheet的索引
    col_ori:源文件中待核对的列
    col_tar:目标文件中待核对的列
    \'\'\'
    success = 0  
    fail = 0
    space = 0  # 空行个数
    row_res = 0
    # 分别打开源文件与目标文件
    wb_ori = xlrd.open_workbook(ori_path,\'w+\')
    wb_tar = xlrd.open_workbook(tar_path)
    # 新建一个excel，用于存放核对结果
    # wb_res = xlwt.Workbook()  
    wb_res = copy.copy(wb_ori)  # 在源文件中追加写入

    # 分别获取源文件与目标文件的sheet
    sheet_ori = wb_ori.sheet_by_index(sheet_index_ori)
    sheet_tar = wb_tar.sheet_by_index(0)
    # 新建一名称为result的sheet页，用于存放核对具体结果
    # sheet_res = wb_res.add_sheet(\'result\')
    sheet_res = wb_res.get_sheet(sheet_index_ori)  # 在原sheet中追加写入

        # 获取源文件中由主键、需核对列组成的键值对，放入字典中
    for row_ori in range(1,sheet_ori.nrows):
        product_id_ori = sheet_ori.cell_value(row_ori,1)  
        # print(product_id_ori)
        product_name_ori = sheet_ori.cell_value(row_ori,4).split(\'-\')[1]  # 源文件中账套名称为：XX-xx形式，因核对时只需要后面的xx，故需要做一下处理
        # product_name_ori_ctype = sheet_ori.cell(row_ori,4).ctype
        # print(product_name_ori)
        # print(product_name_ori_ctype)
        # 以下3行代码将从Excel读出的日期数据进行转化并格式化
        date_ori_1 = sheet_ori.cell_value(row_ori,11)
        date_ori_2 = datetime(*xldate_as_tuple(date_ori_1,0))  
        date_ori = date_ori_2.strftime(\'%Y-%m-%d\')
        # print(date_ori)
        key_ori_list = [product_id_ori,product_name_ori,date_ori]
        key_ori = \'--\'.join(key_ori_list)  # 关键点，将多主键拼接成字符串
        # print(key_ori)
        income_ori_1 = sheet_ori.cell_value(row_ori,col_ori)  
        income_ori = round(float(income_ori_1),2)  # 读出来的数据为str类型，需转化为float类型，方便进行处理，注意源文件中不能有空行
        # dict_ori = {key_ori:income_ori}  # 该方式最终仅保存一次，不是想要的结果
        dict_ori[key_ori] = income_ori  # 将提取出的关键信息追加保存为字典格式
        # print("源文件数据获取成功")
        # print(dict_ori)

    #     获取源文件中由主键、需核对列组成的键值对，放入字典中
    for row_tar in range(1,sheet_tar.nrows):
        product_id_tar = sheet_tar.cell_value(row_tar,1)
        product_name_tar = sheet_tar.cell_value(row_tar,5)
        date_tar_1 = sheet_tar.cell_value(row_tar,15)
        key_tar_list = [product_id_tar,product_name_tar,date_tar_1]
        key_tar = \'--\'.join(key_tar_list)
        income_tar_1 = sheet_tar.cell_value(row_tar,col_tar)
        income_tar = round(float(income_tar_1),2)
        # income_tar_ctype = sheet_tar.cell(row_tar,19).ctype
        # print(income_tar_ctype)
        # dict_tar = {key_tar:income_tar}
        dict_tar[key_tar] = income_tar
        # print("目标文件数据获取成功")
        # print(dict_tar)

    # 核对逻辑
    try:
        for i in dict_ori.keys():
            # print(type(dict_tar.get(i)))
            # income_ori_float = float(dict_ori.get(i))
            # print(i)
            if dict_tar.get(i)  == dict_ori.get(i):  # 无差异的情况
                success += 1
                product_id_res = i.split(\'--\')[0]
                product_name_res = i.split(\'--\')[1]
                date_res = i.split(\'--\')[2]
                sheet_res.write(row_res+1,20,product_id_res)
                sheet_res.write(row_res+1,21,product_name_res)
                sheet_res.write(row_res+1,22,date_res)
                sheet_res.write(row_res+1,23,\'核对无误\')
                row_res += 1 
                print(\'金融产品:%s，账套:%s，日期：%s的收益数据核对无差异\' % (product_id_res,product_name_res,date_res))
            elif dict_ori.get(i) == 0.00 and dict_tar.get(i) == None:  #有空值的情况
                space += 1
                product_id_res = i.split(\'--\')[0]
                product_name_res = i.split(\'--\')[1]
                date_res = i.split(\'--\')[2]
                sheet_res.write(row_res+1,20,product_id_res)
                sheet_res.write(row_res+1,21,product_name_res)
                sheet_res.write(row_res+1,22,date_res)
                sheet_res.write(row_res+1,23,\'空值\') 
                row_res += 1
                print(\'金融产品:%s，账套:%s，日期：%s的数据为空\' % (product_id_res,product_name_res,date_res))
            else:  # 核对有差异的情况
                fail += 1
                product_id_res = i.split(\'--\')[0]
                product_name_res = i.split(\'--\')[1]
                date_res = i.split(\'--\')[2]
                sheet_res.write(row_res+1,20,product_id_res)
                sheet_res.write(row_res+1,21,product_name_res)
                sheet_res.write(row_res+1,22,date_res)
                sheet_res.write(row_res+1,23,\'数值有差异，源文件收益为：%s,目标文件的收益为:%s\'%(dict_ori.get(i),dict_tar.get(i)))
                row_res += 1
                print(\'金融产品:%s，账套:%s，日期：%s 的收益数据核对有差异，源文件的收益为%s,目标文件的收益为%s\'%(product_id_res,product_name_res,date_res,dict_ori.get(i),dict_tar.get(i)))
    except Exception as error:
        print(str(error))
    wb_res.save(ori_path)  # 保存源文件
    print(time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime())+\'核对完成，共核对%d条，其中，无差异%d条，有差异%d条\'%(success+fail,success,fail))


Excel_Compare(\'CashSequence111.xlsx\',\'CashSequence222.xlsx\',3,16,19)