AudioCompare比较两个音频的相似度
项目最近遇到一个问题,两段音频,它们的大小仅相差几个字节,导致文件的md5值不一样,但实际上它们又是相同的音频,需求就是要把这样的音频找出来。从网上找了下,有这样一个库AudioCompare,可以做到音频的比较
项目地址:https://github.com/charlesconnell/AudioCompare
使用方法也比较简单:
./audiocompare -f file1 -f file2
./audiocompare -f file1 -d dir1
./audiocompare -d dir1 -f file1
./audiocompare -d dir1 -d dir2
如果没有匹配成功会返回”NO MATCH” ,如果匹配成功会返回”MATCH …”和匹配的score
测试代码如下:
# -*-coding:utf8-*-
import os
import re
import subprocess
# 读取txt文件内容,每行分开加入到list中
def readTxt(path, ignore):
contentList = list()
for line in open(path, "r"):
line = re.sub(r'\n', '', line)
if line.find(ignore) == -1:
contentList.append(line)
return contentList
def compareAudio(fileNameList, compareFile):
# 调用AudioCompare库方法比较两个文件
fileName = compareFile.split('/')[-1]
filePath = compareFile[:-len(fileName)]
if not os.path.exists(filePath):
os.mkdir(filePath)
if os.path.exists(compareFile): # 已存在
fout = open(compareFile, 'a')
else:
fout = open(compareFile, 'w')
fout.write('# 记录音频比较结果的文件\n')
contentList = readTxt(compareFile, '#')
for index, fileName in enumerate(fileNameList):
for compareFileName in fileNameList[index + 1:]:
# 从记录的文件中取值,如果已经比较过就不再比较
needCompareFlag = True
for content in contentList:
if fileName in content.split('|') and compareFileName in content.split('|'):
needCompareFlag = False
print '已存在:' + fileName + '|' + compareFileName
if needCompareFlag:
command = ' AudioCompare-master/main.py -f ' + fileName + ' -f ' + compareFileName
p = subprocess.Popen('python' + command, stdout=subprocess.PIPE, shell=True)
stdoutput = p.stdout.readlines()
fout.write(stdoutput[0])
# 获取文件大小
def getFileSize(fileName):
try:
return os.path.getsize(fileName)
except Exception as err:
print(err)
# 得到文件大小相似的集合,返回[[('xx.wav',3715364L)],[('xx.wav',3715364L),('xx.wav',3715364L),('xx.wav',3715364L)]]
def sortedNearFile(fileNamesList, level):
# 得到每个文件的大小
fileSizeDict = dict()
for fileName in fileNamesList:
fileSizeDict.setdefault(fileName, getFileSize(fileName))
# 先按照文件的大小排序
sortedFileSizeList = sorted(fileSizeDict.iteritems(), key=lambda item: item[1])
# 逐个比较相邻的文件大小,小于阈值,提取出来
nearFileList = list()
for index in range(0, len(sortedFileSizeList)):
if index + 1 < len(sortedFileSizeList) and index > 0:
# 和左边的比
preVal = abs(sortedFileSizeList[index][1] - sortedFileSizeList[index - 1][1])
# 和右边比
nextVal = abs(sortedFileSizeList[index][1] - sortedFileSizeList[index + 1][1])
# 得出与左边或右边音频大小小于阀值的音频
if preVal <= level or nextVal <= level:
nearFileList.append(sortedFileSizeList[index])
# 把重复的放在一个列表中,返回[[('xx.wav',3715364L)],[('xx.wav',3715364L),('xx.wav',3715364L),('xx.wav',3715364L)]]
sortedNearFileList = list()
i = 0
for index in range(0, len(nearFileList)):
# 如果nearFileList[index] - sortedNearFileList[i][0] <= level 就添加,否则添加到sortedNearFileList[i+1]中
if len(sortedNearFileList) == 0:
sortedNearFileList.append([nearFileList[index]])
else:
# 右边减左边,小于阀值的保存
val = abs(sortedNearFileList[i][0][1] - nearFileList[index][1])
if val <= level:
sortedNearFileList[i].append(nearFileList[index])
else:
i += 1
sortedNearFileList.append([nearFileList[index]])
return sortedNearFileList
# 找出指定文件夹下的所有后缀名为suffix的文件名称,返回列表
def getFileNames(dirPath, suffix):
fileNamesList = list()
for fileName in os.listdir(dirPath):
if os.path.splitext(fileName)[1] == suffix:
fileNamesList.append(dirPath + '/' + fileName)
return fileNamesList
if __name__ == '__main__':
fileNameList = getFileNames(os.path.curdir, '.wav')
sortedFileList = sortedNearFile(fileNameList, 100 * 1024)
for fileTupleList in sortedFileList:
if len(fileTupleList) > 1:
compareList = list()
for fileTuple in fileTupleList:
compareList.append(fileTuple[0])
compareAudio(compareList[:], './compare.txt')
把匹配结果记录到文件中