Java实现文件查重去重
package file;
import org.apache.commons.codec.digest.DigestUtils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DecimalFormat;
import java.util.*;
import java.util.stream.Collectors;
public class Main {
// 配置代码块
static {
PATH = new String[]{
"F:\\test",
"F:\\test\\test1",
"F:\\test2"
};
EXCLUDE_PATH_LIST = new ArrayList(Arrays.asList(
// "F:\\test\\test",
"F:\\test\\temp2"
));
// 删除相关配置
TMP_DELETE_PATH = "F:\\testtmp";
DELETEDUP = false;
// DELETEDUP = true;
FORCE_DELETEDUP = false;
NO_DELETE_INDEX = 0;
DELETE_STR = "";
// 文件筛选范围相关配置
// ALLOW_SIZE = Long.MAX_VALUE;
ALLOW_SIZE = 10 * 1024 * 1024;
SIZE_INTERVAL_MIN = Long.MIN_VALUE;
// SIZE_INTERVAL_MIN = 0*1024*1024;
SIZE_INTERVAL_MAX = Long.MAX_VALUE;
// SIZE_INTERVAL_MAX = 1*1024*1024;
RECURSION = true;
// 图片过滤后缀 视频过滤后缀 自定义过滤后缀
String CUSTOM_EXTENSIONS = "";
String IMAGE_EXTENSIONS = "bmp、jpg、jpeg、png、gif";
String VIDEO_EXTENSIONS = "mp4、3gp、avi、flv、mov、rmvb、wmv、mpg、mpeg、rm、ram、swf";
POINT_EXTENSIONS =
IMAGE_EXTENSIONS + VIDEO_EXTENSIONS +
CUSTOM_EXTENSIONS;
}
/**
* 去重文件路径集合
*/
private static String[] PATH;
/**
* 排除去重文件路径集合
*/
private static List EXCLUDE_PATH_LIST;
/**
* !!!不设置将彻底删除 逻辑删除的文件夹路径
*/
private static String TMP_DELETE_PATH;
/**
* 设置 null 不过滤 过滤的后缀集
*/
private static String POINT_EXTENSIONS;
/**
* 是否递归子文件夹
*/
private static boolean RECURSION;
/**
* 是否删除重复 必须计算了md5的文件才可以删除(除非设置 FORCE_DELETEDUP) 防止误删
*/
private static boolean DELETEDUP;
/**
* 谨慎!!! 是否删除强制重复 没有计算md5的文件也可以删除
*/
private static boolean FORCE_DELETEDUP;
/**
* 删除重复且路径里包含 deleteStr
*/
private static String DELETE_STR;
/**
* 不删除的文件list的索引
*/
private static int NO_DELETE_INDEX;
/**
* 为 0表示关闭md5去重 小于 allowSize 字节 才计算md5
* 由于计算md5需要加载文件 磁盘限制 效率极低 当重复大文件过多时为了减少md5计算
*/
private static long ALLOW_SIZE;
/**
* 文件大小处理区间最小值
*/
private static long SIZE_INTERVAL_MIN;
/**
* 文件大小处理区间最大值
*/
private static long SIZE_INTERVAL_MAX;
public static void main(String[] args) {
long begin = System.currentTimeMillis();
deal(duplicateRemove(PATH));
System.err.println(System.currentTimeMillis() - begin);
}
// 打印 删除
private static void deal(Map<String, List<FileMessage>> map) {
int num = 1;
for (Map.Entry<String, List<FileMessage>> entry : map.entrySet()) {
System.err.println(num++ + " ========== " + entry.getKey() + " ==============================");
List<FileMessage> value = entry.getValue();
for (int i = 0; i < value.size(); i++) {
System.err.println(value.get(i));
if (DELETEDUP && i != NO_DELETE_INDEX && value.get(i).isCanDelete
&& value.get(i).getPath().contains(DELETE_STR)) {
Path path2 = Paths.get(value.get(i).getPath());
try {
if (TMP_DELETE_PATH != null && !"".equals(TMP_DELETE_PATH)) {
Files.move(path2, Paths.get(TMP_DELETE_PATH + File.separator
+ value.get(i).getPath().replace(":", "-").replace(File.separator, "_")));
} else {
Files.delete(path2);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.err.println("================================");
}
}
// 去重
private static Map<String, List<FileMessage>> duplicateRemove(String[] path) {
Map<String, List<FileMessage>> map;
Map<String, List<FileMessage>> mapResult = new TreeMap<>();
List<FileMessage> fileMessageList = new ArrayList<>();
searchFile(path, fileMessageList);
// 多个文件查找的话 开启重复文件过滤
if (path.length > 1) {
fileMessageList = new ArrayList<FileMessage>(new LinkedHashSet(fileMessageList));
}
map = fileMessageList.stream().collect(Collectors.groupingBy(FileMessage::getSize));
// md5 去重
for (Iterator<Map.Entry<String, List<FileMessage>>> iterator = map.entrySet().iterator(); iterator.hasNext(); ) {
Map.Entry<String, List<FileMessage>> entry = iterator.next();
List<FileMessage> value = entry.getValue();
if (value.size() > 1) {
for (int i = 0; i < value.size(); i++) {
String s = value.get(i).getPath();
String md5 = value.get(i).getSize();
if (value.get(i).getSizeLong() < ALLOW_SIZE) {
md5 = calculationMD5(s);
value.get(i).setCanDelete(true);
}
value.get(i).setMd5(md5);
}
if (value != null && value.size() > 0) {
Map<String, List<FileMessage>> collect = value.stream().collect(Collectors.groupingBy(FileMessage::getMd5));
collect.forEach((k, v) -> {
if (v.size() > 1) {
mapResult.put(v.get(0).getPath(), v);
}
});
}
}
}
return mapResult;
}
// 查找文件夹 循环
private static void searchFile(String[] filePath, List<FileMessage> fileMessageList) {
LinkedList<File> list = new LinkedList<>();
for (int i = 0; i < filePath.length; i++) {
if (EXCLUDE_PATH_LIST == null || !EXCLUDE_PATH_LIST.contains(filePath[i])) {
File file = new File(filePath[i]);
if (file.isDirectory()) {
list.add(file);
while (!list.isEmpty()) {
File fileFirst = list.removeFirst();
for (File f : fileFirst.listFiles()) {
if (EXCLUDE_PATH_LIST == null || !EXCLUDE_PATH_LIST.contains(f.getAbsolutePath())) {
if (f.isDirectory()) {
if (RECURSION) {
list.add(f);
}
} else {
extracted(fileMessageList, f);
}
}
}
}
} else {
extracted(fileMessageList, file);
}
}
}
}
private static void extracted(List<FileMessage> fileMessageList, File f) {
if (POINT_EXTENSIONS == null || isAppointFile(f.getName(), POINT_EXTENSIONS)) {
long fileSize = f.length();
// 文件大小是否在处理区间
if (SIZE_INTERVAL_MIN <= fileSize && SIZE_INTERVAL_MAX >= fileSize) {
FileMessage fileMessage = new FileMessage(f.getAbsolutePath(), fileSize + "", f.getName());
fileMessageList.add(fileMessage);
}
}
}
// 受磁盘影响 md5运算缓慢 少用
private static String calculationMD5(String path) {
FileInputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(path);
String md5 = DigestUtils.md5Hex(fileInputStream);
return md5;
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
fileInputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
// 判断是否是指定文件
public static boolean isAppointFile(String fileName, String imageExtension) {
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
return imageExtension.contains(extension.toLowerCase());
}
// 单位换算
public static String readableFileSize(String sizeStr) {
long size = Long.parseLong(sizeStr);
if (size <= 0) {
return "0";
}
final String[] units = new String[]{"B", "KB", "MB", "GB", "TB"};
int digitGroups = (int) (Math.log10(size) / Math.log10(1024));
return new DecimalFormat("#,##0.#").format(size / Math.pow(1024, digitGroups)) + units[digitGroups];
}
static class FileMessage {
private String path;
private String fileName;
private String size;
private String md5;
private boolean isCanDelete = FORCE_DELETEDUP;
public FileMessage() {
}
public FileMessage(String path, String size, String fileName) {
this.path = path;
this.size = size;
this.fileName = fileName;
}
public String getPath() {
return path;
}
public void setPath(String path) {
this.path = path;
}
public String getSize() {
return size;
}
public long getSizeLong() {
return Long.parseLong(size);
}
public void setSize(String size) {
this.size = size;
}
public String getMd5() {
return md5;
}
public void setMd5(String md5) {
this.md5 = md5;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public boolean isCanDelete() {
return isCanDelete;
}
public void setCanDelete(boolean canDelete) {
isCanDelete = canDelete;
}
@Override
public String toString() {
return "{" +
"path='" + path + '\'' +
// ", fileName='" + fileName + '\'' +
", size='" + size + '\'' +
", size换算='" + readableFileSize(size) + '\'' +
", md5='" + md5 + '\'' +
", 计算了md5=" + isCanDelete +
'}';
}
@Override
public boolean equals(Object obj) {
return path.equals(((FileMessage) obj).getPath());
}
@Override
public int hashCode() {
return path.hashCode();
}
}
}