Java实现文件查重去重

时间:2025-03-10 07:01:09
package file; import org.apache.commons.codec.digest.DigestUtils; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DecimalFormat; import java.util.*; import java.util.stream.Collectors; public class Main { // 配置代码块 static { PATH = new String[]{ "F:\\test", "F:\\test\\test1", "F:\\test2" }; EXCLUDE_PATH_LIST = new ArrayList(Arrays.asList( // "F:\\test\\test", "F:\\test\\temp2" )); // 删除相关配置 TMP_DELETE_PATH = "F:\\testtmp"; DELETEDUP = false; // DELETEDUP = true; FORCE_DELETEDUP = false; NO_DELETE_INDEX = 0; DELETE_STR = ""; // 文件筛选范围相关配置 // ALLOW_SIZE = Long.MAX_VALUE; ALLOW_SIZE = 10 * 1024 * 1024; SIZE_INTERVAL_MIN = Long.MIN_VALUE; // SIZE_INTERVAL_MIN = 0*1024*1024; SIZE_INTERVAL_MAX = Long.MAX_VALUE; // SIZE_INTERVAL_MAX = 1*1024*1024; RECURSION = true; // 图片过滤后缀 视频过滤后缀 自定义过滤后缀 String CUSTOM_EXTENSIONS = ""; String IMAGE_EXTENSIONS = "bmp、jpg、jpeg、png、gif"; String VIDEO_EXTENSIONS = "mp4、3gp、avi、flv、mov、rmvb、wmv、mpg、mpeg、rm、ram、swf"; POINT_EXTENSIONS = IMAGE_EXTENSIONS + VIDEO_EXTENSIONS + CUSTOM_EXTENSIONS; } /** * 去重文件路径集合 */ private static String[] PATH; /** * 排除去重文件路径集合 */ private static List EXCLUDE_PATH_LIST; /** * !!!不设置将彻底删除 逻辑删除的文件夹路径 */ private static String TMP_DELETE_PATH; /** * 设置 null 不过滤 过滤的后缀集 */ private static String POINT_EXTENSIONS; /** * 是否递归子文件夹 */ private static boolean RECURSION; /** * 是否删除重复 必须计算了md5的文件才可以删除(除非设置 FORCE_DELETEDUP) 防止误删 */ private static boolean DELETEDUP; /** * 谨慎!!! 是否删除强制重复 没有计算md5的文件也可以删除 */ private static boolean FORCE_DELETEDUP; /** * 删除重复且路径里包含 deleteStr */ private static String DELETE_STR; /** * 不删除的文件list的索引 */ private static int NO_DELETE_INDEX; /** * 为 0表示关闭md5去重 小于 allowSize 字节 才计算md5 * 由于计算md5需要加载文件 磁盘限制 效率极低 当重复大文件过多时为了减少md5计算 */ private static long ALLOW_SIZE; /** * 文件大小处理区间最小值 */ private static long SIZE_INTERVAL_MIN; /** * 文件大小处理区间最大值 */ private static long SIZE_INTERVAL_MAX; public static void main(String[] args) { long begin = System.currentTimeMillis(); deal(duplicateRemove(PATH)); System.err.println(System.currentTimeMillis() - begin); } // 打印 删除 private static void deal(Map<String, List<FileMessage>> map) { int num = 1; for (Map.Entry<String, List<FileMessage>> entry : map.entrySet()) { System.err.println(num++ + " ========== " + entry.getKey() + " =============================="); List<FileMessage> value = entry.getValue(); for (int i = 0; i < value.size(); i++) { System.err.println(value.get(i)); if (DELETEDUP && i != NO_DELETE_INDEX && value.get(i).isCanDelete && value.get(i).getPath().contains(DELETE_STR)) { Path path2 = Paths.get(value.get(i).getPath()); try { if (TMP_DELETE_PATH != null && !"".equals(TMP_DELETE_PATH)) { Files.move(path2, Paths.get(TMP_DELETE_PATH + File.separator + value.get(i).getPath().replace(":", "-").replace(File.separator, "_"))); } else { Files.delete(path2); } } catch (IOException e) { e.printStackTrace(); } } } System.err.println("================================"); } } // 去重 private static Map<String, List<FileMessage>> duplicateRemove(String[] path) { Map<String, List<FileMessage>> map; Map<String, List<FileMessage>> mapResult = new TreeMap<>(); List<FileMessage> fileMessageList = new ArrayList<>(); searchFile(path, fileMessageList); // 多个文件查找的话 开启重复文件过滤 if (path.length > 1) { fileMessageList = new ArrayList<FileMessage>(new LinkedHashSet(fileMessageList)); } map = fileMessageList.stream().collect(Collectors.groupingBy(FileMessage::getSize)); // md5 去重 for (Iterator<Map.Entry<String, List<FileMessage>>> iterator = map.entrySet().iterator(); iterator.hasNext(); ) { Map.Entry<String, List<FileMessage>> entry = iterator.next(); List<FileMessage> value = entry.getValue(); if (value.size() > 1) { for (int i = 0; i < value.size(); i++) { String s = value.get(i).getPath(); String md5 = value.get(i).getSize(); if (value.get(i).getSizeLong() < ALLOW_SIZE) { md5 = calculationMD5(s); value.get(i).setCanDelete(true); } value.get(i).setMd5(md5); } if (value != null && value.size() > 0) { Map<String, List<FileMessage>> collect = value.stream().collect(Collectors.groupingBy(FileMessage::getMd5)); collect.forEach((k, v) -> { if (v.size() > 1) { mapResult.put(v.get(0).getPath(), v); } }); } } } return mapResult; } // 查找文件夹 循环 private static void searchFile(String[] filePath, List<FileMessage> fileMessageList) { LinkedList<File> list = new LinkedList<>(); for (int i = 0; i < filePath.length; i++) { if (EXCLUDE_PATH_LIST == null || !EXCLUDE_PATH_LIST.contains(filePath[i])) { File file = new File(filePath[i]); if (file.isDirectory()) { list.add(file); while (!list.isEmpty()) { File fileFirst = list.removeFirst(); for (File f : fileFirst.listFiles()) { if (EXCLUDE_PATH_LIST == null || !EXCLUDE_PATH_LIST.contains(f.getAbsolutePath())) { if (f.isDirectory()) { if (RECURSION) { list.add(f); } } else { extracted(fileMessageList, f); } } } } } else { extracted(fileMessageList, file); } } } } private static void extracted(List<FileMessage> fileMessageList, File f) { if (POINT_EXTENSIONS == null || isAppointFile(f.getName(), POINT_EXTENSIONS)) { long fileSize = f.length(); // 文件大小是否在处理区间 if (SIZE_INTERVAL_MIN <= fileSize && SIZE_INTERVAL_MAX >= fileSize) { FileMessage fileMessage = new FileMessage(f.getAbsolutePath(), fileSize + "", f.getName()); fileMessageList.add(fileMessage); } } } // 受磁盘影响 md5运算缓慢 少用 private static String calculationMD5(String path) { FileInputStream fileInputStream = null; try { fileInputStream = new FileInputStream(path); String md5 = DigestUtils.md5Hex(fileInputStream); return md5; } catch (IOException e) { e.printStackTrace(); } finally { try { fileInputStream.close(); } catch (IOException e) { e.printStackTrace(); } } return null; } // 判断是否是指定文件 public static boolean isAppointFile(String fileName, String imageExtension) { String extension = fileName.substring(fileName.lastIndexOf(".") + 1); return imageExtension.contains(extension.toLowerCase()); } // 单位换算 public static String readableFileSize(String sizeStr) { long size = Long.parseLong(sizeStr); if (size <= 0) { return "0"; } final String[] units = new String[]{"B", "KB", "MB", "GB", "TB"}; int digitGroups = (int) (Math.log10(size) / Math.log10(1024)); return new DecimalFormat("#,##0.#").format(size / Math.pow(1024, digitGroups)) + units[digitGroups]; } static class FileMessage { private String path; private String fileName; private String size; private String md5; private boolean isCanDelete = FORCE_DELETEDUP; public FileMessage() { } public FileMessage(String path, String size, String fileName) { this.path = path; this.size = size; this.fileName = fileName; } public String getPath() { return path; } public void setPath(String path) { this.path = path; } public String getSize() { return size; } public long getSizeLong() { return Long.parseLong(size); } public void setSize(String size) { this.size = size; } public String getMd5() { return md5; } public void setMd5(String md5) { this.md5 = md5; } public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } public boolean isCanDelete() { return isCanDelete; } public void setCanDelete(boolean canDelete) { isCanDelete = canDelete; } @Override public String toString() { return "{" + "path='" + path + '\'' + // ", fileName='" + fileName + '\'' + ", size='" + size + '\'' + ", size换算='" + readableFileSize(size) + '\'' + ", md5='" + md5 + '\'' + ", 计算了md5=" + isCanDelete + '}'; } @Override public boolean equals(Object obj) { return path.equals(((FileMessage) obj).getPath()); } @Override public int hashCode() { return path.hashCode(); } } }