60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
|
import os
|
|||
|
import hashlib
|
|||
|
import shutil
|
|||
|
from tqdm import tqdm
|
|||
|
class de_repeatfile(object):
|
|||
|
def __init__(self, sampleFile_dir,save_dir):
|
|||
|
"""
|
|||
|
删除重复的样本文件
|
|||
|
:param sampleFile_dir:样本文件所在目录
|
|||
|
"""
|
|||
|
self.sampleFile_dir = sampleFile_dir
|
|||
|
self.save_dir=save_dir
|
|||
|
def find_duplicate_files(self):
|
|||
|
"""
|
|||
|
查找并返回所有重复的文件,以文件路径列表的形式返回
|
|||
|
"""
|
|||
|
# 哈希值与文件路径的字典
|
|||
|
hash_dict = {}
|
|||
|
|
|||
|
for root, dirs, files in os.walk(self.sampleFile_dir):
|
|||
|
# root:表示正在遍历的文件夹的名字(根 / 子)
|
|||
|
# dirs:记录正在遍历的文件夹下的子文件夹集合
|
|||
|
# files:记录正在遍历的文件夹中的文件集合
|
|||
|
notexe_filelist = []
|
|||
|
for file in tqdm(files):
|
|||
|
file_path = os.path.join(self.sampleFile_dir, file)
|
|||
|
if os.path.isdir(file_path):
|
|||
|
continue
|
|||
|
# 计算文件的哈希值
|
|||
|
with open(file_path, 'rb') as f:
|
|||
|
file_hash = hashlib.md5(f.read()).hexdigest()
|
|||
|
# 如果哈希值已经存在,则添加到哈希值对应的文件路径列表中
|
|||
|
if file_hash in hash_dict:
|
|||
|
hash_dict[file_hash].append(file_path)
|
|||
|
else:
|
|||
|
hash_dict[file_hash] = [file_path]
|
|||
|
|
|||
|
return [file_paths for file_paths in hash_dict.values() if len(file_paths) > 1]
|
|||
|
|
|||
|
|
|||
|
def remove_duplicate_files(self):
|
|||
|
"""
|
|||
|
从文件夹中删除重复的文件
|
|||
|
"""
|
|||
|
duplicate_files = self.find_duplicate_files()
|
|||
|
# 删除所有重复的文件
|
|||
|
for files in duplicate_files:
|
|||
|
# 保留第一个文件,删除其他所有文件
|
|||
|
for file in files[1:]:
|
|||
|
save_file=os.path.join(self.sampleFile_dir, file)
|
|||
|
shutil.copy(save_file, self.save_dir)
|
|||
|
os.remove(file)
|
|||
|
print("已删除重复文件:", file)
|
|||
|
|
|||
|
print("已完成删除重复文件的操作!")
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|