detect_rep/tool/md5值生成以及相同样本删除.py

31 lines
1.1 KiB
Python
Raw Permalink Normal View History

2023-04-05 10:04:49 +08:00
import os
import hashlib
import shutil
def md5sum(file_path):
with open(file_path, "rb") as f:
md5 = hashlib.md5()
while True:
data = f.read(1024*1024)
if not data:
break
md5.update(data)
return md5.hexdigest()
def copy_executables(src_folder, dst_folder):
md5_dict = {}
for file_name in os.listdir(src_folder):
file_path = os.path.join(src_folder, file_name)
if os.path.isfile(file_path) and os.access(file_path, os.X_OK):
md5 = md5sum(file_path)
if md5 in md5_dict:
print(f"Skipping duplicate file: {file_path}")
continue
md5_dict[md5] = file_name
new_file_name = md5 + os.path.splitext(file_name)[1]
new_file_path = os.path.join(dst_folder, new_file_name)
shutil.copy(file_path, new_file_path)
print(f"Copied file: {file_path} -> {new_file_path}")
if __name__ == '__main__':
#源文件夹、拷贝文件夹
copy_executables(r"D:\detect_exp_d\data\benign_pre", r"D:\detect_exp_d\data\benign_last")