py2.7可行版本

This commit is contained in:
huihun 2024-03-01 16:42:02 +08:00
parent 8063d079db
commit 65d25d42de

View File

@ -1,4 +1,5 @@
# coding=utf-8
import hashlib
import pickle as pk
import re
import json
@ -6,15 +7,22 @@ import os
from tqdm import tqdm
def convert(start, end, overhaul):
for workflow in range(start, end):
# workflow = 0
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
def calc_sha256(file_path):
with open(file_path, 'rb') as f:
bytes = f.read()
sha256obj = hashlib.sha256(bytes)
sha256 = sha256obj.hexdigest()
return sha256
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
def convert_malware(overhaul):
cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
raw_dir = "D:\\bishe\\dataset\\train_malware"
log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
if overhaul:
if os.path.exists(log_path):
@ -67,7 +75,7 @@ def convert(start, end, overhaul):
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
@ -119,12 +127,13 @@ def convert(start, end, overhaul):
def convert_benign(overhaul):
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
raw_dir = "D:\\bishe\\dataset\\train_benign"
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
if overhaul:
if os.path.exists(log_path):
@ -145,6 +154,7 @@ def convert_benign(overhaul):
continue
name = cfg[:-4] # 纯文件名
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try:
data = pk.load(cfg_file)
@ -180,7 +190,7 @@ def convert_benign(overhaul):
# 为当前pe文件创建json对象
json_obj = {
'hash': data.binary_name[11:],
'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list),
@ -233,4 +243,6 @@ def convert_benign(overhaul):
if __name__ == '__main__':
# convert(35, 69)
convert_benign(False)
# convert_benign(True)
convert_benign(True)
convert_malware(True)