py2.7可行版本

This commit is contained in:
huihun 2024-03-01 16:42:02 +08:00
parent 8063d079db
commit 65d25d42de

View File

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
import hashlib
import pickle as pk import pickle as pk
import re import re
import json import json
@ -6,15 +7,22 @@ import os
from tqdm import tqdm from tqdm import tqdm
def convert(start, end, overhaul): def calc_sha256(file_path):
for workflow in range(start, end): with open(file_path, 'rb') as f:
# workflow = 0 bytes = f.read()
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) sha256obj = hashlib.sha256(bytes)
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) sha256 = sha256obj.hexdigest()
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow) return sha256
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow) def convert_malware(overhaul):
cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
raw_dir = "D:\\bishe\\dataset\\train_malware"
log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
if overhaul: if overhaul:
if os.path.exists(log_path): if os.path.exists(log_path):
@ -67,7 +75,7 @@ def convert(start, end, overhaul):
# 为当前pe文件创建json对象 # 为当前pe文件创建json对象
json_obj = { json_obj = {
'hash': data.binary_name[11:], 'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(), # 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list), 'function_number': len(functions_list),
@ -119,12 +127,13 @@ def convert(start, end, overhaul):
def convert_benign(overhaul): def convert_benign(overhaul):
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot" dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl" output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
raw_dir = "D:\\bishe\\dataset\\train_benign"
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
if overhaul: if overhaul:
if os.path.exists(log_path): if os.path.exists(log_path):
@ -145,6 +154,7 @@ def convert_benign(overhaul):
continue continue
name = cfg[:-4] # 纯文件名 name = cfg[:-4] # 纯文件名
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try: try:
data = pk.load(cfg_file) data = pk.load(cfg_file)
@ -180,7 +190,7 @@ def convert_benign(overhaul):
# 为当前pe文件创建json对象 # 为当前pe文件创建json对象
json_obj = { json_obj = {
'hash': data.binary_name[11:], 'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(), # 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list), 'function_number': len(functions_list),
@ -233,4 +243,6 @@ def convert_benign(overhaul):
if __name__ == '__main__': if __name__ == '__main__':
# convert(35, 69) # convert(35, 69)
convert_benign(False) # convert_benign(True)
convert_benign(True)
convert_malware(True)