py2.7可行版本

This commit is contained in:
huihun 2024-03-01 16:42:02 +08:00
parent 8063d079db
commit 65d25d42de

View File

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
import hashlib
import pickle as pk import pickle as pk
import re import re
import json import json
@ -6,125 +7,133 @@ import os
from tqdm import tqdm from tqdm import tqdm
def convert(start, end, overhaul): def calc_sha256(file_path):
for workflow in range(start, end): with open(file_path, 'rb') as f:
# workflow = 0 bytes = f.read()
cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) sha256obj = hashlib.sha256(bytes)
output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow) sha256 = sha256obj.hexdigest()
dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow) return sha256
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
if overhaul: def convert_malware(overhaul):
if os.path.exists(log_path): cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
os.remove(log_path) output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
if os.path.exists(process_log_path): dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
os.remove(process_log_path) raw_dir = "D:\\bishe\\dataset\\train_malware"
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
logged = log.readline() process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
if logged == '':
log_index = 0 if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
if index < log_index:
continue
name = cfg[:-4] # 纯文件名,不带后缀
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try:
data = pk.load(cfg_file)
except EOFError:
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
continue
except ValueError:
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
continue
finally:
cfg_file.close()
dot_file_path = os.path.join(dot_dir, name + '.dot')
if not os.path.exists(dot_file_path):
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
else: else:
log_index = int(logged) # 打开dot文件获取fcg
raw_function_edges = []
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数data.raw_graph_list仅包含了内部函数
functions_list = []
with open(dot_file_path, 'r') as dot:
for line in dot:
if '->' in line:
raw_function_edges.append(re.findall(r'\b\d+\b', line))
elif 'label' in line:
functions_list.append(line[line.find('= "') + 3:line.find('",')])
for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
if index < log_index: if raw_function_edges.__len__() == 0:
continue continue
name = cfg[:-4] # 纯文件名,不带后缀 # 为当前pe文件创建json对象
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') json_obj = {
try: 'hash': calc_sha256(raw_dir + "\\" + name),
data = pk.load(cfg_file) # 2023.8.12 bug fix: 这里获取的是内部函数的数量
except EOFError: # 'function_number': data.raw_graph_list.__len__(),
process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg)) 'function_number': len(functions_list),
continue 'function_edges': [[int(d[0]) for d in raw_function_edges],
except ValueError: [int(d[1]) for d in raw_function_edges]],
process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) 'acfg_list': [],
continue 'function_names': functions_list
finally: }
cfg_file.close()
dot_file_path = os.path.join(dot_dir, name + '.dot') # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数不包括外部函数因此函数列表和函数数量不能从这里获取
if not os.path.exists(dot_file_path): # 读取pkl文件一个acfg由一个函数分解而来
process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg)) for acfg in data.raw_graph_list:
else: # 函数为外部函数不需要构建cfg
# 打开dot文件获取fcg if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
raw_function_edges = []
# 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数data.raw_graph_list仅包含了内部函数
functions_list = []
with open(dot_file_path, 'r') as dot:
for line in dot:
if '->' in line:
raw_function_edges.append(re.findall(r'\b\d+\b', line))
elif 'label' in line:
functions_list.append(line[line.find('= "') + 3:line.find('",')])
# 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
if raw_function_edges.__len__() == 0:
continue continue
# 为当前pe文件创建json对象 # 这里2是因为Genius框架提取特征时将后代数量放在2
json_obj = { offspring = [d.get('v')[2] for d in acfg.g.node.values()]
'hash': data.binary_name[11:], # 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
# 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 以框架为主将bb_features数组削减为和g.node长度一致
# 'function_number': data.raw_graph_list.__len__(), diff = acfg.g.__len__() - len(acfg.bb_features)
'function_number': len(functions_list), if diff != 0:
'function_edges': [[int(d[0]) for d in raw_function_edges], del acfg.bb_features[diff:]
[int(d[1]) for d in raw_function_edges]], # 将后代数量的特征放入bb_features中
'acfg_list': [],
'function_names': functions_list for i, offs in enumerate(offspring):
acfg.bb_features[i].append(offs)
acfg_item = {
'block_number': acfg.g.__len__(),
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
'block_features': acfg.bb_features
} }
# 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数不包括外部函数因此函数列表和函数数量不能从这里获取 json_obj['acfg_list'].append(acfg_item)
# 读取pkl文件一个acfg由一个函数分解而来 # json_obj['function_names'].append(acfg.funcname)
for acfg in data.raw_graph_list:
# 函数为外部函数不需要构建cfg
if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
continue
# 这里2是因为Genius框架提取特征时将后代数量放在2 # 将结果写入json本地文件
offspring = [d.get('v')[2] for d in acfg.g.node.values()] result = json.dumps(json_obj, ensure_ascii=False)
# 这边可能会出现不知名的原因两个数组长度不一致,按理来说应该是一致的
# 以框架为主将bb_features数组削减为和g.node长度一致
diff = acfg.g.__len__() - len(acfg.bb_features)
if diff != 0:
del acfg.bb_features[diff:]
# 将后代数量的特征放入bb_features中
for i, offs in enumerate(offspring): with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
acfg.bb_features[i].append(offs) out.write(result)
acfg_item = { log.truncate(0)
'block_number': acfg.g.__len__(), log.seek(0)
'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]], log.write(str(index))
'block_features': acfg.bb_features log.flush()
} process_log.write("index {}, {} process done.\n".format(index, cfg))
json_obj['acfg_list'].append(acfg_item)
# json_obj['function_names'].append(acfg.funcname)
# 将结果写入json本地文件
result = json.dumps(json_obj, ensure_ascii=False)
with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
out.write(result)
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, cfg))
def convert_benign(overhaul): def convert_benign(overhaul):
cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot" dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl" output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
raw_dir = "D:\\bishe\\dataset\\train_benign"
log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
if overhaul: if overhaul:
if os.path.exists(log_path): if os.path.exists(log_path):
@ -145,6 +154,7 @@ def convert_benign(overhaul):
continue continue
name = cfg[:-4] # 纯文件名 name = cfg[:-4] # 纯文件名
cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r') cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
try: try:
data = pk.load(cfg_file) data = pk.load(cfg_file)
@ -180,7 +190,7 @@ def convert_benign(overhaul):
# 为当前pe文件创建json对象 # 为当前pe文件创建json对象
json_obj = { json_obj = {
'hash': data.binary_name[11:], 'hash': calc_sha256(raw_dir + "\\" + name),
# 2023.8.12 bug fix: 这里获取的是内部函数的数量 # 2023.8.12 bug fix: 这里获取的是内部函数的数量
# 'function_number': data.raw_graph_list.__len__(), # 'function_number': data.raw_graph_list.__len__(),
'function_number': len(functions_list), 'function_number': len(functions_list),
@ -233,4 +243,6 @@ def convert_benign(overhaul):
if __name__ == '__main__': if __name__ == '__main__':
# convert(35, 69) # convert(35, 69)
convert_benign(False) # convert_benign(True)
convert_benign(True)
convert_malware(True)