自动生成jsonl文件

This commit is contained in:
huihun 2023-12-04 14:15:10 +08:00
parent 73c9da0599
commit bd51d89a0b
2 changed files with 80 additions and 93 deletions

View File

@ -1,19 +0,0 @@
import os
import subprocess
directory = './'
if __name__ == '__main__':
cmd = 'D:\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path ./store/" '
for filename in os.listdir(directory):
if filename[-3:] == 'exe':
process = subprocess.Popen(["powershell", cmd+filename], stdout=subprocess.PIPE)
output = process.communicate()[0]

View File

@ -1,4 +1,5 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import os
import sys import sys
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import networkx as nx import networkx as nx
@ -7,11 +8,6 @@ import hashlib
import json import json
def print_obj(obj):
"打印对象的所有属性"
print(obj.__dict__)
def calc_sha256(file_path): def calc_sha256(file_path):
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
bytes = f.read() bytes = f.read()
@ -25,10 +21,20 @@ import pickle
# sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。 # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant可能是间接引用的不识别。看了下所有函数的特征几乎都没有字符串常量可能都是写在别的地方然后引用的。
# sub_166C4 393 # sub_166C4 393
if __name__ == '__main__': if __name__ == '__main__':
file_path = '../3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe' file_name_list = os.listdir('../A2C/')
testpath = '../store/3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe.ida' res_file = "../sample.jsonl"
sample_file = open(res_file, mode='a')
for file_name in file_name_list:
print file_name
file_path = '../A2C/' + file_name
testpath = '../store/' + file_name + '.ida'
if os.path.exists(testpath) and os.path.splitext(file_path)[-1].lower() == '.exe':
fr = open(testpath, 'r') fr = open(testpath, 'r')
data1 = pickle.load(fr) # 一个二进制文件的acfgs data1 = pickle.load(fr) # 一个二进制文件的acfgs
# funtion num
function_number = len(data1.raw_graph_list)
if function_number == 0:
continue
# function_edges # function_edges
function_edge_start = [] function_edge_start = []
function_edge_end = [] function_edge_end = []
@ -39,8 +45,7 @@ if __name__ == '__main__':
fun_name_temp = [] fun_name_temp = []
# function hsah # function hsah
file_hash = calc_sha256(file_path) file_hash = calc_sha256(file_path)
# funtion num
function_number = len(data1.raw_graph_list)
acfg_list = [] acfg_list = []
# 函数级特征 # 函数级特征
for i in range(len(data1.raw_graph_list)): for i in range(len(data1.raw_graph_list)):
@ -74,7 +79,8 @@ if __name__ == '__main__':
# total instructions # total instructions
block_features.append(temp_G.node[temp]['numIns']) block_features.append(temp_G.node[temp]['numIns'])
# string or integer constants # string or integer constants
block_features.append(len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len( block_features.append(
len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len(
temp_G.node[temp]['consts'])) temp_G.node[temp]['consts']))
# offspring # offspring
block_features.append(temp_G.node[temp]['offs']) block_features.append(temp_G.node[temp]['offs'])
@ -85,15 +91,15 @@ if __name__ == '__main__':
edge_list_start.append(item[0]) edge_list_start.append(item[0])
edge_list_end.append(item[1]) edge_list_end.append(item[1])
block_edges = [edge_list_start, edge_list_end] block_edges = [edge_list_start, edge_list_end]
acfg_list_item = {"block_number": block_number, "block_edges": block_edges, "block_features": acfg_list_item_feature} acfg_list_item = {"block_number": block_number, "block_edges": block_edges,
"block_features": acfg_list_item_feature}
acfg_list.append(acfg_list_item) acfg_list.append(acfg_list_item)
json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp, "hash": file_hash, "function_number": function_number} json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp,
"hash": file_hash, "function_number": function_number}
json_str = json.dumps(json_temp) json_str = json.dumps(json_temp)
print json_str sample_file.write(json_str)
else:
print "删除文件" + file_path
os.remove(file_path)
sample_file.close()