From bd51d89a0b8a7f531512075604562259bc52b00b Mon Sep 17 00:00:00 2001 From: huihun <781165206@qq.com> Date: Mon, 4 Dec 2023 14:15:10 +0800 Subject: [PATCH] =?UTF-8?q?=E8=87=AA=E5=8A=A8=E7=94=9F=E6=88=90jsonl?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ida_print.py | 19 ---- raw-feature-extractor/print_test.py | 154 +++++++++++++++------------- 2 files changed, 80 insertions(+), 93 deletions(-) delete mode 100644 ida_print.py diff --git a/ida_print.py b/ida_print.py deleted file mode 100644 index 9b104e7..0000000 --- a/ida_print.py +++ /dev/null @@ -1,19 +0,0 @@ -import os -import subprocess - - -directory = './' - - -if __name__ == '__main__': - cmd = 'D:\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path ./store/" ' - - for filename in os.listdir(directory): - if filename[-3:] == 'exe': - process = subprocess.Popen(["powershell", cmd+filename], stdout=subprocess.PIPE) - - output = process.communicate()[0] - - - - diff --git a/raw-feature-extractor/print_test.py b/raw-feature-extractor/print_test.py index 885aa03..e500456 100644 --- a/raw-feature-extractor/print_test.py +++ b/raw-feature-extractor/print_test.py @@ -1,4 +1,5 @@ # -*- coding: UTF-8 -*- +import os import sys from matplotlib import pyplot as plt import networkx as nx @@ -7,11 +8,6 @@ import hashlib import json -def print_obj(obj): - "打印对象的所有属性" - print(obj.__dict__) - - def calc_sha256(file_path): with open(file_path, 'rb') as f: bytes = f.read() @@ -25,75 +21,85 @@ import pickle # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。 # sub_166C4 393 if __name__ == '__main__': - file_path = '../3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe' - testpath = '../store/3c580f5beca53b6599e5f04d3aa68a34bd50521d7ec5d7163849eb69f53a4150.exe.ida' - fr = open(testpath, 'r') - data1 = pickle.load(fr) # 一个二进制文件的acfgs - # function_edges - function_edge_start = [] - function_edge_end = [] - for item in data1.raw_graph_list[0].old_g.edges: - function_edge_start.append(item[0]) - function_edge_end.append(item[1]) - function_edges = [function_edge_start, function_edge_end] - fun_name_temp = [] - # function hsah - file_hash = calc_sha256(file_path) - # funtion num - function_number = len(data1.raw_graph_list) - acfg_list = [] - # 函数级特征 - for i in range(len(data1.raw_graph_list)): - - # function name - fun_name_temp.append(data1.raw_graph_list[i].funcname) - # block features - temp_G = data1.raw_graph_list[i].old_g - # block_number - block_number = len(temp_G.node) - # block_features - acfg_list_item_feature = [] - for temp in range(len(temp_G.node)): - block_features = [] - # call - block_features.append(temp_G.node[temp]['numCalls']) - # transfer - block_features.append(temp_G.node[temp]['numTIs']) - # arithmetic - block_features.append(temp_G.node[temp]['numAs']) - # logic - block_features.append(temp_G.node[temp]['numLIs']) - # compare - block_features.append(temp_G.node[temp]['numCom']) - # move - block_features.append(temp_G.node[temp]['numMov']) - # termination - block_features.append(temp_G.node[temp]['numTerm']) - # date declaration - block_features.append(temp_G.node[temp]['numDD']) - # total instructions - block_features.append(temp_G.node[temp]['numIns']) - # string or integer constants - block_features.append(len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len( - temp_G.node[temp]['consts'])) - # offspring - block_features.append(temp_G.node[temp]['offs']) - acfg_list_item_feature.append(block_features) - edge_list_start = [] - edge_list_end = [] - for item in temp_G.edges: - edge_list_start.append(item[0]) - edge_list_end.append(item[1]) - block_edges = [edge_list_start, edge_list_end] - acfg_list_item = {"block_number": block_number, "block_edges": block_edges, "block_features": acfg_list_item_feature} - acfg_list.append(acfg_list_item) - - json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp, "hash": file_hash, "function_number": function_number} - json_str = json.dumps(json_temp) - print json_str - - - + file_name_list = os.listdir('../A2C/') + res_file = "../sample.jsonl" + sample_file = open(res_file, mode='a') + for file_name in file_name_list: + print file_name + file_path = '../A2C/' + file_name + testpath = '../store/' + file_name + '.ida' + if os.path.exists(testpath) and os.path.splitext(file_path)[-1].lower() == '.exe': + fr = open(testpath, 'r') + data1 = pickle.load(fr) # 一个二进制文件的acfgs + # funtion num + function_number = len(data1.raw_graph_list) + if function_number == 0: + continue + # function_edges + function_edge_start = [] + function_edge_end = [] + for item in data1.raw_graph_list[0].old_g.edges: + function_edge_start.append(item[0]) + function_edge_end.append(item[1]) + function_edges = [function_edge_start, function_edge_end] + fun_name_temp = [] + # function hsah + file_hash = calc_sha256(file_path) + acfg_list = [] + # 函数级特征 + for i in range(len(data1.raw_graph_list)): + # function name + fun_name_temp.append(data1.raw_graph_list[i].funcname) + # block features + temp_G = data1.raw_graph_list[i].old_g + # block_number + block_number = len(temp_G.node) + # block_features + acfg_list_item_feature = [] + for temp in range(len(temp_G.node)): + block_features = [] + # call + block_features.append(temp_G.node[temp]['numCalls']) + # transfer + block_features.append(temp_G.node[temp]['numTIs']) + # arithmetic + block_features.append(temp_G.node[temp]['numAs']) + # logic + block_features.append(temp_G.node[temp]['numLIs']) + # compare + block_features.append(temp_G.node[temp]['numCom']) + # move + block_features.append(temp_G.node[temp]['numMov']) + # termination + block_features.append(temp_G.node[temp]['numTerm']) + # date declaration + block_features.append(temp_G.node[temp]['numDD']) + # total instructions + block_features.append(temp_G.node[temp]['numIns']) + # string or integer constants + block_features.append( + len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len( + temp_G.node[temp]['consts'])) + # offspring + block_features.append(temp_G.node[temp]['offs']) + acfg_list_item_feature.append(block_features) + edge_list_start = [] + edge_list_end = [] + for item in temp_G.edges: + edge_list_start.append(item[0]) + edge_list_end.append(item[1]) + block_edges = [edge_list_start, edge_list_end] + acfg_list_item = {"block_number": block_number, "block_edges": block_edges, + "block_features": acfg_list_item_feature} + acfg_list.append(acfg_list_item) + json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp, + "hash": file_hash, "function_number": function_number} + json_str = json.dumps(json_temp) + sample_file.write(json_str) + else: + print "删除文件" + file_path + os.remove(file_path) + sample_file.close()