diff --git a/fun_count.py b/fun_count.py new file mode 100644 index 0000000..9100b3e --- /dev/null +++ b/fun_count.py @@ -0,0 +1,7 @@ +from tqdm import tqdm + + +if __name__ == '__main__': + file_name = './fun_count.jsonl' + fil = open(file_name, mode='r') + for item in tqdm(fil): diff --git a/ida_file_cerate.bat b/ida_file_cerate.bat index 1171460..18a4cc0 100644 --- a/ida_file_cerate.bat +++ b/ida_file_cerate.bat @@ -1,18 +1,20 @@ @echo off -setlocal enabledelayedexpansion +setlocal EnableDelayedExpansion set "IDA_PATH=D:\IDA_Pro_v6.8\idaq.exe" -set "FOLDER_PATH=D:\bishe\Gencoding\A2C" +set "FOLDER_PATH=D:\bishe\Gencoding\train_malware" set "SCRIPT_PATH=../raw-feature-extractor/preprocessing_ida.py" -set "SAVE_PATH=../store/" -set "LOG_PATH=../log/" +set "SAVE_PATH=../train_malware_result/" - - -for %%f in ("%FOLDER_PATH%\*.exe") do ( - echo !time! %%f - %IDA_PATH% -c -B -S"%SCRIPT_PATH% --path %SAVE_PATH%" %%f +for %%f in ("%FOLDER_PATH%\*.*") do ( + echo %%f + if /i "%%~xf"==".idb" ( + echo Found IDB file: %%f + ) else ( + echo !time! %%f + %IDA_PATH% -c -A -S"%SCRIPT_PATH% --path %SAVE_PATH%" %%f ) +) endlocal diff --git a/raw-feature-extractor/external_test.py b/raw-feature-extractor/external_test.py new file mode 100644 index 0000000..cd12037 --- /dev/null +++ b/raw-feature-extractor/external_test.py @@ -0,0 +1,96 @@ +import os +import sys +from matplotlib import pyplot as plt +import networkx as nx + +import hashlib +import json +import pickle + + +if __name__ == '__main__': + done_index = 0 + file_name_list = os.listdir('../A2C/') + res_file = "../sample.jsonl" + for file_name in file_name_list: + file_path = '../A2C/' + file_name + testpath = '../store/' + file_name + '.ida' + if os.path.exists(testpath) and os.path.splitext(file_path)[-1].lower() == '.exe': + fr = open(testpath, 'r') + data1 = pickle.load(fr) + for graph in data1.raw_graph_list: + for i in range(len(graph.old_g.node)): + if len(graph.old_g.node[i]['externs']) != 0: + print graph.old_g.node[i]['externs'] + # for i in range(len(data1.raw_graph_list)): + + + + + + + + # fr = open(testpath, 'r') + # data1 = pickle.load(fr) + # # funtion num + # function_number = len(data1.raw_graph_list) + # if function_number == 0: + # continue + # # function_edges + # function_edge_start = [] + # function_edge_end = [] + # for item in data1.raw_graph_list[0].old_g.edges: + # function_edge_start.append(item[0]) + # function_edge_end.append(item[1]) + # function_edges = [function_edge_start, function_edge_end] + # fun_name_temp = [] + # # function hsah + # acfg_list = [] + + # for i in range(len(data1.raw_graph_list)): + # + # # function name + # fun_name_temp.append(data1.raw_graph_list[i].funcname) + # # block features + # temp_G = data1.raw_graph_list[i].old_g + # # block_number + # block_number = len(temp_G.node) + # # block_features + # acfg_list_item_feature = [] + # for temp in range(len(temp_G.node)): + # block_features = [] + # # call + # block_features.append(temp_G.node[temp]['numCalls']) + # # transfer + # block_features.append(temp_G.node[temp]['numTIs']) + # # arithmetic + # block_features.append(temp_G.node[temp]['numAs']) + # # logic + # block_features.append(temp_G.node[temp]['numLIs']) + # # compare + # block_features.append(temp_G.node[temp]['numCom']) + # # move + # block_features.append(temp_G.node[temp]['numMov']) + # # termination + # block_features.append(temp_G.node[temp]['numTerm']) + # # date declaration + # block_features.append(temp_G.node[temp]['numDD']) + # # total instructions + # block_features.append(temp_G.node[temp]['numIns']) + # # string or integer constants + # block_features.append( + # len(temp_G.node[temp]['strings']) if len(temp_G.node[temp]['strings']) != 0 else len( + # temp_G.node[temp]['consts'])) + # # offspring + # block_features.append(temp_G.node[temp]['offs']) + # acfg_list_item_feature.append(block_features) + # edge_list_start = [] + # edge_list_end = [] + # for item in temp_G.edges: + # edge_list_start.append(item[0]) + # edge_list_end.append(item[1]) + # block_edges = [edge_list_start, edge_list_end] + # acfg_list_item = {"block_number": block_number, "block_edges": block_edges, + # "block_features": acfg_list_item_feature} + # acfg_list.append(acfg_list_item) + diff --git a/raw-feature-extractor/print_test.py b/raw-feature-extractor/print_test.py index e500456..2cca783 100644 --- a/raw-feature-extractor/print_test.py +++ b/raw-feature-extractor/print_test.py @@ -20,7 +20,11 @@ import pickle # sub_10F20 308 反编译代码有字符串,但是这个特征提取里没有字符串 constant,可能是间接引用的,不识别。看了下所有函数的特征,几乎都没有字符串常量,可能都是写在别的地方然后引用的。 # sub_166C4 393 + + + if __name__ == '__main__': + done_index = 0 file_name_list = os.listdir('../A2C/') res_file = "../sample.jsonl" sample_file = open(res_file, mode='a') @@ -98,7 +102,9 @@ if __name__ == '__main__': json_temp = {"function_edges": function_edges, "acfg_list": acfg_list, "function_names": fun_name_temp, "hash": file_hash, "function_number": function_number} json_str = json.dumps(json_temp) - sample_file.write(json_str) + sample_file.write(json_str + '\n') + print "完成写入" + str(done_index) + done_index += 1 else: print "删除文件" + file_path os.remove(file_path) diff --git a/req.txt b/req.txt new file mode 100644 index 0000000..00781a1 Binary files /dev/null and b/req.txt differ