diff --git a/exe2json.py b/exe2json.py index b14b162..eac91d3 100644 --- a/exe2json.py +++ b/exe2json.py @@ -13,7 +13,7 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" ret_trap_opcode_family = ["ret", "hlt", "int3", "ud2"] -sample_type = 'benign' +sample_type = 'malware' def extract_opcode(disasm_text): @@ -41,9 +41,10 @@ def calc_sha256(file_path): return sha256 -def get_graph_cfg_r2pipe(r2pipe_open, file_path): +def get_graph_cfg_r2pipe(r2pipe_open, file_path, feature_out): # CFG提取 acfg_item = [] + acfg_feature_item = [] try: # 获取函数列表 function_list = r2pipe_open.cmdj("aflj") @@ -132,17 +133,24 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path): # print(block_Statement) + """ + 速度过慢 + """ # bert模型转化特征 - block_feature_list = bb2vec(block_Statement) + # block_feature_list = bb2vec(block_Statement) # block_feature_list = [] + # 暂时将bb地址作为特征 后续将操作码集中转化为特征 + block_feature_list = block_addr + acfg_feature_item.append({'addr':block_addr, 'opcode':block_Statement}) + # 过滤不存在的边 for temp_edge in temp_edge_list: if temp_edge[0] in node_list and temp_edge[1] in node_list: edge_list.append(temp_edge) # 单独错误信息日志 - if block_number == 0 or len(block_feature_list) == 0: - logger.warning(f"二进制可执行文件解析出错,出错文件:{file_path},出错函数地址:{function['offset']},基础块个数{block_number},基础块特征{block_feature_list}") + if block_number == 0: + logger.warning(f"二进制可执行文件解析出错,出错文件:{file_path},出错函数地址:{function['offset']},基础块个数{block_number}") # cfg构建 acfg = { 'block_number': block_number, @@ -150,6 +158,8 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path): 'block_features': block_feature_list } acfg_item.append(acfg) + feature_out.write(json.dumps(acfg_feature_item)) + return True, "二进制可执行文件解析成功", acfg_item except Exception as e: return False, e, None @@ -222,7 +232,9 @@ def exe_to_json(file_path): # 获取r2pipe并解析文件 解析完即释放r2 r2 = get_r2pipe(file_path) fcg_Operation_flag, fcg_Operation_message, function_num, function_fcg_edge_list, function_names = get_graph_fcg_r2pipe(r2) - cfg_Operation_flag, cfg_Operation_message, cfg_item = get_graph_cfg_r2pipe(r2,file_path) + with open(os.path.join(output_path, 'feature' ,file_fingerprint + '.jsonl'), 'w') as feature_out: + cfg_Operation_flag, cfg_Operation_message, cfg_item = get_graph_cfg_r2pipe(r2,file_path, feature_out) + feature_out.close() r2.quit() # 文件json构建 @@ -256,7 +268,6 @@ if __name__ == '__main__': sample_file_path = f"/mnt/d/bishe/dataset/sample_{sample_type}" sample_file_list = os.listdir(sample_file_path) - # sample_file_list = ['001b1ca33bf52c5c09b3a852d0ac0254.exe'] multi_thread(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list], thread_num=THREAD_FULL) # test_file_path = '/mnt/d/bishe/exe2json/sample/VirusShare_0a3b625380161cf92c4bb10135326bb5' # exe_to_json(test_file_path)