From 795e5f050e887510e70348408a39a390c7d05315 Mon Sep 17 00:00:00 2001 From: huihun <781165206@qq.com> Date: Thu, 18 Apr 2024 10:46:29 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9F=BA=E7=A1=80=E5=9D=97=E6=9D=A1=E6=95=B0?= =?UTF-8?q?=E7=B2=BE=E7=AE=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- exe2json.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/exe2json.py b/exe2json.py index 37574ba..1e8dad0 100644 --- a/exe2json.py +++ b/exe2json.py @@ -14,7 +14,7 @@ import multiprocessing ret_trap_opcode_family = ["ret", "hlt", "int3", "ud2"] -sample_type = 'malware' +sample_type = 'benign' def extract_opcode(disasm_text): @@ -71,6 +71,7 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name): block_addr = block["addr"] block_Statement = [] + node_list.append(block["addr"]) # 获取基本块的反汇编指令 disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) @@ -78,17 +79,13 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name): # if len(disasm) != 49144: # continue if disasm: - last_op = '' - # if len(disasm) > 200: - # logger.warning( - # f"基础块指令长度异常,{file_path},函数名称{function['name']}基础块地址{block['addr']},长度{len(disasm)}") + block_len = len(disasm) for op_index, op in enumerate(disasm): - op_disasm = extract_opcode(op["disasm"]) - # 防止大量重复的语句造成内存溢出 - if op_disasm == last_op: - continue - last_op = op_disasm # 提取操作码并转换为bert模型输入格式 + op_disasm = extract_opcode(op["disasm"]) + # 如果单个基础块的长度大于20且操作码重复,则跳过 + if block_len > 20 and op_disasm in block_Statement: + continue block_Statement.append(op_disasm) # 处理跳转码并构建cfg if 'jump' in op: @@ -129,6 +126,8 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name): # 当前指令是基础块的最后一条指令 if op not in ret_trap_opcode_family or op["type"] not in ["ret", "trap"]: temp_edge_list.append([block_addr, op["offset"] + op["size"]]) + if block_len > 20: + logger.warning(f"二进制可执行文件解析警告,基础块长度大于20,文件{file_path},基础块地址{block_addr},操作码长度{block_len}->{len(block_Statement)}") # debugger # print(len(disasm)) @@ -257,7 +256,7 @@ if __name__ == '__main__': sample_file_list = os.listdir(sample_file_path) print(f"max worker {os.cpu_count()}") with multiprocessing.Pool(processes=os.cpu_count()) as pool: - result = list(tqdm(pool.imap_unordered(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list]), total=len(sample_file_list))) + result = list(tqdm(pool.imap_unordered(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list[::-1]]), total=len(sample_file_list))) # with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: