基础块条数精简

This commit is contained in:
huihun 2024-04-18 10:46:29 +08:00
parent 7c8145b52a
commit 795e5f050e

View File

@ -14,7 +14,7 @@ import multiprocessing
ret_trap_opcode_family = ["ret", "hlt", "int3", "ud2"] ret_trap_opcode_family = ["ret", "hlt", "int3", "ud2"]
sample_type = 'malware' sample_type = 'benign'
def extract_opcode(disasm_text): def extract_opcode(disasm_text):
@ -71,6 +71,7 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name):
block_addr = block["addr"] block_addr = block["addr"]
block_Statement = [] block_Statement = []
node_list.append(block["addr"]) node_list.append(block["addr"])
# 获取基本块的反汇编指令 # 获取基本块的反汇编指令
disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"])) disasm = r2pipe_open.cmdj("pdj " + str(block["ninstr"]) + " @" + str(block["addr"]))
@ -78,17 +79,13 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name):
# if len(disasm) != 49144: # if len(disasm) != 49144:
# continue # continue
if disasm: if disasm:
last_op = '' block_len = len(disasm)
# if len(disasm) > 200:
# logger.warning(
# f"基础块指令长度异常,{file_path},函数名称{function['name']}基础块地址{block['addr']},长度{len(disasm)}")
for op_index, op in enumerate(disasm): for op_index, op in enumerate(disasm):
op_disasm = extract_opcode(op["disasm"])
# 防止大量重复的语句造成内存溢出
if op_disasm == last_op:
continue
last_op = op_disasm
# 提取操作码并转换为bert模型输入格式 # 提取操作码并转换为bert模型输入格式
op_disasm = extract_opcode(op["disasm"])
# 如果单个基础块的长度大于20且操作码重复则跳过
if block_len > 20 and op_disasm in block_Statement:
continue
block_Statement.append(op_disasm) block_Statement.append(op_disasm)
# 处理跳转码并构建cfg # 处理跳转码并构建cfg
if 'jump' in op: if 'jump' in op:
@ -129,6 +126,8 @@ def get_graph_cfg_r2pipe(r2pipe_open, file_path, output_path, file_name):
# 当前指令是基础块的最后一条指令 # 当前指令是基础块的最后一条指令
if op not in ret_trap_opcode_family or op["type"] not in ["ret", "trap"]: if op not in ret_trap_opcode_family or op["type"] not in ["ret", "trap"]:
temp_edge_list.append([block_addr, op["offset"] + op["size"]]) temp_edge_list.append([block_addr, op["offset"] + op["size"]])
if block_len > 20:
logger.warning(f"二进制可执行文件解析警告基础块长度大于20文件{file_path},基础块地址{block_addr},操作码长度{block_len}->{len(block_Statement)}")
# debugger # debugger
# print(len(disasm)) # print(len(disasm))
@ -257,7 +256,7 @@ if __name__ == '__main__':
sample_file_list = os.listdir(sample_file_path) sample_file_list = os.listdir(sample_file_path)
print(f"max worker {os.cpu_count()}") print(f"max worker {os.cpu_count()}")
with multiprocessing.Pool(processes=os.cpu_count()) as pool: with multiprocessing.Pool(processes=os.cpu_count()) as pool:
result = list(tqdm(pool.imap_unordered(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list]), total=len(sample_file_list))) result = list(tqdm(pool.imap_unordered(exe_to_json, [os.path.join(sample_file_path, file_name) for file_name in sample_file_list[::-1]]), total=len(sample_file_list)))
# with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: