判断空指令

This commit is contained in:
huihun 2024-04-21 21:49:12 +08:00
parent df008989f1
commit 61a44a27c5

View File

@ -1,11 +1,11 @@
import concurrent.futures
import os import os
from my_utils import save_json, load_json, setup_logger, multi_thread_order, THREAD_HALF, THREAD_FULL, continuation_json from my_utils import save_json, load_json, setup_logger
from bert.obtain_inst_vec import bb2vec from bert.obtain_inst_vec import bb2vec
import multiprocessing import multiprocessing
from tqdm import tqdm from tqdm import tqdm
import warnings import warnings
from datetime import datetime from datetime import datetime
import concurrent.futures
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
@ -17,6 +17,8 @@ def addr2vec(base_file_path, index):
# 如果不是路径则开始转化 # 如果不是路径则开始转化
if file_name: if file_name:
# 无操作码标志位
none_opcode_flag = False
# 忽略已生成的文件 # 忽略已生成的文件
if os.path.exists(os.path.join(file_path, 'final', file_name)): if os.path.exists(os.path.join(file_path, 'final', file_name)):
return return
@ -25,38 +27,50 @@ def addr2vec(base_file_path, index):
feature_json = load_json(os.path.join(file_path, 'feature', file_name)) if os.path.exists( feature_json = load_json(os.path.join(file_path, 'feature', file_name)) if os.path.exists(
os.path.join(file_path, 'feature', file_name)) else None os.path.join(file_path, 'feature', file_name)) else None
if feature_json is not None: if feature_json is not None:
# 如果出现无操作码的情况,直接跳过文件
# 对于长度过长的文件先不处理 for item in feature_json:
if len(feature_json) > 10000: if len(item['opcode']) == 0:
data = { logger.error(f"基础块无操作码 {file_name},地址{item['addr']}")
'file_name': file_name, none_opcode_flag = True
'feature_len': len(feature_json) if none_opcode_flag:
}
continuation_json(os.path.join(f'./out/json/too_long_{sample_type}.json'), data)
return return
# 对于长度过长的文件先不处理
# if len(feature_json) > 10000:
# data = {
# 'file_name': file_name,
# 'feature_len': len(feature_json)
# }
# continuation_json(os.path.join(f'./out/json/too_long_{sample_type}.json'), data)
# return
# 多线程预测bert # 多线程预测bert
feature_set = {} feature_set = {}
# with multiprocessing.Pool(processes=os.cpu_count()) as pool: with multiprocessing.Pool(processes=4) as pool:
# try: try:
# results = list(tqdm(pool.imap_unordered(bb2vec, [item for item in feature_json]), results = list(tqdm(pool.imap_unordered(bb2vec, [item for item in feature_json]),
# total=len(feature_json), total=len(feature_json),
# desc=f'{file_name} Progress:{index}/{json_files_len} ', desc=f'{file_name} Progress:{index}/{json_files_len} ',
# leave=True, leave=False,
# dynamic_ncols=True)) dynamic_ncols=True,
# for result in results: position=1))
# feature_set[result[0]] = result[1] for result in results:
# except Exception as e: if result[0]:
# logger.error(f"bert 解析出错 {file_name}{e}") feature_set[result[1]] = result[2]
else:
logger.error(f"bert解析出错 {file_name},地址{result[1]},操作码{result[2]},报错{result[3]}")
return
except Exception as e:
logger.error(f"多线程解析出错:{file_name},报错{e}")
return
# debug # debug
try: # try:
for index, feature in tqdm(enumerate(feature_json), total=len(feature_json)): # for index, feature in tqdm(enumerate(feature_json), total=len(feature_json)):
addr, feature = bb2vec(feature) # addr, feature = bb2vec(feature)
feature_set[addr] = feature # feature_set[addr] = feature
except Exception as e: # except Exception as e:
print(index) # print(index)
print(e) # print(e)
print(feature['opcode']) # print(feature['opcode'])
try: try:
for item in file_json['acfg_list']: for item in file_json['acfg_list']:
@ -73,12 +87,12 @@ def addr2vec(base_file_path, index):
if __name__ == '__main__': if __name__ == '__main__':
logger = setup_logger('feature2json', './log/feature2json.log') logger = setup_logger('feature2json', './log/feature2json.log', reset=True)
sample_type = 'benign' sample_type = 'benign'
# json_path = os.path.join(f'./out/json/{sample_type}') # json_path = os.path.join(f'./out/json/{sample_type}')
json_path = os.path.join(f'./out/json/{sample_type}') json_path = os.path.join(f'./out/json/{sample_type}')
# json_files = os.listdir(json_path) json_files = os.listdir(json_path)
json_files = ['1710ae16c54ac149f353ba58e752ba7069f88326e6b71107598283bd0fffcbd6.jsonl'] # json_files = ['1710ae16c54ac149f353ba58e752ba7069f88326e6b71107598283bd0fffcbd6.jsonl']
json_files_len = len(json_files) json_files_len = len(json_files)
now = datetime.now() now = datetime.now()
formatted_now = now.strftime("%Y-%m-%d %H:%M:%S") formatted_now = now.strftime("%Y-%m-%d %H:%M:%S")
@ -88,6 +102,25 @@ if __name__ == '__main__':
# total=len(json_files))) # total=len(json_files)))
# multi_thread_order(addr2vec, [os.path.join(json_path, file) for file in json_files if # multi_thread_order(addr2vec, [os.path.join(json_path, file) for file in json_files if
# os.path.isfile(os.path.join(json_path, file))], thread_num=THREAD_FULL) # os.path.isfile(os.path.join(json_path, file))], thread_num=THREAD_FULL)
for index, json_file in enumerate(json_files): # with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# tqdm_position = 1
# try:
# future_args = {
# executor.submit(addr2vec, os.path.join(json_path, file), index, tqdm_position)
# for index, file in enumerate(json_files)
# }
# for future in tqdm(concurrent.futures.as_completed(future_args),
# total=len(json_files),
# desc='Total:',
# position=0
# ):
# tqdm_position += 1
# except Exception as e:
# print(e)
for index, json_file in tqdm(enumerate(json_files),
total=len(json_files),
desc='Total:',
position=0):
if os.path.isfile(os.path.join(json_path, json_file)): if os.path.isfile(os.path.join(json_path, json_file)):
addr2vec(os.path.join(json_path, json_file), index) addr2vec(os.path.join(json_path, json_file), index)