Gencoding_Ke/Genius3/raw-feature-extractor/ida_batch.py
TinyCaviar ad2583dba9 backup
2023-11-16 15:31:12 +08:00

201 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import re
import os
import subprocess
import multiprocessing
from tqdm import tqdm
import time
# 单个pe文件处理超时/s
# 多次处理,一批数据中只有少量文件会超时
# 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃
TIMEOUT = 60
# 每个家族最大处理数量
MAX_FAMILY_PROCESS_NUM = 200
def call_preprocess(cmd_line):
subprocess.call(cmd_line, shell=True)
# 良性软件分析模式ida的命令中将workflow改为-1
def benign_batch_mode(overhaul):
# 总失败数据数量
total_failed = 0
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
pe_list = os.listdir(benign_pe_dir)
for index, pe in enumerate(tqdm(sorted(pe_list))):
if index < log_index:
continue
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
os.path.join(benign_pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
flag_kill = True
start = time.time()
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1)
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write(
"index {}, {} stuck, process terminated.\n".format(index, pe))
total_failed += 1
else:
# 正常运行结束
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe))
# 所有副产物删除
delete_output()
print('总失败数{}'.format(total_failed))
def mal_batch_mode(start, end, overhaul):
# 只选其中这些类的pe进行分析其他的就直接跳过
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
# 记录ida处理报错的数据来自哪些家族
failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
# 总失败数据数量
total_failed = 0
for workflow in range(start, end):
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
if overhaul:
if os.path.exists(log_path):
os.remove(log_path)
if os.path.exists(process_log_path):
os.remove(process_log_path)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
'r') as family_file:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
families = family_file.read()
for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
if index < log_index:
continue
# 匹配文件md5取出family文件中该md5的家族
regex = re.compile(pe[11:] + r'[\t][\S]*')
search_result = regex.findall(families)
if len(search_result) == 0:
continue
pe_family = search_result[0].split()[1]
if pe_family not in families_need_to_analyze:
continue
# FOR TEST ONLY
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
# workflow, os.path.join(pe_dir, pe))
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
workflow, os.path.join(pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
flag_kill = True
start = time.time()
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1)
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write(
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
failed_family[pe_family] += 1
total_failed += 1
else:
# 正常运行结束
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe))
families_need_to_analyze[pe_family] += 1
# 一次workflow结束后将所有副产物删除
delete_output()
print(families_need_to_analyze)
print('\n')
print(failed_family, '总失败数{}'.format(total_failed))
def delete_output():
out_dir = 'F:\\iout'
for f in os.listdir(out_dir):
if os.path.exists(os.path.join(out_dir, f)):
os.remove(os.path.join(out_dir, f))
def generate_asm_batch_mode():
pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
pe_list = os.listdir(pe_dir)
for pe in tqdm(pe_list):
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\generate_asm_file.py" -oF:\iout {}'.format(
os.path.join(pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
while True:
if not p.is_alive():
break
else:
time.sleep(1)
delete_output()
# 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库
# F:\\kkk\\IDA_6.6
if __name__ == '__main__':
benign_batch_mode(True)
# mal_batch_mode(35, 69, True)
# generate_asm_batch_mode()