Gencoding_Ke/Genius3/raw-feature-extractor/ida_batch.py
TinyCaviar ddf9ff3b59 backup
2023-09-01 11:47:19 +08:00

100 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import re
import os
import subprocess
import multiprocessing
from tqdm import tqdm
import time
# 单个pe文件处理超时/s
# 多次处理,一批数据中只有少量文件会超时
# 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃
TIMEOUT = 60
# 每个家族最大处理数量
MAX_FAMILY_PROCESS_NUM = 200
def call_preprocess(cmd_line):
subprocess.call(cmd_line, shell=True)
def batch_mode(start, end):
# 只选其中这些类的pe进行分析其他的就直接跳过
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
for workflow in range(start, end):
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
'r') as family_file:
logged = log.readline()
if logged == '':
log_index = 0
else:
log_index = int(logged)
families = family_file.read()
for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
if index < log_index:
continue
# 匹配文件md5取出family文件中该md5的家族
regex = re.compile(pe[11:] + r'[\t][\S]*')
search_result = regex.findall(families)
if len(search_result) == 0:
continue
pe_family = search_result[0].split()[1]
if pe_family not in families_need_to_analyze:
continue
# FOR TEST ONLY
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
# workflow, os.path.join(pe_dir, pe))
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
workflow, os.path.join(pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start()
flag_kill = True
start = time.time()
while time.time() - start <= TIMEOUT:
if not p.is_alive():
flag_kill = False
break
else:
time.sleep(1)
if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f')
process_log.write(
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
else:
# 正常运行结束
log.truncate(0)
log.seek(0)
log.write(str(index))
log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe))
families_need_to_analyze[pe_family] += 1
# 一次workflow结束后将所有副产物删除
delete_output()
def delete_output():
out_dir = 'F:\\iout'
for f in os.listdir(out_dir):
if os.path.exists(os.path.join(out_dir, f)):
os.remove(os.path.join(out_dir, f))
# 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库
# F:\\kkk\\IDA_6.6
if __name__ == '__main__':
batch_mode(36, 69)