This commit is contained in:
TinyCaviar 2023-09-01 11:47:19 +08:00
parent 4637fd0d97
commit ddf9ff3b59
4 changed files with 69 additions and 23 deletions

View File

@ -55,7 +55,7 @@ def convert(start, end):
elif 'label' in line: elif 'label' in line:
functions_list.append(line[line.find('= "') + 3:line.find('",')]) functions_list.append(line[line.find('= "') + 3:line.find('",')])
# 没有内部函数被检测到,保险起见还是不要这数据了 # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了
if raw_function_edges.__len__() == 0: if raw_function_edges.__len__() == 0:
continue continue
@ -113,4 +113,4 @@ def convert(start, end):
if __name__ == '__main__': if __name__ == '__main__':
convert(0, 35) convert(35, 69)

View File

@ -1,4 +1,5 @@
# coding=utf-8 # coding=utf-8
import re
import os import os
import subprocess import subprocess
import multiprocessing import multiprocessing
@ -10,35 +11,52 @@ import time
# 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃 # 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃
TIMEOUT = 60 TIMEOUT = 60
# 每个家族最大处理数量
MAX_FAMILY_PROCESS_NUM = 200
def call_preprocess(cmd_line): def call_preprocess(cmd_line):
subprocess.call(cmd_line, shell=True) subprocess.call(cmd_line, shell=True)
def batch_mode(start, end): def batch_mode(start, end):
# 只选其中这些类的pe进行分析其他的就直接跳过
families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
for workflow in range(start, end): for workflow in range(start, end):
# workflow = 0
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
# for test
# pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test' # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow) log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow) process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
'r') as family_file:
logged = log.readline() logged = log.readline()
if logged == '': if logged == '':
log_index = 0 log_index = 0
else: else:
log_index = int(logged) log_index = int(logged)
# pe = "VirusShare_bc161e5e792028e8137aa070fda53f82" families = family_file.read()
for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))): for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
if index < log_index: if index < log_index:
continue continue
# for test # 匹配文件md5取出family文件中该md5的家族
regex = re.compile(pe[11:] + r'[\t][\S]*')
search_result = regex.findall(families)
if len(search_result) == 0:
continue
pe_family = search_result[0].split()[1]
if pe_family not in families_need_to_analyze:
continue
# FOR TEST ONLY
# cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format( # cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
# workflow, os.path.join(pe_dir, pe)) # workflow, os.path.join(pe_dir, pe))
cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(workflow, os.path.join(pe_dir, pe)) cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
workflow, os.path.join(pe_dir, pe))
p = multiprocessing.Process(target=call_preprocess, args=[cmd_line]) p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
p.start() p.start()
@ -53,7 +71,8 @@ def batch_mode(start, end):
if flag_kill: if flag_kill:
subprocess.call('taskkill /im idaq64.exe /f') subprocess.call('taskkill /im idaq64.exe /f')
process_log.write("index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow)) process_log.write(
"index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
else: else:
# 正常运行结束 # 正常运行结束
log.truncate(0) log.truncate(0)
@ -61,6 +80,8 @@ def batch_mode(start, end):
log.write(str(index)) log.write(str(index))
log.flush() log.flush()
process_log.write("index {}, {} process done.\n".format(index, pe)) process_log.write("index {}, {} process done.\n".format(index, pe))
families_need_to_analyze[pe_family] += 1
# 一次workflow结束后将所有副产物删除 # 一次workflow结束后将所有副产物删除
delete_output() delete_output()
@ -68,10 +89,11 @@ def batch_mode(start, end):
def delete_output(): def delete_output():
out_dir = 'F:\\iout' out_dir = 'F:\\iout'
for f in os.listdir(out_dir): for f in os.listdir(out_dir):
os.remove(f) if os.path.exists(os.path.join(out_dir, f)):
os.remove(os.path.join(out_dir, f))
# 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库 # 注意该py文件必须放在IDA的根目录下且必须使用cmd命令执行否则无法链接到python库
# F:\\kkk\\IDA_6.6 # F:\\kkk\\IDA_6.6
if __name__ == '__main__': if __name__ == '__main__':
batch_mode(20, 35) batch_mode(36, 69)

View File

@ -6,6 +6,7 @@ import time
import json import json
import random import random
import shutil import shutil
from tqdm import tqdm
def func(): def func():
@ -25,21 +26,27 @@ def func1():
def create_dir(): def create_dir():
parent_dir = "D:\\hkn\\infected\\datasets" parent_dir = "D:\\hkn\\infected\\datasets"
for workflow in range(35, 40): for workflow in range(40, 70):
# 生成raw data文件夹 # 生成raw data文件夹
# infected = "virusshare_infected{}".format(workflow) infected = "virusshare_infected{}".format(workflow)
# cfg = "virusshare_infected{}_cfg".format(workflow) cfg = "virusshare_infected{}_cfg".format(workflow)
# dot = "virusshare_infected{}_dot".format(workflow) dot = "virusshare_infected{}_dot".format(workflow)
jsonl = "virusshare_infected{}_json".format(workflow) jsonl = "virusshare_infected{}_json".format(workflow)
# os.mkdir(os.path.join(parent_dir, infected)) create(parent_dir, infected)
# os.mkdir(os.path.join(parent_dir, cfg)) create(parent_dir, cfg)
# os.mkdir(os.path.join(parent_dir, dot)) create(parent_dir, dot)
os.mkdir(os.path.join(parent_dir, jsonl)) create(parent_dir, jsonl)
# iout = "virusshare_infected{}_iout".format(workflow) # iout = "virusshare_infected{}_iout".format(workflow)
# os.rmdir(os.path.join(parent_dir, iout)) # os.rmdir(os.path.join(parent_dir, iout))
# os.rmdir(os.path.join(parent_dir, ida)) # os.rmdir(os.path.join(parent_dir, ida))
def create(parent_dir, folder):
if not os.path.exists(os.path.join(parent_dir, folder)):
os.mkdir(os.path.join(parent_dir, folder))
def change_max_item_lines(): def change_max_item_lines():
f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb') f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
s = f.read() s = f.read()
@ -82,7 +89,7 @@ def delete_error():
def check_json(): def check_json():
for workflow in range(5, 16): for workflow in tqdm(range(0, 69)):
json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
for json_file in os.listdir(json_dir): for json_file in os.listdir(json_dir):
f = open(os.path.join(json_dir, json_file), 'r') f = open(os.path.join(json_dir, json_file), 'r')
@ -183,6 +190,22 @@ def read_test():
print(line[line.find('= "') + 3:line.find('",')]) print(line[line.find('= "') + 3:line.find('",')])
# 临时工具有些pe文件没有经过api分类直接删掉
def del_redundant():
for workflow in range(0, 68):
pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
with open(family_file_path, 'r') as f_file:
family = f_file.read()
for name in os.listdir(pe_dir):
if name[11:] in family:
continue
else:
# print(name)
os.remove(os.path.join(pe_dir, name))
if __name__ == '__main__': if __name__ == '__main__':
# create_dir() # create_dir()
# change_max_item_lines() # change_max_item_lines()
@ -192,8 +215,9 @@ if __name__ == '__main__':
# delete_jsonl() # delete_jsonl()
# check_json() # check_json()
split_samples() split_samples()
rename() # rename()
# half_divide() # half_divide()
# copy_train_data() # copy_train_data()
# clear_dot() # clear_dot()
# read_test() # read_test()
# del_redundant()