diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 84cac1c..81afb45 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -55,7 +55,7 @@ def convert(start, end): elif 'label' in line: functions_list.append(line[line.find('= "') + 3:line.find('",')]) - # 没有内部函数被检测到,保险起见还是不要这数据了 + # 没有内部函数被检测到,正常来说不应该,保险起见还是不要这数据了 if raw_function_edges.__len__() == 0: continue @@ -113,4 +113,4 @@ def convert(start, end): if __name__ == '__main__': - convert(0, 35) + convert(35, 69) diff --git a/Genius3/raw-feature-extractor/func.pyc b/Genius3/raw-feature-extractor/func.pyc index f85ad3f..f4025f5 100644 Binary files a/Genius3/raw-feature-extractor/func.pyc and b/Genius3/raw-feature-extractor/func.pyc differ diff --git a/Genius3/raw-feature-extractor/ida_batch.py b/Genius3/raw-feature-extractor/ida_batch.py index 796b6a3..490a24d 100644 --- a/Genius3/raw-feature-extractor/ida_batch.py +++ b/Genius3/raw-feature-extractor/ida_batch.py @@ -1,4 +1,5 @@ # coding=utf-8 +import re import os import subprocess import multiprocessing @@ -10,35 +11,52 @@ import time # 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理,若仍然超时则放弃 TIMEOUT = 60 +# 每个家族最大处理数量 +MAX_FAMILY_PROCESS_NUM = 200 + def call_preprocess(cmd_line): subprocess.call(cmd_line, shell=True) def batch_mode(start, end): + # 只选其中这些类的pe进行分析,其他的就直接跳过 + families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0, + 'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0} for workflow in range(start, end): - # workflow = 0 - pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow) - # for test # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test' + pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow) + family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow) log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow) process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow) - with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log: + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path, + 'r') as family_file: logged = log.readline() if logged == '': log_index = 0 else: log_index = int(logged) - # pe = "VirusShare_bc161e5e792028e8137aa070fda53f82" + families = family_file.read() for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))): if index < log_index: continue - # for test + # 匹配文件md5,取出family文件中该md5的家族 + regex = re.compile(pe[11:] + r'[\t][\S]*') + search_result = regex.findall(families) + if len(search_result) == 0: + continue + + pe_family = search_result[0].split()[1] + if pe_family not in families_need_to_analyze: + continue + + # FOR TEST ONLY # cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format( # workflow, os.path.join(pe_dir, pe)) - cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(workflow, os.path.join(pe_dir, pe)) + cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format( + workflow, os.path.join(pe_dir, pe)) p = multiprocessing.Process(target=call_preprocess, args=[cmd_line]) p.start() @@ -53,7 +71,8 @@ def batch_mode(start, end): if flag_kill: subprocess.call('taskkill /im idaq64.exe /f') - process_log.write("index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow)) + process_log.write( + "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow)) else: # 正常运行结束 log.truncate(0) @@ -61,6 +80,8 @@ def batch_mode(start, end): log.write(str(index)) log.flush() process_log.write("index {}, {} process done.\n".format(index, pe)) + + families_need_to_analyze[pe_family] += 1 # 一次workflow结束后将所有副产物删除 delete_output() @@ -68,10 +89,11 @@ def batch_mode(start, end): def delete_output(): out_dir = 'F:\\iout' for f in os.listdir(out_dir): - os.remove(f) + if os.path.exists(os.path.join(out_dir, f)): + os.remove(os.path.join(out_dir, f)) # 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库 # F:\\kkk\\IDA_6.6 if __name__ == '__main__': - batch_mode(20, 35) + batch_mode(36, 69) diff --git a/Genius3/raw-feature-extractor/test.py b/Genius3/raw-feature-extractor/test.py index 8cb8f16..6e3c460 100644 --- a/Genius3/raw-feature-extractor/test.py +++ b/Genius3/raw-feature-extractor/test.py @@ -6,6 +6,7 @@ import time import json import random import shutil +from tqdm import tqdm def func(): @@ -25,21 +26,27 @@ def func1(): def create_dir(): parent_dir = "D:\\hkn\\infected\\datasets" - for workflow in range(35, 40): + for workflow in range(40, 70): # 生成raw data文件夹 - # infected = "virusshare_infected{}".format(workflow) - # cfg = "virusshare_infected{}_cfg".format(workflow) - # dot = "virusshare_infected{}_dot".format(workflow) + infected = "virusshare_infected{}".format(workflow) + cfg = "virusshare_infected{}_cfg".format(workflow) + dot = "virusshare_infected{}_dot".format(workflow) jsonl = "virusshare_infected{}_json".format(workflow) - # os.mkdir(os.path.join(parent_dir, infected)) - # os.mkdir(os.path.join(parent_dir, cfg)) - # os.mkdir(os.path.join(parent_dir, dot)) - os.mkdir(os.path.join(parent_dir, jsonl)) + create(parent_dir, infected) + create(parent_dir, cfg) + create(parent_dir, dot) + create(parent_dir, jsonl) # iout = "virusshare_infected{}_iout".format(workflow) # os.rmdir(os.path.join(parent_dir, iout)) # os.rmdir(os.path.join(parent_dir, ida)) +def create(parent_dir, folder): + if not os.path.exists(os.path.join(parent_dir, folder)): + os.mkdir(os.path.join(parent_dir, folder)) + + + def change_max_item_lines(): f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb') s = f.read() @@ -82,7 +89,7 @@ def delete_error(): def check_json(): - for workflow in range(5, 16): + for workflow in tqdm(range(0, 69)): json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow) for json_file in os.listdir(json_dir): f = open(os.path.join(json_dir, json_file), 'r') @@ -183,6 +190,22 @@ def read_test(): print(line[line.find('= "') + 3:line.find('",')]) +# 临时工具,有些pe文件没有经过api分类,直接删掉 +def del_redundant(): + for workflow in range(0, 68): + pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow) + family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow) + + with open(family_file_path, 'r') as f_file: + family = f_file.read() + for name in os.listdir(pe_dir): + if name[11:] in family: + continue + else: + # print(name) + os.remove(os.path.join(pe_dir, name)) + + if __name__ == '__main__': # create_dir() # change_max_item_lines() @@ -192,8 +215,9 @@ if __name__ == '__main__': # delete_jsonl() # check_json() split_samples() - rename() + # rename() # half_divide() # copy_train_data() # clear_dot() - # read_test() \ No newline at end of file + # read_test() + # del_redundant()