diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py index 56bae4c..837483b 100644 --- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py +++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py @@ -119,9 +119,9 @@ def convert(start, end, overhaul): def convert_benign(overhaul): - cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new" - output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new" - dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new" + cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" + dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot" + output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl" log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log" process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log" @@ -139,7 +139,8 @@ def convert_benign(overhaul): else: log_index = int(logged) - for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))): + cdg_list = os.listdir(cfg_dir) + for index, cfg in enumerate(tqdm(cdg_list)): if index < log_index: continue @@ -153,6 +154,8 @@ def convert_benign(overhaul): except ValueError: process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg)) continue + except KeyError: + process_log.write("index {}, {} process failed. KeyError occurred.\n".format(index, cfg)) finally: cfg_file.close() @@ -230,4 +233,4 @@ def convert_benign(overhaul): if __name__ == '__main__': # convert(35, 69) - convert_benign(True) + convert_benign(False) diff --git a/Genius3/raw-feature-extractor/generate_asm_file.py b/Genius3/raw-feature-extractor/generate_asm_file.py new file mode 100644 index 0000000..598806a --- /dev/null +++ b/Genius3/raw-feature-extractor/generate_asm_file.py @@ -0,0 +1,24 @@ +# coding=utf-8 +from func import * +from idc import * + + +def generate_asm_file(): + binary_name = idc.GetInputFile() + + # workflow = idc.ARGV[1] + + analysis_flags = idc.GetShortPrm(idc.INF_START_AF) + analysis_flags &= ~idc.AF_IMMOFF + idc.SetShortPrm(idc.INF_START_AF, analysis_flags) + idaapi.autoWait() + + # 生成pe文件的asm文件 + idc.GenerateFile(idc.OFILE_ASM, binary_name + ".asm", 0, idc.BADADDR, 0) + + # 由于命令行模式也必须打开ida pro,因此每次结束自动关闭ida + idc.Exit(0) + + +if __name__ == '__main__': + generate_asm_file() diff --git a/Genius3/raw-feature-extractor/ida_batch.py b/Genius3/raw-feature-extractor/ida_batch.py index 8f5bcdf..7849282 100644 --- a/Genius3/raw-feature-extractor/ida_batch.py +++ b/Genius3/raw-feature-extractor/ida_batch.py @@ -26,7 +26,7 @@ def benign_batch_mode(overhaul): log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log' process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log' - benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new' + benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind' if overhaul: if os.path.exists(log_path): @@ -41,7 +41,8 @@ def benign_batch_mode(overhaul): else: log_index = int(logged) - for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))): + pe_list = os.listdir(benign_pe_dir) + for index, pe in enumerate(tqdm(sorted(pe_list))): if index < log_index: continue @@ -78,7 +79,7 @@ def benign_batch_mode(overhaul): print('总失败数{}'.format(total_failed)) -def mal_batch_mode(start, end): +def mal_batch_mode(start, end, overhaul): # 只选其中这些类的pe进行分析,其他的就直接跳过 families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0, 'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0} @@ -94,6 +95,13 @@ def mal_batch_mode(start, end): family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow) log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow) process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow) + + if overhaul: + if os.path.exists(log_path): + os.remove(log_path) + if os.path.exists(process_log_path): + os.remove(process_log_path) + with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path, 'r') as family_file: logged = log.readline() @@ -165,8 +173,28 @@ def delete_output(): os.remove(os.path.join(out_dir, f)) +def generate_asm_batch_mode(): + pe_dir = 'F:\\kkk\\dataset\\benign\\refind' + pe_list = os.listdir(pe_dir) + for pe in tqdm(pe_list): + cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\generate_asm_file.py" -oF:\iout {}'.format( + os.path.join(pe_dir, pe)) + + p = multiprocessing.Process(target=call_preprocess, args=[cmd_line]) + p.start() + while True: + if not p.is_alive(): + break + else: + time.sleep(1) + + delete_output() + + # 注意:该py文件必须放在IDA的根目录下,且必须使用cmd命令执行,否则无法链接到python库 # F:\\kkk\\IDA_6.6 if __name__ == '__main__': benign_batch_mode(True) - # mal_batch_mode(35, 69) + # mal_batch_mode(35, 69, True) + # generate_asm_batch_mode() + diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py index 507d83c..31ac46c 100644 --- a/Genius3/raw-feature-extractor/preprocessing_ida.py +++ b/Genius3/raw-feature-extractor/preprocessing_ida.py @@ -17,11 +17,13 @@ def preprocess(): workflow = idc.ARGV[1] # workflow为特定值时分析良性软件,否则分析恶意软件 if workflow == '-1': - cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new" - gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name) + cfg_path = "F:\\kkk\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name) + gdl_path = "F:\\kkk\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name) + asm_path = "F:\\kkk\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name) else: - cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow) + cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg\\{}.ida".format(workflow, binary_name) gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name) + asm_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_asm\\{}.asm".format(workflow, binary_name) analysis_flags = idc.GetShortPrm(idc.INF_START_AF) analysis_flags &= ~idc.AF_IMMOFF @@ -30,14 +32,17 @@ def preprocess(): # 生成pe文件的cfg列表 cfgs = get_func_cfgs_c(FirstSeg()) - # 生成pe文件的fcg + # 将cfg保存为.ida + pickle.dump(cfgs, open(cfg_path, 'w')) + + # 生成pe文件的fcg,保存为.dot文件 # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件,网上几乎找不到gdl这个格式 idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT) - full_path = os.path.join(cfg_path, binary_name + '.ida') - pickle.dump(cfgs, open(full_path, 'w')) + # 生成.asm文件 + idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0) - # 由于命令行模式也必须打开ida pro,因此每次结束自动关闭ida + # 关闭IDA Pro idc.Exit(0) diff --git a/Genius3/raw-feature-extractor/test.py b/Genius3/raw-feature-extractor/test.py index 722739c..7a7ad41 100644 --- a/Genius3/raw-feature-extractor/test.py +++ b/Genius3/raw-feature-extractor/test.py @@ -7,6 +7,7 @@ import json import random import shutil from tqdm import tqdm +import csv def func(): @@ -247,14 +248,45 @@ def delete_pe(): # os.remove(os.path.join(dot_dir, cfg)) +def delete_error_benign(): + jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl' + dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot' + cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg" + asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm" + pe_dir = "F:\\kkk\\dataset\\benign\\refind" + alist = os.listdir(pe_dir) + for f in alist: + if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')): + os.remove(os.path.join(pe_dir, f)) + if os.path.exists(os.path.join(asm_dir, f + '.asm')): + os.remove(os.path.join(asm_dir, f + '.asm')) + if os.path.exists(os.path.join(cfg_dir, f + '.ida')): + os.remove(os.path.join(cfg_dir, f + '.ida')) + if os.path.exists(os.path.join(dot_dir, f + '.dot')): + os.remove(os.path.join(dot_dir, f + '.dot')) + + +def generate_benign_csv(): + benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind' + csv_out = 'F:\\kkk\\dataset\\benign_family.csv' + fieldnames = ['Id', 'Class'] + with open(csv_out, "wb") as output_file: + writer = csv.DictWriter(output_file, fieldnames=fieldnames) + writer.writeheader() + for f in os.listdir(benign_pe_dir): + writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'}) + + if __name__ == '__main__': + generate_benign_csv() + # create_pixel_intensity() # create_dir() # change_max_item_lines() # subprocess.call('taskkill /im idaq64.exe /f') - # delete_error() + # delete_error_benign() # test() # delete_jsonl() - delete_all_local() + # delete_all_local() # check_json() # delete_pe()