backup

2023-11-16 15:31:12 +08:00 · 2023-11-16 15:31:12 +08:00 · ad2583dba9
commit ad2583dba9
parent d599236e94
5 changed files with 110 additions and 18 deletions
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -119,9 +119,9 @@ def convert(start, end, overhaul):
 def convert_benign(overhaul):
-    cfg_dir = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
+    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
-    output_dir = "D:\\hkn\\infected\\datasets\\benign_json\\new"
+    dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
-    dot_dir = "D:\\hkn\\infected\\datasets\\benign_dot\\new"
+    output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
@ -139,7 +139,8 @@ def convert_benign(overhaul):
        else:
            log_index = int(logged)
-        for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
+        cdg_list = os.listdir(cfg_dir)
        for index, cfg in enumerate(tqdm(cdg_list)):
            if index < log_index:
                continue
@ -153,6 +154,8 @@ def convert_benign(overhaul):
            except ValueError:
                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
                continue
            except KeyError:
                process_log.write("index {}, {} process failed. KeyError occurred.\n".format(index, cfg))
            finally:
                cfg_file.close()
@ -230,4 +233,4 @@ def convert_benign(overhaul):
 if __name__ == '__main__':
    # convert(35, 69)
-    convert_benign(True)
+    convert_benign(False)
--- a/Genius3/raw-feature-extractor/generate_asm_file.py
+++ b/Genius3/raw-feature-extractor/generate_asm_file.py
@ -0,0 +1,24 @@
 # coding=utf-8
 from func import *
 from idc import *
 def generate_asm_file():
    binary_name = idc.GetInputFile()
    # workflow = idc.ARGV[1]
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
    idaapi.autoWait()
    # 生成pe文件的asm文件
    idc.GenerateFile(idc.OFILE_ASM, binary_name + ".asm", 0, idc.BADADDR, 0)
    # 由于命令行模式也必须打开ida pro，因此每次结束自动关闭ida
    idc.Exit(0)
 if __name__ == '__main__':
    generate_asm_file()
--- a/Genius3/raw-feature-extractor/ida_batch.py
+++ b/Genius3/raw-feature-extractor/ida_batch.py
@ -26,7 +26,7 @@ def benign_batch_mode(overhaul):
    log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
    process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
-    benign_pe_dir = 'D:\\hkn\\infected\\datasets\\benign\\new'
+    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    if overhaul:
        if os.path.exists(log_path):
@ -41,7 +41,8 @@ def benign_batch_mode(overhaul):
        else:
            log_index = int(logged)
-        for index, pe in enumerate(tqdm(sorted(os.listdir(benign_pe_dir)))):
+        pe_list = os.listdir(benign_pe_dir)
        for index, pe in enumerate(tqdm(sorted(pe_list))):
            if index < log_index:
                continue
@ -78,7 +79,7 @@ def benign_batch_mode(overhaul):
    print('总失败数{}'.format(total_failed))
-def mal_batch_mode(start, end):
+def mal_batch_mode(start, end, overhaul):
    # 只选其中这些类的pe进行分析，其他的就直接跳过
    families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
                                'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
@ -94,6 +95,13 @@ def mal_batch_mode(start, end):
        family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
        log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
        process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
        if overhaul:
            if os.path.exists(log_path):
                os.remove(log_path)
            if os.path.exists(process_log_path):
                os.remove(process_log_path)
        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
                                                                                            'r') as family_file:
            logged = log.readline()
@ -165,8 +173,28 @@ def delete_output():
            os.remove(os.path.join(out_dir, f))
 def generate_asm_batch_mode():
    pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    pe_list = os.listdir(pe_dir)
    for pe in tqdm(pe_list):
        cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\generate_asm_file.py" -oF:\iout {}'.format(
            os.path.join(pe_dir, pe))
        p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
        p.start()
        while True:
            if not p.is_alive():
                break
            else:
                time.sleep(1)
    delete_output()
 # 注意：该py文件必须放在IDA的根目录下，且必须使用cmd命令执行，否则无法链接到python库
 # F:\\kkk\\IDA_6.6
 if __name__ == '__main__':
    benign_batch_mode(True)
-    # mal_batch_mode(35, 69)
+    # mal_batch_mode(35, 69, True)
    # generate_asm_batch_mode()
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -17,11 +17,13 @@ def preprocess():
    workflow = idc.ARGV[1]
    # workflow为特定值时分析良性软件，否则分析恶意软件
    if workflow == '-1':
-        cfg_path = "D:\\hkn\\infected\\datasets\\benign_cfg\\new"
+        cfg_path = "F:\\kkk\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
-        gdl_path = "D:\\hkn\\infected\\datasets\\benign_dot\\new\\{}.dot".format(binary_name)
+        gdl_path = "F:\\kkk\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
        asm_path = "F:\\kkk\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
    else:
-        cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
+        cfg_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg\\{}.ida".format(workflow, binary_name)
        gdl_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\{}.dot".format(workflow, binary_name)
        asm_path = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_asm\\{}.asm".format(workflow, binary_name)
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
@ -30,14 +32,17 @@ def preprocess():
    # 生成pe文件的cfg列表
    cfgs = get_func_cfgs_c(FirstSeg())
-    # 生成pe文件的fcg
+    # 将cfg保存为.ida
    pickle.dump(cfgs, open(cfg_path, 'w'))
    # 生成pe文件的fcg，保存为.dot文件
    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
-    full_path = os.path.join(cfg_path, binary_name + '.ida')
+    # 生成.asm文件
-    pickle.dump(cfgs, open(full_path, 'w'))
+    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
-    # 由于命令行模式也必须打开ida pro，因此每次结束自动关闭ida
+    # 关闭IDA Pro
    idc.Exit(0)
--- a/Genius3/raw-feature-extractor/test.py
+++ b/Genius3/raw-feature-extractor/test.py
@ -7,6 +7,7 @@ import json
 import random
 import shutil
 from tqdm import tqdm
 import csv
 def func():
@ -247,14 +248,45 @@ def delete_pe():
            # os.remove(os.path.join(dot_dir, cfg))
 def delete_error_benign():
    jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl'
    dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot'
    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
    asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm"
    pe_dir = "F:\\kkk\\dataset\\benign\\refind"
    alist = os.listdir(pe_dir)
    for f in alist:
        if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')):
            os.remove(os.path.join(pe_dir, f))
            if os.path.exists(os.path.join(asm_dir, f + '.asm')):
                os.remove(os.path.join(asm_dir, f + '.asm'))
            if os.path.exists(os.path.join(cfg_dir, f + '.ida')):
                os.remove(os.path.join(cfg_dir, f + '.ida'))
            if os.path.exists(os.path.join(dot_dir, f + '.dot')):
                os.remove(os.path.join(dot_dir, f + '.dot'))
 def generate_benign_csv():
    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
    csv_out = 'F:\\kkk\\dataset\\benign_family.csv'
    fieldnames = ['Id', 'Class']
    with open(csv_out, "wb") as output_file:
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()
        for f in os.listdir(benign_pe_dir):
            writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
 if __name__ == '__main__':
    generate_benign_csv()
    # create_pixel_intensity()
    # create_dir()
    # change_max_item_lines()
    # subprocess.call('taskkill /im idaq64.exe /f')
-    # delete_error()
+    # delete_error_benign()
    # test()
    # delete_jsonl()
-    delete_all_local()
+    # delete_all_local()
    # check_json()
    # delete_pe()