批量化操作

2024-03-01 14:45:10 +08:00 · 2024-03-01 14:45:10 +08:00 · 0f1e3378a2
commit 0f1e3378a2
parent 8063d079db
2 changed files with 173 additions and 142 deletions
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -1,4 +1,5 @@
 # coding=utf-8
+import hashlib
 import pickle as pk
 import re
 import json
@ -6,125 +7,133 @@ import os
 from tqdm import tqdm


-def convert(start, end, overhaul):
-    for workflow in range(start, end):
-        # workflow = 0
-        cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
-        output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
-        dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
+def calc_sha256(file_path):
+    with open(file_path, 'rb') as f:
+        bytes = f.read()
+        sha256obj = hashlib.sha256(bytes)
+        sha256 = sha256obj.hexdigest()
+        return sha256

-        log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
-        process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)

-        if overhaul:
-            if os.path.exists(log_path):
-                os.remove(log_path)
-            if os.path.exists(process_log_path):
-                os.remove(process_log_path)
+def convert_malware(overhaul):
+    cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
+    output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
+    dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
+    raw_dir = "D:\\bishe\\dataset\\train_malware"

-        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
-            logged = log.readline()
-            if logged == '':
-                log_index = 0
+    log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
+    process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
+
+    if overhaul:
+        if os.path.exists(log_path):
+            os.remove(log_path)
+        if os.path.exists(process_log_path):
+            os.remove(process_log_path)
+
+    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+        logged = log.readline()
+        if logged == '':
+            log_index = 0
+        else:
+            log_index = int(logged)
+
+        for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
+            if index < log_index:
+                continue
+
+            name = cfg[:-4]  # 纯文件名，不带后缀
+            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
+            try:
+                data = pk.load(cfg_file)
+            except EOFError:
+                process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
+                continue
+            except ValueError:
+                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
+                continue
+            finally:
+                cfg_file.close()
+
+            dot_file_path = os.path.join(dot_dir, name + '.dot')
+            if not os.path.exists(dot_file_path):
+                process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
            else:
-                log_index = int(logged)
+                # 打开dot文件获取fcg
+                raw_function_edges = []
+                # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
+                functions_list = []
+                with open(dot_file_path, 'r') as dot:
+                    for line in dot:
+                        if '->' in line:
+                            raw_function_edges.append(re.findall(r'\b\d+\b', line))
+                        elif 'label' in line:
+                            functions_list.append(line[line.find('= "') + 3:line.find('",')])

-            for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
-                if index < log_index:
+                # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
+                if raw_function_edges.__len__() == 0:
                    continue

-                name = cfg[:-4]  # 纯文件名，不带后缀
-                cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
-                try:
-                    data = pk.load(cfg_file)
-                except EOFError:
-                    process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
-                    continue
-                except ValueError:
-                    process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
-                    continue
-                finally:
-                    cfg_file.close()
+                # 为当前pe文件创建json对象
+                json_obj = {
+                    'hash': calc_sha256(raw_dir + "\\" + name),
+                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
+                    # 'function_number': data.raw_graph_list.__len__(),
+                    'function_number': len(functions_list),
+                    'function_edges': [[int(d[0]) for d in raw_function_edges],
+                                       [int(d[1]) for d in raw_function_edges]],
+                    'acfg_list': [],
+                    'function_names': functions_list
+                }

-                dot_file_path = os.path.join(dot_dir, name + '.dot')
-                if not os.path.exists(dot_file_path):
-                    process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
-                else:
-                    # 打开dot文件获取fcg
-                    raw_function_edges = []
-                    # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
-                    functions_list = []
-                    with open(dot_file_path, 'r') as dot:
-                        for line in dot:
-                            if '->' in line:
-                                raw_function_edges.append(re.findall(r'\b\d+\b', line))
-                            elif 'label' in line:
-                                functions_list.append(line[line.find('= "') + 3:line.find('",')])
-
-                    # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
-                    if raw_function_edges.__len__() == 0:
+                # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
+                # 读取pkl文件，一个acfg由一个函数分解而来
+                for acfg in data.raw_graph_list:
+                    # 函数为外部函数，不需要构建cfg
+                    if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
                        continue

-                    # 为当前pe文件创建json对象
-                    json_obj = {
-                        'hash': data.binary_name[11:],
-                        # 2023.8.12 bug fix: 这里获取的是内部函数的数量
-                        # 'function_number': data.raw_graph_list.__len__(),
-                        'function_number': len(functions_list),
-                        'function_edges': [[int(d[0]) for d in raw_function_edges],
-                                           [int(d[1]) for d in raw_function_edges]],
-                        'acfg_list': [],
-                        'function_names': functions_list
+                    # 这里2是因为Genius框架提取特征时将后代数量放在2
+                    offspring = [d.get('v')[2] for d in acfg.g.node.values()]
+                    # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
+                    # 以框架为主，将bb_features数组削减为和g.node长度一致
+                    diff = acfg.g.__len__() - len(acfg.bb_features)
+                    if diff != 0:
+                        del acfg.bb_features[diff:]
+                    # 将后代数量的特征放入bb_features中
+
+                    for i, offs in enumerate(offspring):
+                        acfg.bb_features[i].append(offs)
+
+                    acfg_item = {
+                        'block_number': acfg.g.__len__(),
+                        'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
+                        'block_features': acfg.bb_features
                    }

-                    # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
-                    # 读取pkl文件，一个acfg由一个函数分解而来
-                    for acfg in data.raw_graph_list:
-                        # 函数为外部函数，不需要构建cfg
-                        if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
-                            continue
+                    json_obj['acfg_list'].append(acfg_item)
+                    # json_obj['function_names'].append(acfg.funcname)

-                        # 这里2是因为Genius框架提取特征时将后代数量放在2
-                        offspring = [d.get('v')[2] for d in acfg.g.node.values()]
-                        # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
-                        # 以框架为主，将bb_features数组削减为和g.node长度一致
-                        diff = acfg.g.__len__() - len(acfg.bb_features)
-                        if diff != 0:
-                            del acfg.bb_features[diff:]
-                        # 将后代数量的特征放入bb_features中
+                # 将结果写入json本地文件
+                result = json.dumps(json_obj, ensure_ascii=False)

-                        for i, offs in enumerate(offspring):
-                            acfg.bb_features[i].append(offs)
+                with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
+                    out.write(result)

-                        acfg_item = {
-                            'block_number': acfg.g.__len__(),
-                            'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
-                            'block_features': acfg.bb_features
-                        }
-
-                        json_obj['acfg_list'].append(acfg_item)
-                        # json_obj['function_names'].append(acfg.funcname)
-
-                    # 将结果写入json本地文件
-                    result = json.dumps(json_obj, ensure_ascii=False)
-
-                    with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
-                        out.write(result)
-
-                    log.truncate(0)
-                    log.seek(0)
-                    log.write(str(index))
-                    log.flush()
-                    process_log.write("index {}, {} process done.\n".format(index, cfg))
+                log.truncate(0)
+                log.seek(0)
+                log.write(str(index))
+                log.flush()
+                process_log.write("index {}, {} process done.\n".format(index, cfg))


 def convert_benign(overhaul):
-    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
-    dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
-    output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
+    cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
+    dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
+    output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
+    raw_dir = "D:\\bishe\\dataset\\train_benign"

-    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
-    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
+    log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
+    process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"

    if overhaul:
        if os.path.exists(log_path):
@ -145,6 +154,7 @@ def convert_benign(overhaul):
                continue

            name = cfg[:-4]  # 纯文件名
+
            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
            try:
                data = pk.load(cfg_file)
@ -180,7 +190,7 @@ def convert_benign(overhaul):

                # 为当前pe文件创建json对象
                json_obj = {
-                    'hash': data.binary_name[11:],
+                    'hash': calc_sha256(raw_dir + "\\" + name),
                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
                    # 'function_number': data.raw_graph_list.__len__(),
                    'function_number': len(functions_list),
@ -233,4 +243,6 @@ def convert_benign(overhaul):

 if __name__ == '__main__':
    # convert(35, 69)
-    convert_benign(False)
+    # convert_benign(True)
+    convert_benign(True)
+    convert_malware(True)
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,54 +1,73 @@
-# -*- coding: UTF-8 -*-
-import pickle
-from func import *
-from idc import *
+# coding=utf-8
 import os
+import pickle
+import idc
+import idaapi

+# 定义常量
+DATA_DIR = "D:\\bishe\\dataset"
+INFECTED_DIR = os.path.join(DATA_DIR, "infected")
+BENIGN_DIR = os.path.join(DATA_DIR, "benign")
+CFG_EXTENSION = ".ida"
+GDL_EXTENSION = ".dot"
+ASM_EXTENSION = ".asm"

-def preprocess():
-    # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
-    # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
-    # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
-    # print idc.ARGV[2]
-    # print type(idc.ARGV[2])
+def preprocess(binary_name, workflow):
+    cfg_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{CFG_EXTENSION}"
+    )
+    gdl_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{GDL_EXTENSION}"
+    )
+    asm_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{ASM_EXTENSION}"
+    )

-    binary_name = idc.GetInputFile()
-
-    workflow = idc.ARGV[1]
-    # workflow为特定值时分析良性软件，否则分析恶意软件
-    if workflow == '-1':
-        cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
-        gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
-        asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
+    if os.path.exists(cfg_path):
+        idc.Exit(0)
    else:
-        cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
-        gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
-        asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
+        analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+        analysis_flags &= ~idc.AF_IMMOFF
+        idc.SetShortPrm(idc.INF_START_AF, analysis_flags)

-    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
-    analysis_flags &= ~idc.AF_IMMOFF
-    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
-    idaapi.autoWait()
+        idaapi.autoWait()

-    # 生成pe文件的cfg列表
+        # 生成CFG
+        generate_cfg(binary_name, cfg_path)
+
+        # 生成GDL
+        generate_gdl(gdl_path)
+
+        # 生成ASM
+        generate_asm(asm_path)
+
+        # 关闭IDA Pro
+        idc.Exit(0)
+
+def generate_cfg(binary_name, cfg_path):
    cfgs = get_func_cfgs_c(FirstSeg())
-    # 将cfg保存为.ida
-    pickle.dump(cfgs, open(cfg_path, 'w'))
+    with open(cfg_path, 'wb') as cfg_file:
+        pickle.dump(cfgs, cfg_file)

-    # 生成pe文件的fcg，保存为.dot文件
-    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
+def generate_gdl(gdl_path):
    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)

-    # 生成.asm文件
+def generate_asm(asm_path):
    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)

-    # 关闭IDA Pro
-    idc.Exit(0)
+# 主函数
+def main():
+    binary_name = idc.GetInputFile()
+    try:
+        workflow = idc.ARGV[1]
+    except IndexError:
+        print("Workflow argument not provided.")
+        return
+    preprocess(binary_name, workflow)

-
-# 通用命令行格式  idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
-# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path，完整命令行如下
-# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
-# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
-if __name__ == '__main__':
-    preprocess()
+# 如果是作为IDA Pro的脚本运行，调用主函数
+if __name__ == "__main__":
+    main()