From 0f1e3378a2db6dd4c05f5f5f3eabcac9b7c75299 Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Fri, 1 Mar 2024 14:45:10 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96=E6=93=8D?=
 =?UTF-8?q?=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../convert_pkl_to_json.py                    | 216 +++++++++---------
 .../preprocessing_ida.py                      |  99 ++++----
 2 files changed, 173 insertions(+), 142 deletions(-)

diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
index 837483b..7807f52 100644
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@@ -1,4 +1,5 @@
 # coding=utf-8
+import hashlib
 import pickle as pk
 import re
 import json
@@ -6,125 +7,133 @@ import os
 from tqdm import tqdm
 
 
-def convert(start, end, overhaul):
-    for workflow in range(start, end):
-        # workflow = 0
-        cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
-        output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
-        dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
+def calc_sha256(file_path):
+    with open(file_path, 'rb') as f:
+        bytes = f.read()
+        sha256obj = hashlib.sha256(bytes)
+        sha256 = sha256obj.hexdigest()
+        return sha256
 
-        log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
-        process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
 
-        if overhaul:
-            if os.path.exists(log_path):
-                os.remove(log_path)
-            if os.path.exists(process_log_path):
-                os.remove(process_log_path)
+def convert_malware(overhaul):
+    cfg_dir = "D:\\bishe\\dataset\\infected\\infected_cfg"
+    output_dir = "D:\\bishe\\dataset\\infected\\infected_jsonl"
+    dot_dir = "D:\\bishe\\dataset\\infected\\infected_dot"
+    raw_dir = "D:\\bishe\\dataset\\train_malware"
 
-        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
-            logged = log.readline()
-            if logged == '':
-                log_index = 0
+    log_path = "D:\\bishe\\dataset\\logging\\convert_malware_log.log"
+    process_log_path = "D:\\bishe\\dataset\\logging\\convert_malware_process_log.log"
+
+    if overhaul:
+        if os.path.exists(log_path):
+            os.remove(log_path)
+        if os.path.exists(process_log_path):
+            os.remove(process_log_path)
+
+    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+        logged = log.readline()
+        if logged == '':
+            log_index = 0
+        else:
+            log_index = int(logged)
+
+        for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
+            if index < log_index:
+                continue
+
+            name = cfg[:-4]  # 纯文件名，不带后缀
+            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
+            try:
+                data = pk.load(cfg_file)
+            except EOFError:
+                process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
+                continue
+            except ValueError:
+                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
+                continue
+            finally:
+                cfg_file.close()
+
+            dot_file_path = os.path.join(dot_dir, name + '.dot')
+            if not os.path.exists(dot_file_path):
+                process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
             else:
-                log_index = int(logged)
+                # 打开dot文件获取fcg
+                raw_function_edges = []
+                # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
+                functions_list = []
+                with open(dot_file_path, 'r') as dot:
+                    for line in dot:
+                        if '->' in line:
+                            raw_function_edges.append(re.findall(r'\b\d+\b', line))
+                        elif 'label' in line:
+                            functions_list.append(line[line.find('= "') + 3:line.find('",')])
 
-            for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
-                if index < log_index:
+                # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
+                if raw_function_edges.__len__() == 0:
                     continue
 
-                name = cfg[:-4]  # 纯文件名，不带后缀
-                cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
-                try:
-                    data = pk.load(cfg_file)
-                except EOFError:
-                    process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
-                    continue
-                except ValueError:
-                    process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
-                    continue
-                finally:
-                    cfg_file.close()
+                # 为当前pe文件创建json对象
+                json_obj = {
+                    'hash': calc_sha256(raw_dir + "\\" + name),
+                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
+                    # 'function_number': data.raw_graph_list.__len__(),
+                    'function_number': len(functions_list),
+                    'function_edges': [[int(d[0]) for d in raw_function_edges],
+                                       [int(d[1]) for d in raw_function_edges]],
+                    'acfg_list': [],
+                    'function_names': functions_list
+                }
 
-                dot_file_path = os.path.join(dot_dir, name + '.dot')
-                if not os.path.exists(dot_file_path):
-                    process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
-                else:
-                    # 打开dot文件获取fcg
-                    raw_function_edges = []
-                    # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
-                    functions_list = []
-                    with open(dot_file_path, 'r') as dot:
-                        for line in dot:
-                            if '->' in line:
-                                raw_function_edges.append(re.findall(r'\b\d+\b', line))
-                            elif 'label' in line:
-                                functions_list.append(line[line.find('= "') + 3:line.find('",')])
-
-                    # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
-                    if raw_function_edges.__len__() == 0:
+                # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
+                # 读取pkl文件，一个acfg由一个函数分解而来
+                for acfg in data.raw_graph_list:
+                    # 函数为外部函数，不需要构建cfg
+                    if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
                         continue
 
-                    # 为当前pe文件创建json对象
-                    json_obj = {
-                        'hash': data.binary_name[11:],
-                        # 2023.8.12 bug fix: 这里获取的是内部函数的数量
-                        # 'function_number': data.raw_graph_list.__len__(),
-                        'function_number': len(functions_list),
-                        'function_edges': [[int(d[0]) for d in raw_function_edges],
-                                           [int(d[1]) for d in raw_function_edges]],
-                        'acfg_list': [],
-                        'function_names': functions_list
+                    # 这里2是因为Genius框架提取特征时将后代数量放在2
+                    offspring = [d.get('v')[2] for d in acfg.g.node.values()]
+                    # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
+                    # 以框架为主，将bb_features数组削减为和g.node长度一致
+                    diff = acfg.g.__len__() - len(acfg.bb_features)
+                    if diff != 0:
+                        del acfg.bb_features[diff:]
+                    # 将后代数量的特征放入bb_features中
+
+                    for i, offs in enumerate(offspring):
+                        acfg.bb_features[i].append(offs)
+
+                    acfg_item = {
+                        'block_number': acfg.g.__len__(),
+                        'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
+                        'block_features': acfg.bb_features
                     }
 
-                    # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
-                    # 读取pkl文件，一个acfg由一个函数分解而来
-                    for acfg in data.raw_graph_list:
-                        # 函数为外部函数，不需要构建cfg
-                        if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
-                            continue
+                    json_obj['acfg_list'].append(acfg_item)
+                    # json_obj['function_names'].append(acfg.funcname)
 
-                        # 这里2是因为Genius框架提取特征时将后代数量放在2
-                        offspring = [d.get('v')[2] for d in acfg.g.node.values()]
-                        # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
-                        # 以框架为主，将bb_features数组削减为和g.node长度一致
-                        diff = acfg.g.__len__() - len(acfg.bb_features)
-                        if diff != 0:
-                            del acfg.bb_features[diff:]
-                        # 将后代数量的特征放入bb_features中
+                # 将结果写入json本地文件
+                result = json.dumps(json_obj, ensure_ascii=False)
 
-                        for i, offs in enumerate(offspring):
-                            acfg.bb_features[i].append(offs)
+                with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
+                    out.write(result)
 
-                        acfg_item = {
-                            'block_number': acfg.g.__len__(),
-                            'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
-                            'block_features': acfg.bb_features
-                        }
-
-                        json_obj['acfg_list'].append(acfg_item)
-                        # json_obj['function_names'].append(acfg.funcname)
-
-                    # 将结果写入json本地文件
-                    result = json.dumps(json_obj, ensure_ascii=False)
-
-                    with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
-                        out.write(result)
-
-                    log.truncate(0)
-                    log.seek(0)
-                    log.write(str(index))
-                    log.flush()
-                    process_log.write("index {}, {} process done.\n".format(index, cfg))
+                log.truncate(0)
+                log.seek(0)
+                log.write(str(index))
+                log.flush()
+                process_log.write("index {}, {} process done.\n".format(index, cfg))
 
 
 def convert_benign(overhaul):
-    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
-    dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
-    output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
+    cfg_dir = "D:\\bishe\\dataset\\benign\\refind_cfg"
+    dot_dir = "D:\\bishe\\dataset\\benign\\refind_dot"
+    output_dir = "D:\\bishe\\dataset\\benign\\refind_jsonl"
+    raw_dir = "D:\\bishe\\dataset\\train_benign"
 
-    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
-    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
+    log_path = "D:\\bishe\\dataset\\logging\\convert_benign_log.log"
+    process_log_path = "D:\\bishe\\dataset\\logging\\convert_benign_process_log.log"
 
     if overhaul:
         if os.path.exists(log_path):
@@ -145,6 +154,7 @@ def convert_benign(overhaul):
                 continue
 
             name = cfg[:-4]  # 纯文件名
+
             cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
             try:
                 data = pk.load(cfg_file)
@@ -180,7 +190,7 @@ def convert_benign(overhaul):
 
                 # 为当前pe文件创建json对象
                 json_obj = {
-                    'hash': data.binary_name[11:],
+                    'hash': calc_sha256(raw_dir + "\\" + name),
                     # 2023.8.12 bug fix: 这里获取的是内部函数的数量
                     # 'function_number': data.raw_graph_list.__len__(),
                     'function_number': len(functions_list),
@@ -233,4 +243,6 @@ def convert_benign(overhaul):
 
 if __name__ == '__main__':
     # convert(35, 69)
-    convert_benign(False)
+    # convert_benign(True)
+    convert_benign(True)
+    convert_malware(True)
diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py
index 14afd45..4744c07 100644
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@@ -1,54 +1,73 @@
-# -*- coding: UTF-8 -*-
-import pickle
-from func import *
-from idc import *
+# coding=utf-8
 import os
+import pickle
+import idc
+import idaapi
 
+# 定义常量
+DATA_DIR = "D:\\bishe\\dataset"
+INFECTED_DIR = os.path.join(DATA_DIR, "infected")
+BENIGN_DIR = os.path.join(DATA_DIR, "benign")
+CFG_EXTENSION = ".ida"
+GDL_EXTENSION = ".dot"
+ASM_EXTENSION = ".asm"
 
-def preprocess():
-    # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
-    # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
-    # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
-    # print idc.ARGV[2]
-    # print type(idc.ARGV[2])
+def preprocess(binary_name, workflow):
+    cfg_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{CFG_EXTENSION}"
+    )
+    gdl_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{GDL_EXTENSION}"
+    )
+    asm_path = os.path.join(
+        INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
+        f"{binary_name}{ASM_EXTENSION}"
+    )
 
-    binary_name = idc.GetInputFile()
-
-    workflow = idc.ARGV[1]
-    # workflow为特定值时分析良性软件，否则分析恶意软件
-    if workflow == '-1':
-        cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
-        gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
-        asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
+    if os.path.exists(cfg_path):
+        idc.Exit(0)
     else:
-        cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
-        gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
-        asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)
+        analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+        analysis_flags &= ~idc.AF_IMMOFF
+        idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
 
-    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
-    analysis_flags &= ~idc.AF_IMMOFF
-    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
-    idaapi.autoWait()
+        idaapi.autoWait()
 
-    # 生成pe文件的cfg列表
+        # 生成CFG
+        generate_cfg(binary_name, cfg_path)
+
+        # 生成GDL
+        generate_gdl(gdl_path)
+
+        # 生成ASM
+        generate_asm(asm_path)
+
+        # 关闭IDA Pro
+        idc.Exit(0)
+
+def generate_cfg(binary_name, cfg_path):
     cfgs = get_func_cfgs_c(FirstSeg())
-    # 将cfg保存为.ida
-    pickle.dump(cfgs, open(cfg_path, 'w'))
+    with open(cfg_path, 'wb') as cfg_file:
+        pickle.dump(cfgs, cfg_file)
 
-    # 生成pe文件的fcg，保存为.dot文件
-    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
+def generate_gdl(gdl_path):
     idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
 
-    # 生成.asm文件
+def generate_asm(asm_path):
     idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
 
-    # 关闭IDA Pro
-    idc.Exit(0)
+# 主函数
+def main():
+    binary_name = idc.GetInputFile()
+    try:
+        workflow = idc.ARGV[1]
+    except IndexError:
+        print("Workflow argument not provided.")
+        return
+    preprocess(binary_name, workflow)
 
-
-# 通用命令行格式  idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
-# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path，完整命令行如下
-# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
-# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
-if __name__ == '__main__':
-    preprocess()
+# 如果是作为IDA Pro的脚本运行，调用主函数
+if __name__ == "__main__":
+    main()

From 548eedb29236e6377d9c45b26cb02b10bc6340f9 Mon Sep 17 00:00:00 2001
From: huihun <781165206@qq.com>
Date: Fri, 1 Mar 2024 16:11:26 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E6=89=B9=E9=87=8F=E5=8C=96=E6=93=8D?=
 =?UTF-8?q?=E4=BD=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .idea/Gencoding3.iml                          |   2 +-
 .idea/misc.xml                                |   2 +-
 .../bat_file/benign/ida_file_cerate.bat       |   2 +-
 .../malware/ida_file_cerate_malware.bat       |  16 +
 Genius3/main.py                               |  16 -
 .../HierarchicalGraphModel_mine.py            |  81 ----
 .../convert_pkl_to_json.py                    |   2 -
 Genius3/raw-feature-extractor/discovRe.py     | 264 -------------
 Genius3/raw-feature-extractor/func.py         |   4 +-
 .../graph_analysis_ida.py                     |  35 +-
 .../preprocessing_ida.py                      |  16 +-
 Genius3/raw-feature-extractor/read_idaFILE.py | 101 -----
 Genius3/search-engine/db.py                   | 356 ------------------
 ida_file_cerate_malware.bat                   |  16 -
 14 files changed, 48 insertions(+), 865 deletions(-)
 rename ida_file_cerate.bat => Genius3/bat_file/benign/ida_file_cerate.bat (82%)
 create mode 100644 Genius3/bat_file/malware/ida_file_cerate_malware.bat
 delete mode 100644 Genius3/main.py
 delete mode 100644 Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
 delete mode 100644 Genius3/raw-feature-extractor/discovRe.py
 delete mode 100644 Genius3/raw-feature-extractor/read_idaFILE.py
 delete mode 100644 Genius3/search-engine/db.py
 delete mode 100644 ida_file_cerate_malware.bat

diff --git a/.idea/Gencoding3.iml b/.idea/Gencoding3.iml
index 7805102..f7a47fa 100644
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="malgraph" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 7ba73c2..b20e505 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="malgraph" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
diff --git a/ida_file_cerate.bat b/Genius3/bat_file/benign/ida_file_cerate.bat
similarity index 82%
rename from ida_file_cerate.bat
rename to Genius3/bat_file/benign/ida_file_cerate.bat
index f86dbde..618ba13 100644
--- a/ida_file_cerate.bat
+++ b/Genius3/bat_file/benign/ida_file_cerate.bat
@@ -2,7 +2,7 @@
 setlocal EnableDelayedExpansion
 
 
-set "FOLDER_PATH=D:\bishe\dataset\train_benign"
+set "FOLDER_PATH=D:\bishe\dataset\train_benign_part0"
 
 
 
diff --git a/Genius3/bat_file/malware/ida_file_cerate_malware.bat b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
new file mode 100644
index 0000000..c061ccf
--- /dev/null
+++ b/Genius3/bat_file/malware/ida_file_cerate_malware.bat
@@ -0,0 +1,16 @@
+@echo off
+setlocal EnableDelayedExpansion
+
+
+set "FOLDER_PATH=D:\bishe\dataset\sample_20230130_458"
+
+
+
+for %%f in ("%FOLDER_PATH%\*") do (
+    echo !time! %%f
+    D:\IDA_Pro_v6.8\idaq64.exe -c -A  -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
+
+)
+
+endlocal
+
diff --git a/Genius3/main.py b/Genius3/main.py
deleted file mode 100644
index 266873d..0000000
--- a/Genius3/main.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-
-from func import *
-from raw_graphs import *
-from idc import *
-import os
-import argparse
-if __name__ == '__main__':
-	print "hello"
-
-	#
-	# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
-	# -c 删除旧数据库  -A 自动分析，不显示对话框
-	# -B 相当于 -c -A
-
diff --git a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
deleted file mode 100644
index b28ad47..0000000
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ /dev/null
@@ -1,81 +0,0 @@
-class HierarchicalGraphNeuralNetwork(nn.Module):
-    def __init__(self, external_vocab: Vocab):
-        super(HierarchicalGraphNeuralNetwork, self).__init__()
-        self.pool = 'global_max_pool'
-        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
-        cfg_filter_list =[200, 200]
-        cfg_filter_list.insert(0, 11)
-        self.cfg_filter_length = len(cfg_filter_list)
-        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
-                                i in range(self.cfg_filter_length - 1)]
-        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
-        cfg_constructor = cfg_conv['constructor']
-        for i in range(self.cfg_filter_length - 1):
-            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
-        self.dropout = nn.Dropout(p=0.2)
-        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
-        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
-                                                     embedding_dim=cfg_filter_list[-1],
-                                                     padding_idx=external_vocab.pad_idx)
-        fcg_filter_list = [200, 200]
-        fcg_filter_list.insert(0, cfg_filter_list[-1])
-        self.fcg_filter_length = len(fcg_filter_list)
-        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
-                                i in range(self.fcg_filter_length - 1)]
-        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
-        fcg_constructor = fcg_conv['constructor']
-        for i in range(self.fcg_filter_length - 1):
-            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
-        # Last Projection Function: gradually project with more linear layers
-        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
-        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
-        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
-        self.last_activation = nn.Softmax(dim=1)
-
-    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
-                bt_all_function_edges: list):
-        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
-        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
-        fcg_list = []
-        fcg_internal_list = []
-        for idx_batch in range(len(real_bt_positions) - 1):
-            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
-            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
-            fcg_internal_list.append(idx_x_cfg)
-            idx_x_external = self.external_embedding_layer(
-                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
-            idx_x_external = idx_x_external.squeeze(dim=0)
-            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
-            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
-            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
-            idx_graph_data.validate()
-            fcg_list.append(idx_graph_data)
-        fcg_batch = Batch.from_data_list(fcg_list)
-        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
-        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
-        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
-        batch_final = x_fcg_pool
-        # step last project to the number_of_classes (multiclass)
-        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
-        bt_pred = self.last_activation(bt_final_embed)
-        return bt_pred
-
-    def forward_cfg_gnn(self, local_batch: Batch):
-        in_x, edge_index = local_batch.x, local_batch.edge_index
-        for i in range(self.cfg_filter_length - 1):
-            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
-            out_x = torch.nn.functional.relu(out_x, inplace=True)
-            out_x = self.dropout(out_x)
-            in_x = out_x
-        local_batch.x = in_x
-        return local_batch
-
-    def forward_fcg_gnn(self, function_batch: Batch):
-        in_x, edge_index = function_batch.x, function_batch.edge_index
-        for i in range(self.fcg_filter_length - 1):
-            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
-            out_x = torch.nn.functional.relu(out_x, inplace=True)
-            out_x = self.dropout(out_x)
-            in_x = out_x
-        function_batch.x = in_x
-        return function_batch
\ No newline at end of file
diff --git a/Genius3/raw-feature-extractor/convert_pkl_to_json.py b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
index 7807f52..5b291ef 100644
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@@ -242,7 +242,5 @@ def convert_benign(overhaul):
 
 
 if __name__ == '__main__':
-    # convert(35, 69)
-    # convert_benign(True)
     convert_benign(True)
     convert_malware(True)
diff --git a/Genius3/raw-feature-extractor/discovRe.py b/Genius3/raw-feature-extractor/discovRe.py
deleted file mode 100644
index 451999e..0000000
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding=utf-8
-#
-# Reference Lister
-#
-# List all functions and all references to them in the current section.
-#
-# Implemented with the idautils module
-#
-import networkx as nx
-import pdb
-from graph_analysis_ida import *
-from graph_property import *
-
-
-# import wingdbstub
-# wingdbstub.Ensure()
-
-def get_funcs(ea):
-    funcs = {}
-    # Get current ea
-    # Loop from start to end in the current segment
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        func = get_func(funcea)
-        blocks = FlowChart(func)
-        funcs[funcname] = []
-        for bl in blocks:
-            start = bl.startEA
-            end = bl.endEA
-            funcs[funcname].append((start, end))
-    return funcs
-
-
-# 似乎是没用的函数
-# def get_funcs_for_discoverRe(ea):
-#     features = {}
-#     for funcea in Functions(SegStart(ea)):
-#         funcname = GetFunctionName(funcea)
-#         print(funcname)
-#         func = get_func(funcea)
-#         feature = get_discoverRe_feature(func)
-#         features[funcname] = feature
-#     return features
-
-
-# 获取所有bb的11维属性特征
-# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
-def get_bb_features(func):
-    bb_features = []
-    blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-    for bl in blocks:
-        calls = calCalls(bl)
-        transferIns = calTransferIns(bl)
-        mathematicsIns = calArithmeticIns(bl)
-        logicIns = calLogicInstructions(bl)
-        cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
-        movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
-        interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
-        declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
-        totalIns = calInsts(bl)
-        consts = getBBconsts(bl)
-        stringOrIntConsts = len(consts[0]) + len(consts[1])
-        bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
-                            interruptIns, declareIns, totalIns, stringOrIntConsts])
-    return bb_features
-
-
-def get_discoverRe_feature(func, icfg):
-    start = func.startEA
-    end = func.endEA
-    features = []
-    FunctionCalls = getFuncCalls(func)
-    # 1
-    features.append(FunctionCalls)
-    LogicInstr = getLogicInsts(func)
-    # 2
-    features.append(LogicInstr)
-    Transfer = getTransferInsts(func)
-    # 3
-    features.append(Transfer)
-    Locals = getLocalVariables(func)
-    # 4
-    features.append(Locals)
-    BB = getBasicBlocks(func)
-    # 5
-    features.append(BB)
-    Edges = len(icfg.edges())
-    # 6
-    features.append(Edges)
-    Incoming = getIncommingCalls(func)
-    # 7
-    features.append(Incoming)
-    # 8
-    Instrs = getIntrs(func)
-    features.append(Instrs)
-    between = retrieveGP(icfg)
-    # 9
-    features.append(between)
-
-    strings, consts = getfunc_consts(func)
-    # 10
-    features.append(strings)
-    # 11
-    features.append(consts)
-    return features
-
-
-def get_func_names(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        funcs[funcname] = funcea
-    return funcs
-
-
-def get_func_bases(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        funcs[funcea] = funcname
-    return funcs
-
-
-def get_func_range(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        func = get_func(funcea)
-        funcs[funcname] = (func.startEA, func.endEA)
-    return funcs
-
-
-def get_func_sequences(ea):
-    funcs_bodylist = {}
-    funcs = get_funcs(ea)
-    for funcname in funcs:
-        if funcname not in funcs_bodylist:
-            funcs_bodylist[funcname] = []
-        for start, end in funcs[funcname]:
-            inst_addr = start
-            while inst_addr <= end:
-                opcode = GetMnem(inst_addr)
-                funcs_bodylist[funcname].append(opcode)
-                inst_addr = NextHead(inst_addr)
-    return funcs_bodylist
-
-
-def get_func_cfgs(ea):
-    func_cfglist = {}
-    i = 0
-    start, end = get_section('LOAD')
-    # print start, end
-    for funcea in Functions(SegStart(ea)):
-        if start <= funcea <= end:
-            funcname = GetFunctionName(funcea)
-            func = get_func(funcea)
-            print(i)
-            i += 1
-            try:
-                icfg = cfg.cfg_construct(func)
-                func_cfglist[funcname] = icfg
-            except:
-                pass
-
-    return func_cfglist
-
-
-def get_section(t):
-    base = SegByName(t)
-    start = SegByBase(base)
-    end = SegEnd(start)
-    return start, end
-
-
-def get_func_cfg_sequences(func_cfglist):
-    func_cfg_seqlist = {}
-    for funcname in func_cfglist:
-        func_cfg_seqlist[funcname] = {}
-        cfg = func_cfglist[funcname][0]
-        for start, end in cfg:
-            codesq = get_sequences(start, end)
-            func_cfg_seqlist[funcname][(start, end)] = codesq
-
-    return func_cfg_seqlist
-
-
-def get_sequences(start, end):
-    seq = []
-    inst_addr = start
-    while inst_addr <= end:
-        opcode = GetMnem(inst_addr)
-        seq.append(opcode)
-        inst_addr = NextHead(inst_addr)
-    return seq
-
-
-def get_stack_arg(func_addr):
-    print(func_addr)
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-        return []
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <= lastM:
-        mName = GetMemberName(stack, i)
-        mSize = GetMemberSize(stack, i)
-        if mSize:
-            i = i + mSize
-        else:
-            i = i + 4
-        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
-            args.append(mName)
-    return args
-
-    # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-
-
-def processDataSegs():
-    funcdata = {}
-    datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
-        ea = seg.startEA
-        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
-            start = idc.SegStart(ea)
-            end = idc.SegEnd(ea)
-            cur = start
-            while cur <= end:
-                refs = [v for v in DataRefsTo(cur)]
-                for fea in refs:
-                    name = GetFunctionName(fea)
-                    if len(name) == 0:
-                        continue
-                    if name not in funcdata:
-                        funcdata[name] = [cur]
-                    else:
-                        funcdata[name].append(cur)
-                    if cur not in datafunc:
-                        datafunc[cur] = [name]
-                    else:
-                        datafunc[cur].append(name)
-                cur = NextHead(cur)
-    return funcdata, datafunc
-
-
-def obtainDataRefs(callgraph):
-    datarefs = {}
-    funcdata, datafunc = processDataSegs()
-    for node in callgraph:
-        if node in funcdata:
-            datas = funcdata[node]
-            for dd in datas:
-                refs = datafunc[dd]
-                refs = list(set(refs))
-                if node in datarefs:
-                    print(refs)
-                    datarefs[node] += refs
-                    datarefs[node] = list(set(datarefs[node]))
-                else:
-                    datarefs[node] = refs
-    return datarefs
diff --git a/Genius3/raw-feature-extractor/func.py b/Genius3/raw-feature-extractor/func.py
index 33020aa..61f207a 100644
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@@ -16,9 +16,7 @@ from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *
 
-sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
-#import wingdbstub
-#wingdbstub.Ensure()
+
 
 
 
diff --git a/Genius3/raw-feature-extractor/graph_analysis_ida.py b/Genius3/raw-feature-extractor/graph_analysis_ida.py
index 390f8f1..66194b3 100644
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@@ -119,24 +119,23 @@ def getIncommingCalls(func):
 
 
 def get_stackVariables(func_addr):
-    #print func_addr
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-            return 0
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <=lastM:
-        mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack,i)
-        if mSize:
-                i = i + mSize
-        else:
-                i = i+4
-        if mName not in args and mName and 'var_' in mName:
-            args.append(mName)
-    return len(args)
+	args = []
+	stack = GetFrame(func_addr)
+	if not stack:
+		return 0
+	firstM = GetFirstMember(stack)
+	lastM = GetLastMember(stack)
+	i = firstM
+	while i <= lastM:
+		mName = GetMemberName(stack, i)
+		mSize = GetMemberSize(stack, i)
+		if mSize:
+			i = i + mSize
+		else:
+			i = i + 4
+		if mName not in args and mName and 'var_' in mName:
+			args.append(mName)
+	return len(args)
 
 
 # 计算算数指令数量
diff --git a/Genius3/raw-feature-extractor/preprocessing_ida.py b/Genius3/raw-feature-extractor/preprocessing_ida.py
index 4744c07..4a968da 100644
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@@ -1,7 +1,7 @@
-# coding=utf-8
 import os
 import pickle
-import idc
+from func import *
+from idc import *
 import idaapi
 
 # 定义常量
@@ -12,6 +12,7 @@ CFG_EXTENSION = ".ida"
 GDL_EXTENSION = ".dot"
 ASM_EXTENSION = ".asm"
 
+
 def preprocess(binary_name, workflow):
     cfg_path = os.path.join(
         INFECTED_DIR if workflow != "-1" else BENIGN_DIR,
@@ -29,9 +30,9 @@ def preprocess(binary_name, workflow):
     if os.path.exists(cfg_path):
         idc.Exit(0)
     else:
-        analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
-        analysis_flags &= ~idc.AF_IMMOFF
-        idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+        analysis_flags = idc.GetShortPrm(idc.INF_AF2)
+        analysis_flags &= ~ida_ida.AF_IMMOFF
+        idc.SetShortPrm(idc.INF_AF2, analysis_flags)
 
         idaapi.autoWait()
 
@@ -47,17 +48,21 @@ def preprocess(binary_name, workflow):
         # 关闭IDA Pro
         idc.Exit(0)
 
+
 def generate_cfg(binary_name, cfg_path):
     cfgs = get_func_cfgs_c(FirstSeg())
     with open(cfg_path, 'wb') as cfg_file:
         pickle.dump(cfgs, cfg_file)
 
+
 def generate_gdl(gdl_path):
     idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)
 
+
 def generate_asm(asm_path):
     idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
 
+
 # 主函数
 def main():
     binary_name = idc.GetInputFile()
@@ -68,6 +73,7 @@ def main():
         return
     preprocess(binary_name, workflow)
 
+
 # 如果是作为IDA Pro的脚本运行，调用主函数
 if __name__ == "__main__":
     main()
diff --git a/Genius3/raw-feature-extractor/read_idaFILE.py b/Genius3/raw-feature-extractor/read_idaFILE.py
deleted file mode 100644
index aae5416..0000000
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# -*- coding: UTF-8 -*-
-import sys
-from matplotlib import pyplot as plt
-import networkx as nx
-import pickle
-# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
-
-
-def print_obj(obj):
-    # "打印对象的所有属性"
-    print(obj.__dict__)
-
-
-# sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
-# sub_166C4 393
-if __name__ == '__main__':
-    testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
-    fr = open(testpath, 'r')
-    data = pickle.load(fr) #一个二进制文件的acfgs
-    fr.close()
-
-    # print(type(data1))
-    # print_obj(data1)
-    # print data1.raw_graph_list[393]
-    # print_obj(data1.raw_graph_list[393])
-    # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
-    # plt.show()
-
-    print("一个二进制文件的所有函数的原始特征，list。")
-    print_obj(data)  # acfg list
-    print("\n")
-
-    print("一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成")
-    print_obj(data.raw_graph_list[0])  # 一个函数的acfg
-    print("其中fun_features = 函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
-    # feature = data.raw_graph_list[0].fun_features
-    print("old_g:{}".format(data.raw_graph_list[0].old_g))
-    print("g:{}".format(data.raw_graph_list[0].g))
-
-
-    # G = data1.raw_graph_list[393].old_g
-    # print G.node[0] # G.node[i]是dict
-    # for key, value in G.node[0].items():
-    #     print('{key}:{value}'.format(key=key, value=value))
-
-    # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
-    G = data.raw_graph_list[0].g
-    print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
-    # print(G.node[0])
-    # print("\n")
-    # 函数内所有基本快的特征
-    for key, value in G.node.items():
-        print('{}:{}'.format(key, value))
-
-
-
-    #oldg就是读取IDA的CFG，所以数量、方向等都一样；g根据old_g生成，也一样
-    #old g
-    G = data.raw_graph_list[0].old_g
-    nx.draw(G, with_labels=True)
-    #plt.title('old_g')
-    plt.show()
-
-
-    # g
-    G = data.raw_graph_list[0].g
-    nx.draw(G, with_labels=True)
-    #plt.title('Genius_g')
-    plt.show()
-
-    # draw graph with labels
-    pos = nx.spring_layout(G)
-    nx.draw(G, pos)
-    node_labels = nx.get_node_attributes(G, 'v')  #networkx的node，由属性。g的属性为'v'，意为原始特征的vector。old_g的属性见cfg_constructor.py
-    nx.draw_networkx_labels(G, pos, labels=node_labels)
-    #plt.title('Genius_g with raw feature vector')
-    plt.show()
-
-
-# 1 function calls（本函数的函数调用指令（call jal jalr）数量）。。注意arm中没有这些指令
-
-# 2 logic instructions ，本函数的逻辑运算指令数量。如and、or的数量
-
-# 3 TransferIns 转移指令（如jmp arm中为mov）数量
-
-# 4 LocalVariables 局部变量数量
-
-# 5 BB basicblocks数量
-
-# 6 Edges icfg edges数量。icfg是另一篇论文dicovRe中的特征，这里暂时不管
-
-# 7 IncommingCalls，调用本函数的指令数量
-
-# 8 Intrs 指令数量
-
-# 9 between 结构特征中的betweeness。
-
-# 10 strings 字符串
-
-# 11 consts  数字常量
\ No newline at end of file
diff --git a/Genius3/search-engine/db.py b/Genius3/search-engine/db.py
deleted file mode 100644
index bc6c864..0000000
--- a/Genius3/search-engine/db.py
+++ /dev/null
@@ -1,356 +0,0 @@
-import cPickle as pickle 
-from search import *
-from nearpy import Engine
-from nearpy.hashes import RandomDiscretizedProjections
-from nearpy.filters import NearestFilter, UniqueFilter
-from nearpy.distances import EuclideanDistance
-from nearpy.distances import CosineDistance
-from nearpy.hashes import RandomBinaryProjections
-from nearpy.experiments import DistanceRatioExperiment
-from redis import Redis
-from nearpy.storage import RedisStorage
-from feature import *
-import numpy as np
-import os
-import pdb
-import argparse
-import time
-import numpy as np
-from refactoring import *
-import pymongo
-from pymongo import MongoClient
-
-def initDB():
-	client = MongoClient()
-	client = MongoClient('localhost', 27017)
-	client = MongoClient('mongodb://localhost:27017/')
-	db = client.test_database
-	db = client['iot-encoding']
-	return db
-
-db = initDB()
-posts = db.posts
-
-class db:
-	
-	def __init__(self):
-		self.feature_list = {}
-		self.engine = None
-
-	def loadHashmap(self, feature_size, result_n):
-		# Create redis storage adapter
-		redis_object = Redis(host='localhost', port=6379, db=0)
-		redis_storage = RedisStorage(redis_object)
-		pdb.set_trace()
-		try:
-			# Get hash config from redis
-			config = redis_storage.load_hash_configuration('test')
-			# Config is existing, create hash with None parameters
-			lshash = RandomBinaryProjections(None, None)
-			# Apply configuration loaded from redis
-			lshash.apply_config(config)
-			
-		except:
-			# Config is not existing, create hash from scratch, with 10 projections
-			lshash = RandomBinaryProjections('test', 0)
-			
-
-		# Create engine for feature space of 100 dimensions and use our hash.
-		# This will set the dimension of the lshash only the first time, not when
-		# using the configuration loaded from redis. Use redis storage to store
-		# buckets.
-		nearest = NearestFilter(1000)
-		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
-		pdb.set_trace()
-		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
-
-		# Do some stuff like indexing or querying with the engine...
-
-		# Finally store hash configuration in redis for later use
-		redis_storage.store_hash_configuration(lshash)
-
-	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
-		if fvector is None:
-			return
-		#ftuple = tuple([fvector])
-		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
-
-	def batch_appendDB(self, binary_name, features, firmware_name=""):
-		for funcname in features:
-			feature = features[funcname]
-			#pdb.set_trace()
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir(self, base_dir):
-		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
-		i = 0
-		for v in cursor:
-			print i
-			i+=1
-			binary_name = v['binary_name']
-			funcname = v['func_name']
-			firmware_name = v['firmware_name']
-			feature = v['fvector']
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir1(self, base_dir):
-		image_dir = os.path.join(base_dir, "image")
-		firmware_featrues={}
-		bnum = 0
-		fnum = 0
-		i  = 0
-		pdb.set_trace()
-		for firmware_name in os.listdir(image_dir):
-			print firmware_name
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(image_dir, firmware_name)
-			for binary_name in os.listdir(firmware_dir):
-				if binary_name.endswith(".features"):
-					bnum += 1
-					featrues_dir = os.path.join(firmware_dir, binary_name)
-					featrues = pickle.load(open(featrues_dir, "r"))
-					for funcname in featrues:
-						fnum +=1
-						#pdb.set_trace()
-						feature = featrues[funcname]
-						self.appendToDB(binary_name, funcname, feature, firmware_name)
-					del featrues
-		print("bnum ", bnum)
-		print("fnum ", fnum)
-
-	def dump(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		pickle.dump(self.feature_list, open(db_dir, 'w'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		pickle.dump(self.engine, open(db_dir, 'w'))
-
-	def loadDB(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		self.feature_list = pickle.load(open(db_dir, 'r'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		self.engine = pickle.load(open(db_dir, 'r'))
-
-	def findF(self, binary_name, funcname):
-		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
-		return x[0]
-
-def retrieveFeaturesByDir(n, base_dir):
-	firmware_featrues={}
-	i = 0
-	for firmware_name in os.listdir(base_dir):
-		if firmware_name.endWith(".features"):
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(base_dir, firmware_name)
-			if i > 0:
-				break
-			i += 1
-			pdb.set_trace()
-			for binary_name in os.listdir(firmware_dir):
-				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
-				featrues = pickle.load(open(featrues_dir, "r"))
-				for funcname in featrues:
-					feature = featrues[funcname]
-					self.appendToDB(firmware_name, binary_name, funcname, feature)
-				del featrues
-
-def retrieveFeatures(n, base_dir, filename, funcs):
-	feature_dic = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename)
-	for name in featrues:
-		#if name in funcs:
-		x = featrues[name] 
-		#+ featuresx[name]
-		feature_dic[name] = np.asarray(x)
-	return feature_dic
-
-def retrieveVuldb(base_input_dir):
-	vul_path = os.path.join(base_input_dir, "vul")
-	vul_db = pickle.load(open(vul_path, "r"))
-	return vul_db
-
-
-def retrieveFeaturesx(filename):
-	ida_input_dir = os.path.join("./data/", filename + ".features")
-	featuresx = pickle.load(open(ida_input_dir, "r"))
-	return featuresx
-
-def retrieveQueries(n, base_dir, filename1, featrues_src):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename1)
-	for name in featrues:
-		#if name in featrues_src:
-		x = featrues[name] 
-		#+ featuresx[name]
-		queries[name] = np.asarray(x)
-	return queries
-
-def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	for name in featrues:
-		#del featrues[name][5]
-		queries[name] = np.asarray(featrues[name])
-	return queries
-
-def retrieveQuery(n, base_dir, filename, funcname):
-	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	f = [featrues[v] for v in featrues if funcname in v ][0]
-	return np.asarray(f)
-
-def parse_command():
-	parser = argparse.ArgumentParser(description='Process some integers.')
-	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
-	parser.add_argument('--output_dir', type=str, help="output dir")
-	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
-	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
-	parser.add_argument("--size", type=int, help="the size of each graphlet")
-	#parser.add_argument("--size", type=int, help="the size of each graphlet")
-	args = parser.parse_args()
-	return args
-
-def loadFuncs(path):
-	funcs = {}
-	x86_dir = os.path.join(path, "func_candid")
-	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
-	fp = open(x86_dir,"r")
-	for line in fp:
-		items = line.split("\n")
-		funcname = items[0]
-		funcs[funcname] = 1
-	return funcs
-
-def dump(path, featrues, queries):
-	fp = open(path + "/" + "matrix", 'w')
-	for name in featrues:
-		row = []
-		row.append("x86")
-		row.append(name)
-		row += featrues[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
-	for name in queries:
-		row = []
-		row.append("mips")
-		row.append(name)
-		row += queries[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
-	fp.close()
-
-
-def queryBytwo(base_input_dir, filename1, filename2, n):
-	threthold = 50
-	db_instance = db()
-	funcs = loadFuncs(base_input_dir)
-	db_instance.loadHashmap(n, 50000)
-	#pdb.set_trace()
-	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
-	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
-	#queries = refactoring(queries, featrues)
-	vul_db = retrieveVuldb(base_input_dir)
-	pdb.set_trace()
-	#dump(base_input_dir, featrues, queries)
-	#start = time.time()
-	#db_instance.batch_appendDBbyDir(base_input_dir)
-	#end = time.time()
-	#total = end - start
-	#print total
-	db_instance.batch_appendDB(filename1, featrues)
-	pdb.set_trace()
-	ranks = []
-	times = []
-	for threthold in xrange(1, 210, 10):
-		hit = []
-		i = 0
-		for name in queries:
-			#print i 
-			i += 1
-			'''
-			if i == 1000:
-				print (sum(times)/len(times))
-				pdb.set_trace()
-				print "s"
-			'''
-			#if name not in vul_db['openssl']:
-			#	continue
-			if name not in featrues:
-				continue
-			#pdb.set_trace()
-			query = queries[name]
-			#start = time.time()
-			x = db_instance.engine.neighbours(query)
-			#end = time.time()
-			#total = end - start
-			#times.append(total)
-			#print total
-			#pdb.set_trace()
-			try:
-				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
-				ranks.append((name, rank))
-				if rank <= threthold:
-					hit.append(1)
-				else:
-					hit.append(0)
-			except:
-				#pdb.set_trace()
-				hit.append(0)
-				pass
-		#pdb.set_trace()
-		acc = sum(hit) * 1.0 / len(hit)
-		print acc
-
-def queryAll(base_dir, firmware_name, filename1, n):
-	threthold = 155
-	db_instance = db()
-	db_instance.loadHashmap(n, 50000)
-	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
-	start = time.time()
-	pdb.set_trace()
-	db_instance.batch_appendDBbyDir(n, base_dir)
-	end = time.time()
-	dur = end - start
-	print dur
-	pdb.set_trace()
-	hit = []
-	i = 0
-	times = []
-	for name in queries:
-		print i 
-		i += 1
-		query = queries[name]
-		start = time.clock()
-		x = db_instance.engine.neighbours(query)
-		end = time.clock()
-		dur = end - start
-		times.append(dur)
-		#pdb.set_trace()
-		try:
-			rank = [v for v in xrange(len(x)) if name in x[v][1]]
-			if len(rank) > 1:
-				pdb.set_trace()
-				print "stop"
-			if rank[0] <= threthold:
-				hit.append(1)
-			else:
-				hit.append(0)
-		except:
-			hit.append(0)
-	
-	acc = sum(hit) * 1.0 / len(hit)
-	mean = np.mean(times)
-	std =  np.std(times)
-	#pdb.set_trace()
-	print acc
-
-if __name__ == "__main__":
-	args = parse_command()
-	base_dir = args.base_input_dir
-	filename1 = args.filename1
-	filename2 = args.filename2
-	n = args.size
-	pdb.set_trace()
-	queryBytwo(base_dir, filename1, filename2, n)
diff --git a/ida_file_cerate_malware.bat b/ida_file_cerate_malware.bat
deleted file mode 100644
index cd555ed..0000000
--- a/ida_file_cerate_malware.bat
+++ /dev/null
@@ -1,16 +0,0 @@
-@echo off
-setlocal EnableDelayedExpansion
-
-
-set "FOLDER_PATH=D:\bishe\dataset\train_malware"
-
-
-
-for %%f in ("%FOLDER_PATH%\*") do (
-    echo !time! %%f
-    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
-
-)
-
-endlocal
-