批量化操作

Update test.py
2024-01-06 18:47:26 +08:00 · 2024-01-06 18:47:03 +08:00 · 2023-11-24 09:43:46 +08:00 · 2023-11-16 15:31:12 +08:00 · 2023-10-10 22:12:18 +08:00 · 2023-09-01 11:47:19 +08:00
64 changed files with 2156 additions and 56710 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/../../../../../:\hkn\project_folder\Gencoding3\.idea/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/Gencoding3.iml
+++ b/.idea/Gencoding3.iml
@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/Genius3/python" isTestSource="false" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 2.7" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="pytest" />
+  </component>
+</module>
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
+    <serverData>
+      <paths name="304">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="root@region-42.seetacloud.com:58034 password">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+  </component>
+</project>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,24 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoreConstantInTheMiddle" value="true" />
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E501" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N802" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Gencoding3.iml" filepath="$PROJECT_DIR$/.idea/Gencoding3.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/Genius3/beautified_sample.json
+++ b/Genius3/beautified_sample.json
@ -0,0 +1,623 @@
+{
+    "function_edges": [
+        [
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            1
+        ],
+        [
+            0,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26
+        ]
+    ], // 看FCG，所有函数之间连接边的出边函数index和入边函数index
+    "acfg_list": [ // 对应 data.raw_graph_list
+        { // 一个CFG对应 data.raw_graph_list[a]
+            "block_number": 3, // CFG中基本块的个数 √ data.raw_graph_list[a].g.__len__()
+            "block_edges": [
+                [
+                    0,
+                    0,
+                    1,
+                    1
+                ],
+                [
+                    0,
+                    2,
+                    0,
+                    2
+                ]
+            ], // 中间那个块才是第0块，不知道为什么；第一个数组是所有边的出块号，第二个数组是所有边的入块号 √ data.raw_graph_list[a].g.edges
+            "block_features": [ // 每个基本块的属性
+                [
+                    0,
+                    2,
+                    1,
+                    0,
+                    7,
+                    0,
+                    1,
+                    1,
+                    4,
+                    0,
+                    0
+                ], // 每个块的属性特征，属性特征为11维向量，具体是调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
+                [
+                    0,
+                    2,
+                    0,
+                    0,
+                    3,
+                    1,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0,
+                    1,
+                    0
+                ]
+            ]
+        },
+        {
+            "block_number": 29, // CFG中基本块的个数
+            "block_edges": [
+                [
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    3,
+                    3,
+                    4,
+                    5,
+                    6,
+                    6,
+                    7,
+                    7,
+                    8,
+                    8,
+                    9,
+                    9,
+                    10,
+                    10,
+                    11,
+                    12,
+                    12,
+                    13,
+                    14,
+                    14,
+                    15,
+                    16,
+                    17,
+                    18,
+                    19,
+                    19,
+                    20,
+                    20,
+                    21,
+                    21,
+                    23,
+                    24,
+                    24,
+                    26,
+                    26,
+                    27,
+                    28
+                ],
+                [
+                    16,
+                    0,
+                    2,
+                    0,
+                    4,
+                    1,
+                    3,
+                    3,
+                    3,
+                    25,
+                    15,
+                    8,
+                    6,
+                    6,
+                    7,
+                    28,
+                    12,
+                    9,
+                    23,
+                    16,
+                    25,
+                    11,
+                    21,
+                    17,
+                    13,
+                    19,
+                    22,
+                    14,
+                    19,
+                    18,
+                    27,
+                    24,
+                    23,
+                    26,
+                    21,
+                    22,
+                    25,
+                    10,
+                    25,
+                    5,
+                    14,
+                    8
+                ]
+            ],
+            "block_features": [
+                [
+                    8,
+                    2,
+                    1,
+                    5,
+                    36,
+                    0,
+                    6,
+                    0,
+                    2,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    7,
+                    0,
+                    0,
+                    3,
+                    0,
+                    1,
+                    1,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    7,
+                    0,
+                    0,
+                    2,
+                    0,
+                    1,
+                    1,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    7,
+                    0,
+                    1,
+                    8,
+                    1,
+                    2,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    7,
+                    1,
+                    0,
+                    2,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    7,
+                    0,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    18,
+                    0,
+                    1,
+                    9,
+                    0,
+                    2,
+                    1,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    21,
+                    1,
+                    0,
+                    3,
+                    0,
+                    1,
+                    1,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    21,
+                    0,
+                    1,
+                    4,
+                    1,
+                    2,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    24,
+                    0,
+                    2,
+                    12,
+                    1,
+                    3,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    26,
+                    0,
+                    3,
+                    16,
+                    0,
+                    4,
+                    1,
+                    4,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    2,
+                    0,
+                    5,
+                    22,
+                    0,
+                    5,
+                    0,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    5,
+                    4,
+                    1,
+                    3,
+                    21,
+                    0,
+                    4,
+                    1,
+                    3,
+                    0,
+                    0
+                ],
+                [
+                    4,
+                    11,
+                    0,
+                    2,
+                    17,
+                    1,
+                    2,
+                    0,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    2,
+                    14,
+                    0,
+                    1,
+                    12,
+                    0,
+                    2,
+                    1,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    3,
+                    17,
+                    0,
+                    0,
+                    10,
+                    0,
+                    1,
+                    0,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    1,
+                    0,
+                    1,
+                    5,
+                    0,
+                    2,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    14,
+                    0,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    3,
+                    17,
+                    0,
+                    0,
+                    7,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    17,
+                    0,
+                    1,
+                    5,
+                    0,
+                    2,
+                    1,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    2,
+                    28,
+                    1,
+                    1,
+                    11,
+                    1,
+                    2,
+                    1,
+                    1,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    11,
+                    0,
+                    1,
+                    8,
+                    1,
+                    2,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    0,
+                    0,
+                    1,
+                    1,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    1,
+                    0,
+                    0,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0
+                ],
+                [
+                    12,
+                    27,
+                    1,
+                    7,
+                    41,
+                    0,
+                    8,
+                    1,
+                    6,
+                    0,
+                    0
+                ],
+                [
+                    0,
+                    0,
+                    1,
+                    0,
+                    7,
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    0
+                ],
+                [
+                    2,
+                    9,
+                    0,
+                    2,
+                    17,
+                    0,
+                    3,
+                    1,
+                    3,
+                    0,
+                    0
+                ],
+                [
+                    2,
+                    14,
+                    0,
+                    0,
+                    5,
+                    0,
+                    1,
+                    0,
+                    4,
+                    0,
+                    0
+                ],
+                [
+                    1,
+                    21,
+                    4,
+                    1,
+                    13,
+                    0,
+                    2,
+                    0,
+                    5,
+                    0,
+                    0
+                ]
+            ]
+        }
+    ],
+    "function_names": [ // 包括外部函数和局部函数的函数名
+        "sub_401000",
+        "start",
+        "GetTempPathW",
+        "GetFileSize",
+        "GetCurrentDirectoryW",
+        "DeleteFileW",
+        "CloseHandle",
+        "WriteFile",
+        "lstrcmpW",
+        "ReadFile",
+        "GetModuleHandleW",
+        "ExitProcess",
+        "HeapCreate",
+        "HeapAlloc",
+        "GetModuleFileNameW",
+        "CreateFileW",
+        "lstrlenW",
+        "ShellExecuteW",
+        "wsprintfW",
+        "HttpSendRequestW",
+        "InternetSetOptionW",
+        "InternetQueryOptionW",
+        "HttpOpenRequestW",
+        "HttpQueryInfoW",
+        "InternetReadFile",
+        "InternetConnectW",
+        "InternetOpenW"
+    ], // √
+    "hash": "316ebb797d5196020eee013cfe771671fff4da8859adc9f385f52a74e82f4e55", // 文件哈希，可以用文件名中的md5替代 √
+    "function_number": 27 // 函数数量 √
+}
--- a/Genius3/python/PySide/QtCore.pyd
+++ b/Genius3/python/PySide/QtCore.pyd
--- a/Genius3/python/PySide/QtDeclarative.pyd
+++ b/Genius3/python/PySide/QtDeclarative.pyd
--- a/Genius3/python/PySide/QtGui.pyd
+++ b/Genius3/python/PySide/QtGui.pyd
--- a/Genius3/python/PySide/QtHelp.pyd
+++ b/Genius3/python/PySide/QtHelp.pyd
--- a/Genius3/python/PySide/QtMultimedia.pyd
+++ b/Genius3/python/PySide/QtMultimedia.pyd
--- a/Genius3/python/PySide/QtNetwork.pyd
+++ b/Genius3/python/PySide/QtNetwork.pyd
--- a/Genius3/python/PySide/QtOpenGL.pyd
+++ b/Genius3/python/PySide/QtOpenGL.pyd
--- a/Genius3/python/PySide/QtScript.pyd
+++ b/Genius3/python/PySide/QtScript.pyd
--- a/Genius3/python/PySide/QtScriptTools.pyd
+++ b/Genius3/python/PySide/QtScriptTools.pyd
--- a/Genius3/python/PySide/QtSql.pyd
+++ b/Genius3/python/PySide/QtSql.pyd
--- a/Genius3/python/PySide/QtSvg.pyd
+++ b/Genius3/python/PySide/QtSvg.pyd
--- a/Genius3/python/PySide/QtTest.pyd
+++ b/Genius3/python/PySide/QtTest.pyd
--- a/Genius3/python/PySide/QtUiTools.pyd
+++ b/Genius3/python/PySide/QtUiTools.pyd
--- a/Genius3/python/PySide/QtXml.pyd
+++ b/Genius3/python/PySide/QtXml.pyd
--- a/Genius3/python/PySide/QtXmlPatterns.pyd
+++ b/Genius3/python/PySide/QtXmlPatterns.pyd
--- a/Genius3/python/PySide/init.py
+++ b/Genius3/python/PySide/init.py
@ -1,3 +0,0 @@
-__all__ = ['QtCore', 'QtGui', 'QtNetwork', 'QtOpenGL', 'QtSql', 'QtSvg', 'QtTest', 'QtWebKit', 'QtScript']
-__version__         = "1.1.2"
-__version_info__    = (1, 1, 2, "final", 1)
--- a/Genius3/python/PySide/phonon.pyd
+++ b/Genius3/python/PySide/phonon.pyd
--- a/Genius3/python/PySide/pyside-python2.7.dll
+++ b/Genius3/python/PySide/pyside-python2.7.dll
--- a/Genius3/python/PySide/shiboken-python2.7.dll
+++ b/Genius3/python/PySide/shiboken-python2.7.dll
--- a/Genius3/python/idaapi.py
+++ b/Genius3/python/idaapi.py
--- a/Genius3/python/idaapi.pyc
+++ b/Genius3/python/idaapi.pyc
--- a/Genius3/python/idautils.py
+++ b/Genius3/python/idautils.py
@ -1,830 +0,0 @@
-#---------------------------------------------------------------------
-# IDAPython - Python plugin for Interactive Disassembler
-#
-# Copyright (c) 2004-2010 Gergely Erdelyi <gergely.erdelyi@d-dome.net>
-#
-# All rights reserved.
-#
-# For detailed copyright information see the file COPYING in
-# the root of the distribution archive.
-#---------------------------------------------------------------------
-"""
-idautils.py - High level utility functions for IDA
-"""
-import idaapi
-import idc
-import types
-import os
-
-
-def refs(ea, funcfirst, funcnext):
-    """
-    Generic reference collector - INTERNAL USE ONLY.
-    """
-    ref = funcfirst(ea)
-    while ref != idaapi.BADADDR:
-        yield ref
-        ref = funcnext(ea, ref)
-
-
-def CodeRefsTo(ea, flow):
-    """
-    Get a list of code references to 'ea'
-
-    @param ea:   Target address
-    @param flow: Follow normal code flow or not
-    @type  flow: Boolean (0/1, False/True)
-
-    @return: list of references (may be empty list)
-
-    Example::
-
-        for ref in CodeRefsTo(ScreenEA(), 1):
-            print ref
-    """
-    if flow == 1:
-        return refs(ea, idaapi.get_first_cref_to, idaapi.get_next_cref_to)
-    else:
-        return refs(ea, idaapi.get_first_fcref_to, idaapi.get_next_fcref_to)
-
-
-def CodeRefsFrom(ea, flow):
-    """
-    Get a list of code references from 'ea'
-
-    @param ea:   Target address
-    @param flow: Follow normal code flow or not
-    @type  flow: Boolean (0/1, False/True)
-
-    @return: list of references (may be empty list)
-
-    Example::
-
-        for ref in CodeRefsFrom(ScreenEA(), 1):
-            print ref
-    """
-    if flow == 1:
-        return refs(ea, idaapi.get_first_cref_from, idaapi.get_next_cref_from)
-    else:
-        return refs(ea, idaapi.get_first_fcref_from, idaapi.get_next_fcref_from)
-
-
-def DataRefsTo(ea):
-    """
-    Get a list of data references to 'ea'
-
-    @param ea:   Target address
-
-    @return: list of references (may be empty list)
-
-    Example::
-
-        for ref in DataRefsTo(ScreenEA()):
-            print ref
-    """
-    return refs(ea, idaapi.get_first_dref_to, idaapi.get_next_dref_to)
-
-
-def DataRefsFrom(ea):
-    """
-    Get a list of data references from 'ea'
-
-    @param ea:   Target address
-
-    @return: list of references (may be empty list)
-
-    Example::
-
-        for ref in DataRefsFrom(ScreenEA()):
-            print ref
-    """
-    return refs(ea, idaapi.get_first_dref_from, idaapi.get_next_dref_from)
-
-
-def XrefTypeName(typecode):
-    """
-    Convert cross-reference type codes to readable names
-
-    @param typecode: cross-reference type code
-    """
-    ref_types = {
-        0  : 'Data_Unknown',
-        1  : 'Data_Offset',
-        2  : 'Data_Write',
-        3  : 'Data_Read',
-        4  : 'Data_Text',
-        5  : 'Data_Informational',
-        16 : 'Code_Far_Call',
-        17 : 'Code_Near_Call',
-        18 : 'Code_Far_Jump',
-        19 : 'Code_Near_Jump',
-        20 : 'Code_User',
-        21 : 'Ordinary_Flow'
-        }
-    assert typecode in ref_types, "unknown reference type %d" % typecode
-    return ref_types[typecode]
-
-
-def _copy_xref(xref):
-    """ Make a private copy of the xref class to preserve its contents """
-    class _xref(object):
-        pass
-
-    xr = _xref()
-    for attr in [ 'frm', 'to', 'iscode', 'type', 'user' ]:
-        setattr(xr, attr, getattr(xref, attr))
-    return xr
-
-
-def XrefsFrom(ea, flags=0):
-    """
-    Return all references from address 'ea'
-
-    @param ea: Reference address
-    @param flags: any of idaapi.XREF_* flags
-
-    Example::
-           for xref in XrefsFrom(here(), 0):
-               print xref.type, XrefTypeName(xref.type), \
-                         'from', hex(xref.frm), 'to', hex(xref.to)
-    """
-    xref = idaapi.xrefblk_t()
-    if xref.first_from(ea, flags):
-        yield _copy_xref(xref)
-        while xref.next_from():
-            yield _copy_xref(xref)
-
-
-def XrefsTo(ea, flags=0):
-    """
-    Return all references to address 'ea'
-
-    @param ea: Reference address
-    @param flags: any of idaapi.XREF_* flags
-
-    Example::
-           for xref in XrefsTo(here(), 0):
-               print xref.type, XrefTypeName(xref.type), \
-                         'from', hex(xref.frm), 'to', hex(xref.to)
-    """
-    xref = idaapi.xrefblk_t()
-    if xref.first_to(ea, flags):
-        yield _copy_xref(xref)
-        while xref.next_to():
-            yield _copy_xref(xref)
-
-
-def Threads():
-    """Returns all thread IDs"""
-    for i in xrange(0, idc.GetThreadQty()):
-        yield idc.GetThreadId(i)
-
-
-def Heads(start=None, end=None):
-    """
-    Get a list of heads (instructions or data)
-
-    @param start: start address (default: inf.minEA)
-    @param end:   end address (default: inf.maxEA)
-
-    @return: list of heads between start and end
-    """
-    if not start: start = idaapi.cvar.inf.minEA
-    if not end:   end = idaapi.cvar.inf.maxEA
-
-    ea = start
-    if not idc.isHead(idc.GetFlags(ea)):
-        ea = idaapi.next_head(ea, end)
-    while ea != idaapi.BADADDR:
-        yield ea
-        ea = idaapi.next_head(ea, end)
-
-
-def Functions(start=None, end=None):
-    """
-    Get a list of functions
-
-    @param start: start address (default: inf.minEA)
-    @param end:   end address (default: inf.maxEA)
-
-    @return: list of heads between start and end
-
-    @note: The last function that starts before 'end' is included even
-    if it extends beyond 'end'. Any function that has its chunks scattered
-    in multiple segments will be reported multiple times, once in each segment
-    as they are listed.
-    """
-    if not start: start = idaapi.cvar.inf.minEA
-    if not end:   end = idaapi.cvar.inf.maxEA
-
-    # find first function head chunk in the range
-    chunk = idaapi.get_fchunk(start)
-    if not chunk:
-        chunk = idaapi.get_next_fchunk(start)
-    while chunk and chunk.startEA < end and (chunk.flags & idaapi.FUNC_TAIL) != 0:
-        chunk = idaapi.get_next_fchunk(chunk.startEA)
-    func = chunk
-
-    while func and func.startEA < end:
-        startea = func.startEA
-        yield startea
-        func = idaapi.get_next_func(startea)
-
-
-def Chunks(start):
-    """
-    Get a list of function chunks
-
-    @param start: address of the function
-
-    @return: list of funcion chunks (tuples of the form (start_ea, end_ea))
-             belonging to the function
-    """
-    func_iter = idaapi.func_tail_iterator_t( idaapi.get_func( start ) )
-    status = func_iter.main()
-    while status:
-        chunk = func_iter.chunk()
-        yield (chunk.startEA, chunk.endEA)
-        status = func_iter.next()
-
-
-def Modules():
-    """
-    Returns a list of module objects with name,size,base and the rebase_to attributes
-    """
-    mod = idaapi.module_info_t()
-    result = idaapi.get_first_module(mod)
-    while result:
-        yield idaapi.object_t(name=mod.name, size=mod.size, base=mod.base, rebase_to=mod.rebase_to)
-        result = idaapi.get_next_module(mod)
-
-
-def Names():
-    """
-    Returns a list of names
-
-    @return: List of tuples (ea, name)
-    """
-    for i in xrange(idaapi.get_nlist_size()):
-        ea   = idaapi.get_nlist_ea(i)
-        name = idaapi.get_nlist_name(i)
-        yield (ea, name)
-
-
-def Segments():
-    """
-    Get list of segments (sections) in the binary image
-
-    @return: List of segment start addresses.
-    """
-    for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
-        if seg:
-            yield seg.startEA
-
-
-def Entries():
-    """
-    Returns a list of entry points
-
-    @return: List of tuples (index, ordinal, ea, name)
-    """
-    n = idaapi.get_entry_qty()
-    for i in xrange(0, n):
-        ordinal = idaapi.get_entry_ordinal(i)
-        ea      = idaapi.get_entry(ordinal)
-        name    = idaapi.get_entry_name(ordinal)
-        yield (i, ordinal, ea, name)
-
-
-def FuncItems(start):
-    """
-    Get a list of function items
-
-    @param start: address of the function
-
-    @return: ea of each item in the function
-    """
-    func = idaapi.get_func(start)
-    if not func:
-        return
-    fii = idaapi.func_item_iterator_t()
-    ok = fii.set(func)
-    while ok:
-        yield fii.current()
-        ok = fii.next_code()
-
-
-def Structs():
-    """
-    Get a list of structures
-
-    @return: List of tuples (idx, sid, name)
-    """
-    idx  = idc.GetFirstStrucIdx()
-    while idx != idaapi.BADADDR:
-        sid = idc.GetStrucId(idx)
-        yield (idx, sid, idc.GetStrucName(sid))
-        idx = idc.GetNextStrucIdx(idx)
-
-
-def StructMembers(sid):
-    """
-    Get a list of structure members information (or stack vars if given a frame).
-
-    @param sid: ID of the structure.
-
-    @return: List of tuples (offset, name, size)
-
-    @note: If 'sid' does not refer to a valid structure,
-           an exception will be raised.
-    @note: This will not return 'holes' in structures/stack frames;
-           it only returns defined structure members.
-    """
-    m = idc.GetFirstMember(sid)
-    if m == -1:
-        raise Exception("No structure with ID: 0x%x" % sid)
-    while (m != idaapi.BADADDR):
-        name = idc.GetMemberName(sid, m)
-        if name:
-            yield (m, name, idc.GetMemberSize(sid, m))
-        m = idc.GetStrucNextOff(sid, m)
-
-
-def DecodePrecedingInstruction(ea):
-    """
-    Decode preceding instruction in the execution flow.
-
-    @param ea: address to decode
-    @return: (None or the decode instruction, farref)
-             farref will contain 'true' if followed an xref, false otherwise
-    """
-    prev_addr, farref  = idaapi.decode_preceding_insn(ea)
-    if prev_addr == idaapi.BADADDR:
-        return (None, False)
-    else:
-        return (idaapi.cmd.copy(), farref)
-
-
-
-def DecodePreviousInstruction(ea):
-    """
-    Decodes the previous instruction and returns an insn_t like class
-
-    @param ea: address to decode
-    @return: None or a new insn_t instance
-    """
-    prev_addr = idaapi.decode_prev_insn(ea)
-    if prev_addr == idaapi.BADADDR:
-        return None
-
-    return idaapi.cmd.copy()
-
-
-def DecodeInstruction(ea):
-    """
-    Decodes an instruction and returns an insn_t like class
-
-    @param ea: address to decode
-    @return: None or a new insn_t instance
-    """
-    inslen = idaapi.decode_insn(ea)
-    if inslen == 0:
-        return None
-
-    return idaapi.cmd.copy()
-
-
-def GetDataList(ea, count, itemsize=1):
-    """
-    Get data list - INTERNAL USE ONLY
-    """
-    if itemsize == 1:
-        getdata = idaapi.get_byte
-    elif itemsize == 2:
-        getdata = idaapi.get_word
-    elif itemsize == 4:
-        getdata = idaapi.get_long
-    elif itemsize == 8:
-        getdata = idaapi.get_qword
-    else:
-        raise ValueError, "Invalid data size! Must be 1, 2, 4 or 8"
-
-    endea = ea + itemsize * count
-    curea = ea
-    while curea < endea:
-        yield getdata(curea)
-        curea += itemsize
-
-
-def PutDataList(ea, datalist, itemsize=1):
-    """
-    Put data list - INTERNAL USE ONLY
-    """
-    putdata = None
-
-    if itemsize == 1:
-        putdata = idaapi.patch_byte
-    if itemsize == 2:
-        putdata = idaapi.patch_word
-    if itemsize == 4:
-        putdata = idaapi.patch_long
-
-    assert putdata, "Invalid data size! Must be 1, 2 or 4"
-
-    for val in datalist:
-        putdata(ea, val)
-        ea = ea + itemsize
-
-
-def MapDataList(ea, length, func, wordsize=1):
-    """
-    Map through a list of data words in the database
-
-    @param ea:       start address
-    @param length:   number of words to map
-    @param func:     mapping function
-    @param wordsize: size of words to map [default: 1 byte]
-
-    @return: None
-    """
-    PutDataList(ea, map(func, GetDataList(ea, length, wordsize)), wordsize)
-
-
-def GetInputFileMD5():
-    """
-    Return the MD5 hash of the input binary file
-
-    @return: MD5 string or None on error
-    """
-    return idc.GetInputMD5()
-
-
-class Strings(object):
-    """
-    Allows iterating over the string list. The set of strings will not be modified.
-    , unless asked explicitly at setup()-time..
-
-    Example:
-        s = Strings()
-
-        for i in s:
-            print "%x: len=%d type=%d -> '%s'" % (i.ea, i.length, i.type, str(i))
-
-    """
-    class StringItem(object):
-        """
-        Class representing each string item.
-        """
-        def __init__(self, si):
-            self.ea     = si.ea
-            """String ea"""
-            self.type   = si.type
-            """string type (ASCSTR_xxxxx)"""
-            self.length = si.length
-            """string length"""
-
-        def is_1_byte_encoding(self):
-            return not self.is_2_bytes_encoding() and not self.is_4_bytes_encoding()
-
-        def is_2_bytes_encoding(self):
-            return (self.type & 7) in [idaapi.ASCSTR_UTF16, idaapi.ASCSTR_ULEN2, idaapi.ASCSTR_ULEN4]
-
-        def is_4_bytes_encoding(self):
-            return (self.type & 7) == idaapi.ASCSTR_UTF32
-
-        def _toseq(self, as_unicode):
-            if self.is_2_bytes_encoding():
-                conv = idaapi.ACFOPT_UTF16
-                pyenc = "utf-16"
-            elif self.is_4_bytes_encoding():
-                conv = idaapi.ACFOPT_UTF8
-                pyenc = "utf-8"
-            else:
-                conv = idaapi.ACFOPT_ASCII
-                pyenc = 'ascii'
-            strbytes = idaapi.get_ascii_contents2(self.ea, self.length, self.type, conv)
-            return unicode(strbytes, pyenc, 'replace') if as_unicode else strbytes
-
-        def __str__(self):
-            return self._toseq(False)
-
-        def __unicode__(self):
-            return self._toseq(True)
-
-
-    STR_C       = 0x0001
-    """C-style ASCII string"""
-    STR_PASCAL  = 0x0002
-    """Pascal-style ASCII string (length byte)"""
-    STR_LEN2    = 0x0004
-    """Pascal-style, length is 2 bytes"""
-    STR_UNICODE = 0x0008
-    """Unicode string"""
-    STR_LEN4    = 0x0010
-    """Pascal-style, length is 4 bytes"""
-    STR_ULEN2   = 0x0020
-    """Pascal-style Unicode, length is 2 bytes"""
-    STR_ULEN4   = 0x0040
-    """Pascal-style Unicode, length is 4 bytes"""
-
-    def clear_cache(self):
-        """Clears the strings list cache"""
-        self.refresh(0, 0) # when ea1=ea2 the kernel will clear the cache
-
-    def __init__(self, default_setup = False):
-        """
-        Initializes the Strings enumeration helper class
-
-        @param default_setup: Set to True to use default setup (C strings, min len 5, ...)
-        """
-        self.size = 0
-        if default_setup:
-            self.setup()
-        else:
-            self.refresh()
-
-        self._si  = idaapi.string_info_t()
-
-    def refresh(self, ea1=None, ea2=None):
-        """Refreshes the strings list"""
-        if ea1 is None:
-            ea1 = idaapi.cvar.inf.minEA
-        if ea2 is None:
-            ea2 = idaapi.cvar.inf.maxEA
-
-        idaapi.refresh_strlist(ea1, ea2)
-        self.size = idaapi.get_strlist_qty()
-
-
-    def setup(self,
-              strtypes = STR_C,
-              minlen = 5,
-              only_7bit = True,
-              ignore_instructions = False,
-              ea1 = None,
-              ea2 = None,
-              display_only_existing_strings = False):
-
-        if ea1 is None:
-            ea1 = idaapi.cvar.inf.minEA
-
-        if ea2 is None:
-            ea2 = idaapi.cvar.inf.maxEA
-
-        t = idaapi.strwinsetup_t()
-        t.strtypes = strtypes
-        t.minlen = minlen
-        t.only_7bit = only_7bit
-        t.ea1 = ea1
-        t.ea2 = ea2
-        t.display_only_existing_strings = display_only_existing_strings
-        idaapi.set_strlist_options(t)
-
-        # Automatically refreshes
-        self.refresh()
-
-
-    def _get_item(self, index):
-        if not idaapi.get_strlist_item(index, self._si):
-            return None
-        else:
-            return Strings.StringItem(self._si)
-
-
-    def __iter__(self):
-        return (self._get_item(index) for index in xrange(0, self.size))
-
-
-    def __getitem__(self, index):
-        """Returns a string item or None"""
-        if index >= self.size:
-            raise KeyError
-        else:
-            return self._get_item(index)
-
-# -----------------------------------------------------------------------
-def GetIdbDir():
-    """
-    Get IDB directory
-
-    This function returns directory path of the current IDB database
-    """
-    return os.path.dirname(idaapi.cvar.database_idb) + os.sep
-
-# -----------------------------------------------------------------------
-def GetRegisterList():
-    """Returns the register list"""
-    return idaapi.ph_get_regnames()
-
-# -----------------------------------------------------------------------
-def GetInstructionList():
-    """Returns the instruction list of the current processor module"""
-    return [i[0] for i in idaapi.ph_get_instruc() if i[0]]
-
-# -----------------------------------------------------------------------
-def _Assemble(ea, line):
-    """
-    Please refer to Assemble() - INTERNAL USE ONLY
-    """
-    if type(line) == types.StringType:
-        lines = [line]
-    else:
-        lines = line
-    ret = []
-    for line in lines:
-        seg = idaapi.getseg(ea)
-        if not seg:
-            return (False, "No segment at ea")
-        ip  = ea - (idaapi.ask_selector(seg.sel) << 4)
-        buf = idaapi.AssembleLine(ea, seg.sel, ip, seg.bitness, line)
-        if not buf:
-            return (False, "Assembler failed: " + line)
-        ea += len(buf)
-        ret.append(buf)
-
-    if len(ret) == 1:
-        ret = ret[0]
-    return (True, ret)
-
-
-def Assemble(ea, line):
-    """
-    Assembles one or more lines (does not display an message dialogs)
-    If line is a list then this function will attempt to assemble all the lines
-    This function will turn on batch mode temporarily so that no messages are displayed on the screen
-
-    @param ea:       start address
-    @return: (False, "Error message") or (True, asm_buf) or (True, [asm_buf1, asm_buf2, asm_buf3])
-    """
-    old_batch = idc.Batch(1)
-    ret = _Assemble(ea, line)
-    idc.Batch(old_batch)
-    return ret
-
-def _copy_obj(src, dest, skip_list = None):
-    """
-    Copy non private/non callable attributes from a class instance to another
-    @param src: Source class to copy from
-    @param dest: If it is a string then it designates the new class type that will be created and copied to.
-                 Otherwise dest should be an instance of another class
-    @return: A new instance or "dest"
-    """
-    if type(dest) == types.StringType:
-        # instantiate a new destination class of the specified type name?
-        dest = new.classobj(dest, (), {})
-    for x in dir(src):
-        # skip special and private fields
-        if x.startswith("__") and x.endswith("__"):
-            continue
-        # skip items in the skip list
-        if skip_list and x in skip_list:
-            continue
-        t = getattr(src, x)
-        # skip callable
-        if callable(t):
-            continue
-        setattr(dest, x, t)
-    return dest
-
-# -----------------------------------------------------------------------
-class _reg_dtyp_t(object):
-    """
-    INTERNAL
-    This class describes a register's number and dtyp.
-    The equal operator is overloaded so that two instances can be tested for equality
-    """
-    def __init__(self, reg, dtyp):
-        self.reg  = reg
-        self.dtyp = dtyp
-
-    def __eq__(self, other):
-        return (self.reg == other.reg) and (self.dtyp == other.dtyp)
-
-# -----------------------------------------------------------------------
-class _procregs(object):
-    """Utility class allowing the users to identify registers in a decoded instruction"""
-    def __getattr__(self, attr):
-        ri = idaapi.reg_info_t()
-        if not idaapi.parse_reg_name(attr, ri):
-            raise AttributeError()
-        r = _reg_dtyp_t(ri.reg, ord(idaapi.get_dtyp_by_size(ri.size)))
-        self.__dict__[attr] = r
-        return r
-
-    def __setattr__(self, attr, value):
-        raise AttributeError(attr)
-
-
-# -----------------------------------------------------------------------
-class _cpu(object):
-    "Simple wrapper around GetRegValue/SetRegValue"
-    def __getattr__(self, name):
-        #print "cpu.get(%s)" % name
-        return idc.GetRegValue(name)
-
-    def __setattr__(self, name, value):
-        #print "cpu.set(%s)" % name
-        return idc.SetRegValue(value, name)
-
-
-# --------------------------------------------------------------------------
-class __process_ui_actions_helper(object):
-    def __init__(self, actions, flags = 0):
-        """Expect a list or a string with a list of actions"""
-        if isinstance(actions, str):
-            lst = actions.split(";")
-        elif isinstance(actions, (list, tuple)):
-            lst = actions
-        else:
-            raise ValueError, "Must pass a string, list or a tuple"
-
-        # Remember the action list and the flags
-        self.__action_list = lst
-        self.__flags = flags
-
-        # Reset action index
-        self.__idx = 0
-
-    def __len__(self):
-        return len(self.__action_list)
-
-    def __call__(self):
-        if self.__idx >= len(self.__action_list):
-            return False
-
-        # Execute one action
-        idaapi.process_ui_action(
-                self.__action_list[self.__idx],
-                self.__flags)
-
-        # Move to next action
-        self.__idx += 1
-
-        # Reschedule
-        return True
-
-
-# --------------------------------------------------------------------------
-def ProcessUiActions(actions, flags=0):
-    """
-    @param actions: A string containing a list of actions separated by semicolon, a list or a tuple
-    @param flags: flags to be passed to process_ui_action()
-    @return: Boolean. Returns False if the action list was empty or execute_ui_requests() failed.
-    """
-
-    # Instantiate a helper
-    helper = __process_ui_actions_helper(actions, flags)
-    return False if len(helper) < 1 else idaapi.execute_ui_requests((helper,))
-
-
-# -----------------------------------------------------------------------
-class peutils_t(object):
-    """
-    PE utility class. Retrieves PE information from the database.
-
-    Constants from pe.h
-    """
-    PE_NODE = "$ PE header" # netnode name for PE header
-    PE_ALT_DBG_FPOS   = idaapi.BADADDR & -1 #  altval() -> translated fpos of debuginfo
-    PE_ALT_IMAGEBASE  = idaapi.BADADDR & -2 #  altval() -> loading address (usually pe.imagebase)
-    PE_ALT_PEHDR_OFF  = idaapi.BADADDR & -3 #  altval() -> offset of PE header
-    PE_ALT_NEFLAGS    = idaapi.BADADDR & -4 #  altval() -> neflags
-    PE_ALT_TDS_LOADED = idaapi.BADADDR & -5 #  altval() -> tds already loaded(1) or invalid(-1)
-    PE_ALT_PSXDLL     = idaapi.BADADDR & -6 #  altval() -> if POSIX(x86) imports from PSXDLL netnode
-
-    def __init__(self):
-        self.__penode = idaapi.netnode()
-        self.__penode.create(peutils_t.PE_NODE)
-
-    imagebase = property(
-        lambda self: self.__penode.altval(peutils_t.PE_ALT_IMAGEBASE)
-      )
-
-    header = property(
-        lambda self: self.__penode.altval(peutils_t.PE_ALT_PEHDR_OFF)
-      )
-
-    def __str__(self):
-        return "peutils_t(imagebase=%s, header=%s)" % (hex(self.imagebase), hex(self.header))
-
-    def header(self):
-        """
-        Returns the complete PE header as an instance of peheader_t (defined in the SDK).
-        """
-        return self.__penode.valobj()
-
-# -----------------------------------------------------------------------
-cpu = _cpu()
-"""This is a special class instance used to access the registers as if they were attributes of this object.
-For example to access the EAX register:
-    print "%x" % cpu.Eax
-"""
-
-procregs = _procregs()
-"""This object is used to access the processor registers. It is useful when decoding instructions and you want to see which instruction is which.
-For example:
-    x = idautils.DecodeInstruction(here())
-    if x[0] == procregs.Esp:
-        print "This operand is the register ESP
-"""
--- a/Genius3/python/idautils.pyc
+++ b/Genius3/python/idautils.pyc
--- a/Genius3/python/idc.py
+++ b/Genius3/python/idc.py
--- a/Genius3/python/idc.pyc
+++ b/Genius3/python/idc.pyc
--- a/Genius3/python/init.py
+++ b/Genius3/python/init.py
@ -1,111 +0,0 @@
-#!/usr/bin/env python
-# -----------------------------------------------------------------------
-# IDAPython - Python plugin for Interactive Disassembler
-#
-# Copyright (c) The IDAPython Team <idapython@googlegroups.com>
-#
-# All rights reserved.
-#
-# For detailed copyright information see the file COPYING in
-# the root of the distribution archive.
-# -----------------------------------------------------------------------
-# init.py - Essential init routines
-# -----------------------------------------------------------------------
-import os
-import sys
-import time
-import warnings
-import _idaapi
-
-# __EA64__ is set if IDA is running in 64-bit mode
-__EA64__ = _idaapi.BADADDR == 0xFFFFFFFFFFFFFFFFL
-
-# -----------------------------------------------------------------------
-# Take over the standard text outputs
-# -----------------------------------------------------------------------
-class IDAPythonStdOut:
-    """
-    Dummy file-like class that receives stout and stderr
-    """
-    def write(self, text):
-        # NB: in case 'text' is Unicode, msg() will decode it
-        # and call umsg() to print it
-        _idaapi.msg(text)
-
-    def flush(self):
-        pass
-
-    def isatty(self):
-        return False
-
-# -----------------------------------------------------------------------
-def runscript(script):
-    """
-    Executes a script.
-    This function is present for backward compatiblity. Please use idaapi.IDAPython_ExecScript() instead
-
-    @param script: script path
-
-    @return: Error string or None on success
-    """
-
-    import idaapi
-    return idaapi.IDAPython_ExecScript(script, globals())
-
-# -----------------------------------------------------------------------
-def print_banner():
-    banner = [
-      "Python %s " % sys.version,
-      "IDAPython" + (" 64-bit" if __EA64__ else "") + " v%d.%d.%d %s (serial %d) (c) The IDAPython Team <idapython@googlegroups.com>" % IDAPYTHON_VERSION
-    ]
-    sepline = '-' * (max([len(s) for s in banner])+1)
-
-    print(sepline)
-    print("\n".join(banner))
-    print(sepline)
-
-# -----------------------------------------------------------------------
-
-# Redirect stderr and stdout to the IDA message window
-_orig_stdout = sys.stdout;
-_orig_stderr = sys.stderr;
-sys.stdout = sys.stderr = IDAPythonStdOut()
-
-# -----------------------------------------------------------------------
-# Initialize the help, with our own stdin wrapper, that'll query the user
-# -----------------------------------------------------------------------
-import pydoc
-class IDAPythonHelpPrompter:
-    def readline(self):
-        return idaapi.askstr(0, '', 'Help topic?')
-help = pydoc.Helper(input = IDAPythonHelpPrompter(), output = sys.stdout)
-
-# Assign a default sys.argv
-sys.argv = [""]
-
-# Have to make sure Python finds our modules
-sys.path.append(_idaapi.idadir("python"))
-
-# Remove current directory from the top of the patch search
-if '' in sys.path: # On non Windows, the empty path is added
-    sys.path.remove('')
-
-if os.getcwd() in sys.path:
-    sys.path.remove(os.getcwd())
-
-# ...and add it to the end if needed
-if not IDAPYTHON_REMOVE_CWD_SYS_PATH:
-    sys.path.append(os.getcwd())
-
-# Import all the required modules
-from idaapi import Choose, get_user_idadir, cvar, Choose2, Appcall, Form
-from idc      import *
-from idautils import *
-import idaapi
-
-# Load the users personal init file
-userrc = os.path.join(get_user_idadir(), "idapythonrc.py")
-if os.path.exists(userrc):
-    idaapi.IDAPython_ExecScript(userrc, globals())
-
-# All done, ready to rock.
--- a/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
+++ b/Genius3/raw-feature-extractor/HierarchicalGraphModel_mine.py
@ -0,0 +1,81 @@
+class HierarchicalGraphNeuralNetwork(nn.Module):
+    def __init__(self, external_vocab: Vocab):
+        super(HierarchicalGraphNeuralNetwork, self).__init__()
+        self.pool = 'global_max_pool'
+        # Hierarchical 1: Control Flow Graph (CFG) embedding and pooling
+        cfg_filter_list =[200, 200]
+        cfg_filter_list.insert(0, 11)
+        self.cfg_filter_length = len(cfg_filter_list)
+        cfg_graphsage_params = [dict(in_channels=cfg_filter_list[i], out_channels=cfg_filter_list[i + 1], bias=True) for
+                                i in range(self.cfg_filter_length - 1)]
+        cfg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=cfg_graphsage_params)
+        cfg_constructor = cfg_conv['constructor']
+        for i in range(self.cfg_filter_length - 1):
+            setattr(self, 'CFG_gnn_{}'.format(i + 1), cfg_constructor(**cfg_conv['kwargs'][i]))
+        self.dropout = nn.Dropout(p=0.2)
+        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
+        self.external_embedding_layer = nn.Embedding(num_embeddings=external_vocab.max_vocab_size + 2,
+                                                     embedding_dim=cfg_filter_list[-1],
+                                                     padding_idx=external_vocab.pad_idx)
+        fcg_filter_list = [200, 200]
+        fcg_filter_list.insert(0, cfg_filter_list[-1])
+        self.fcg_filter_length = len(fcg_filter_list)
+        fcg_graphsage_params = [dict(in_channels=fcg_filter_list[i], out_channels=fcg_filter_list[i + 1], bias=True) for
+                                i in range(self.fcg_filter_length - 1)]
+        fcg_conv = dict(constructor=torch_geometric.nn.conv.SAGEConv, kwargs=fcg_graphsage_params)
+        fcg_constructor = fcg_conv['constructor']
+        for i in range(self.fcg_filter_length - 1):
+            setattr(self, 'FCG_gnn_{}'.format(i + 1), fcg_constructor(**fcg_conv['kwargs'][i]))
+        # Last Projection Function: gradually project with more linear layers
+        self.pj1 = torch.nn.Linear(in_features=fcg_filter_list[-1], out_features=int(fcg_filter_list[-1] / 2))
+        self.pj2 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 2), out_features=int(fcg_filter_list[-1] / 4))
+        self.pj3 = torch.nn.Linear(in_features=int(fcg_filter_list[-1] / 4), out_features=6)
+        self.last_activation = nn.Softmax(dim=1)
+
+    def forward(self, real_local_batch: Batch, real_bt_positions: list, bt_external_names: list,
+                bt_all_function_edges: list):
+        rtn_local_batch = self.forward_cfg_gnn(local_batch=real_local_batch)
+        x_cfg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_local_batch.x, batch=rtn_local_batch.batch)
+        fcg_list = []
+        fcg_internal_list = []
+        for idx_batch in range(len(real_bt_positions) - 1):
+            start_pos, end_pos = real_bt_positions[idx_batch: idx_batch + 2]
+            idx_x_cfg = x_cfg_pool[start_pos: end_pos]
+            fcg_internal_list.append(idx_x_cfg)
+            idx_x_external = self.external_embedding_layer(
+                torch.tensor([bt_external_names[idx_batch]], dtype=torch.long))
+            idx_x_external = idx_x_external.squeeze(dim=0)
+            idx_x_total = torch.cat([idx_x_cfg, idx_x_external], dim=0)
+            idx_function_edge = torch.tensor(bt_all_function_edges[idx_batch], dtype=torch.long)
+            idx_graph_data = Data(x=idx_x_total, edge_index=idx_function_edge)
+            idx_graph_data.validate()
+            fcg_list.append(idx_graph_data)
+        fcg_batch = Batch.from_data_list(fcg_list)
+        # Hierarchical 2: Function Call Graph (FCG) embedding and pooling
+        rtn_fcg_batch = self.forward_fcg_gnn(function_batch=fcg_batch)  # [batch_size, max_node_size, dim]
+        x_fcg_pool = torch_geometric.nn.glob.global_max_pool(x=rtn_fcg_batch.x, batch=rtn_fcg_batch.batch)
+        batch_final = x_fcg_pool
+        # step last project to the number_of_classes (multiclass)
+        bt_final_embed = self.pj3(self.pj2(self.pj1(batch_final)))
+        bt_pred = self.last_activation(bt_final_embed)
+        return bt_pred
+
+    def forward_cfg_gnn(self, local_batch: Batch):
+        in_x, edge_index = local_batch.x, local_batch.edge_index
+        for i in range(self.cfg_filter_length - 1):
+            out_x = getattr(self, 'CFG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
+            out_x = torch.nn.functional.relu(out_x, inplace=True)
+            out_x = self.dropout(out_x)
+            in_x = out_x
+        local_batch.x = in_x
+        return local_batch
+
+    def forward_fcg_gnn(self, function_batch: Batch):
+        in_x, edge_index = function_batch.x, function_batch.edge_index
+        for i in range(self.fcg_filter_length - 1):
+            out_x = getattr(self, 'FCG_gnn_{}'.format(i + 1))(x=in_x, edge_index=edge_index)
+            out_x = torch.nn.functional.relu(out_x, inplace=True)
+            out_x = self.dropout(out_x)
+            in_x = out_x
+        function_batch.x = in_x
+        return function_batch
--- a/Genius3/raw-feature-extractor/cfg_constructor.py
+++ b/Genius3/raw-feature-extractor/cfg_constructor.py
@ -1,9 +1,3 @@
-import copy
-import networkx as nx
-from idautils import *
-from idaapi import *
-from idc import *
-
 import copy
 import networkx as nx
 from idautils import *
@ -99,11 +93,11 @@ def filtering(cfg):
 		bb_start = bb[0]
 		bb_end = bb[1]
 		re = remove(bb_start, bb_end)
-		print bb_id, re, bb_start, bb_end
+		print(bb_id, re, bb_start, bb_end)
 		if re:
-			print re, bb_id
+			print(re, bb_id)
 			rm_sets.append(bb_id)
-	print rm_sets
+	print(rm_sets)
 	for bb_id in rm_sets:
 		cfg.remove_node(bb_id)

@ -160,16 +154,16 @@ def attributingRe(cfg, externs_eas, ea_externs):
 def attributing(cfg):
 	ga = graph_analysis()
 	ga.gwithoffspring(cfg)
-	print "finishing offspring"
+	print("finishing offspring")
 	for node in cfg:
 		stmt_num = getStmtNum(node)
 		binary_value = getBinaryValue(node)
 		cfg.node[node]['stmt_num'] = stmt_num
 		cfg.node[node]['binary_value'] = binary_value
 	ga.domChecking(cfg)
-	print "finishing domChecking"
+	print("finishing domChecking")
 	ga.loopChecking(cfg)
-	print "finishing loopChecking"
+	print("finishing loopChecking")


 def getStmtNum(node):
@ -190,17 +184,17 @@ def getBinaryValue(node):
 	for x in xrange((inst_addr - start)-1):
 		addr = start + x
 		y = GetOriginalByte(addr)
-		print value, addr, y
+		print(value, addr, y)
 		value = value | y
 		value = value << 8
-		print value
+		print(value)

 	addr = inst_addr - 1
 	y = GetOriginalByte(addr)
-	print value, addr, y
+	print(value, addr, y)
 	value = value | y
-	print node
-	print bin(value)
+	print(node)
+	print(bin(value))
 	return value


--- a/Genius3/raw-feature-extractor/cfg_constructor.pyc
+++ b/Genius3/raw-feature-extractor/cfg_constructor.pyc
--- a/Genius3/raw-feature-extractor/convert_pkl_to_json.py
+++ b/Genius3/raw-feature-extractor/convert_pkl_to_json.py
@ -0,0 +1,236 @@
+# coding=utf-8
+import pickle as pk
+import re
+import json
+import os
+from tqdm import tqdm
+
+
+def convert(start, end, overhaul):
+    for workflow in range(start, end):
+        # workflow = 0
+        cfg_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_cfg".format(workflow)
+        output_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
+        dot_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot".format(workflow)
+
+        log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_log{}.log".format(workflow)
+        process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
+
+        if overhaul:
+            if os.path.exists(log_path):
+                os.remove(log_path)
+            if os.path.exists(process_log_path):
+                os.remove(process_log_path)
+
+        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+            logged = log.readline()
+            if logged == '':
+                log_index = 0
+            else:
+                log_index = int(logged)
+
+            for index, cfg in enumerate(tqdm(os.listdir(cfg_dir))):
+                if index < log_index:
+                    continue
+
+                name = cfg[:-4]  # 纯文件名，不带后缀
+                cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
+                try:
+                    data = pk.load(cfg_file)
+                except EOFError:
+                    process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
+                    continue
+                except ValueError:
+                    process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
+                    continue
+                finally:
+                    cfg_file.close()
+
+                dot_file_path = os.path.join(dot_dir, name + '.dot')
+                if not os.path.exists(dot_file_path):
+                    process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
+                else:
+                    # 打开dot文件获取fcg
+                    raw_function_edges = []
+                    # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
+                    functions_list = []
+                    with open(dot_file_path, 'r') as dot:
+                        for line in dot:
+                            if '->' in line:
+                                raw_function_edges.append(re.findall(r'\b\d+\b', line))
+                            elif 'label' in line:
+                                functions_list.append(line[line.find('= "') + 3:line.find('",')])
+
+                    # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
+                    if raw_function_edges.__len__() == 0:
+                        continue
+
+                    # 为当前pe文件创建json对象
+                    json_obj = {
+                        'hash': data.binary_name[11:],
+                        # 2023.8.12 bug fix: 这里获取的是内部函数的数量
+                        # 'function_number': data.raw_graph_list.__len__(),
+                        'function_number': len(functions_list),
+                        'function_edges': [[int(d[0]) for d in raw_function_edges],
+                                           [int(d[1]) for d in raw_function_edges]],
+                        'acfg_list': [],
+                        'function_names': functions_list
+                    }
+
+                    # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
+                    # 读取pkl文件，一个acfg由一个函数分解而来
+                    for acfg in data.raw_graph_list:
+                        # 函数为外部函数，不需要构建cfg
+                        if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
+                            continue
+
+                        # 这里2是因为Genius框架提取特征时将后代数量放在2
+                        offspring = [d.get('v')[2] for d in acfg.g.node.values()]
+                        # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
+                        # 以框架为主，将bb_features数组削减为和g.node长度一致
+                        diff = acfg.g.__len__() - len(acfg.bb_features)
+                        if diff != 0:
+                            del acfg.bb_features[diff:]
+                        # 将后代数量的特征放入bb_features中
+
+                        for i, offs in enumerate(offspring):
+                            acfg.bb_features[i].append(offs)
+
+                        acfg_item = {
+                            'block_number': acfg.g.__len__(),
+                            'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
+                            'block_features': acfg.bb_features
+                        }
+
+                        json_obj['acfg_list'].append(acfg_item)
+                        # json_obj['function_names'].append(acfg.funcname)
+
+                    # 将结果写入json本地文件
+                    result = json.dumps(json_obj, ensure_ascii=False)
+
+                    with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
+                        out.write(result)
+
+                    log.truncate(0)
+                    log.seek(0)
+                    log.write(str(index))
+                    log.flush()
+                    process_log.write("index {}, {} process done.\n".format(index, cfg))
+
+
+def convert_benign(overhaul):
+    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
+    dot_dir = "F:\\kkk\\dataset\\benign\\refind_dot"
+    output_dir = "F:\\kkk\\dataset\\benign\\refind_jsonl"
+
+    log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_log.log"
+    process_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_benign_process_log{}.log"
+
+    if overhaul:
+        if os.path.exists(log_path):
+            os.remove(log_path)
+        if os.path.exists(process_log_path):
+            os.remove(process_log_path)
+
+    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+        logged = log.readline()
+        if logged == '':
+            log_index = 0
+        else:
+            log_index = int(logged)
+
+        cdg_list = os.listdir(cfg_dir)
+        for index, cfg in enumerate(tqdm(cdg_list)):
+            if index < log_index:
+                continue
+
+            name = cfg[:-4]  # 纯文件名
+            cfg_file = open(os.path.join(cfg_dir, name + '.ida'), 'r')
+            try:
+                data = pk.load(cfg_file)
+            except EOFError:
+                process_log.write("index {}, {} process failed. EOFError occurred.\n".format(index, cfg))
+                continue
+            except ValueError:
+                process_log.write("index {}, {} process failed. ValueError occurred.\n".format(index, cfg))
+                continue
+            except KeyError:
+                process_log.write("index {}, {} process failed. KeyError occurred.\n".format(index, cfg))
+            finally:
+                cfg_file.close()
+
+            dot_file_path = os.path.join(dot_dir, name + '.dot')
+            if not os.path.exists(dot_file_path):
+                process_log.write("index {}, {} process failed. dot file not exists.\n".format(index, cfg))
+            else:
+                # 打开dot文件获取fcg
+                raw_function_edges = []
+                # 2023.8.12 bug fix: ida生成的fcg(.dot)文件包含了所有函数，data.raw_graph_list仅包含了内部函数
+                functions_list = []
+                with open(dot_file_path, 'r') as dot:
+                    for line in dot:
+                        if '->' in line:
+                            raw_function_edges.append(re.findall(r'\b\d+\b', line))
+                        elif 'label' in line:
+                            functions_list.append(line[line.find('= "') + 3:line.find('",')])
+
+                # 没有内部函数被检测到，正常来说不应该，保险起见还是不要这数据了
+                if raw_function_edges.__len__() == 0:
+                    continue
+
+                # 为当前pe文件创建json对象
+                json_obj = {
+                    'hash': data.binary_name[11:],
+                    # 2023.8.12 bug fix: 这里获取的是内部函数的数量
+                    # 'function_number': data.raw_graph_list.__len__(),
+                    'function_number': len(functions_list),
+                    'function_edges': [[int(d[0]) for d in raw_function_edges],
+                                       [int(d[1]) for d in raw_function_edges]],
+                    'acfg_list': [],
+                    'function_names': functions_list
+                }
+
+                # 2023.8.12 bug fix: data.raw_graph_list是ida检测到的内部函数，不包括外部函数，因此函数列表和函数数量不能从这里获取
+                # 读取pkl文件，一个acfg由一个函数分解而来
+                for acfg in data.raw_graph_list:
+                    # 函数为外部函数，不需要构建cfg
+                    if acfg.funcname != 'start' and acfg.funcname != 'start_0' and 'sub_' not in acfg.funcname:
+                        continue
+
+                    # 这里2是因为Genius框架提取特征时将后代数量放在2
+                    offspring = [d.get('v')[2] for d in acfg.g.node.values()]
+                    # 这边可能会出现不知名的原因两个数组长度不一致，按理来说应该是一致的
+                    # 以框架为主，将bb_features数组削减为和g.node长度一致
+                    diff = acfg.g.__len__() - len(acfg.bb_features)
+                    if diff != 0:
+                        del acfg.bb_features[diff:]
+                    # 将后代数量的特征放入bb_features中
+
+                    for i, offs in enumerate(offspring):
+                        acfg.bb_features[i].append(offs)
+
+                    acfg_item = {
+                        'block_number': acfg.g.__len__(),
+                        'block_edges': [[d[0] for d in acfg.g.edges], [d[1] for d in acfg.g.edges]],
+                        'block_features': acfg.bb_features
+                    }
+
+                    json_obj['acfg_list'].append(acfg_item)
+                    # json_obj['function_names'].append(acfg.funcname)
+
+                # 将结果写入json本地文件
+                result = json.dumps(json_obj, ensure_ascii=False)
+
+                with open(os.path.join(output_dir, name + '.jsonl'), 'w') as out:
+                    out.write(result)
+
+                log.truncate(0)
+                log.seek(0)
+                log.write(str(index))
+                log.flush()
+                process_log.write("index {}, {} process done.\n".format(index, cfg))
+
+
+if __name__ == '__main__':
+    # convert(35, 69)
+    convert_benign(False)
--- a/Genius3/raw-feature-extractor/discovRe.py
+++ b/Genius3/raw-feature-extractor/discovRe.py
@ -1,3 +1,4 @@
+# coding=utf-8
 #
 # Reference Lister
 #
@ -6,10 +7,11 @@
 # Implemented with the idautils module
 #
 import networkx as nx
-import cPickle as pickle
 import pdb
 from graph_analysis_ida import *
 from graph_property import *
+
+
 # import wingdbstub
 # wingdbstub.Ensure()

@ -28,15 +30,40 @@ def get_funcs(ea):
            funcs[funcname].append((start, end))
    return funcs

-def get_funcs_for_discoverRe(ea):
-    features = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        print funcname
-        func = get_func(funcea)
-        feature = get_discoverRe_feature(func)
-        features[funcname] = feature
-    return features
+
+# 似乎是没用的函数
+# def get_funcs_for_discoverRe(ea):
+#     features = {}
+#     for funcea in Functions(SegStart(ea)):
+#         funcname = GetFunctionName(funcea)
+#         print(funcname)
+#         func = get_func(funcea)
+#         feature = get_discoverRe_feature(func)
+#         features[funcname] = feature
+#     return features
+
+
+# 获取所有bb的11维属性特征
+# 调用/传输/算术/逻辑/比较/移动/终止/数据声明/总指令数/字符串或整数常量/后代的数量
+def get_bb_features(func):
+    bb_features = []
+    blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
+    for bl in blocks:
+        calls = calCalls(bl)
+        transferIns = calTransferIns(bl)
+        mathematicsIns = calArithmeticIns(bl)
+        logicIns = calLogicInstructions(bl)
+        cmpIns = calIns(bl, {'cmp': 1, 'cmps': 1, 'cmpsb': 1, 'cmppd': 1, 'cmpps': 1, 'fcom': 1, 'fcomp': 1, 'fcompp': 1, 'ficom': 1, 'ficomp': 1, 'ptest': 1, 'test': 1})
+        movIns = calIns(bl, {'mov': 1, 'movb': 1, 'movw': 1, 'movl': 1, 'movq': 1, 'movabsq': 1, 'push': 1, 'pop': 1, 'lea': 1})
+        interruptIns = calIns(bl, {'int1': 1, 'int3': 1, 'into': 1, 'iret': 1, 'iretd': 1, 'iretq': 1})
+        declareIns = calIns(bl, {'dw': 1, 'dd': 1, 'db': 1})
+        totalIns = calInsts(bl)
+        consts = getBBconsts(bl)
+        stringOrIntConsts = len(consts[0]) + len(consts[1])
+        bb_features.append([calls, transferIns, mathematicsIns, logicIns, cmpIns, movIns,
+                            interruptIns, declareIns, totalIns, stringOrIntConsts])
+    return bb_features
+

 def get_discoverRe_feature(func, icfg):
    start = func.startEA
@ -71,10 +98,13 @@ def get_discoverRe_feature(func, icfg):
    features.append(between)

    strings, consts = getfunc_consts(func)
+    # 10
    features.append(strings)
+    # 11
    features.append(consts)
    return features

+
 def get_func_names(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
@ -82,6 +112,7 @@ def get_func_names(ea):
        funcs[funcname] = funcea
    return funcs

+
 def get_func_bases(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
@ -89,6 +120,7 @@ def get_func_bases(ea):
        funcs[funcea] = funcname
    return funcs

+
 def get_func_range(ea):
    funcs = {}
    for funcea in Functions(SegStart(ea)):
@ -97,6 +129,7 @@ def get_func_range(ea):
        funcs[funcname] = (func.startEA, func.endEA)
    return funcs

+
 def get_func_sequences(ea):
    funcs_bodylist = {}
    funcs = get_funcs(ea)
@ -111,6 +144,7 @@ def get_func_sequences(ea):
                inst_addr = NextHead(inst_addr)
    return funcs_bodylist

+
 def get_func_cfgs(ea):
    func_cfglist = {}
    i = 0
@ -120,7 +154,7 @@ def get_func_cfgs(ea):
        if start <= funcea <= end:
            funcname = GetFunctionName(funcea)
            func = get_func(funcea)
-            print i
+            print(i)
            i += 1
            try:
                icfg = cfg.cfg_construct(func)
@ -130,6 +164,7 @@ def get_func_cfgs(ea):

    return func_cfglist

+
 def get_section(t):
    base = SegByName(t)
    start = SegByBase(base)
@ -158,8 +193,9 @@ def get_sequences(start, end):
        inst_addr = NextHead(inst_addr)
    return seq

+
 def get_stack_arg(func_addr):
-    print func_addr
+    print(func_addr)
    args = []
    stack = GetFrame(func_addr)
    if not stack:
@ -180,6 +216,7 @@ def get_stack_arg(func_addr):

    # pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))

+
 def processDataSegs():
    funcdata = {}
    datafunc = {}
@ -208,6 +245,7 @@ def processDataSegs():
                cur = NextHead(cur)
    return funcdata, datafunc

+
 def obtainDataRefs(callgraph):
    datarefs = {}
    funcdata, datafunc = processDataSegs()
@ -218,11 +256,9 @@ def obtainDataRefs(callgraph):
                refs = datafunc[dd]
                refs = list(set(refs))
                if node in datarefs:
-                    print refs
+                    print(refs)
                    datarefs[node] += refs
                    datarefs[node] = list(set(datarefs[node]))
                else:
                    datarefs[node] = refs
    return datarefs
-
-
--- a/Genius3/raw-feature-extractor/discovRe.pyc
+++ b/Genius3/raw-feature-extractor/discovRe.pyc
--- a/Genius3/raw-feature-extractor/func.py
+++ b/Genius3/raw-feature-extractor/func.py
@ -11,29 +11,34 @@ from idaapi import *
 from idc import *
 import networkx as nx
 import cfg_constructor as cfg
-import cPickle as pickle
 import pdb
 from raw_graphs import *
 #from discovRe_feature.discovRe import *
 from discovRe import *
+
+sys.path.append("D:\\hkn\\project_folder\\Gencoding3\\Genius3\\python")
 #import wingdbstub
 #wingdbstub.Ensure()

+
+
 def print_obj(obj):
-    "打印对象的所有属性"
+    # "打印对象的所有属性"
    print(obj.__dict__)

+
 def gt_funcNames(ea):
    funcs = []
    plt_func, plt_data = processpltSegs()
    for funcea in Functions(SegStart(ea)):
            funcname = get_unified_funcname(funcea)
            if funcname in plt_func:
-				print funcname
+                print(funcname)
                continue
            funcs.append(funcname)
    return funcs

+
 def get_funcs(ea):
    funcs = {}
        # Get current ea
@ -52,6 +57,7 @@ def get_funcs(ea):
                funcs[funcname].append((start, end))
    return funcs

+
 # used for the callgraph generation.
 def get_func_namesWithoutE(ea):
    funcs = {}
@ -59,13 +65,14 @@ def get_func_namesWithoutE(ea):
    for funcea in Functions(SegStart(ea)):
            funcname = get_unified_funcname(funcea)
            if 'close' in funcname:
-				print funcea
+                print(funcea)
            if funcname in plt_func:
-				print funcname
+                print(funcname)
                continue
            funcs[funcname] = funcea
    return funcs

+
 # used for the callgraph generation.
 def get_func_names(ea):
    funcs = {}
@ -74,6 +81,7 @@ def get_func_names(ea):
            funcs[funcname] = funcea
    return funcs

+
 def get_func_bases(ea):
        funcs = {}
        plt_func, plt_data = processpltSegs()
@ -84,6 +92,7 @@ def get_func_bases(ea):
                funcs[funcea] = funcname
        return funcs

+
 def get_func_range(ea):
        funcs = {}
        for funcea in Functions(SegStart(ea)):
@ -92,6 +101,7 @@ def get_func_range(ea):
        funcs[funcname] = (func.startEA, func.endEA)
        return funcs

+
 def get_unified_funcname(ea):
    funcname = GetFunctionName(ea)
    if len(funcname) > 0:
@ -99,6 +109,7 @@ def get_unified_funcname(ea):
            funcname = funcname[1:]
    return funcname

+
 def get_func_sequences(ea):
    funcs_bodylist = {}
    funcs = get_funcs(ea)
@ -113,6 +124,7 @@ def get_func_sequences(ea):
                inst_addr = NextHead(inst_addr)
    return funcs_bodylist

+
 def get_func_cfgs_c(ea):
    # type: (object) -> object
    binary_name = idc.GetInputFile()
@ -122,16 +134,18 @@ def get_func_cfgs_c(ea):
    for funcea in Functions(SegStart(ea)):
        funcname = get_unified_funcname(funcea)
        func = get_func(funcea)
-		print i
+        print(i)
        i += 1
        icfg = cfg.getCfg(func, externs_eas, ea_externs)
        func_f = get_discoverRe_feature(func, icfg[0])
-		raw_g = raw_graph(funcname, icfg, func_f) #生成一个rawcfg。raw_graph是一个python class，定义在 raw_graph.py.包含g（本文的ACFG）、olg_g（discovRe的acfg）、feature（函数级别的一些特征，以及betweenness）
+        bb_f = get_bb_features(func)
+        raw_g = raw_graph(funcname, icfg, func_f, bb_f)
        raw_cfgs.append(raw_g) # raw_graphs 是另一个python class，存储raw_graph的list。定义在 raw_graph.py
        #print(raw_g.__dict__)
        #print(raw_g) 由于raw_graph、raw_graphs都是class，直接print只会打印<raw_graphs.raw_graphs instance at 0x09888FD0>，不能打印对象的属性。	#https://blog.51cto.com/steed/2046408 print_obj、    print(obj.__dict__)
    return raw_cfgs

+
 def get_func_cfgs_ctest(ea):
    binary_name = idc.GetInputFile()
    raw_cfgs = raw_graphs(binary_name)
@ -141,7 +155,7 @@ def get_func_cfgs_ctest(ea):
    for funcea in Functions(SegStart(ea)):
        funcname = get_unified_funcname(funcea)
        func = get_func(funcea)
-		print i
+        print(i)
        i += 1
        icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
        diffs[funcname] = (icfg, old_cfg)
@ -150,13 +164,14 @@ def get_func_cfgs_ctest(ea):

    return diffs

+
 def get_func_cfgs(ea):
    func_cfglist = {}
    i = 0
    for funcea in Functions(SegStart(ea)):
        funcname = get_unified_funcname(funcea)
        func = get_func(funcea)
-		print i
+        print(i)
        i += 1
        try:
            icfg = cfg.getCfg(func)
@ -166,6 +181,7 @@ def get_func_cfgs(ea):

    return func_cfglist

+
 def get_func_cfg_sequences(func_cfglist):
    func_cfg_seqlist = {}
    for funcname in func_cfglist:
@ -187,8 +203,9 @@ def get_sequences(start, end):
        inst_addr = NextHead(inst_addr)
    return seq

+
 def get_stack_arg(func_addr):
-	print func_addr
+    print(func_addr)
    args = []
    stack = GetFrame(func_addr)
    if not stack:
@ -206,9 +223,9 @@ def get_stack_arg(func_addr):
        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
            args.append(mName)
    return args
-
    #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))

+
 def processExternalSegs():
    funcdata = {}
    datafunc = {}
@ -226,6 +243,7 @@ def processExternalSegs():
                cur = NextHead(cur)
    return funcdata

+
 def processpltSegs():
    funcdata = {}
    datafunc = {}
@ -273,6 +291,7 @@ def processDataSegs():
                cur = NextHead(cur)
    return funcdata, datafunc

+
 def obtainDataRefs(callgraph):
    datarefs = {}
    funcdata, datafunc = processDataSegs()
@ -283,7 +302,7 @@ def obtainDataRefs(callgraph):
                refs = datafunc[dd]
                refs = list(set(refs))
                if node in datarefs:
-					print refs
+                    print(refs)
                    datarefs[node] += refs
                    datarefs[node] = list(set(datarefs[node]))
                else:
--- a/Genius3/raw-feature-extractor/func.pyc
+++ b/Genius3/raw-feature-extractor/func.pyc
--- a/Genius3/raw-feature-extractor/generate_asm_file.py
+++ b/Genius3/raw-feature-extractor/generate_asm_file.py
@ -0,0 +1,24 @@
+# coding=utf-8
+from func import *
+from idc import *
+
+
+def generate_asm_file():
+    binary_name = idc.GetInputFile()
+
+    # workflow = idc.ARGV[1]
+
+    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
+    analysis_flags &= ~idc.AF_IMMOFF
+    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
+    idaapi.autoWait()
+
+    # 生成pe文件的asm文件
+    idc.GenerateFile(idc.OFILE_ASM, binary_name + ".asm", 0, idc.BADADDR, 0)
+
+    # 由于命令行模式也必须打开ida pro，因此每次结束自动关闭ida
+    idc.Exit(0)
+
+
+if __name__ == '__main__':
+    generate_asm_file()
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.py
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.py
@ -1,3 +1,4 @@
+# coding=utf-8
 from idautils import *
 from idaapi import *
 from idc import *
@ -138,7 +139,7 @@ def get_stackVariables(func_addr):
    return len(args)


-
+# 计算算数指令数量
 def calArithmeticIns(bl):
 	x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
 	mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
@ -156,6 +157,7 @@ def calArithmeticIns(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num

+# 计算调用数量
 def calCalls(bl):
 	calls = {'call':1, 'jal':1, 'jalr':1}
 	start = bl[0]
@ -169,6 +171,7 @@ def calCalls(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num

+# 计算指令数量
 def calInsts(bl):
 	start = bl[0]
 	end = bl[1]
@ -196,7 +199,23 @@ def calLogicInstructions(bl):
 		inst_addr = NextHead(inst_addr)
 	return invoke_num

+
+def calIns(bl, inst):
+	calls = {}
+	calls.update(inst)
+	start = bl[0]
+	end = bl[1]
+	invoke_num = 0
+	inst_addr = start
+	while inst_addr < end:
+		opcode = GetMnem(inst_addr)
+		if opcode in calls:
+			invoke_num += 1
+		inst_addr = NextHead(inst_addr)
+	return invoke_num
+
 def calSconstants(bl):
+	calls = {}
 	start = bl[0]
 	end = bl[1]
 	invoke_num = 0
--- a/Genius3/raw-feature-extractor/graph_analysis_ida.pyc
+++ b/Genius3/raw-feature-extractor/graph_analysis_ida.pyc
--- a/Genius3/raw-feature-extractor/graph_property.pyc
+++ b/Genius3/raw-feature-extractor/graph_property.pyc
--- a/Genius3/raw-feature-extractor/ida_batch.py
+++ b/Genius3/raw-feature-extractor/ida_batch.py
@ -0,0 +1,200 @@
+# coding=utf-8
+import re
+import os
+import subprocess
+import multiprocessing
+from tqdm import tqdm
+import time
+
+# 单个pe文件处理超时/s
+# 多次处理，一批数据中只有少量文件会超时
+# 所有数据处理完成后可以对这些数据再进行一次更长超时时间的处理，若仍然超时则放弃
+TIMEOUT = 60
+
+# 每个家族最大处理数量
+MAX_FAMILY_PROCESS_NUM = 200
+
+
+def call_preprocess(cmd_line):
+    subprocess.call(cmd_line, shell=True)
+
+
+# 良性软件分析模式，ida的命令中将workflow改为-1
+def benign_batch_mode(overhaul):
+    # 总失败数据数量
+    total_failed = 0
+
+    log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log_benign.log'
+    process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log_benign.log'
+    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
+
+    if overhaul:
+        if os.path.exists(log_path):
+            os.remove(log_path)
+        if os.path.exists(process_log_path):
+            os.remove(process_log_path)
+
+    with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log:
+        logged = log.readline()
+        if logged == '':
+            log_index = 0
+        else:
+            log_index = int(logged)
+
+        pe_list = os.listdir(benign_pe_dir)
+        for index, pe in enumerate(tqdm(sorted(pe_list))):
+            if index < log_index:
+                continue
+
+            cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oF:\iout {}'.format(
+                os.path.join(benign_pe_dir, pe))
+
+            p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
+            p.start()
+            flag_kill = True
+            start = time.time()
+            while time.time() - start <= TIMEOUT:
+                if not p.is_alive():
+                    flag_kill = False
+                    break
+                else:
+                    time.sleep(1)
+
+            if flag_kill:
+                subprocess.call('taskkill /im idaq64.exe /f')
+                process_log.write(
+                    "index {}, {} stuck, process terminated.\n".format(index, pe))
+
+                total_failed += 1
+            else:
+                # 正常运行结束
+                log.truncate(0)
+                log.seek(0)
+                log.write(str(index))
+                log.flush()
+                process_log.write("index {}, {} process done.\n".format(index, pe))
+    # 所有副产物删除
+    delete_output()
+
+    print('总失败数{}'.format(total_failed))
+
+
+def mal_batch_mode(start, end, overhaul):
+    # 只选其中这些类的pe进行分析，其他的就直接跳过
+    families_need_to_analyze = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
+                                'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
+    # 记录ida处理报错的数据来自哪些家族
+    failed_family = {'wacatac': 0, 'glupteba': 0, 'ulpm': 0, 'fugrafa': 0, 'tiggre': 0,
+                     'redcap': 0, 'generickdz': 0, 'berbew': 0, 'agenttesla': 0, 'lazy': 0}
+    # 总失败数据数量
+    total_failed = 0
+
+    for workflow in range(start, end):
+        # pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_test'
+        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
+        family_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
+        log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_log{}.log'.format(workflow)
+        process_log_path = 'D:\\hkn\\infected\\datasets\\logging\\ida_process_log{}.log'.format(workflow)
+
+        if overhaul:
+            if os.path.exists(log_path):
+                os.remove(log_path)
+            if os.path.exists(process_log_path):
+                os.remove(process_log_path)
+
+        with open(log_path, 'a+') as log, open(process_log_path, 'a+') as process_log, open(family_path,
+                                                                                            'r') as family_file:
+            logged = log.readline()
+            if logged == '':
+                log_index = 0
+            else:
+                log_index = int(logged)
+
+            families = family_file.read()
+            for index, pe in enumerate(tqdm(sorted(os.listdir(pe_dir)))):
+                if index < log_index:
+                    continue
+
+                # 匹配文件md5，取出family文件中该md5的家族
+                regex = re.compile(pe[11:] + r'[\t][\S]*')
+                search_result = regex.findall(families)
+                if len(search_result) == 0:
+                    continue
+
+                pe_family = search_result[0].split()[1]
+                if pe_family not in families_need_to_analyze:
+                    continue
+
+                # FOR TEST ONLY
+                # cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
+                #     workflow, os.path.join(pe_dir, pe))
+                cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py {}" -oF:\iout {}'.format(
+                    workflow, os.path.join(pe_dir, pe))
+
+                p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
+                p.start()
+                flag_kill = True
+                start = time.time()
+                while time.time() - start <= TIMEOUT:
+                    if not p.is_alive():
+                        flag_kill = False
+                        break
+                    else:
+                        time.sleep(1)
+
+                if flag_kill:
+                    subprocess.call('taskkill /im idaq64.exe /f')
+                    process_log.write(
+                        "index {}, {} in workflow {} stuck, process terminated.\n".format(index, pe, workflow))
+
+                    failed_family[pe_family] += 1
+                    total_failed += 1
+                else:
+                    # 正常运行结束
+                    log.truncate(0)
+                    log.seek(0)
+                    log.write(str(index))
+                    log.flush()
+                    process_log.write("index {}, {} process done.\n".format(index, pe))
+
+                    families_need_to_analyze[pe_family] += 1
+        # 一次workflow结束后将所有副产物删除
+        delete_output()
+
+    print(families_need_to_analyze)
+    print('\n')
+    print(failed_family, '总失败数{}'.format(total_failed))
+
+
+def delete_output():
+    out_dir = 'F:\\iout'
+    for f in os.listdir(out_dir):
+        if os.path.exists(os.path.join(out_dir, f)):
+            os.remove(os.path.join(out_dir, f))
+
+
+def generate_asm_batch_mode():
+    pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
+    pe_list = os.listdir(pe_dir)
+    for pe in tqdm(pe_list):
+        cmd_line = r'idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\generate_asm_file.py" -oF:\iout {}'.format(
+            os.path.join(pe_dir, pe))
+
+        p = multiprocessing.Process(target=call_preprocess, args=[cmd_line])
+        p.start()
+        while True:
+            if not p.is_alive():
+                break
+            else:
+                time.sleep(1)
+
+    delete_output()
+
+
+# 注意：该py文件必须放在IDA的根目录下，且必须使用cmd命令执行，否则无法链接到python库
+# F:\\kkk\\IDA_6.6
+if __name__ == '__main__':
+    benign_batch_mode(True)
+    # mal_batch_mode(35, 69, True)
+    # generate_asm_batch_mode()
+
--- a/Genius3/raw-feature-extractor/preprocessing_ida.py
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.py
@ -1,56 +1,54 @@
 # -*- coding: UTF-8 -*-
-import sys
-
+import pickle
 from func import *
-from raw_graphs import *
 from idc import *
 import os
-import argparse
-import raw_graphs

-def print_obj(obj):
-    "打印对象的所有属性"
-    print(obj.__dict__)

-def parse_command():
-	parser = argparse.ArgumentParser(description='Process some integers.')
-	parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
-	args = parser.parse_args()
-	return args
-
-if __name__ == '__main__':
+def preprocess():
    # E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe -c -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius3\acfgs" hpcenter
    # print str(sys.argv) #['raw-feature-extractor/preprocessing_ida.py']
    # print str(idc.ARGV) #['raw-feature-extractor/preprocessing_ida.py', '--path', 'C:\\Program1\\pycharmproject\\Genius3\\acfgs']
    # print idc.ARGV[2]
    # print type(idc.ARGV[2])

-	# E:\BaiduNetdiskDownload\IDA_Pro_v6.8\IDA_Pro_v6.8\idaq.exe  -c -A -S"raw-feature-extractor/preprocessing_ida.py --path C:\Program1\pycharmproject\Genius4\acfgs" hpcenter
-	#测试生成原始特征的时间。
-	start_t = time.clock()
+    binary_name = idc.GetInputFile()
+
+    workflow = idc.ARGV[1]
+    # workflow为特定值时分析良性软件，否则分析恶意软件
+    if workflow == '-1':
+        cfg_path = "D:\\bishe\\dataset\\benign\\refind_cfg\\{}.ida".format(binary_name)
+        gdl_path = "D:\\bishe\\dataset\\benign\\refind_dot\\{}.dot".format(binary_name)
+        asm_path = "D:\\bishe\\dataset\\benign\\refind_asm\\{}.asm".format(binary_name)
+    else:
+        cfg_path = "D:\\bishe\\dataset\\infected\\infected_cfg\\{}.ida".format(binary_name)
+        gdl_path = "D:\\bishe\\dataset\\infected\\infected_dot\\{}.dot".format(binary_name)
+        asm_path = "D:\\bishe\\dataset\\infected\\infected_asm\\{}.asm".format(binary_name)

-	args = parse_command()
-	#path = args.path
-	path = idc.ARGV[2]
    analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
    analysis_flags &= ~idc.AF_IMMOFF
-	# turn off "automatically make offset" heuristic
    idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
    idaapi.autoWait()
+
+    # 生成pe文件的cfg列表
    cfgs = get_func_cfgs_c(FirstSeg())
+    # 将cfg保存为.ida
+    pickle.dump(cfgs, open(cfg_path, 'w'))

-	end_t = time.clock()
-	print (end_t - start_t) #1.5934438s hpcenter 83.4 KB    #35.6745299s SCGDW698 5.5mb  #14.1480888s  762kb   SCMQTTIot     这个时间包括ida分析二进制文件的时间和脚本生成对应原始特征的时间
-	# 应该是随着函数和基本块的数量增加而线性增加的，先不写了。可能ida分析二进制文件的占比比较高
+    # 生成pe文件的fcg，保存为.dot文件
+    # idc.GenCallGdl(gdl_path, 'Call Gdl', idc.CHART_GEN_GDL) 这个生成gdl文件，网上几乎找不到gdl这个格式
+    idc.GenCallGdl(gdl_path, 'Call Gdl', idaapi.CHART_GEN_DOT)

-	binary_name = idc.GetInputFile() + '.ida'
-	print path
-	print binary_name
-	fullpath = os.path.join(path, binary_name)
-	pickle.dump(cfgs, open(fullpath,'w'))
-	#print binary_name
+    # 生成.asm文件
+    idc.GenerateFile(idc.OFILE_ASM, asm_path, 0, idc.BADADDR, 0)
+
+    # 关闭IDA Pro
+    idc.Exit(0)


-
-	#加上这句，脚本执行完就退出IDA
-	#idc.Exit(0)
+# 通用命令行格式  idaq64 -c -A -S"preprocessing_ida.py arg1 arg2" VirusShare_bca58b12923073
+# 此处使用 idaq64 -c -A -S"preprocessing_ida.py workflow" -oF:\iout pe_path，完整命令行如下
+# F:\kkk\IDA_6.6\idaq64 -c -A -S"D:\hkn\project_folder\Gencoding3\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oF:\iout D:\hkn\infected\datasets\virusshare_infected0\VirusShare_bc161e5e792028e8137aa070fda53f82
+# D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out D:\bishe\dataset\train_malware\0ACDbR5M3ZhBJajygTuf
+if __name__ == '__main__':
+    preprocess()
--- a/Genius3/raw-feature-extractor/preprocessing_ida.pyc
+++ b/Genius3/raw-feature-extractor/preprocessing_ida.pyc
--- a/Genius3/raw-feature-extractor/raw_graphs.py
+++ b/Genius3/raw-feature-extractor/raw_graphs.py
@ -2,24 +2,26 @@
 import itertools
 import sys

-sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-sys.path.insert(1, 'C:/Python27/Lib/site-packages')
+# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
+# sys.path.insert(1, 'C:/Python27/Lib/site-packages')

 import networkx as nx
-#import numpy as np
+import numpy as np
 from subprocess import Popen, PIPE
 import pdb
 import os
-import re,mmap
+import re
+import mmap
 # from graph_edit_new import *

 class raw_graph:
-	def __init__(self, funcname, g, func_f):
+	def __init__(self, funcname, g, func_f, bb_f):
 		#print "create"
 		self.funcname = funcname
 		self.old_g = g[0]
 		self.g = nx.DiGraph()
 		self.entry = g[1]
+		self.bb_features = bb_f  # len=bb数量,每个元素都是一个11维向量
 		self.fun_features = func_f
 		self.attributing()

@ -54,6 +56,9 @@ class raw_graph:
 				offsprings[suc] = 1
 				self.getOffsprings(g, suc, offsprings)

+	# 提取acfg的属性特征
+	# 调用/传输/算术/逻辑/比较/移动/终止
+	# 数据声明/总指令数/字符串或整数常量/后代的数量
 	def retrieveVec(self, id_, g):
 		feature_vec = []
 		#numC0
@ -96,7 +101,7 @@ class raw_graph:

 	def genMotifs(self, n):
 		motifs = {}
-		subgs = enumerating(n)
+		subgs = self.enumerating(n)
 		for subg in subgs:
 			if len(motifs) == 0:
 				motifs[subg] = [subg]
@ -182,7 +187,7 @@ class raw_graph:
 			tg.updateG(fang, indexes, self.g)
 			return tg
 		pdb.set_trace()
-		print "there is g which is none"
+		print("there is g which is none")

 	def createG(self, binary_str, n):
 		g = nx.DiGraph()
--- a/Genius3/raw-feature-extractor/raw_graphs.pyc
+++ b/Genius3/raw-feature-extractor/raw_graphs.pyc
--- a/Genius3/raw-feature-extractor/read_idaFILE.py
+++ b/Genius3/raw-feature-extractor/read_idaFILE.py
@ -1,24 +1,25 @@
 # -*- coding: UTF-8 -*-
 import sys
-import sys
 from matplotlib import pyplot as plt
-sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-sys.path.insert(1, 'C:/Python27/Lib/site-packages')
 import networkx as nx
+import pickle
+# sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
+# sys.path.insert(1, 'C:/Python27/Lib/site-packages')
+
+
 def print_obj(obj):
-    "打印对象的所有属性"
+    # "打印对象的所有属性"
    print(obj.__dict__)

-import pickle

 # sub_10F20 308  反编译代码有字符串，但是这个特征提取里没有字符串 constant，可能是间接引用的，不识别。看了下所有函数的特征，几乎都没有字符串常量，可能都是写在别的地方然后引用的。
 # sub_166C4 393
 if __name__ == '__main__':
-
-
-    testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
+    testpath = "D:\\hkn\\infected\\datasets\\virusshare_infected23_cfg\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.ida"
    fr = open(testpath, 'r')
-    data1 = pickle.load(fr) #一个二进制文件的acfgs
+    data = pickle.load(fr) #一个二进制文件的acfgs
+    fr.close()
+
    # print(type(data1))
    # print_obj(data1)
    # print data1.raw_graph_list[393]
@ -26,17 +27,16 @@ if __name__ == '__main__':
    # nx.draw(data1.raw_graph_list[393].g,with_labels=True)
    # plt.show()

-    print "一个二进制文件的所有函数的原始特征，list。"
-    print_obj(data1) #acfg list
-    print "\n"
+    print("一个二进制文件的所有函数的原始特征，list。")
+    print_obj(data)  # acfg list
+    print("\n")

-    print "一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成"
-    print_obj(data1.raw_graph_list[393]) #一个函数的acfg
-    print "\n"
-    feature=data1.raw_graph_list[393].fun_features
-    print "函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts"
-    print feature
-    print "\n"
+    print("一个函数的原始特征，由old_g（discovRe方法的ACFG），g（Genius方法的ACFG），fun_feature（表示函数级别的特征的向量）三部分构成")
+    print_obj(data.raw_graph_list[0])  # 一个函数的acfg
+    print("其中fun_features = 函数级别特征： # 1 function calls # 2 logic instructions # 3 TransferIns # 4 LocalVariables # 5 BB basicblocks# 6 Edges # 7 IncommingCalls# 8 Intrs# 9 between # 10 strings # 11 consts")
+    # feature = data.raw_graph_list[0].fun_features
+    print("old_g:{}".format(data.raw_graph_list[0].old_g))
+    print("g:{}".format(data.raw_graph_list[0].g))


    # G = data1.raw_graph_list[393].old_g
@ -44,26 +44,27 @@ if __name__ == '__main__':
    # for key, value in G.node[0].items():
    #     print('{key}:{value}'.format(key=key, value=value))

-    # 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
-    G=data1.raw_graph_list[393].g
-    print "# 一个基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量"
-    print G.node[0]
-    print "\n"
-    # for key, value in G.node[0].items():
-    #     print('{key}:{value}'.format(key=key, value=value))
+    # 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 字节点数量？ #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 如AND #8'numTIs' 转移指令数量
+    G = data.raw_graph_list[0].g
+    print("# 基本块的特征 #1'consts' 数字常量 #2'strings'字符串常量 #3'offs' offspring 后代数量 #4'numAs' 算数指令如INC  #5'numCalls' 调用指令 #6'numIns' 指令数量 #7'numLIs' LogicInstructions 逻辑如AND #8'numTIs' 转移指令数量")
+    # print(G.node[0])
+    # print("\n")
+    # 函数内所有基本快的特征
+    for key, value in G.node.items():
+        print('{}:{}'.format(key, value))



    #oldg就是读取IDA的CFG，所以数量、方向等都一样；g根据old_g生成，也一样
    #old g
-    G = data1.raw_graph_list[393].old_g
+    G = data.raw_graph_list[0].old_g
    nx.draw(G, with_labels=True)
    #plt.title('old_g')
    plt.show()


    # g
-    G = data1.raw_graph_list[393].g
+    G = data.raw_graph_list[0].g
    nx.draw(G, with_labels=True)
    #plt.title('Genius_g')
    plt.show()
--- a/Genius3/raw-feature-extractor/test.py
+++ b/Genius3/raw-feature-extractor/test.py
@ -1,8 +1,380 @@
+# coding=utf-8
+import re
+import os
+import subprocess
+import time
+import json
+import random
+import shutil
+from tqdm import tqdm
+import csv
+import pandas as pd

-import pickle
-testpath = "C:\Program1\pycharmproject\Genius3/acfgs/hpcenter.ida"
-fr = open(testpath, 'r')
-data1 = pickle.load(fr)
-print(type(data1))
-# # print_obj(data1)
-# print cfgs.raw_graph_list[0]
+
+def create_dir():
+    parent_dir = "D:\\hkn\\infected\\datasets"
+    for workflow in range(40, 70):
+        # 生成raw data文件夹
+        infected = "virusshare_infected{}".format(workflow)
+        cfg = "virusshare_infected{}_cfg".format(workflow)
+        dot = "virusshare_infected{}_dot".format(workflow)
+        jsonl = "virusshare_infected{}_json".format(workflow)
+        create(parent_dir, infected)
+        create(parent_dir, cfg)
+        create(parent_dir, dot)
+        create(parent_dir, jsonl)
+        # iout = "virusshare_infected{}_iout".format(workflow)
+        # os.rmdir(os.path.join(parent_dir, iout))
+        # os.rmdir(os.path.join(parent_dir, ida))
+
+
+def create(parent_dir, folder):
+    if not os.path.exists(os.path.join(parent_dir, folder)):
+        os.mkdir(os.path.join(parent_dir, folder))
+
+
+def change_max_item_lines():
+    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'rb')
+    s = f.read()
+    f.close()
+    index = s.find(b'MAX_ITEM_LINES          = 5000')
+    news = s.replace(b'MAX_ITEM_LINES          = 5000', b'MAX_ITEM_LINES          = 50000')
+    # print(news[index:index+50])
+    f = open("F:\\kkk\\IDA_6.6\\cfg\\ida.cfg", 'wb')
+    f.write(news)
+    f.close()
+
+
+def clock():
+    TIMEOUT = 10
+    start = time.time()
+    flag_kill = True
+    while time.time() - start <= TIMEOUT:
+        if not p.is_alive():
+            flag_kill = False
+            break
+        else:
+            time.sleep(1)  # Just to avoid hogging the CPU
+
+    if flag_kill:
+        subprocess.call('taskkill /im idaq64.exe /f')
+
+
+def delete_error():
+    for workflow in range(0, 35):
+        convert_log_path = "D:\\hkn\\infected\\datasets\\logging\\convert_process_log{}.log".format(workflow)
+        json_dir = "D:\\hkn\\infected\\datasets\\virusshare_infected{}_json".format(workflow)
+
+        with open(convert_log_path, 'r') as log:
+            for line in log:
+                if 'Error occurred' in line:
+                    name = line[line.find(',') + 2: line.find('.')] + '.jsonl'
+                    # print(os.path.join(json_dir, name))
+                    if os.path.exists(os.path.join(json_dir, name)):
+                        os.remove(os.path.join(json_dir, name))
+
+
+def check_json():
+    print('start checking json')
+    for workflow in tqdm(range(0, 69)):
+        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
+        for json_file in os.listdir(json_dir):
+            f = open(os.path.join(json_dir, json_file), 'r')
+            try:
+                data = json.load(f)
+            except UnicodeDecodeError:
+                continue
+            finally:
+                f.close()
+
+            if len(data['function_edges'][0]) == 0:
+                print("{} {} function_edges null\n".format(workflow, json_file))
+                # continue
+            # for acfg in data['acfg_list']:
+            #     if acfg['block_number'] != len(acfg['block_features']):
+            #         print("{} {}\n".format(workflow, json_file))
+
+
+# 临时函数，删除所有jsonl文件
+def delete_jsonl():
+    for workflow in range(0, 35):
+        json_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
+        for f in os.listdir(json_dir):
+            os.remove(os.path.join(json_dir, f))
+
+
+def delete_all_local():
+    data_dirs = ['D:\\hkn\\infected\\datasets\\virusshare_train\\1',
+                 'D:\\hkn\\infected\\datasets\\virusshare_train\\2',
+                 'D:\\hkn\\infected\\datasets\\virusshare_train\\3',
+                 'D:\\hkn\\infected\\datasets\\virusshare_train\\4',
+                 'D:\\hkn\\infected\\datasets\\virusshare_train\\5',
+                 ]
+    for d in data_dirs:
+        path = os.listdir(d)
+        for f in path:
+            os.remove(os.path.join(d, f))
+
+
+# 重命名pt文件使之与代码相符
+def rename(mal_or_be, postfix):
+    tag_set = ['train', 'test', 'valid']
+    for tag in tag_set:
+        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
+        for index, f in enumerate(os.listdir(data_dir)):
+            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, 'm' + f))
+    for tag in tag_set:
+        data_dir = 'D:/hkn/infected/datasets/proprecessed_pt/{}_{}{}/'.format(tag, mal_or_be, postfix)
+        for index, f in enumerate(os.listdir(data_dir)):
+            os.rename(os.path.join(data_dir, f), os.path.join(data_dir, '{}_{}.pt'.format(mal_or_be, index)))
+
+
+def split_data_by_label():
+    all = 'D:\\hkn\\infected\\datasets\\virusshare_train\\all_pt'
+    dest = 'D:\\hkn\\infected\\datasets\\virusshare_train'
+    csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
+    with open(csv_path, 'r') as label:
+        label.readline()
+        labels = label.readlines()
+        for lines in labels:
+            name, cls = lines.strip().split(',')
+            fpath = os.path.join(all, name + '.pt')
+            if os.path.exists(fpath):
+                shutil.move(fpath, os.path.join(dest, cls))
+            else:
+                print(fpath, 'file not exist.')
+
+
+def half_divide():
+    src = 'D:\\hkn\\infected\\datasets\\proprecessed_pt'
+
+    test = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware'
+    valid = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'
+
+    flag = True
+    for f in os.listdir(src):
+        if 'pt' not in f:
+            continue
+        if flag:
+            shutil.copy(os.path.join(src, f), test)
+        else:
+            shutil.copy(os.path.join(src, f), valid)
+        flag = not flag
+
+
+def copy_train_data():
+    all = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\all'
+    dest = 'D:\\hkn\\infected\\datasets\\proprecessed_pt\\train_malware'
+    train = set(os.listdir(all)) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\test_malware')) - set(os.listdir('D:\\hkn\\infected\\datasets\\proprecessed_pt\\valid_malware'))
+    for f in train:
+        shutil.copy(os.path.join(all, f), dest)
+
+
+def clear_dot():
+    for workflow in range(0, 35):
+        path = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_dot\\'.format(workflow)
+        for name in os.listdir(path):
+            full = os.path.join(path, name)
+            f = open(full, 'r')
+            data = f.read()
+            f.close()
+            if 'start' not in data and 'sub_' not in data:
+                # print("delete")
+                os.remove(full)
+
+
+def read_test():
+    dot_file_path = "D:\\hkn\\infected\\datasets\\virusshare_infected23_dot\\VirusShare_9ba64176b2ca61212ff56a5b4eb546ff.dot"
+    with open(dot_file_path, 'r') as dot:
+        for line in dot:
+            if '->' in line:
+                print(re.findall(r'\b\d+\b', line))
+            elif 'label' in line:
+                print(line[line.find('= "') + 3:line.find('",')])
+
+
+# 临时工具，有些pe文件没有经过api分类，直接删掉
+def del_redundant():
+    for workflow in range(0, 68):
+        pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}'.format(workflow)
+        family_file_path = 'D:\\hkn\\infected\\datasets\\virusshare_family\\virusshare_family{}.txt'.format(workflow)
+
+        with open(family_file_path, 'r') as f_file:
+            family = f_file.read()
+            for name in os.listdir(pe_dir):
+                if name[11:] in family:
+                    continue
+                else:
+                    # print(name)
+                    os.remove(os.path.join(pe_dir, name))
+
+
+def delete_pe():
+    dot_dir = 'D:\\hkn\\infected\\datasets\\benign_dot'
+    cfg_dir = 'D:\\hkn\\infected\\datasets\\benign_cfg'
+    dot_list = os.listdir(dot_dir)
+    for cfg in os.listdir(cfg_dir):
+        name = cfg[:-4] + ".dot"
+        if name in dot_list:
+            continue
+        else:
+            print(os.path.join(dot_dir, name))
+            # os.remove(os.path.join(dot_dir, cfg))
+
+
+def delete_error_benign():
+    jsonl_dir = 'F:\\kkk\\dataset\\benign\\refind_jsonl'
+    dot_dir = 'F:\\kkk\\dataset\\benign\\refind_dot'
+    cfg_dir = "F:\\kkk\\dataset\\benign\\refind_cfg"
+    asm_dir = "F:\\kkk\\dataset\\benign\\refind_asm"
+    pe_dir = "F:\\kkk\\dataset\\benign\\refind"
+    alist = os.listdir(pe_dir)
+    for f in alist:
+        if not os.path.exists(os.path.join(jsonl_dir, f + '.jsonl')):
+            os.remove(os.path.join(pe_dir, f))
+            if os.path.exists(os.path.join(asm_dir, f + '.asm')):
+                os.remove(os.path.join(asm_dir, f + '.asm'))
+            if os.path.exists(os.path.join(cfg_dir, f + '.ida')):
+                os.remove(os.path.join(cfg_dir, f + '.ida'))
+            if os.path.exists(os.path.join(dot_dir, f + '.dot')):
+                os.remove(os.path.join(dot_dir, f + '.dot'))
+
+
+def generate_benign_csv():
+    benign_pe_dir = 'F:\\kkk\\dataset\\benign\\refind'
+    csv_out = 'F:\\kkk\\dataset\\benign_family.csv'
+    fieldnames = ['Id', 'Class']
+    with open(csv_out, "wb") as output_file:
+        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for f in os.listdir(benign_pe_dir):
+            writer.writerow({fieldnames[0]: f, fieldnames[1]: '5'})
+
+
+def process_csv():
+    csv_path = 'F:\\kkk\\dataset\\virusshare_AllLabel.csv'
+    files = os.listdir('D:\\hkn\\infected\\datasets\\virusshare_train\\pe')
+    print(files.__len__())
+    df = df[df['Id'].isin(files)]
+    df = df.drop_duplicates('Id')
+    df['Id'] = 'VirusShare_' + df['Id']
+    df.to_csv(csv_path, index=False)
+
+
+def generate_virusshare_csv():
+    index = {'wacatac': 1, 'ulpm': 2, 'fugrafa': 3, 'redcap': 4}
+    fieldnames = ['Id', 'Class']
+    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
+    family_dir = 'D:\\hkn\\infected\\datasets\\virusshare_family'
+    csv_out = 'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
+    with open(csv_out, "wb") as output_file:
+        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for f in tqdm(os.listdir(family_dir)):
+            with open(os.path.join(family_dir, f), 'r') as family:
+                lines = family.readlines()
+                for line in lines:
+                    md5, label = line.strip().split('\t')
+                    if label in index:
+                        if os.path.exists(os.path.join(pe_dir, 'VirusShare_' + md5)):
+                            writer.writerow({fieldnames[0]: 'VirusShare_' + md5, fieldnames[1]: index[label]})
+
+
+def findlostone():
+    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\pe'
+    asm_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\asm'
+    for f in os.listdir(pe_dir):
+        if not os.path.exists(os.path.join(asm_dir, f + '.asm')):
+            print(f)
+
+
+def find_pe_in_original_set():
+    for workflow in range(0, 69):
+        data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
+        for f in os.listdir(data_dir):
+            if f[:-6] == 'VirusShare_0f07b29873cf503a0fb69fa064ce76a3':
+                print(workflow)
+                return
+
+
+def select_jsonl():
+    csv_paths = 'F:\\kkk\\dataset\\virusshare_family.csv'
+    jsonl_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\malware_jsonl'
+
+    with open(csv_paths, 'r') as csv_path:
+        labels = csv.reader(csv_path, delimiter=',')
+        data = list(labels)
+        for workflow in range(0, 69):
+            data_dir = 'D:\\hkn\\infected\\datasets\\virusshare_infected{}_json'.format(workflow)
+            for f in os.listdir(data_dir):
+                for line in data:
+                    if f[:-6] in line:
+                        shutil.copy(os.path.join(data_dir, f), jsonl_dir)
+                        break
+
+
+def generate_csv():
+    pe_dir = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\pe'
+    csv_path = 'D:\\hkn\\infected\\datasets\\virusshare_train\\5\\virusshare_5.csv'
+    fieldnames = ['Id', 'Class']
+    with open(csv_path, "wb") as output_file:
+        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
+        writer.writeheader()
+        for pe in os.listdir(pe_dir):
+            writer.writerow({fieldnames[0]: pe, fieldnames[1]: 5})
+
+
+def merge_csvs(cs, out):
+    for i, c in enumerate(cs):
+        if i == 0:
+            merged = pd.read_csv(c)
+        else:
+            merged = pd.merge(pd.read_csv(c), merged, on='Id')
+            # merged = pd.concat([merged, pd.read_csv(c)])
+
+    # if 'Class' in merged:
+    #     merged['Class'] = merged['Class'] - 1
+    merged.to_csv(out, index=False)
+
+if __name__ == '__main__':
+    # find_pe_in_original_set()
+    # split_data_by_label()
+    # select_jsonl()
+    # findlostone()
+    # generate_csv()
+    # generate_virusshare_csv()
+    # merge_csvs([
+    #     'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_1_compliment.csv',
+    #     'D:\\hkn\\infected\\datasets\\virusshare_family.csv',
+    #     'D:\\hkn\\infected\\datasets\\virusshare_train\\virusshare_5.csv',
+    # ],
+    #     'D:\\hkn\\infected\\datasets\\virusshare_family.csv'
+    # )
+    process_csv()
+    # generate_benign_csv()
+    # create_pixel_intensity()
+    # create_dir()
+    # change_max_item_lines()
+    # subprocess.call('taskkill /im idaq64.exe /f')
+    # delete_error_benign()
+    # test()
+    # delete_jsonl()
+    # delete_all_local()
+    # check_json()
+    # delete_pe()
+
+    # rename('malware', '_backup')
+
+    # 指定 'standard' or 'benign' or 'one_family'
+    # standard表示处理所有恶意样本
+    # split_samples()
+    # one_family表示仅处理一个家族，仅用于测试原模型的二分类
+    # split_samples('one_family')
+    # benign表示处理良性样本
+    # split_samples('benign')
+
+    # half_divide()
+    # copy_train_data()
+    # clear_dot()
+    # read_test()
+    # del_redundant()
--- a/ida_file_cerate.bat
+++ b/ida_file_cerate.bat
@ -0,0 +1,16 @@
+@echo off
+setlocal EnableDelayedExpansion
+
+
+set "FOLDER_PATH=D:\bishe\dataset\train_benign"
+
+
+
+for %%f in ("%FOLDER_PATH%\*") do (
+    echo !time! %%f
+    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py -1" -oD:\bishe\dataset\out %%f
+
+)
+
+endlocal
+
--- a/ida_file_cerate_malware.bat
+++ b/ida_file_cerate_malware.bat
@ -0,0 +1,16 @@
+@echo off
+setlocal EnableDelayedExpansion
+
+
+set "FOLDER_PATH=D:\bishe\dataset\train_malware"
+
+
+
+for %%f in ("%FOLDER_PATH%\*") do (
+    echo !time! %%f
+    D:\IDA_Pro_v6.8\idaq64.exe -c -A -S"D:\bishe\Gencoding_KE\Genius3\raw-feature-extractor\preprocessing_ida.py 0" -oD:\bishe\dataset\out %%f
+
+)
+
+endlocal
+
--- a/raw-feature-extractor/cfg_constructor.py
+++ b/raw-feature-extractor/cfg_constructor.py
@ -1,286 +0,0 @@
-import copy
-import networkx as nx
-from idautils import *
-from idaapi import *
-from idc import *
-
-import copy
-import networkx as nx
-from idautils import *
-from idaapi import *
-from idc import *
-from graph_analysis_ida import *
-
-
-def getCfg(func, externs_eas, ea_externs):
-	func_start = func.startEA
-	func_end = func.endEA
-	cfg = nx.DiGraph()
-	control_blocks, main_blocks = obtain_block_sequence(func)
-	i = 0
-	visited = {}
-	start_node = None
-	for bl in control_blocks:
-		start = control_blocks[bl][0]
-		end = control_blocks[bl][1]
-		src_node = (start, end)
-		if src_node not in visited:
-			src_id = len(cfg)
-			visited[src_node] = src_id
-			cfg.add_node(src_id)
-			cfg.node[src_id]['label'] = src_node
-		else:
-			src_id = visited[src_node]
-
-		#if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
-		if start == func_start:
-			cfg.node[src_id]['c'] = "start"
-			start_node = src_node
-		if end == func_end:
-			cfg.node[src_id]['c'] = "end"
-		#print control_ea, 1
-		refs = CodeRefsTo(start, 0)
-		for ref in refs:
-			if ref in control_blocks:
-				dst_node = control_blocks[ref]
-				if dst_node not in visited:
-					visited[dst_node] = len(cfg)
-				dst_id = visited[dst_node]
-				cfg.add_edge(dst_id, src_id)
-				cfg.node[dst_id]['label'] = dst_node
-		#print control_ea, 1
-		refs = CodeRefsTo(start, 1)
-		for ref in refs:
-			if ref in control_blocks:
-				dst_node = control_blocks[ref]
-				if dst_node not in visited:
-					visited[dst_node] = len(cfg)
-				dst_id = visited[dst_node]
-				cfg.add_edge(dst_id, src_id)
-				cfg.node[dst_id]['label'] = dst_node
-	#print "attributing"
-	attributingRe(cfg, externs_eas, ea_externs)
-	# removing deadnodes
-	#old_cfg = copy.deepcopy(cfg)
-	#transform(cfg)
-	return cfg, 0
-
-def transform(cfg):
-	merging(cfg)
-	filtering(cfg)
-
-def merging(cfg):
-	bb_ids = cfg.nodes()
-	for bb_id in bb_ids:
-		try:
-			bb = cfg.node[bb_id]['label']
-			bb_start = bb[0]
-			bb_end = bb[1]
-			succs = cfg.successors(bb_id)
-			#preds = cfg.predecessors(bb_id)
-			if len(succs) == 1:
-				preds = cfg.predecessors(succs[0])
-				if len(preds) == 1:
-					domerge(cfg, bb_id, succs[0])
-		except:
-			pass
-
-def domerge(cfg, bb_id, suc_node):
-	suc_nodes = cfg.successors(suc_node)
-	for node in suc_nodes:
-		cfg.add_edge(bb_id, node)
-	cfg.remove_node(suc_node)
-
-
-def filtering(cfg):
-	rm_sets = []
-	for bb_id in cfg:
-		bb = cfg.node[bb_id]['label']
-		bb_start = bb[0]
-		bb_end = bb[1]
-		re = remove(bb_start, bb_end)
-		print bb_id, re, bb_start, bb_end
-		if re:
-			print re, bb_id
-			rm_sets.append(bb_id)
-	print rm_sets
-	for bb_id in rm_sets:
-		cfg.remove_node(bb_id)
-
-def remove(bb_start, bb_end):
-	seqs = getSequences(bb_start, bb_end)
-	if matchseq(seqs):
-		return True
-	return False
-
-def matchseq(seqs):
-	mips = set(['lw', "jr", "addiu"])
-	x86 = set(['add', 'pop', 'retn'])
-	b_mips = set(['b', ('move','$v0')])
-	b_x86 = set(['b', ('mov','$eax')])
-	re_mips = set([('move','$v0')])
-	re_x86 = set([('mov','$eax')])
-	diff_mips = set(seqs).difference(set(mips))
-	if len(diff_mips) == 0:
-		return True
-	diff_x86 = set(seqs).difference(set(x86))
-	if len(diff_x86) == 0:
-		return True
-	if set(seqs) == b_mips:
-		return True
-	if set(seqs) == b_x86:
-		return True
-	if set(seqs) == re_mips:
-		return True
-	if set(seqs) == re_x86:
-		return True
-	return False
-
-def attributingRe(cfg, externs_eas, ea_externs):
-	for node_id in cfg:
-		bl = cfg.node[node_id]['label']
-		numIns = calInsts(bl)
-		cfg.node[node_id]['numIns'] = numIns
-		numCalls = calCalls(bl)
-		cfg.node[node_id]['numCalls'] = numCalls
-		numLIs = calLogicInstructions(bl)
-		cfg.node[node_id]['numLIs'] = numLIs
-		numAs = calArithmeticIns(bl)
-		cfg.node[node_id]['numAs'] = numAs
-		strings, consts = getBBconsts(bl)
-		cfg.node[node_id]['numNc'] = len(strings) + len(consts)
-		cfg.node[node_id]['consts'] = consts
-		cfg.node[node_id]['strings'] = strings
-		externs = retrieveExterns(bl, ea_externs)
-		cfg.node[node_id]['externs'] = externs
-		numTIs = calTransferIns(bl)
-		cfg.node[node_id]['numTIs'] = numTIs
-
-
-def attributing(cfg):
-	ga = graph_analysis()
-	ga.gwithoffspring(cfg)
-	print "finishing offspring"
-	for node in cfg:
-		stmt_num = getStmtNum(node)
-		binary_value = getBinaryValue(node)
-		cfg.node[node]['stmt_num'] = stmt_num
-		cfg.node[node]['binary_value'] = binary_value
-	ga.domChecking(cfg)
-	print "finishing domChecking"
-	ga.loopChecking(cfg)
-	print "finishing loopChecking"
-
-
-def getStmtNum(node):
-	start = node[0]
-	end = node[1]
-	stmt_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		inst_addr = NextHead(inst_addr)
-		stmt_num += 1
-	return stmt_num
-
-def getBinaryValue(node):
-	start = node[0]
-	inst_addr = NextHead(start)
-	value = 0
-	addr = 0
-	for x in xrange((inst_addr - start)-1):
-		addr = start + x
-		y = GetOriginalByte(addr)
-		print value, addr, y
-		value = value | y
-		value = value << 8
-		print value
-
-	addr = inst_addr - 1
-	y = GetOriginalByte(addr)
-	print value, addr, y
-	value = value | y
-	print node
-	print bin(value)
-	return value
-
-
-def cfg_construct(func):
-	func_start = func.startEA
-	func_end = func.endEA
-	cfg = nx.DiGraph()
-	seq_blocks, main_blocks = obtain_block_sequence(func)
-	i = 0
-	visited = {}
-	for bl in seq_blocks:
-		start = seq_blocks[bl][0]
-		end = seq_blocks[bl][1]
-		src_node = (start, end)
-		if end in seq_blocks and GetMnem(PrevHead(end)) != 'jmp':
-						next_start = seq_blocks[end][0]
-						next_end = seq_blocks[end][1]
-						next_node = (next_start, next_end)
-						cfg.add_edge(src_node, next_node)
-		if start == func_start:
-			cfg.add_node(src_node, c='start')
-			start_node = src_node
-		if end == func_end:
-			cfg.add_node(src_node, c='end')
-		refs = CodeRefsFrom(PrevHead(end), 0)
-		
-		for ref in refs:
-						#print ref
-						if ref in seq_blocks:
-								dst_node = (seq_blocks[ref][0], seq_blocks[ref][1])
-								cfg.add_edge(src_node, dst_node)
-	return cfg, start_node
-
-
-def obtain_allpaths( cfg, node, path, allpaths):
-	path.append(node)
-	if 'c' in cfg.node[node] and cfg.node[node]['c'] == 'end':
-		allpaths.append(path)
-		return
-	else:
-		for suc in cfg.successors(node):
-						if suc not in path:
-								path_copy = copy.copy(path)
-								obtain_allpaths(cfg, suc, path_copy, allpaths)
-
-
-def obtain_block_sequence(func):
-	control_blocks = {}
-	main_blocks = {}
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	for bl in blocks:
-		base = bl[0]
-		end = PrevHead(bl[1])
-		control_ea = checkCB(bl)
-		control_blocks[control_ea] = bl
-		control_blocks[end] = bl
-		if func.startEA <= base <= func.endEA:
-						main_blocks[base] = bl
-		x = sorted(main_blocks)
-	return control_blocks, x
-
-def checkCB(bl):
-	start = bl[0]
-	end = bl[1]
-	ea = start
-	while ea < end:
-		if checkCondition(ea):
-			return ea
-		ea = NextHead(ea)
-
-	return PrevHead(end)
-
-def checkCondition(ea):
-	mips_branch = {"beqz":1, "beq":1, "bne":1, "bgez":1, "b":1, "bnez":1, "bgtz":1, "bltz":1, "blez":1, "bgt":1, "bge":1, "blt":1, "ble":1, "bgtu":1, "bgeu":1, "bltu":1, "bleu":1}
-	x86_branch = {"jz":1, "jnb":1, "jne":1, "je":1, "jg":1, "jle":1, "jl":1, "jge":1, "ja":1, "jae":1, "jb":1, "jbe":1, "jo":1, "jno":1, "js":1, "jns":1}
-	arm_branch = {"B":1, "BAL":1, "BNE":1, "BEQ":1, "BPL":1, "BMI":1, "BCC":1, "BLO":1, "BCS":1, "BHS":1, "BVC":1, "BVS":1, "BGT":1, "BGE":1, "BLT":1, "BLE":1, "BHI":1 ,"BLS":1 }
-	conds = {}
-	conds.update(mips_branch)
-	conds.update(x86_branch)
-	opcode = GetMnem(ea)
-	if opcode in conds:
-		return True
-	return False
--- a/raw-feature-extractor/discovRe.py
+++ b/raw-feature-extractor/discovRe.py
@ -1,228 +0,0 @@
-#
-# Reference Lister
-#
-# List all functions and all references to them in the current section.
-#
-# Implemented with the idautils module
-#
-import networkx as nx
-import cPickle as pickle
-import pdb
-from graph_analysis_ida import *
-from graph_property import *
-#import wingdbstub
-#wingdbstub.Ensure()
-
-def get_funcs(ea):
-        funcs = {}
-        # Get current ea
-        # Loop from start to end in the current segment
-	for funcea in Functions(SegStart(ea)):
-		funcname = GetFunctionName(funcea)
-		func = get_func(funcea)
-		blocks = FlowChart(func)
-		funcs[funcname] = []
-		for bl in blocks:
-		        start = bl.startEA
-		        end = bl.endEA
-		        funcs[funcname].append((start, end))
-        return funcs
-
-def get_funcs_for_discoverRe(ea):
-    features = {}
-    for funcea in Functions(SegStart(ea)):
-        funcname = GetFunctionName(funcea)
-        print funcname
-        func = get_func(funcea)
-        feature = get_discoverRe_feature(func)
-        features[funcname] = feature
-    return features
-
-def get_discoverRe_feature(func, icfg):
-    start = func.startEA
-    end = func.endEA
-    features = []
-    FunctionCalls = getFuncCalls(func)
-    #1
-    features.append(FunctionCalls)
-    LogicInstr = getLogicInsts(func)
-    #2
-    features.append(LogicInstr)
-    Transfer = getTransferInsts(func)
-    #3
-    features.append(Transfer)
-    Locals = getLocalVariables(func)
-    #4
-    features.append(Locals)
-    BB = getBasicBlocks(func)
-    #5
-    features.append(BB)
-    Edges = len(icfg.edges())
-    #6
-    features.append(Edges)
-    Incoming = getIncommingCalls(func)
-    #7
-    features.append(Incoming)
-    #8
-    Instrs = getIntrs(func)
-    features.append(Instrs)
-    between = retrieveGP(icfg)
-    #9
-    features.append(between)
-
-    strings, consts = getfunc_consts(func)
-    features.append(strings)
-    features.append(consts)
-    return features
-
-def get_func_names(ea):
-    funcs = {}
-    for funcea in Functions(SegStart(ea)):
-            funcname = GetFunctionName(funcea)
-            funcs[funcname] = funcea
-    return funcs
-
-def get_func_bases(ea):
-        funcs = {}
-        for funcea in Functions(SegStart(ea)):
-                funcname = GetFunctionName(funcea)
-                funcs[funcea] = funcname
-        return funcs
-
-def get_func_range(ea):
-        funcs = {}
-        for funcea in Functions(SegStart(ea)):
-                funcname = GetFunctionName(funcea)
-		func = get_func(funcea)
-                funcs[funcname] = (func.startEA, func.endEA)
-        return funcs
-
-def get_func_sequences(ea):
-	funcs_bodylist = {}
-	funcs = get_funcs(ea)
-	for funcname in funcs:
-		if funcname not in funcs_bodylist:
-			funcs_bodylist[funcname] = []
-		for start, end in funcs[funcname]:
-			inst_addr = start
-			while inst_addr <= end:
-				opcode = GetMnem(inst_addr)
-				funcs_bodylist[funcname].append(opcode)
-				inst_addr = NextHead(inst_addr)
-        return funcs_bodylist
-
-def get_func_cfgs(ea):
-    func_cfglist = {}
-    i = 0
-    start, end = get_section('LOAD')
-    #print start, end
-    for funcea in Functions(SegStart(ea)):
-        if start <= funcea <= end:
-            funcname = GetFunctionName(funcea)
-            func = get_func(funcea)
-            print i
-            i += 1
-            try:
-                icfg = cfg.cfg_construct(func)
-                func_cfglist[funcname] = icfg
-            except:
-                pass
-            
-    return func_cfglist
-
-def get_section(t):
-    base = SegByName(t)
-    start = SegByBase(base)
-    end = SegEnd(start)
-    return start, end
-
-
-def get_func_cfg_sequences(func_cfglist):
-    func_cfg_seqlist = {}
-    for funcname in func_cfglist:
-        func_cfg_seqlist[funcname] = {}
-        cfg = func_cfglist[funcname][0]
-        for start, end in cfg:
-            codesq = get_sequences(start, end)
-            func_cfg_seqlist[funcname][(start,end)] = codesq
-
-    return func_cfg_seqlist
-
-
-def get_sequences(start, end):
-    seq = []
-    inst_addr = start
-    while inst_addr <= end:
-        opcode = GetMnem(inst_addr)
-        seq.append(opcode)
-        inst_addr = NextHead(inst_addr)
-    return seq
-
-def get_stack_arg(func_addr):
-    print func_addr
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-            return []
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <=lastM:
-        mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack,i)
-        if mSize:
-                i = i + mSize
-        else:
-                i = i+4
-        if mName not in args and mName and ' s' not in mName and ' r' not in mName:
-            args.append(mName)
-    return args
-
-        #pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-        
-def processDataSegs():
-    funcdata = {}
-    datafunc = {}
-    for n in xrange(idaapi.get_segm_qty()):
-        seg = idaapi.getnseg(n)
-        ea = seg.startEA
-        segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-        if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
-            start = idc.SegStart(ea)
-            end = idc.SegEnd(ea)
-            cur = start
-            while cur <= end:
-                refs = [v for v in DataRefsTo(cur)]
-                for fea in refs:
-                    name = GetFunctionName(fea)
-                    if len(name)== 0:
-                        continue
-                    if name not in funcdata:
-                        funcdata[name] = [cur]
-                    else:
-                        funcdata[name].append(cur)
-                    if cur not in datafunc:
-                        datafunc[cur] = [name]
-                    else:
-                        datafunc[cur].append(name)
-                cur = NextHead(cur)
-    return funcdata, datafunc
-
-def obtainDataRefs(callgraph):
-    datarefs = {}
-    funcdata, datafunc = processDataSegs()
-    for node in callgraph:
-        if node in funcdata:
-            datas = funcdata[node]
-            for dd in datas:
-                refs = datafunc[dd]
-                refs = list(set(refs))
-                if node in datarefs:
-                    print refs
-                    datarefs[node] += refs
-                    datarefs[node] = list(set(datarefs[node]))
-                else:
-                    datarefs[node] = refs
-    return datarefs
-
-
--- a/raw-feature-extractor/func.py
+++ b/raw-feature-extractor/func.py
@ -1,285 +0,0 @@
-#
-# Reference Lister
-#
-# List all functions and all references to them in the current section.
-#
-# Implemented with the idautils module
-#
-from idautils import *
-from idaapi import *
-from idc import *
-import networkx as nx
-import cfg_constructor as cfg
-import cPickle as pickle
-import pdb
-from raw_graphs import *
-#from discovRe_feature.discovRe import *
-from discovRe import *
-#import wingdbstub
-#wingdbstub.Ensure()
-def gt_funcNames(ea):
-	funcs = []
-	plt_func, plt_data = processpltSegs()
-	for funcea in Functions(SegStart(ea)):
-			funcname = get_unified_funcname(funcea)
-			if funcname in plt_func:
-				print funcname
-				continue
-			funcs.append(funcname)
-	return funcs
-
-def get_funcs(ea):
-	funcs = {}
-		# Get current ea
-		# Loop from start to end in the current segment
-	plt_func, plt_data = processpltSegs()
-	for funcea in Functions(SegStart(ea)):
-		funcname = get_unified_funcname(funcea)
-		if funcname in plt_func:
-			continue
-		func = get_func(funcea)
-		blocks = FlowChart(func)
-		funcs[funcname] = []
-		for bl in blocks:
-				start = bl.startEA
-				end = bl.endEA
-				funcs[funcname].append((start, end))
-	return funcs
-
-# used for the callgraph generation.
-def get_func_namesWithoutE(ea):
-	funcs = {}
-	plt_func, plt_data = processpltSegs()
-	for funcea in Functions(SegStart(ea)):
-			funcname = get_unified_funcname(funcea)
-			if 'close' in funcname:
-				print funcea
-			if funcname in plt_func:
-				print funcname
-				continue
-			funcs[funcname] = funcea
-	return funcs
-
-# used for the callgraph generation.
-def get_func_names(ea):
-	funcs = {}
-	for funcea in Functions(SegStart(ea)):
-			funcname = get_unified_funcname(funcea)
-			funcs[funcname] = funcea
-	return funcs
-
-def get_func_bases(ea):
-		funcs = {}
-		plt_func, plt_data = processpltSegs()
-		for funcea in Functions(SegStart(ea)):
-				funcname = get_unified_funcname(funcea)
-				if funcname in plt_func:
-					continue
-				funcs[funcea] = funcname
-		return funcs
-
-def get_func_range(ea):
-		funcs = {}
-		for funcea in Functions(SegStart(ea)):
-				funcname = get_unified_funcname(funcea)
-		func = get_func(funcea)
-		funcs[funcname] = (func.startEA, func.endEA)
-		return funcs
-
-def get_unified_funcname(ea):
-	funcname = GetFunctionName(ea)
-	if len(funcname) > 0:
-		if '.' == funcname[0]:
-			funcname = funcname[1:]
-	return funcname
-
-def get_func_sequences(ea):
-	funcs_bodylist = {}
-	funcs = get_funcs(ea)
-	for funcname in funcs:
-		if funcname not in funcs_bodylist:
-			funcs_bodylist[funcname] = []
-		for start, end in funcs[funcname]:
-			inst_addr = start
-			while inst_addr <= end:
-				opcode = GetMnem(inst_addr)
-				funcs_bodylist[funcname].append(opcode)
-				inst_addr = NextHead(inst_addr)
-	return funcs_bodylist
-
-def get_func_cfgs_c(ea):
-	binary_name = idc.GetInputFile()
-	raw_cfgs = raw_graphs(binary_name)
-	externs_eas, ea_externs = processpltSegs()
-	i = 0
-	for funcea in Functions(SegStart(ea)):
-		funcname = get_unified_funcname(funcea)
-		func = get_func(funcea)
-		print i
-		i += 1
-		icfg = cfg.getCfg(func, externs_eas, ea_externs)
-		func_f = get_discoverRe_feature(func, icfg[0])
-		raw_g = raw_graph(funcname, icfg, func_f)
-		raw_cfgs.append(raw_g)
-			
-	return raw_cfgs
-
-def get_func_cfgs_ctest(ea):
-	binary_name = idc.GetInputFile()
-	raw_cfgs = raw_graphs(binary_name)
-	externs_eas, ea_externs = processpltSegs()
-	i = 0
-	diffs = {}
-	for funcea in Functions(SegStart(ea)):
-		funcname = get_unified_funcname(funcea)
-		func = get_func(funcea)
-		print i
-		i += 1
-		icfg, old_cfg = cfg.getCfg(func, externs_eas, ea_externs)
-		diffs[funcname] = (icfg, old_cfg)
-		#raw_g = raw_graph(funcname, icfg)
-		#raw_cfgs.append(raw_g)
-			
-	return diffs
-
-def get_func_cfgs(ea):
-	func_cfglist = {}
-	i = 0
-	for funcea in Functions(SegStart(ea)):
-		funcname = get_unified_funcname(funcea)
-		func = get_func(funcea)
-		print i
-		i += 1
-		try:
-			icfg = cfg.getCfg(func)
-			func_cfglist[funcname] = icfg
-		except:
-			pass
-			
-	return func_cfglist
-
-def get_func_cfg_sequences(func_cfglist):
-	func_cfg_seqlist = {}
-	for funcname in func_cfglist:
-		func_cfg_seqlist[funcname] = {}
-		cfg = func_cfglist[funcname][0]
-		for start, end in cfg:
-			codesq = get_sequences(start, end)
-			func_cfg_seqlist[funcname][(start,end)] = codesq
-
-	return func_cfg_seqlist
-
-
-def get_sequences(start, end):
-	seq = []
-	inst_addr = start
-	while inst_addr <= end:
-		opcode = GetMnem(inst_addr)
-		seq.append(opcode)
-		inst_addr = NextHead(inst_addr)
-	return seq
-
-def get_stack_arg(func_addr):
-	print func_addr
-	args = []
-	stack = GetFrame(func_addr)
-	if not stack:
-			return []
-	firstM = GetFirstMember(stack)
-	lastM = GetLastMember(stack)
-	i = firstM
-	while i <=lastM:
-		mName = GetMemberName(stack,i)
-		mSize = GetMemberSize(stack,i)
-		if mSize:
-				i = i + mSize
-		else:
-				i = i+4
-		if mName not in args and mName and ' s' not in mName and ' r' not in mName:
-			args.append(mName)
-	return args
-
-		#pickle.dump(funcs, open('C:/Documents and Settings/Administrator/Desktop/funcs','w'))
-
-def processExternalSegs():
-	funcdata = {}
-	datafunc = {}
-	for n in xrange(idaapi.get_segm_qty()):
-		seg = idaapi.getnseg(n)
-		ea = seg.startEA
-		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-		if segtype in [idc.SEG_XTRN]:
-			start = idc.SegStart(ea)
-			end = idc.SegEnd(ea)
-			cur = start
-			while cur <= end:
-				name = get_unified_funcname(cur)
-				funcdata[name] = hex(cur)
-				cur = NextHead(cur)
-	return funcdata
-
-def processpltSegs():
-	funcdata = {}
-	datafunc = {}
-	for n in xrange(idaapi.get_segm_qty()):
-		seg = idaapi.getnseg(n)
-		ea = seg.startEA
-		segname = SegName(ea)
-		if segname in ['.plt', 'extern', '.MIPS.stubs']:
-			start = seg.startEA
-			end = seg.endEA
-			cur = start
-			while cur < end:
-				name = get_unified_funcname(cur)
-				funcdata[name] = hex(cur)
-				datafunc[cur]= name
-				cur = NextHead(cur)
-	return funcdata, datafunc
-
-		
-def processDataSegs():
-	funcdata = {}
-	datafunc = {}
-	for n in xrange(idaapi.get_segm_qty()):
-		seg = idaapi.getnseg(n)
-		ea = seg.startEA
-		segtype = idc.GetSegmentAttr(ea, idc.SEGATTR_TYPE)
-		if segtype in [idc.SEG_DATA, idc.SEG_BSS]:
-			start = idc.SegStart(ea)
-			end = idc.SegEnd(ea)
-			cur = start
-			while cur <= end:
-				refs = [v for v in DataRefsTo(cur)]
-				for fea in refs:
-					name = get_unified_funcname(fea)
-					if len(name)== 0:
-						continue
-					if name not in funcdata:
-						funcdata[name] = [cur]
-					else:
-						funcdata[name].append(cur)
-					if cur not in datafunc:
-						datafunc[cur] = [name]
-					else:
-						datafunc[cur].append(name)
-				cur = NextHead(cur)
-	return funcdata, datafunc
-
-def obtainDataRefs(callgraph):
-	datarefs = {}
-	funcdata, datafunc = processDataSegs()
-	for node in callgraph:
-		if node in funcdata:
-			datas = funcdata[node]
-			for dd in datas:
-				refs = datafunc[dd]
-				refs = list(set(refs))
-				if node in datarefs:
-					print refs
-					datarefs[node] += refs
-					datarefs[node] = list(set(datarefs[node]))
-				else:
-					datarefs[node] = refs
-	return datarefs
-
-
--- a/raw-feature-extractor/graph_analysis_ida.py
+++ b/raw-feature-extractor/graph_analysis_ida.py
@ -1,257 +0,0 @@
-from idautils import *
-from idaapi import *
-from idc import *
-
-def getfunc_consts(func):
-	strings = []
-	consts = []
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	for bl in blocks:
-		strs, conts = getBBconsts(bl)
-		strings += strs
-		consts += conts
-	return strings, consts
-
-def getConst(ea, offset):
-	strings = []
-	consts = []
-	optype1 = GetOpType(ea, offset)
-	if optype1 == idaapi.o_imm:
-		imm_value = GetOperandValue(ea, offset)
-		if 0<= imm_value <= 10:
-			consts.append(imm_value)
-		else:
-			if idaapi.isLoaded(imm_value) and idaapi.getseg(imm_value):
-				str_value = GetString(imm_value)
-				if str_value is None:
-					str_value = GetString(imm_value+0x40000)
-					if str_value is None:
-						consts.append(imm_value)
-					else:
-						re = all(40 <= ord(c) < 128 for c in str_value)
-						if re:
-							strings.append(str_value)
-						else:
-							consts.append(imm_value)
-				else:
-					re = all(40 <= ord(c) < 128 for c in str_value)
-					if re:
-						strings.append(str_value)
-					else:
-						consts.append(imm_value)
-			else:
-				consts.append(imm_value)
-	return strings, consts
-
-def getBBconsts(bl):
-	strings = []
-	consts = []
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		if opcode in ['la','jalr','call', 'jal']:
-			inst_addr = NextHead(inst_addr)
-			continue
-		strings_src, consts_src = getConst(inst_addr, 0)
-		strings_dst, consts_dst = getConst(inst_addr, 1)
-		strings += strings_src
-		strings += strings_dst
-		consts += consts_src
-		consts += consts_dst
-		try:
-			strings_dst, consts_dst = getConst(inst_addr, 2)
-			consts += consts_dst
-			strings += strings_dst
-		except:
-			pass
-
-		inst_addr = NextHead(inst_addr)
-	return strings, consts
-
-def getFuncCalls(func):
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	sumcalls = 0
-	for bl in blocks:
-		callnum = calCalls(bl)
-		sumcalls += callnum
-	return sumcalls
-
-def getLogicInsts(func):
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	sumcalls = 0
-	for bl in blocks:
-		callnum = calLogicInstructions(bl)
-		sumcalls += callnum
-	return sumcalls
-
-def getTransferInsts(func):
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	sumcalls = 0
-	for bl in blocks:
-		callnum = calTransferIns(bl)
-		sumcalls += callnum
-	return sumcalls
-
-def getIntrs(func):
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	sumcalls = 0
-	for bl in blocks:
-		callnum = calInsts(bl)
-		sumcalls += callnum
-	return sumcalls	
-
-def getLocalVariables(func):
-	args_num = get_stackVariables(func.startEA)
-	return args_num
-
-def getBasicBlocks(func):
-	blocks = [(v.startEA, v.endEA) for v in FlowChart(func)]
-	return len(blocks)
-
-def getIncommingCalls(func):
-	refs = CodeRefsTo(func.startEA, 0)
-	re = len([v for v in refs])
-	return re
-
-
-def get_stackVariables(func_addr):
-    #print func_addr
-    args = []
-    stack = GetFrame(func_addr)
-    if not stack:
-            return 0
-    firstM = GetFirstMember(stack)
-    lastM = GetLastMember(stack)
-    i = firstM
-    while i <=lastM:
-        mName = GetMemberName(stack,i)
-        mSize = GetMemberSize(stack,i)
-        if mSize:
-                i = i + mSize
-        else:
-                i = i+4
-        if mName not in args and mName and 'var_' in mName:
-            args.append(mName)
-    return len(args)
-
-
-
-def calArithmeticIns(bl):
-	x86_AI = {'add':1, 'sub':1, 'div':1, 'imul':1, 'idiv':1, 'mul':1, 'shl':1, 'dec':1, 'inc':1}
-	mips_AI = {'add':1, 'addu':1, 'addi':1, 'addiu':1, 'mult':1, 'multu':1, 'div':1, 'divu':1}
-	calls = {}
-	calls.update(x86_AI)
-	calls.update(mips_AI)
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		if opcode in calls:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
-
-def calCalls(bl):
-	calls = {'call':1, 'jal':1, 'jalr':1}
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		if opcode in calls:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
-
-def calInsts(bl):
-	start = bl[0]
-	end = bl[1]
-	ea = start
-	num = 0
-	while ea < end:
-		num += 1
-		ea = NextHead(ea)
-	return num
-
-def calLogicInstructions(bl):
-	x86_LI = {'and':1, 'andn':1, 'andnpd':1, 'andpd':1, 'andps':1, 'andnps':1, 'test':1, 'xor':1, 'xorpd':1, 'pslld':1}
-	mips_LI = {'and':1, 'andi':1, 'or':1, 'ori':1, 'xor':1, 'nor':1, 'slt':1, 'slti':1, 'sltu':1}
-	calls = {}
-	calls.update(x86_LI)
-	calls.update(mips_LI)
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		if opcode in calls:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
-
-def calSconstants(bl):
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		if opcode in calls:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
-
-
-def calNconstants(bl):
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		optype1 = GetOpType(inst_addr, 0)
-		optype2 = GetOpType(inst_addr, 1)
-		if optype1 == 5 or optype2 == 5:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
-
-def retrieveExterns(bl, ea_externs):
-	externs = []
-	start = bl[0]
-	end = bl[1]
-	inst_addr = start
-	while inst_addr < end:
-		refs = CodeRefsFrom(inst_addr, 1)
-		try:
-			ea = [v for v in refs if v in ea_externs][0]
-			externs.append(ea_externs[ea])
-		except:
-			pass
-		inst_addr = NextHead(inst_addr)
-	return externs
-
-def calTransferIns(bl):
-	x86_TI = {'jmp':1, 'jz':1, 'jnz':1, 'js':1, 'je':1, 'jne':1, 'jg':1, 'jle':1, 'jge':1, 'ja':1, 'jnc':1, 'call':1}
-	mips_TI = {'beq':1, 'bne':1, 'bgtz':1, "bltz":1, "bgez":1, "blez":1, 'j':1, 'jal':1, 'jr':1, 'jalr':1}
-	arm_TI = {'MVN':1, "MOV":1}
-	calls = {}
-	calls.update(x86_TI)
-	calls.update(mips_TI)
-	start = bl[0]
-	end = bl[1]
-	invoke_num = 0
-	inst_addr = start
-	while inst_addr < end:
-		opcode = GetMnem(inst_addr)
-		re = [v for v in calls if opcode in v]
-		if len(re) > 0:
-			invoke_num += 1
-		inst_addr = NextHead(inst_addr)
-	return invoke_num
--- a/raw-feature-extractor/graph_property.py
+++ b/raw-feature-extractor/graph_property.py
@ -1,24 +0,0 @@
-import networkx as nx
-import pdb
-def betweeness(g):
-	#pdb.set_trace()
-	betweenness = nx.betweenness_centrality(g)
-	return betweenness
-
-def eigenvector(g):
-	centrality = nx.eigenvector_centrality(g)
-	return centrality
-
-def closeness_centrality(g):
-	closeness = nx.closeness_centrality(g)
-	return closeness
-
-def retrieveGP(g):
-	bf = betweeness(g)
-	#close = closeness_centrality(g)
-	#bf_sim = 
-	#close_sim = 
-	x = sorted(bf.values())
-	value = sum(x)/len(x)
-	return round(value,5)
-
--- a/raw-feature-extractor/preprocessing_ida.py
+++ b/raw-feature-extractor/preprocessing_ida.py
@ -1,27 +0,0 @@
-from func import *
-from raw_graphs import *
-from idc import *
-import os
-import argparse
-
-def parse_command():
-	parser = argparse.ArgumentParser(description='Process some integers.')
-	parser.add_argument("--path", type=str, help="The directory where to store the generated .ida file")
-	args = parser.parse_args()
-	return args
-
-if __name__ == '__main__':
-
-	args = parse_command()
-	path = args.path
-	analysis_flags = idc.GetShortPrm(idc.INF_START_AF)
-	analysis_flags &= ~idc.AF_IMMOFF
-	# turn off "automatically make offset" heuristic
-	idc.SetShortPrm(idc.INF_START_AF, analysis_flags)
-	idaapi.autoWait()
-	cfgs = get_func_cfgs_c(FirstSeg())
-	binary_name = idc.GetInputFile() + '.ida'
-	fullpath = os.path.join(path, binary_name)
-	pickle.dump(cfgs, open(fullpath,'w'))
-	print binary_name
-	idc.Exit(0)
--- a/raw-feature-extractor/raw_graphs.py
+++ b/raw-feature-extractor/raw_graphs.py
@ -1,286 +0,0 @@
-import itertools
-import sys
-sys.path.insert(0, '/usr/local/lib/python2.7/dist-packages/')
-import networkx as nx
-#import numpy as np
-from subprocess import Popen, PIPE
-import pdb
-import os
-import re,mmap
-#from graph_edit_new import *
-
-class raw_graph:
-	def __init__(self, funcname, g, func_f):
-		self.funcname = funcname
-		self.old_g = g[0]
-		self.g = nx.DiGraph()
-		self.entry = g[1]
-		self.fun_features = func_f
-		self.attributing()
-
-	def __len__(self):
-		return len(self.g)
-
-	def attributing(self):
-		self.obtainOffsprings(self.old_g)
-		for node in self.old_g:
-			fvector = self.retrieveVec(node, self.old_g)
-			self.g.add_node(node)
-			self.g.node[node]['v'] = fvector
-
-		for edge in self.old_g.edges():
-			node1 = edge[0]
-			node2 = edge[1]
-			self.g.add_edge(node1, node2)
-
-	def obtainOffsprings(self,g):
-		nodes = g.nodes()
-		for node in nodes:
-			offsprings = {}
-			self.getOffsprings(g, node, offsprings)
-			g.node[node]['offs'] = len(offsprings)
-		return g
-
-	def getOffsprings(self, g, node, offsprings):
-		node_offs = 0
-		sucs = g.successors(node)
-		for suc in sucs:
-			if suc not in offsprings:
-				offsprings[suc] = 1
-				self.getOffsprings(g, suc, offsprings)
-
-	def retrieveVec(self, id_, g):
-		feature_vec = []
-		#numC0
-		numc = g.node[id_]['consts']
-		feature_vec.append(numc)
-		#nums1
-		nums = g.node[id_]['strings']
-		feature_vec.append(nums)
-		#offsprings2
-		offs = g.node[id_]['offs']
-		feature_vec.append(offs)
-		#numAs3
-		numAs = g.node[id_]['numAs']
-		feature_vec.append(numAs)
-		# of calls4
-		calls = g.node[id_]['numCalls']
-		feature_vec.append(calls)
-		# of insts5
-		insts = g.node[id_]['numIns']
-		feature_vec.append(insts)
-		# of LIs6
-		insts = g.node[id_]['numLIs']
-		feature_vec.append(insts)
-		# of TIs7
-		insts = g.node[id_]['numTIs']
-		feature_vec.append(insts)	
-		return feature_vec
-
-
-	def enumerating(self, n):
-		subgs = []
-		#pdb.set_trace()
-		for sub_nodes in itertools.combinations(self.g.nodes(), n):
-		    subg = self.g.subgraph(sub_nodes)
-		    u_subg = subg.to_undirected()
-		    if nx.is_connected(u_subg):
-		        subgs.append(subg)
-		return subgs
-
-
-	def genMotifs(self, n):
-		motifs = {}
-		subgs = enumerating(n)
-		for subg in subgs:
-			if len(motifs) == 0:
-				motifs[subg] = [subg]
-			else:
-				nomatch = True
-				for mt in motifs:
-					if nx.is_isomorphic(mt, subg):
-						motifs[mt].append(subg)
-						nomatch = False
-				if nomatch:
-					motifs[subg] = [subg]
-		return motifs
-
-	def enumerating_efficient(self, n):
-		#pdb.set_trace()
-		if len(self.g) >= 200:
-			return []
-		with open('/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt','wb') as f:
-			nx.write_edgelist(self.g,f,data=False)
-		#pdb.set_trace()
-		process = Popen(["/home/qian/workspace/FANMOD-command_line-source/executables/./fanmod_command_line_linux", str(n), "100000", "1", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt", "1", "0", "0", "2", "0", "0", "0", "1000", "3", "3", "/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt", "0", "1"], stdout=PIPE, stderr=PIPE)
-		stdout, stderr = process.communicate()
-		if process.returncode >= 0:
-		#os.system("/home/qian/software/FANMOD-command_line-source/executables/./fanmod_command_line_linux " +str(n) + " 100000 1 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/OUTPUT.txt 1 0 0 2 0 0 0 1000 3 3 /home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt 0 1")
-		#pdb.set_trace()
-			#pdb.set_trace()
-			subgs = self.parseOutput("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump", n)
-			#pdb.set_trace()
-			os.remove("/home/qian/workspace/gEnding/gencoding/encoding/labeled/data/preprocessing/MotifCount.txt.dump")
-			return subgs
-		return []
-
-	def parseOutput(self, path, n):
-		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
-		subgraphs = []
-		with open(path,'r') as f:
-			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
-			mo = re.findall(pattern, data)
-			if mo:
-				results = [map(int, v.split(',')[1:]) for v in mo]
-				subgraphs = self.createGraphDirectly(results)
-		return subgraphs
-
-	def parseOutputByconditions(self, path, n):
-		pattern = re.compile('[0-9]+\,[0-9]+\,[0-9]+\,[0-9]+')
-		subgraphs = []
-		with open(path,'r') as f:
-			data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
-			mo = re.findall(pattern, data)
-			if mo:
-				results = [map(int, v.split(',')[1:]) for v in mo]
-				subgraphs = self.create_Graphbycondition_Directly(results)
-		return subgraphs
-
-	def create_Graphbycondition_Directly(self, results):
-		subgs = []
-		for indexes in results:
-			tg = template_graph()
-			subg = self.g.subgraph(indexes)
-			tg.updateG(subg)
-			subgs.append(tg)
-			del tg
-		return subgs
-
-	def createGraphDirectly(self, results):
-		#pdb.set_trace()
-		#subgs = [self.g.subgraph(indexes) for indexes in results]
-		subgs = []
-		for indexes in results:
-			tg = template_graph()
-			subg = self.g.subgraph(indexes)
-			tg.updateG(subg)
-			subgs.append(tg)
-			del tg
-		return subgs
-
-	def createGraph(self, results, n):
-		binary_value = int(results[0],2)
-		indexes = [int(v) for v in results[1:]]
-		fang = self.createG(results[0], n)
-		if fang:
-			tg = template_graph(binary_value)
-			tg.updateG(fang, indexes, self.g)
-			return tg
-		pdb.set_trace()
-		print "there is g which is none"
-
-	def createG(self, binary_str, n):
-		g = nx.DiGraph()
-		l = [int(v) for v in binary_str]
-		#pdb.set_trace()
-		shape = (n, n)
-		data = np.array(l)
-		ad_matrix = data.reshape(shape)
-		for i in xrange(n):
-			for j in xrange(n):
-				if ad_matrix[i][j] == 1:
-					g.add_edge(i, j)
-		return g
-			
-
-
-class raw_graphs:
-	def __init__(self, binary_name):
-		self.binary_name = binary_name
-		self.raw_graph_list = []
-
-	def append(self, raw_g):
-		self.raw_graph_list.append(raw_g)
-
-	def __len__(self):
-		return len(self.raw_graph_list)
-
-
-class graphlets:
-	def __init__(self, funcname):
-		self.funcname = funcname
-		self.graphlets_list = []
-		self.binary_name = None
-
-	def updateBN(self, binary_name):
-		self.binary_name = binary_name
-
-	def append(self, subg):
-		self.graphlets_list.append(subg)
-
-	def appendSet(self, subgs):
-		self.graphlets_list += subgs
-
-	def __len__(self):
-		return len(self.graphlets_list)
-
-class template_graph:
-	def __init__(self, value=None):
-		self.value = value
-		self.g = None
-
-	def updateG(self,g):
-		self.g = g
-	#def updateIndexes(self, indexes):
-	#	self.indexes = indexes
-
-	#def updateAttributes(self, pg, indexes, maing):
-	#	for id_ in xrange(len(indexes)):
-	#		index = indexes[id_]
-	#		gnode = self.findNode(index, maing)
-	#		self.g.node[gnode] = pg.node[index]
-
-
-class template_graphs:
-	def __init__(self, size):
-		self.size = size
-		self.gs = []
-		self.bit_len = None
-
-	def enumeratingAll(self):
-		subgs = []
-		binary_value = self.genBinValue()
-		for i in xrange(binary_value):
-			if i == 0 :
-				continue
-			g = self.createG(i)
-			if g:
-				tg = template_graph(i)
-				tg.updateG(g)
-				self.gs.append(tg)
-
-	def genBinValue(self):
-		n = self.size
-		self.bit_len = n*n
-		return 2**(self.bit_len)
-
-	def createG(self, i):
-		g = nx.DiGraph()
-		l = self.genArray(i)
-		#pdb.set_trace()
-		shape = (self.size, self.size)
-		data = np.array(l)
-		ad_matrix = data.reshape(shape)
-		for i in xrange(self.size):
-			for j in xrange(self.size):
-				if ad_matrix[i][j] == 1:
-					g.add_edge(i, j)
-		u_g = g.to_undirected()
-		if len(g) == self.size and nx.is_connected(u_g):
-			return g
-		return False
-
-	def genArray(self, i):
-		l = [int(x) for x in bin(i)[2:]]
-		x = [0 for v in xrange(self.bit_len - len(l))]
-		return x + l
--- a/search-engine/db.py
+++ b/search-engine/db.py
@ -1,356 +0,0 @@
-import cPickle as pickle 
-from search import *
-from nearpy import Engine
-from nearpy.hashes import RandomDiscretizedProjections
-from nearpy.filters import NearestFilter, UniqueFilter
-from nearpy.distances import EuclideanDistance
-from nearpy.distances import CosineDistance
-from nearpy.hashes import RandomBinaryProjections
-from nearpy.experiments import DistanceRatioExperiment
-from redis import Redis
-from nearpy.storage import RedisStorage
-from feature import *
-import numpy as np
-import os
-import pdb
-import argparse
-import time
-import numpy as np
-from refactoring import *
-import pymongo
-from pymongo import MongoClient
-
-def initDB():
-	client = MongoClient()
-	client = MongoClient('localhost', 27017)
-	client = MongoClient('mongodb://localhost:27017/')
-	db = client.test_database
-	db = client['iot-encoding']
-	return db
-
-db = initDB()
-posts = db.posts
-
-class db:
-	
-	def __init__(self):
-		self.feature_list = {}
-		self.engine = None
-
-	def loadHashmap(self, feature_size, result_n):
-		# Create redis storage adapter
-		redis_object = Redis(host='localhost', port=6379, db=0)
-		redis_storage = RedisStorage(redis_object)
-		pdb.set_trace()
-		try:
-			# Get hash config from redis
-			config = redis_storage.load_hash_configuration('test')
-			# Config is existing, create hash with None parameters
-			lshash = RandomBinaryProjections(None, None)
-			# Apply configuration loaded from redis
-			lshash.apply_config(config)
-			
-		except:
-			# Config is not existing, create hash from scratch, with 10 projections
-			lshash = RandomBinaryProjections('test', 0)
-			
-
-		# Create engine for feature space of 100 dimensions and use our hash.
-		# This will set the dimension of the lshash only the first time, not when
-		# using the configuration loaded from redis. Use redis storage to store
-		# buckets.
-		nearest = NearestFilter(1000)
-		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
-		pdb.set_trace()
-		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())
-
-		# Do some stuff like indexing or querying with the engine...
-
-		# Finally store hash configuration in redis for later use
-		redis_storage.store_hash_configuration(lshash)
-
-	def appendToDB(self, binary_name, funcname, fvector, firmware_name=""):
-		if fvector is None:
-			return
-		#ftuple = tuple([fvector])
-		self.engine.store_vector(np.asarray(fvector), ".".join((firmware_name,binary_name,funcname)))
-
-	def batch_appendDB(self, binary_name, features, firmware_name=""):
-		for funcname in features:
-			feature = features[funcname]
-			#pdb.set_trace()
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir(self, base_dir):
-		cursor = posts.find({"firmware_name":"ddwrt-r21676_result"})
-		i = 0
-		for v in cursor:
-			print i
-			i+=1
-			binary_name = v['binary_name']
-			funcname = v['func_name']
-			firmware_name = v['firmware_name']
-			feature = v['fvector']
-			self.appendToDB(binary_name, funcname, feature, firmware_name)
-
-	def batch_appendDBbyDir1(self, base_dir):
-		image_dir = os.path.join(base_dir, "image")
-		firmware_featrues={}
-		bnum = 0
-		fnum = 0
-		i  = 0
-		pdb.set_trace()
-		for firmware_name in os.listdir(image_dir):
-			print firmware_name
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(image_dir, firmware_name)
-			for binary_name in os.listdir(firmware_dir):
-				if binary_name.endswith(".features"):
-					bnum += 1
-					featrues_dir = os.path.join(firmware_dir, binary_name)
-					featrues = pickle.load(open(featrues_dir, "r"))
-					for funcname in featrues:
-						fnum +=1
-						#pdb.set_trace()
-						feature = featrues[funcname]
-						self.appendToDB(binary_name, funcname, feature, firmware_name)
-					del featrues
-		print("bnum ", bnum)
-		print("fnum ", fnum)
-
-	def dump(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		pickle.dump(self.feature_list, open(db_dir, 'w'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		pickle.dump(self.engine, open(db_dir, 'w'))
-
-	def loadDB(self, base_dir):
-		db_dir = os.path.join(base_dir, "data/db/busybox.feature_mapping")
-		self.feature_list = pickle.load(open(db_dir, 'r'))
-		db_dir = os.path.join(base_dir, "data/db/busybox.hashmap")
-		self.engine = pickle.load(open(db_dir, 'r'))
-
-	def findF(self, binary_name, funcname):
-		x = [v for v in self.feature_list if binary_name in self.feature_list[v] and funcname in self.feature_list[v][binary_name]]
-		return x[0]
-
-def retrieveFeaturesByDir(n, base_dir):
-	firmware_featrues={}
-	i = 0
-	for firmware_name in os.listdir(base_dir):
-		if firmware_name.endWith(".features"):
-			firmware_featrues[firmware_name] = {}
-			firmware_dir = os.path.join(base_dir, firmware_name)
-			if i > 0:
-				break
-			i += 1
-			pdb.set_trace()
-			for binary_name in os.listdir(firmware_dir):
-				featrues_dir = os.path.join(firmware_dir, binary_name + "_cb" + str(n) + ".features")
-				featrues = pickle.load(open(featrues_dir, "r"))
-				for funcname in featrues:
-					feature = featrues[funcname]
-					self.appendToDB(firmware_name, binary_name, funcname, feature)
-				del featrues
-
-def retrieveFeatures(n, base_dir, filename, funcs):
-	feature_dic = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename)
-	for name in featrues:
-		#if name in funcs:
-		x = featrues[name] 
-		#+ featuresx[name]
-		feature_dic[name] = np.asarray(x)
-	return feature_dic
-
-def retrieveVuldb(base_input_dir):
-	vul_path = os.path.join(base_input_dir, "vul")
-	vul_db = pickle.load(open(vul_path, "r"))
-	return vul_db
-
-
-def retrieveFeaturesx(filename):
-	ida_input_dir = os.path.join("./data/", filename + ".features")
-	featuresx = pickle.load(open(ida_input_dir, "r"))
-	return featuresx
-
-def retrieveQueries(n, base_dir, filename1, featrues_src):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, "5000", filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	#featuresx = retrieveFeaturesx(filename1)
-	for name in featrues:
-		#if name in featrues_src:
-		x = featrues[name] 
-		#+ featuresx[name]
-		queries[name] = np.asarray(x)
-	return queries
-
-def retrieveQueriesbyDir(n, base_dir, firmware_name, filename1):
-	queries = {}
-	featrues_dir = os.path.join(base_dir, firmware_name, filename1 + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	for name in featrues:
-		#del featrues[name][5]
-		queries[name] = np.asarray(featrues[name])
-	return queries
-
-def retrieveQuery(n, base_dir, filename, funcname):
-	featrues_dir = os.path.join(base_dir, filename + "_cb" + str(n) + ".features")
-	featrues = pickle.load(open(featrues_dir, "r"))
-	f = [featrues[v] for v in featrues if funcname in v ][0]
-	return np.asarray(f)
-
-def parse_command():
-	parser = argparse.ArgumentParser(description='Process some integers.')
-	parser.add_argument("--base_input_dir", type=str, help="raw binaries to process for training")
-	parser.add_argument('--output_dir', type=str, help="output dir")
-	parser.add_argument("--filename1", type=str, help="the size of each graphlet")
-	parser.add_argument("--filename2", type=str, help="the size of each graphlet")
-	parser.add_argument("--size", type=int, help="the size of each graphlet")
-	#parser.add_argument("--size", type=int, help="the size of each graphlet")
-	args = parser.parse_args()
-	return args
-
-def loadFuncs(path):
-	funcs = {}
-	x86_dir = os.path.join(path, "func_candid")
-	#mips_dir = os.path.join(path, "openssl1.0.1a_mips.ida")
-	fp = open(x86_dir,"r")
-	for line in fp:
-		items = line.split("\n")
-		funcname = items[0]
-		funcs[funcname] = 1
-	return funcs
-
-def dump(path, featrues, queries):
-	fp = open(path + "/" + "matrix", 'w')
-	for name in featrues:
-		row = []
-		row.append("x86")
-		row.append(name)
-		row += featrues[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %tuple(row))
-	for name in queries:
-		row = []
-		row.append("mips")
-		row.append(name)
-		row += queries[name]
-		fp.write("%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % tuple(row))
-	fp.close()
-
-
-def queryBytwo(base_input_dir, filename1, filename2, n):
-	threthold = 50
-	db_instance = db()
-	funcs = loadFuncs(base_input_dir)
-	db_instance.loadHashmap(n, 50000)
-	#pdb.set_trace()
-	featrues = retrieveFeatures(n, base_input_dir, filename1, funcs)
-	queries = retrieveQueries(n, base_input_dir, filename2, funcs)
-	#queries = refactoring(queries, featrues)
-	vul_db = retrieveVuldb(base_input_dir)
-	pdb.set_trace()
-	#dump(base_input_dir, featrues, queries)
-	#start = time.time()
-	#db_instance.batch_appendDBbyDir(base_input_dir)
-	#end = time.time()
-	#total = end - start
-	#print total
-	db_instance.batch_appendDB(filename1, featrues)
-	pdb.set_trace()
-	ranks = []
-	times = []
-	for threthold in xrange(1, 210, 10):
-		hit = []
-		i = 0
-		for name in queries:
-			#print i 
-			i += 1
-			'''
-			if i == 1000:
-				print (sum(times)/len(times))
-				pdb.set_trace()
-				print "s"
-			'''
-			#if name not in vul_db['openssl']:
-			#	continue
-			if name not in featrues:
-				continue
-			#pdb.set_trace()
-			query = queries[name]
-			#start = time.time()
-			x = db_instance.engine.neighbours(query)
-			#end = time.time()
-			#total = end - start
-			#times.append(total)
-			#print total
-			#pdb.set_trace()
-			try:
-				rank = [v for v in xrange(len(x)) if name in x[v][1]][0]
-				ranks.append((name, rank))
-				if rank <= threthold:
-					hit.append(1)
-				else:
-					hit.append(0)
-			except:
-				#pdb.set_trace()
-				hit.append(0)
-				pass
-		#pdb.set_trace()
-		acc = sum(hit) * 1.0 / len(hit)
-		print acc
-
-def queryAll(base_dir, firmware_name, filename1, n):
-	threthold = 155
-	db_instance = db()
-	db_instance.loadHashmap(n, 50000)
-	queries = retrieveQueriesbyDir(n, base_dir, firmware_name, filename1)
-	start = time.time()
-	pdb.set_trace()
-	db_instance.batch_appendDBbyDir(n, base_dir)
-	end = time.time()
-	dur = end - start
-	print dur
-	pdb.set_trace()
-	hit = []
-	i = 0
-	times = []
-	for name in queries:
-		print i 
-		i += 1
-		query = queries[name]
-		start = time.clock()
-		x = db_instance.engine.neighbours(query)
-		end = time.clock()
-		dur = end - start
-		times.append(dur)
-		#pdb.set_trace()
-		try:
-			rank = [v for v in xrange(len(x)) if name in x[v][1]]
-			if len(rank) > 1:
-				pdb.set_trace()
-				print "stop"
-			if rank[0] <= threthold:
-				hit.append(1)
-			else:
-				hit.append(0)
-		except:
-			hit.append(0)
-	
-	acc = sum(hit) * 1.0 / len(hit)
-	mean = np.mean(times)
-	std =  np.std(times)
-	#pdb.set_trace()
-	print acc
-
-if __name__ == "__main__":
-	args = parse_command()
-	base_dir = args.base_input_dir
-	filename1 = args.filename1
-	filename2 = args.filename2
-	n = args.size
-	pdb.set_trace()
-	queryBytwo(base_dir, filename1, filename2, n)
Author	SHA1	Message	Date
huihun	8063d079db	批量化操作	2024-01-06 18:47:26 +08:00
huihun	1a8c103c43	批量化操作	2024-01-06 18:47:03 +08:00
TinyCaviar	f82f488bb3	Update test.py	2023-11-24 09:43:46 +08:00
TinyCaviar	ad2583dba9	backup	2023-11-16 15:31:12 +08:00
TinyCaviar	d599236e94	backup	2023-10-10 22:12:18 +08:00
TinyCaviar	ddf9ff3b59	backup	2023-09-01 11:47:19 +08:00
TinyCaviar	4637fd0d97	backup	2023-08-12 13:48:27 +08:00
TinyCaviar	2ec7e5e212	backup	2023-08-07 20:48:53 +08:00
TinyCaviar	636ec90a1c	backup	2023-08-07 20:48:21 +08:00
TinyCaviar	badd4eada6	backup	2023-08-03 10:03:02 +08:00